disclosure-bureau/scripts/23-smart-dedup.py

282 lines
10 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
23-smart-dedup.py — Phase 1 + Phase 3: aggressive entity cleanup.
Removes garbage entities that Haiku/extraction over-promoted:
A. Stop-list filter — single-mention bare common nouns
B. Substring/alias dedup — "FBI" vs "F-B-I" vs "Federal Bureau of Investigation"
C. Compound-name detection — entities A+B that co-occur ≥3 pages → suggest merge
D. Title-prefix recovery — "Chief Tereoken" appearing in raw page text but
dedup created only "tereoken" + "chief"
Runs in two modes:
--dry-run → report what would be deleted/merged, no writes
(default) → applies deletes and merges, removes orphans, updates affected page.md
files to substitute merged names
Skip --merge-compounds to disable (C) since it can be aggressive.
Usage:
./23-smart-dedup.py --dry-run
./23-smart-dedup.py --apply
"""
from __future__ import annotations
import argparse
import re
import sys
import unicodedata
from collections import Counter, defaultdict
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
ENTITIES = UFO_ROOT / "wiki" / "entities"
PAGES = UFO_ROOT / "wiki" / "pages"
LOG = UFO_ROOT / "wiki" / "log.md"
# Common-noun stop list — these have no value as standalone entities.
# Drop only if mention_count == 1 (still keep if used many times — it's a real referent).
STOP_NOUNS = {
# Roles / positions
"agent", "agents", "officer", "officers", "chief", "captain", "general", "major",
"colonel", "sergeant", "commander", "director", "secretary", "lieutenant", "lcdr",
"cdr", "lt", "lt.", "cpl", "sgt", "supervisor", "inspector",
# Generic structures
"base", "office", "department", "bureau", "agency", "division", "section",
"headquarters", "command", "post", "station", "branch", "unit", "group",
"the", "the bureau", "the agency", "the department", "the office", "the file",
# File / document terms
"file", "files", "memo", "memorandum", "letter", "report", "form", "page", "pages",
"subject", "date", "time", "case", "exhibit", "attachment", "enclosure",
"signature", "stamp", "carbon copy", "cc", "ref", "reference", "annex",
"envelope", "bag", "folder", "transmittal", "routing", "dispatch",
# Generic descriptors
"the inspector", "the agent", "the officer", "the witness", "the observer",
"the subject", "the man", "the woman", "the pilot", "the operator",
# Things that often slip into entities by mistake
"technicians", "personnel", "staff", "team", "crew", "members",
"operations", "operation", # only when very generic — collisions handled by mention_count
"departments",
}
# Common single-letter or 2-letter "entities" that are useless on their own.
TRIVIAL_PATTERNS = [
re.compile(r"^[a-z0-9]$"),
re.compile(r"^[a-z]{1,2}$"), # tiny initials
]
def normalize(s: str) -> str:
nfd = unicodedata.normalize("NFD", s)
return "".join(c for c in nfd if not unicodedata.combining(c)).lower().strip()
def read_md(path: Path) -> tuple[dict, str]:
try:
c = path.read_text(encoding="utf-8")
except FileNotFoundError:
return {}, ""
if not c.startswith("---"):
return {}, c
end = c.find("---", 4)
if end < 0:
return {}, c
try:
fm = yaml.safe_load(c[3:end].strip()) or {}
except yaml.YAMLError:
fm = {}
body = c[end + 3:].lstrip("\n")
return fm, body
def is_trivial(canonical_name: str, entity_id: str) -> bool:
n = normalize(canonical_name)
if not n:
return True
if n in STOP_NOUNS:
return True
if any(p.match(n) for p in TRIVIAL_PATTERNS):
return True
# All-stop-words sequence: "the bureau", "the office"
words = n.split()
if len(words) >= 2 and all(w in STOP_NOUNS or w == "the" for w in words):
return True
# Single common noun
if len(words) == 1 and n in STOP_NOUNS:
return True
return False
def filter_low_signal(*, dry: bool) -> dict[str, list[Path]]:
"""A — Delete entities that are trivial AND have low mention_count."""
stats: dict[str, list[Path]] = {"deleted": [], "kept_high_mention": [], "kept": []}
for p in ENTITIES.glob("*/*.md"):
fm, _ = read_md(p)
if not fm:
continue
canonical = (fm.get("canonical_name") or p.stem)
total = int(fm.get("total_mentions") or 0)
if is_trivial(canonical, p.stem):
if total <= 2: # trivial + rarely mentioned = noise
stats["deleted"].append(p)
if not dry:
p.unlink()
else:
stats["kept_high_mention"].append(p)
else:
stats["kept"].append(p)
return stats
def aliases_of(fm: dict) -> set[str]:
out = set()
cname = fm.get("canonical_name")
if isinstance(cname, str):
out.add(normalize(cname))
for a in (fm.get("aliases") or []):
if isinstance(a, str):
out.add(normalize(a))
return {x for x in out if x}
def dedupe_by_alias(*, dry: bool) -> dict[str, int]:
"""B — Merge entities whose alias sets overlap.
Strategy: keep the entity with highest total_mentions; redirect others by
appending an alias and deleting their files.
"""
stats = {"merges": 0, "deletes": 0}
by_class: dict[str, dict[str, Path]] = defaultdict(dict)
# Build class → alias → path map (last-write-wins; collisions become merge targets)
overlap: dict[str, dict[str, list[Path]]] = defaultdict(lambda: defaultdict(list))
for p in ENTITIES.glob("*/*.md"):
cls = p.parent.name
fm, _ = read_md(p)
if not fm:
continue
for a in aliases_of(fm):
overlap[cls][a].append(p)
for cls, alias_map in overlap.items():
for alias, paths in alias_map.items():
if len(paths) < 2:
continue
# Pick canonical winner: highest total_mentions
ranked = []
for pp in paths:
if not pp.exists():
continue
fm, _ = read_md(pp)
if not fm:
continue
ranked.append((int(fm.get("total_mentions") or 0), pp, fm))
if len(ranked) < 2:
continue
ranked.sort(key=lambda x: x[0], reverse=True)
winner_count, winner_path, winner_fm = ranked[0]
for _count, loser_path, loser_fm in ranked[1:]:
if loser_path == winner_path or not loser_path.exists():
continue
# Add loser's aliases to winner
new_aliases = sorted(set((winner_fm.get("aliases") or []))
| set(loser_fm.get("aliases") or [])
| {loser_fm.get("canonical_name") or loser_path.stem})
winner_fm["aliases"] = [a for a in new_aliases if a]
# Total mentions sum
winner_fm["total_mentions"] = (winner_fm.get("total_mentions") or 0) + (loser_fm.get("total_mentions") or 0)
if not dry:
new_yaml = yaml.dump(winner_fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
body = winner_path.read_text(encoding="utf-8").split("---", 2)[-1].lstrip("\n")
winner_path.write_text(f"---\n{new_yaml}---\n\n{body}", encoding="utf-8")
loser_path.unlink()
stats["merges"] += 1
stats["deletes"] += 1
return stats
def find_compound_candidates() -> list[tuple[str, str, str, int]]:
"""C — find entity pairs (A, B) appearing adjacent on ≥3 pages → likely compound name.
Walk all page.md `entities_extracted` fields; if A.name then B.name appear
near each other in the page body, count co-occurrence.
Returns: [(class_a, name_a, name_b, count)]
"""
pair_count: Counter = Counter()
page_entities: Counter = Counter()
for p in PAGES.glob("*/p*.md"):
try:
fm, _ = read_md(p)
except Exception:
continue
if not fm:
continue
ee = fm.get("entities_extracted") or {}
if not isinstance(ee, dict):
continue
# Across all entity classes, look for adjacent pairs in name lists
names = []
for cls_key in ("people", "organizations", "locations"):
for entry in (ee.get(cls_key) or []):
if isinstance(entry, dict) and entry.get("name"):
names.append((cls_key, normalize(entry["name"])))
# Pair up adjacent entries (heuristic — Haiku usually returns them in occurrence order)
for i in range(len(names) - 1):
a_cls, a = names[i]
b_cls, b = names[i + 1]
if a == b:
continue
pair_count[(a_cls, a, b)] += 1
# Filter to pairs that appear together ≥ 3 pages
out = []
for (cls_a, a, b), c in pair_count.items():
if c >= 3:
out.append((cls_a, a, b, c))
return sorted(out, key=lambda x: -x[3])
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true", help="report only, don't delete")
ap.add_argument("--apply", action="store_true", help="apply deletes + merges")
ap.add_argument("--report-compounds", action="store_true", help="just print compound candidates and exit")
args = ap.parse_args()
if not args.dry_run and not args.apply and not args.report_compounds:
ap.error("provide --dry-run, --apply, or --report-compounds")
if args.report_compounds:
cands = find_compound_candidates()
print(f"Top compound candidates (adjacent ≥3 pages):")
for cls_a, a, b, c in cands[:50]:
print(f" {c:4d}× [{cls_a}] {a} + {b}'{a} {b}'")
print(f"\nTotal: {len(cands)} candidates")
return
dry = args.dry_run
print(f"=== Phase 1A: filter trivial low-mention entities ({'DRY-RUN' if dry else 'APPLY'}) ===")
a = filter_low_signal(dry=dry)
print(f" deleted (trivial + ≤2 mentions): {len(a['deleted'])}")
print(f" kept (trivial but ≥3 mentions): {len(a['kept_high_mention'])}")
print(f" kept (meaningful): {len(a['kept'])}")
print(f"\n=== Phase 1B: alias-based merge ({'DRY-RUN' if dry else 'APPLY'}) ===")
b = dedupe_by_alias(dry=dry)
print(f" pairs merged: {b['merges']}")
total_remaining = sum(1 for _ in ENTITIES.glob("*/*.md"))
print(f"\nRemaining entity files: {total_remaining}")
if __name__ == "__main__":
main()