283 lines
10 KiB
Python
283 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
23-smart-dedup.py — Phase 1 + Phase 3: aggressive entity cleanup.
|
|||
|
|
|
|||
|
|
Removes garbage entities that Haiku/extraction over-promoted:
|
|||
|
|
A. Stop-list filter — single-mention bare common nouns
|
|||
|
|
B. Substring/alias dedup — "FBI" vs "F-B-I" vs "Federal Bureau of Investigation"
|
|||
|
|
C. Compound-name detection — entities A+B that co-occur ≥3 pages → suggest merge
|
|||
|
|
D. Title-prefix recovery — "Chief Tereoken" appearing in raw page text but
|
|||
|
|
dedup created only "tereoken" + "chief"
|
|||
|
|
|
|||
|
|
Runs in two modes:
|
|||
|
|
--dry-run → report what would be deleted/merged, no writes
|
|||
|
|
(default) → applies deletes and merges, removes orphans, updates affected page.md
|
|||
|
|
files to substitute merged names
|
|||
|
|
|
|||
|
|
Skip --merge-compounds to disable (C) since it can be aggressive.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
./23-smart-dedup.py --dry-run
|
|||
|
|
./23-smart-dedup.py --apply
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import re
|
|||
|
|
import sys
|
|||
|
|
import unicodedata
|
|||
|
|
from collections import Counter, defaultdict
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import yaml
|
|||
|
|
except ImportError:
|
|||
|
|
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|||
|
|
ENTITIES = UFO_ROOT / "wiki" / "entities"
|
|||
|
|
PAGES = UFO_ROOT / "wiki" / "pages"
|
|||
|
|
LOG = UFO_ROOT / "wiki" / "log.md"
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Common-noun stop list — these have no value as standalone entities.
|
|||
|
|
# Drop only if mention_count == 1 (still keep if used many times — it's a real referent).
|
|||
|
|
STOP_NOUNS = {
|
|||
|
|
# Roles / positions
|
|||
|
|
"agent", "agents", "officer", "officers", "chief", "captain", "general", "major",
|
|||
|
|
"colonel", "sergeant", "commander", "director", "secretary", "lieutenant", "lcdr",
|
|||
|
|
"cdr", "lt", "lt.", "cpl", "sgt", "supervisor", "inspector",
|
|||
|
|
# Generic structures
|
|||
|
|
"base", "office", "department", "bureau", "agency", "division", "section",
|
|||
|
|
"headquarters", "command", "post", "station", "branch", "unit", "group",
|
|||
|
|
"the", "the bureau", "the agency", "the department", "the office", "the file",
|
|||
|
|
# File / document terms
|
|||
|
|
"file", "files", "memo", "memorandum", "letter", "report", "form", "page", "pages",
|
|||
|
|
"subject", "date", "time", "case", "exhibit", "attachment", "enclosure",
|
|||
|
|
"signature", "stamp", "carbon copy", "cc", "ref", "reference", "annex",
|
|||
|
|
"envelope", "bag", "folder", "transmittal", "routing", "dispatch",
|
|||
|
|
# Generic descriptors
|
|||
|
|
"the inspector", "the agent", "the officer", "the witness", "the observer",
|
|||
|
|
"the subject", "the man", "the woman", "the pilot", "the operator",
|
|||
|
|
# Things that often slip into entities by mistake
|
|||
|
|
"technicians", "personnel", "staff", "team", "crew", "members",
|
|||
|
|
"operations", "operation", # only when very generic — collisions handled by mention_count
|
|||
|
|
"departments",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Common single-letter or 2-letter "entities" that are useless on their own.
|
|||
|
|
TRIVIAL_PATTERNS = [
|
|||
|
|
re.compile(r"^[a-z0-9]$"),
|
|||
|
|
re.compile(r"^[a-z]{1,2}$"), # tiny initials
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def normalize(s: str) -> str:
|
|||
|
|
nfd = unicodedata.normalize("NFD", s)
|
|||
|
|
return "".join(c for c in nfd if not unicodedata.combining(c)).lower().strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_md(path: Path) -> tuple[dict, str]:
|
|||
|
|
try:
|
|||
|
|
c = path.read_text(encoding="utf-8")
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
return {}, ""
|
|||
|
|
if not c.startswith("---"):
|
|||
|
|
return {}, c
|
|||
|
|
end = c.find("---", 4)
|
|||
|
|
if end < 0:
|
|||
|
|
return {}, c
|
|||
|
|
try:
|
|||
|
|
fm = yaml.safe_load(c[3:end].strip()) or {}
|
|||
|
|
except yaml.YAMLError:
|
|||
|
|
fm = {}
|
|||
|
|
body = c[end + 3:].lstrip("\n")
|
|||
|
|
return fm, body
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_trivial(canonical_name: str, entity_id: str) -> bool:
|
|||
|
|
n = normalize(canonical_name)
|
|||
|
|
if not n:
|
|||
|
|
return True
|
|||
|
|
if n in STOP_NOUNS:
|
|||
|
|
return True
|
|||
|
|
if any(p.match(n) for p in TRIVIAL_PATTERNS):
|
|||
|
|
return True
|
|||
|
|
# All-stop-words sequence: "the bureau", "the office"
|
|||
|
|
words = n.split()
|
|||
|
|
if len(words) >= 2 and all(w in STOP_NOUNS or w == "the" for w in words):
|
|||
|
|
return True
|
|||
|
|
# Single common noun
|
|||
|
|
if len(words) == 1 and n in STOP_NOUNS:
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def filter_low_signal(*, dry: bool) -> dict[str, list[Path]]:
|
|||
|
|
"""A — Delete entities that are trivial AND have low mention_count."""
|
|||
|
|
stats: dict[str, list[Path]] = {"deleted": [], "kept_high_mention": [], "kept": []}
|
|||
|
|
for p in ENTITIES.glob("*/*.md"):
|
|||
|
|
fm, _ = read_md(p)
|
|||
|
|
if not fm:
|
|||
|
|
continue
|
|||
|
|
canonical = (fm.get("canonical_name") or p.stem)
|
|||
|
|
total = int(fm.get("total_mentions") or 0)
|
|||
|
|
if is_trivial(canonical, p.stem):
|
|||
|
|
if total <= 2: # trivial + rarely mentioned = noise
|
|||
|
|
stats["deleted"].append(p)
|
|||
|
|
if not dry:
|
|||
|
|
p.unlink()
|
|||
|
|
else:
|
|||
|
|
stats["kept_high_mention"].append(p)
|
|||
|
|
else:
|
|||
|
|
stats["kept"].append(p)
|
|||
|
|
return stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def aliases_of(fm: dict) -> set[str]:
|
|||
|
|
out = set()
|
|||
|
|
cname = fm.get("canonical_name")
|
|||
|
|
if isinstance(cname, str):
|
|||
|
|
out.add(normalize(cname))
|
|||
|
|
for a in (fm.get("aliases") or []):
|
|||
|
|
if isinstance(a, str):
|
|||
|
|
out.add(normalize(a))
|
|||
|
|
return {x for x in out if x}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def dedupe_by_alias(*, dry: bool) -> dict[str, int]:
|
|||
|
|
"""B — Merge entities whose alias sets overlap.
|
|||
|
|
Strategy: keep the entity with highest total_mentions; redirect others by
|
|||
|
|
appending an alias and deleting their files.
|
|||
|
|
"""
|
|||
|
|
stats = {"merges": 0, "deletes": 0}
|
|||
|
|
by_class: dict[str, dict[str, Path]] = defaultdict(dict)
|
|||
|
|
|
|||
|
|
# Build class → alias → path map (last-write-wins; collisions become merge targets)
|
|||
|
|
overlap: dict[str, dict[str, list[Path]]] = defaultdict(lambda: defaultdict(list))
|
|||
|
|
for p in ENTITIES.glob("*/*.md"):
|
|||
|
|
cls = p.parent.name
|
|||
|
|
fm, _ = read_md(p)
|
|||
|
|
if not fm:
|
|||
|
|
continue
|
|||
|
|
for a in aliases_of(fm):
|
|||
|
|
overlap[cls][a].append(p)
|
|||
|
|
|
|||
|
|
for cls, alias_map in overlap.items():
|
|||
|
|
for alias, paths in alias_map.items():
|
|||
|
|
if len(paths) < 2:
|
|||
|
|
continue
|
|||
|
|
# Pick canonical winner: highest total_mentions
|
|||
|
|
ranked = []
|
|||
|
|
for pp in paths:
|
|||
|
|
if not pp.exists():
|
|||
|
|
continue
|
|||
|
|
fm, _ = read_md(pp)
|
|||
|
|
if not fm:
|
|||
|
|
continue
|
|||
|
|
ranked.append((int(fm.get("total_mentions") or 0), pp, fm))
|
|||
|
|
if len(ranked) < 2:
|
|||
|
|
continue
|
|||
|
|
ranked.sort(key=lambda x: x[0], reverse=True)
|
|||
|
|
winner_count, winner_path, winner_fm = ranked[0]
|
|||
|
|
|
|||
|
|
for _count, loser_path, loser_fm in ranked[1:]:
|
|||
|
|
if loser_path == winner_path or not loser_path.exists():
|
|||
|
|
continue
|
|||
|
|
# Add loser's aliases to winner
|
|||
|
|
new_aliases = sorted(set((winner_fm.get("aliases") or []))
|
|||
|
|
| set(loser_fm.get("aliases") or [])
|
|||
|
|
| {loser_fm.get("canonical_name") or loser_path.stem})
|
|||
|
|
winner_fm["aliases"] = [a for a in new_aliases if a]
|
|||
|
|
# Total mentions sum
|
|||
|
|
winner_fm["total_mentions"] = (winner_fm.get("total_mentions") or 0) + (loser_fm.get("total_mentions") or 0)
|
|||
|
|
if not dry:
|
|||
|
|
new_yaml = yaml.dump(winner_fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|||
|
|
body = winner_path.read_text(encoding="utf-8").split("---", 2)[-1].lstrip("\n")
|
|||
|
|
winner_path.write_text(f"---\n{new_yaml}---\n\n{body}", encoding="utf-8")
|
|||
|
|
loser_path.unlink()
|
|||
|
|
stats["merges"] += 1
|
|||
|
|
stats["deletes"] += 1
|
|||
|
|
return stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_compound_candidates() -> list[tuple[str, str, str, int]]:
|
|||
|
|
"""C — find entity pairs (A, B) appearing adjacent on ≥3 pages → likely compound name.
|
|||
|
|
|
|||
|
|
Walk all page.md `entities_extracted` fields; if A.name then B.name appear
|
|||
|
|
near each other in the page body, count co-occurrence.
|
|||
|
|
|
|||
|
|
Returns: [(class_a, name_a, name_b, count)]
|
|||
|
|
"""
|
|||
|
|
pair_count: Counter = Counter()
|
|||
|
|
page_entities: Counter = Counter()
|
|||
|
|
for p in PAGES.glob("*/p*.md"):
|
|||
|
|
try:
|
|||
|
|
fm, _ = read_md(p)
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
if not fm:
|
|||
|
|
continue
|
|||
|
|
ee = fm.get("entities_extracted") or {}
|
|||
|
|
if not isinstance(ee, dict):
|
|||
|
|
continue
|
|||
|
|
# Across all entity classes, look for adjacent pairs in name lists
|
|||
|
|
names = []
|
|||
|
|
for cls_key in ("people", "organizations", "locations"):
|
|||
|
|
for entry in (ee.get(cls_key) or []):
|
|||
|
|
if isinstance(entry, dict) and entry.get("name"):
|
|||
|
|
names.append((cls_key, normalize(entry["name"])))
|
|||
|
|
# Pair up adjacent entries (heuristic — Haiku usually returns them in occurrence order)
|
|||
|
|
for i in range(len(names) - 1):
|
|||
|
|
a_cls, a = names[i]
|
|||
|
|
b_cls, b = names[i + 1]
|
|||
|
|
if a == b:
|
|||
|
|
continue
|
|||
|
|
pair_count[(a_cls, a, b)] += 1
|
|||
|
|
|
|||
|
|
# Filter to pairs that appear together ≥ 3 pages
|
|||
|
|
out = []
|
|||
|
|
for (cls_a, a, b), c in pair_count.items():
|
|||
|
|
if c >= 3:
|
|||
|
|
out.append((cls_a, a, b, c))
|
|||
|
|
return sorted(out, key=lambda x: -x[3])
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
ap = argparse.ArgumentParser()
|
|||
|
|
ap.add_argument("--dry-run", action="store_true", help="report only, don't delete")
|
|||
|
|
ap.add_argument("--apply", action="store_true", help="apply deletes + merges")
|
|||
|
|
ap.add_argument("--report-compounds", action="store_true", help="just print compound candidates and exit")
|
|||
|
|
args = ap.parse_args()
|
|||
|
|
|
|||
|
|
if not args.dry_run and not args.apply and not args.report_compounds:
|
|||
|
|
ap.error("provide --dry-run, --apply, or --report-compounds")
|
|||
|
|
|
|||
|
|
if args.report_compounds:
|
|||
|
|
cands = find_compound_candidates()
|
|||
|
|
print(f"Top compound candidates (adjacent ≥3 pages):")
|
|||
|
|
for cls_a, a, b, c in cands[:50]:
|
|||
|
|
print(f" {c:4d}× [{cls_a}] {a} + {b} → '{a} {b}'")
|
|||
|
|
print(f"\nTotal: {len(cands)} candidates")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
dry = args.dry_run
|
|||
|
|
|
|||
|
|
print(f"=== Phase 1A: filter trivial low-mention entities ({'DRY-RUN' if dry else 'APPLY'}) ===")
|
|||
|
|
a = filter_low_signal(dry=dry)
|
|||
|
|
print(f" deleted (trivial + ≤2 mentions): {len(a['deleted'])}")
|
|||
|
|
print(f" kept (trivial but ≥3 mentions): {len(a['kept_high_mention'])}")
|
|||
|
|
print(f" kept (meaningful): {len(a['kept'])}")
|
|||
|
|
|
|||
|
|
print(f"\n=== Phase 1B: alias-based merge ({'DRY-RUN' if dry else 'APPLY'}) ===")
|
|||
|
|
b = dedupe_by_alias(dry=dry)
|
|||
|
|
print(f" pairs merged: {b['merges']}")
|
|||
|
|
|
|||
|
|
total_remaining = sum(1 for _ in ENTITIES.glob("*/*.md"))
|
|||
|
|
print(f"\nRemaining entity files: {total_remaining}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|