Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
313 lines
12 KiB
Python
313 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Aggressive entity deduplication — camada 1 (determinístico).
|
|
|
|
Hoje há ~34.355 entidades; muitas são variações tipográficas, prefixos de
|
|
papel (Mr./Dr./Major), ou OBJ-* gerados por chunk em vez de por evento.
|
|
Este script faz merge em três camadas, todas alta-confiança:
|
|
|
|
A. PROPER_NAME — pessoa com ≥2 tokens onde o nome próprio principal
|
|
(último token significativo + primeiro nome) é único após strip de
|
|
role prefixes. Ex: "Frank M. Brown", "Lt. Frank M. Brown",
|
|
"Special Agent Frank M. Brown" → 1 entidade canônica.
|
|
|
|
B. UAP_OBJECT_BY_EVENT — todos os OBJ-EV<year>-<EVENT>-NN do mesmo evento
|
|
são colapsados em 1 OBJ-EV<year>-<EVENT>-00 (NN=00 = canonical).
|
|
|
|
C. EXACT_NORMALIZED — após lowercase + strip de pontuação + strip de
|
|
stopwords + strip de sufixos tipo " UAP" / " incident", strings
|
|
idênticas viram 1 entidade.
|
|
|
|
Para cada cluster:
|
|
- Escolhe canonical: o mais longo OU o que tem narrative_summary curado,
|
|
com fallback no primeiro alfabético.
|
|
- Une aliases[], mentioned_in[], text_mentioned_in[], referenced_by[].
|
|
- Recalcula signal_sources somando page_refs/text_refs (db_chunks fica
|
|
com o do canonical pq depende do entity_pk).
|
|
- Move duplicatas para wiki/entities/_archived/.
|
|
|
|
Output: lista de merges (cluster → canonical), pra revisar antes de aplicar.
|
|
|
|
Run:
|
|
python3 scripts/maintain/49_dedup_aggressive.py --dry-run
|
|
python3 scripts/maintain/49_dedup_aggressive.py # apply
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import shutil
|
|
import sys
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from typing import Iterable
|
|
|
|
import yaml
|
|
|
|
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
|
|
ARCHIVED = WIKI_ENT / "_archived"
|
|
|
|
ROLE_PREFIX_RE = re.compile(
|
|
r"^("
|
|
r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|"
|
|
r"major|maj|colonel|col|lt|lieutenant|captain|capt|"
|
|
r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|"
|
|
r"agent|special agent|sa|director|deputy director|deputy|"
|
|
r"reverend|rev|professor|"
|
|
r"president|vice president|vp|chairman|secretary|"
|
|
r"detective|det|inspector"
|
|
r")\.?\s+",
|
|
re.IGNORECASE,
|
|
)
|
|
STOPWORD_PREFIX_RE = re.compile(r"^(the|a|an|o|os|a|as|de|do|da|dos|das|of|los|las|el|la|le|les)\s+", re.IGNORECASE)
|
|
PUNCT_RE = re.compile(r"[.,;:!?\"'\(\)\[\]_\-]")
|
|
WS_RE = re.compile(r"\s+")
|
|
NOISE_SUFFIX_RE = re.compile(r"\s+(uap|incident|case|sighting|event|observation)$", re.IGNORECASE)
|
|
OBJ_ID_RE = re.compile(r"^OBJ-([A-Z0-9]+)-(.+?)-(\d{2})$")
|
|
|
|
|
|
def ascii_fold(s: str) -> str:
|
|
return "".join(
|
|
c for c in unicodedata.normalize("NFD", s)
|
|
if not unicodedata.combining(c)
|
|
)
|
|
|
|
|
|
def aggressive_normalize(name: str) -> str:
|
|
s = ascii_fold(name).strip().lower()
|
|
# strip role prefixes (repeat: "Special Agent Major Brown")
|
|
for _ in range(3):
|
|
new = ROLE_PREFIX_RE.sub("", s)
|
|
if new == s: break
|
|
s = new
|
|
s = STOPWORD_PREFIX_RE.sub("", s)
|
|
s = PUNCT_RE.sub(" ", s)
|
|
s = WS_RE.sub(" ", s).strip()
|
|
s = NOISE_SUFFIX_RE.sub("", s).strip()
|
|
return s
|
|
|
|
|
|
FOLDER_TO_CLASS = {
|
|
"people": "person",
|
|
"organizations": "organization",
|
|
"locations": "location",
|
|
"events": "event",
|
|
"uap-objects": "uap_object",
|
|
"vehicles": "vehicle",
|
|
"operations": "operation",
|
|
"concepts": "concept",
|
|
}
|
|
|
|
|
|
def load_entity(path: Path) -> dict | None:
|
|
try:
|
|
text = path.read_text(encoding="utf-8")
|
|
if not text.startswith("---"):
|
|
return None
|
|
parts = text.split("---", 2)
|
|
if len(parts) < 3: return None
|
|
fm = yaml.safe_load(parts[1]) or {}
|
|
body = parts[2]
|
|
return {"path": path, "fm": fm, "body": body, "raw": text}
|
|
except Exception as e:
|
|
return None
|
|
|
|
|
|
def dump_entity(entity: dict) -> str:
|
|
return "---\n" + yaml.safe_dump(
|
|
entity["fm"], sort_keys=False, allow_unicode=True, width=1000
|
|
) + "---" + entity["body"]
|
|
|
|
|
|
def dedup_pass_obj_by_event(entities: list[dict]) -> dict[str, list[dict]]:
|
|
"""OBJ-EVYYYY-EVENT-NN → group by EVENT base (drop NN)."""
|
|
clusters: dict[str, list[dict]] = defaultdict(list)
|
|
for e in entities:
|
|
if e["fm"].get("entity_class") != "uap_object":
|
|
continue
|
|
eid = e["fm"].get("uap_object_id") or e["fm"].get("entity_id") or ""
|
|
m = OBJ_ID_RE.match(eid)
|
|
if not m: continue
|
|
# Group by EV<year>-<EVENT_SLUG>
|
|
key = f"OBJ-{m.group(1)}-{m.group(2)}"
|
|
clusters[key].append(e)
|
|
return {k: v for k, v in clusters.items() if len(v) > 1}
|
|
|
|
|
|
def dedup_pass_proper_name(entities: list[dict]) -> dict[str, list[dict]]:
|
|
"""Person/organization/event/location: cluster by aggressive_normalize.
|
|
Only auto-merge if the normalized form has ≥2 tokens (avoids "smith" only).
|
|
"""
|
|
clusters: dict[str, list[dict]] = defaultdict(list)
|
|
for e in entities:
|
|
cls = e["fm"].get("entity_class")
|
|
if cls not in ("person", "organization", "event", "location", "operation",
|
|
"concept", "vehicle"):
|
|
continue
|
|
name = e["fm"].get("canonical_name") or ""
|
|
if not name: continue
|
|
norm = aggressive_normalize(name)
|
|
if not norm: continue
|
|
# Require ≥2 tokens OR ≥6 chars to avoid "smith" / "brown" collisions
|
|
n_tokens = len(norm.split())
|
|
if n_tokens < 2 and len(norm) < 8:
|
|
continue
|
|
key = f"{cls}::{norm}"
|
|
clusters[key].append(e)
|
|
return {k: v for k, v in clusters.items() if len(v) > 1}
|
|
|
|
|
|
def choose_canonical(cluster: list[dict]) -> dict:
|
|
"""Pick canonical: prefer one with curated narrative, then longest aliases,
|
|
then most mentions, then first alphabetical."""
|
|
def score(e: dict) -> tuple:
|
|
fm = e["fm"]
|
|
curated = 1 if fm.get("summary_status") == "curated" else 0
|
|
n_aliases = len(fm.get("aliases") or [])
|
|
mentions = fm.get("total_mentions") or 0
|
|
# Negative path to make alphabetical ascending
|
|
name_for_sort = str(fm.get("canonical_name") or "")
|
|
return (curated, n_aliases, mentions, -ord(name_for_sort[0]) if name_for_sort else 0)
|
|
return max(cluster, key=score)
|
|
|
|
|
|
def merge_into(canonical: dict, duplicates: list[dict]) -> None:
|
|
"""Merge fields from duplicates into canonical (in place)."""
|
|
cfm = canonical["fm"]
|
|
cfm.setdefault("aliases", [])
|
|
cfm.setdefault("mentioned_in", [])
|
|
cfm.setdefault("text_mentioned_in", [])
|
|
cfm.setdefault("referenced_by", [])
|
|
cfm.setdefault("related", [])
|
|
|
|
# Collect aliases (include the duplicates' canonical_name as alias)
|
|
all_aliases = set(cfm["aliases"] or [])
|
|
all_aliases.add(cfm.get("canonical_name", ""))
|
|
all_mentions = set(cfm["mentioned_in"] or [])
|
|
all_text_mentions = set(cfm["text_mentioned_in"] or [])
|
|
all_referenced = set(cfm["referenced_by"] or [])
|
|
all_related = set(cfm["related"] or [])
|
|
page_refs_sum = int((cfm.get("signal_sources") or {}).get("page_refs") or 0)
|
|
text_refs_sum = int((cfm.get("signal_sources") or {}).get("text_refs") or 0)
|
|
|
|
for d in duplicates:
|
|
dfm = d["fm"]
|
|
dcanonical = dfm.get("canonical_name")
|
|
if dcanonical: all_aliases.add(dcanonical)
|
|
for a in (dfm.get("aliases") or []): all_aliases.add(a)
|
|
for m in (dfm.get("mentioned_in") or []): all_mentions.add(m)
|
|
for m in (dfm.get("text_mentioned_in") or []): all_text_mentions.add(m)
|
|
for r in (dfm.get("referenced_by") or []): all_referenced.add(r)
|
|
for r in (dfm.get("related") or []): all_related.add(r)
|
|
|
|
all_aliases.discard("")
|
|
all_aliases.discard(None)
|
|
cfm["aliases"] = sorted(all_aliases)
|
|
cfm["mentioned_in"] = sorted(all_mentions)
|
|
cfm["text_mentioned_in"] = sorted(all_text_mentions)
|
|
cfm["referenced_by"] = sorted(all_referenced)
|
|
cfm["related"] = sorted(all_related)
|
|
cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in all_mentions})
|
|
# Recompute signal_sources (page_refs/text_refs are sums; db_chunks stays as canonical's)
|
|
sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0}
|
|
sigs["page_refs"] = len(all_mentions)
|
|
sigs["text_refs"] = len(all_text_mentions)
|
|
sigs["cross_refs"] = len(all_referenced)
|
|
sigs["db_chunks"] = int(sigs.get("db_chunks", 0))
|
|
cfm["signal_sources"] = sigs
|
|
cfm["total_mentions"] = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"]
|
|
# Recompute signal_strength
|
|
total = cfm["total_mentions"]
|
|
if total == 0:
|
|
cfm["signal_strength"] = "orphan"
|
|
elif (sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3
|
|
or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1)
|
|
or sigs["text_refs"] >= 5):
|
|
cfm["signal_strength"] = "strong"
|
|
else:
|
|
cfm["signal_strength"] = "weak"
|
|
|
|
|
|
def archive_path(p: Path) -> Path:
|
|
rel = p.relative_to(WIKI_ENT)
|
|
return ARCHIVED / rel
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
ap.add_argument("--limit-pass", choices=["all", "obj", "name"], default="all")
|
|
args = ap.parse_args()
|
|
|
|
print(f"Loading entities from {WIKI_ENT} ...")
|
|
all_entities: list[dict] = []
|
|
for f in WIKI_ENT.rglob("*.md"):
|
|
if "_archived" in f.parts: continue
|
|
ent = load_entity(f)
|
|
if ent and ent["fm"].get("type") == "entity":
|
|
all_entities.append(ent)
|
|
print(f" loaded {len(all_entities)} entities")
|
|
|
|
# Run dedup passes
|
|
clusters: dict[str, list[dict]] = {}
|
|
if args.limit_pass in ("all", "obj"):
|
|
obj_clusters = dedup_pass_obj_by_event(all_entities)
|
|
print(f"\nPass A — OBJ by event: {len(obj_clusters)} clusters ({sum(len(v) for v in obj_clusters.values())} entities → {len(obj_clusters)} canonicals)")
|
|
clusters.update({f"OBJ::{k}": v for k, v in obj_clusters.items()})
|
|
if args.limit_pass in ("all", "name"):
|
|
name_clusters = dedup_pass_proper_name(all_entities)
|
|
print(f"Pass B/C — proper-name normalize: {len(name_clusters)} clusters ({sum(len(v) for v in name_clusters.values())} entities → {len(name_clusters)} canonicals)")
|
|
clusters.update({f"NAME::{k}": v for k, v in name_clusters.items()})
|
|
|
|
# Deduplicate entities across passes (avoid double-merge)
|
|
seen_paths: set[str] = set()
|
|
plans: list[tuple[str, dict, list[dict]]] = []
|
|
for ckey, cluster in clusters.items():
|
|
# Filter out already-seen
|
|
cluster = [e for e in cluster if str(e["path"]) not in seen_paths]
|
|
if len(cluster) < 2: continue
|
|
canonical = choose_canonical(cluster)
|
|
duplicates = [e for e in cluster if e is not canonical]
|
|
for e in cluster: seen_paths.add(str(e["path"]))
|
|
plans.append((ckey, canonical, duplicates))
|
|
|
|
plans.sort(key=lambda p: -len(p[2])) # biggest clusters first
|
|
redundant_total = sum(len(d) for _, _, d in plans)
|
|
print(f"\n=== Merge plan ===")
|
|
print(f" clusters: {len(plans)}")
|
|
print(f" entities removed: {redundant_total}")
|
|
print(f" before: {len(all_entities)} → after: {len(all_entities) - redundant_total}")
|
|
print(f" reduction: {100*redundant_total/len(all_entities):.1f}%\n")
|
|
|
|
print("=== Top 20 biggest merges ===")
|
|
for ckey, canonical, dupes in plans[:20]:
|
|
cname = canonical["fm"].get("canonical_name", "?")
|
|
print(f" {len(dupes)+1:>3} entities → '{cname}' ({ckey.split('::')[0]})")
|
|
for d in dupes[:4]:
|
|
print(f" ✗ {d['fm'].get('canonical_name', '?')}")
|
|
if len(dupes) > 4: print(f" ... +{len(dupes)-4}")
|
|
|
|
if args.dry_run:
|
|
print("\n(dry-run; nothing written)")
|
|
return 0
|
|
|
|
# Apply merges
|
|
print("\nApplying merges ...")
|
|
merged_count = 0
|
|
archived_count = 0
|
|
for ckey, canonical, dupes in plans:
|
|
merge_into(canonical, dupes)
|
|
canonical["path"].write_text(dump_entity(canonical), encoding="utf-8")
|
|
merged_count += 1
|
|
for d in dupes:
|
|
archive_to = archive_path(d["path"])
|
|
archive_to.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.move(str(d["path"]), str(archive_to))
|
|
archived_count += 1
|
|
print(f" canonicals updated: {merged_count}")
|
|
print(f" duplicates archived: {archived_count}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|