Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
428 lines
17 KiB
Python
428 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Aggressive entity deduplication — camada 2 (fuzzy trigram).
|
|
|
|
Para cada entity_class, compara TODAS as entidades restantes via similaridade
|
|
trigram (Postgres pg_trgm). Merge automático quando:
|
|
- similarity >= 0.85 e ambos os nomes têm ≥2 tokens significativos OU
|
|
- similarity >= 0.92 (mais tolerante para nomes curtos)
|
|
- mesma classe
|
|
- estado: NÃO já arquivada
|
|
- mesmo "núcleo" (último token após strip de role prefixes)
|
|
|
|
Para nomes ambíguos (single-word sobrenome como "Smith"), só faz merge se
|
|
houver contexto compartilhado (mesma página, mesmo documento na maioria das
|
|
menções).
|
|
|
|
Run:
|
|
DATABASE_URL=postgres://... python3 scripts/maintain/50_dedup_fuzzy_trigram.py --dry-run
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sys
|
|
import unicodedata
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import psycopg
|
|
import yaml
|
|
|
|
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
|
|
ARCHIVED = WIKI_ENT / "_archived"
|
|
|
|
ROLE_PREFIX_RE = re.compile(
|
|
r"^("
|
|
r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|"
|
|
r"major|maj|colonel|col|lt|lieutenant|captain|capt|"
|
|
r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|"
|
|
r"agent|special agent|sa|director|deputy director|deputy|"
|
|
r"reverend|rev|professor|"
|
|
r"president|vice president|vp|chairman|secretary|"
|
|
r"detective|det|inspector"
|
|
r")\.?\s+",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def ascii_fold(s: str) -> str:
|
|
return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))
|
|
|
|
|
|
def strip_roles(name: str) -> str:
|
|
s = name
|
|
for _ in range(3):
|
|
new = ROLE_PREFIX_RE.sub("", s)
|
|
if new == s: break
|
|
s = new
|
|
return s.strip()
|
|
|
|
|
|
def core_tokens(name: str) -> set[str]:
|
|
"""Significant tokens of a name (no roles, no stopwords, lowercased)."""
|
|
s = ascii_fold(strip_roles(name).lower())
|
|
s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s)
|
|
toks = [t for t in s.split() if len(t) > 1 and t not in {
|
|
"the", "of", "and", "de", "do", "da", "dos", "das", "el", "la", "los", "las",
|
|
"a", "an", "o", "as", "os", "le", "les", "von", "van"
|
|
}]
|
|
return set(toks)
|
|
|
|
|
|
# Tokens that mix letters and digits (II-22, B-6, mode4, district17, 17th, 3rd)
|
|
# These are SIGNIFICANT modifiers — if they differ between two names, the
|
|
# names refer to DIFFERENT things.
|
|
NUMERIC_TOKEN_RE = re.compile(r"^[a-z]*\d+[a-z]*$|^\d+[a-z]+$|^[a-z]+-?\d+[a-z]*$|^[ivxlcdm]+-?\d+$", re.IGNORECASE)
|
|
|
|
|
|
CODE_SUFFIX_RE = re.compile(r"(?:\s-\s|-)([A-Z]{1,3})$|\s([A-Z])$")
|
|
|
|
|
|
def code_suffix(name: str) -> str | None:
|
|
"""Extract trailing short code (1-3 uppercase letters) like ' - Z',
|
|
' M', '-R'. These often denote sub-categories that differ semantically
|
|
(FBI classification subdivisions, military variants)."""
|
|
s = name.strip()
|
|
m = CODE_SUFFIX_RE.search(s)
|
|
if not m: return None
|
|
code = (m.group(1) or m.group(2) or "").upper()
|
|
return code if code else None
|
|
|
|
|
|
ROMAN_NUMERALS = {
|
|
"i","ii","iii","iv","v","vi","vii","viii","ix","x",
|
|
"xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
|
|
"xxi","xxii","xxiii","xxiv","xxv","xxvi","xxvii","xxviii","xxix","xxx",
|
|
}
|
|
ORDINAL_WORDS = {
|
|
"first","second","third","fourth","fifth","sixth","seventh","eighth",
|
|
"ninth","tenth","eleventh","twelfth","thirteenth","fourteenth","fifteenth",
|
|
"sixteenth","seventeenth","eighteenth","nineteenth","twentieth",
|
|
"primeiro","segundo","terceiro","quarto","quinto","sexto","setimo",
|
|
"oitavo","nono","decimo","undecimo","duodecimo",
|
|
}
|
|
|
|
|
|
def is_variant_marker(tok: str) -> bool:
|
|
"""True if `tok` is the kind of token that distinguishes instances of a
|
|
series: 'A', 'B', 'II', 'XIII', 'Ninth', 'Fourth', '5', etc."""
|
|
t = tok.lower()
|
|
if t.isdigit(): return True
|
|
if t in ROMAN_NUMERALS: return True
|
|
if t in ORDINAL_WORDS: return True
|
|
# Single uppercase letter (e.g. 'A' in 'Pioneer A')
|
|
if len(tok) == 1 and tok.isalpha() and tok.isupper(): return True
|
|
return False
|
|
|
|
|
|
def single_letter_token_diff(name_a: str, name_b: str) -> bool:
|
|
"""Returns True if the two names differ by tokens that are 'variant
|
|
markers' — letters, romans, ordinals. Catches:
|
|
Pioneer Launch vs PIONEER A Launch (single letter)
|
|
PIONEER-B Launch vs PIONEER-C Launch
|
|
XII Tactical Air Cmd vs XIII Tactical Air Cmd (romans)
|
|
Ninth Air Force vs Tenth Air Force (ordinals)
|
|
Apollo vs Apollo 11 (digit)
|
|
These are variants of the same program, NOT the same instance.
|
|
"""
|
|
def toks(s: str) -> list[str]:
|
|
s = ascii_fold(s.lower())
|
|
s = re.sub(r"[-_]", " ", s)
|
|
return [t for t in re.findall(r"\b[\w]+\b", s) if t]
|
|
|
|
# Lowercase tokens for set diff, but remember the original case to detect
|
|
# the single-uppercase-letter case.
|
|
ta_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_a)))
|
|
tb_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_b)))
|
|
ta = [t.lower() for t in ta_orig]
|
|
tb = [t.lower() for t in tb_orig]
|
|
if not ta or not tb: return False
|
|
from collections import Counter
|
|
ca, cb = Counter(ta), Counter(tb)
|
|
diff_a = list((ca - cb).elements())
|
|
diff_b = list((cb - ca).elements())
|
|
if not diff_a and not diff_b: return False
|
|
# Helper: variant marker check considering original case for single letters
|
|
def marker_or_single_letter(lower_tok: str, src: list[str]) -> bool:
|
|
if is_variant_marker(lower_tok): return True
|
|
# Single letter not flagged above because we only allowed UPPERCASE.
|
|
# Re-check via original-case forms in the source name.
|
|
if len(lower_tok) == 1 and lower_tok.isalpha():
|
|
# See if it appears as uppercase in original tokens
|
|
for o in src:
|
|
if o.lower() == lower_tok and o.isupper(): return True
|
|
return False
|
|
|
|
a_all_markers = all(marker_or_single_letter(t, ta_orig) for t in diff_a) if diff_a else True
|
|
b_all_markers = all(marker_or_single_letter(t, tb_orig) for t in diff_b) if diff_b else True
|
|
if a_all_markers and b_all_markers and (diff_a or diff_b):
|
|
return True
|
|
return False
|
|
|
|
|
|
def numeric_signature(name: str) -> frozenset[str]:
|
|
"""Extract all numeric/ordinal/serial tokens from a name.
|
|
Two names with DIFFERENT numeric signatures CANNOT be merged."""
|
|
s = ascii_fold(name.lower())
|
|
s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s)
|
|
# Extract all tokens that contain at least one digit
|
|
nums = set()
|
|
for t in re.findall(r"\b[\w-]+\b", s):
|
|
# Pure number
|
|
if re.fullmatch(r"\d+(st|nd|rd|th)?", t):
|
|
# Normalize "17th" → "17"
|
|
nums.add(re.sub(r"(st|nd|rd|th)$", "", t))
|
|
# Letter + digit (II-22, b-6, mode4)
|
|
elif re.search(r"\d", t):
|
|
# Normalize "II-22" / "ii-22" → "ii22"; "b-6" → "b6"
|
|
nums.add(re.sub(r"[-\s]", "", t))
|
|
return frozenset(nums)
|
|
|
|
|
|
FOLDER_TO_CLASS = {
|
|
"people": "person",
|
|
"organizations": "organization",
|
|
"locations": "location",
|
|
"events": "event",
|
|
"uap-objects": "uap_object",
|
|
"vehicles": "vehicle",
|
|
"operations": "operation",
|
|
"concepts": "concept",
|
|
}
|
|
CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()}
|
|
|
|
|
|
def load_entity(path: Path) -> dict | None:
|
|
try:
|
|
text = path.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): return None
|
|
parts = text.split("---", 2)
|
|
if len(parts) < 3: return None
|
|
fm = yaml.safe_load(parts[1]) or {}
|
|
body = parts[2]
|
|
return {"path": path, "fm": fm, "body": body}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def dump_entity(entity: dict) -> str:
|
|
return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + entity["body"]
|
|
|
|
|
|
def entity_path_for(cls: str, entity_id: str) -> Path | None:
|
|
folder = CLASS_TO_FOLDER.get(cls)
|
|
if not folder: return None
|
|
p = WIKI_ENT / folder / f"{entity_id}.md"
|
|
return p if p.exists() else None
|
|
|
|
|
|
def merge_into(canonical: dict, duplicate: dict) -> None:
|
|
cfm = canonical["fm"]; dfm = duplicate["fm"]
|
|
cfm.setdefault("aliases", []); cfm.setdefault("mentioned_in", [])
|
|
cfm.setdefault("text_mentioned_in", []); cfm.setdefault("referenced_by", [])
|
|
cfm.setdefault("related", [])
|
|
all_aliases = set(cfm["aliases"] or []); all_aliases.add(cfm.get("canonical_name", ""))
|
|
if dfm.get("canonical_name"): all_aliases.add(dfm["canonical_name"])
|
|
for a in (dfm.get("aliases") or []): all_aliases.add(a)
|
|
all_aliases.discard(""); all_aliases.discard(None)
|
|
cfm["aliases"] = sorted(all_aliases)
|
|
cfm["mentioned_in"] = sorted(set(cfm["mentioned_in"] or []) | set(dfm.get("mentioned_in") or []))
|
|
cfm["text_mentioned_in"] = sorted(set(cfm["text_mentioned_in"] or []) | set(dfm.get("text_mentioned_in") or []))
|
|
cfm["referenced_by"] = sorted(set(cfm["referenced_by"] or []) | set(dfm.get("referenced_by") or []))
|
|
cfm["related"] = sorted(set(cfm["related"] or []) | set(dfm.get("related") or []))
|
|
cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in cfm["mentioned_in"]})
|
|
sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0}
|
|
sigs["page_refs"] = len(cfm["mentioned_in"])
|
|
sigs["text_refs"] = len(cfm["text_mentioned_in"])
|
|
sigs["cross_refs"] = len(cfm["referenced_by"])
|
|
sigs["db_chunks"] = int(sigs.get("db_chunks", 0))
|
|
cfm["signal_sources"] = sigs
|
|
total = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"]
|
|
cfm["total_mentions"] = total
|
|
if total == 0:
|
|
cfm["signal_strength"] = "orphan"
|
|
elif sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3 or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1) or sigs["text_refs"] >= 5:
|
|
cfm["signal_strength"] = "strong"
|
|
else:
|
|
cfm["signal_strength"] = "weak"
|
|
|
|
|
|
def choose_canonical(a: dict, b: dict) -> tuple[dict, dict]:
|
|
"""Return (canonical, duplicate). Prefer one with curated narrative,
|
|
then longer aliases list, then higher total_mentions."""
|
|
def score(e: dict) -> tuple:
|
|
fm = e["fm"]
|
|
return (
|
|
1 if fm.get("summary_status") == "curated" else 0,
|
|
len(fm.get("aliases") or []),
|
|
fm.get("total_mentions") or 0,
|
|
len(fm.get("canonical_name") or ""),
|
|
)
|
|
if score(a) >= score(b): return a, b
|
|
return b, a
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
ap.add_argument("--threshold", type=float, default=0.85,
|
|
help="trigram similarity threshold (0..1)")
|
|
ap.add_argument("--threshold-short", type=float, default=0.92,
|
|
help="higher threshold for single-token names")
|
|
ap.add_argument("--limit", type=int, default=None,
|
|
help="apply at most N merges (for cautious runs)")
|
|
args = ap.parse_args()
|
|
|
|
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
|
|
if not dburl: sys.exit("DATABASE_URL not set")
|
|
|
|
with psycopg.connect(dburl) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(f"SET pg_trgm.similarity_threshold = {args.threshold}")
|
|
# All entity pairs above threshold in the SAME class, where a > b (avoid duplicates)
|
|
cur.execute(f"""
|
|
SELECT e1.entity_class,
|
|
e1.entity_id, e1.canonical_name,
|
|
e2.entity_id, e2.canonical_name,
|
|
similarity(e1.canonical_name, e2.canonical_name) AS sim
|
|
FROM entities e1
|
|
JOIN entities e2
|
|
ON e1.entity_class = e2.entity_class
|
|
AND e1.entity_id < e2.entity_id
|
|
AND e1.canonical_name % e2.canonical_name
|
|
ORDER BY sim DESC
|
|
""")
|
|
pairs = cur.fetchall()
|
|
|
|
print(f"Trigram candidate pairs (sim >= {args.threshold}): {len(pairs)}")
|
|
|
|
# Filter pairs by:
|
|
# - share at least 1 significant core token (avoids "United States" matching "United Kingdom")
|
|
# - if both names are single-token AFTER role strip, require higher threshold
|
|
accepted = []
|
|
rejected_short = 0
|
|
rejected_no_overlap = 0
|
|
rejected_numeric = 0
|
|
for cls, id_a, name_a, id_b, name_b, sim in pairs:
|
|
toks_a = core_tokens(name_a or "")
|
|
toks_b = core_tokens(name_b or "")
|
|
if not toks_a or not toks_b:
|
|
rejected_no_overlap += 1; continue
|
|
# Must share at least one significant token
|
|
if not (toks_a & toks_b):
|
|
rejected_no_overlap += 1; continue
|
|
# If one side is single-token, require stricter threshold
|
|
if (len(toks_a) <= 1 or len(toks_b) <= 1) and sim < args.threshold_short:
|
|
rejected_short += 1; continue
|
|
# NUMERIC SAFEGUARD: if numeric signatures differ, the names refer to
|
|
# different objects (NAVSTAR II-2 vs II-24, Mode 3 vs Mode 4,
|
|
# 17th District vs 13th District, etc). Reject.
|
|
sig_a = numeric_signature(name_a or "")
|
|
sig_b = numeric_signature(name_b or "")
|
|
if sig_a != sig_b:
|
|
rejected_numeric += 1; continue
|
|
# CODE SUFFIX SAFEGUARD: if EITHER name has a short code suffix
|
|
# (1-3 uppercase letters), they must have IDENTICAL suffixes.
|
|
# 'INTERNAL SECURITY - Z' ≠ 'INTERNAL SECURITY - X' ≠ 'INTERNAL SECURITY' (base).
|
|
cs_a = code_suffix(name_a or "")
|
|
cs_b = code_suffix(name_b or "")
|
|
if (cs_a or cs_b) and cs_a != cs_b:
|
|
rejected_numeric += 1; continue
|
|
# SINGLE-LETTER VARIANT TOKEN: 'PIONEER A Launch' vs 'PIONEER-B Launch'
|
|
# vs 'Pioneer Launch' are distinct missions of the same program.
|
|
if single_letter_token_diff(name_a or "", name_b or ""):
|
|
rejected_numeric += 1; continue
|
|
accepted.append((cls, id_a, name_a, id_b, name_b, sim))
|
|
|
|
print(f" rejected (no token overlap): {rejected_no_overlap}")
|
|
print(f" rejected (single-token below {args.threshold_short}): {rejected_short}")
|
|
print(f" rejected (numeric signature mismatch): {rejected_numeric}")
|
|
print(f" ACCEPTED for merge: {len(accepted)}")
|
|
|
|
# Build a union-find over accepted pairs so transitive clusters merge correctly
|
|
parent: dict[tuple[str, str], tuple[str, str]] = {}
|
|
def find(x):
|
|
while parent.get(x, x) != x:
|
|
parent[x] = parent.get(parent[x], parent[x])
|
|
x = parent[x]
|
|
return x
|
|
def union(x, y):
|
|
rx, ry = find(x), find(y)
|
|
if rx != ry: parent[ry] = rx
|
|
|
|
for cls, id_a, _, id_b, _, _ in accepted:
|
|
a = (cls, id_a); b = (cls, id_b)
|
|
parent.setdefault(a, a); parent.setdefault(b, b)
|
|
union(a, b)
|
|
|
|
clusters: dict[tuple[str, str], list[tuple[str, str]]] = defaultdict(list)
|
|
for node in list(parent.keys()):
|
|
clusters[find(node)].append(node)
|
|
clusters = {k: v for k, v in clusters.items() if len(v) > 1}
|
|
|
|
print(f"\nClusters after union-find: {len(clusters)}")
|
|
print(f"Entities to remove: {sum(len(v) - 1 for v in clusters.values())}\n")
|
|
|
|
# Sample biggest
|
|
biggest = sorted(clusters.values(), key=lambda c: -len(c))[:15]
|
|
print("=== Top 15 biggest fuzzy clusters ===")
|
|
for cluster in biggest:
|
|
# Load names for display
|
|
names = []
|
|
for cls, eid in cluster:
|
|
p = entity_path_for(cls, eid)
|
|
if p:
|
|
ent = load_entity(p)
|
|
if ent: names.append(ent["fm"].get("canonical_name") or eid)
|
|
if not names: continue
|
|
cls = cluster[0][0]
|
|
print(f" [{cls}] {len(cluster)} entities:")
|
|
for n in names[:6]: print(f" - {n}")
|
|
if len(names) > 6: print(f" ... +{len(names)-6}")
|
|
|
|
if args.dry_run:
|
|
print("\n(dry-run; nothing written)")
|
|
return 0
|
|
|
|
# Apply merges
|
|
print("\nApplying merges ...")
|
|
applied = 0
|
|
archived = 0
|
|
for cluster in clusters.values():
|
|
if args.limit and applied >= args.limit: break
|
|
# Load all entities
|
|
loaded = []
|
|
for cls, eid in cluster:
|
|
p = entity_path_for(cls, eid)
|
|
if p:
|
|
ent = load_entity(p)
|
|
if ent: loaded.append(ent)
|
|
if len(loaded) < 2: continue
|
|
# Pick canonical: highest score
|
|
canonical = max(loaded, key=lambda e: (
|
|
1 if e["fm"].get("summary_status") == "curated" else 0,
|
|
len(e["fm"].get("aliases") or []),
|
|
e["fm"].get("total_mentions") or 0,
|
|
len(e["fm"].get("canonical_name") or ""),
|
|
))
|
|
dupes = [e for e in loaded if e is not canonical]
|
|
for d in dupes:
|
|
merge_into(canonical, d)
|
|
canonical["path"].write_text(dump_entity(canonical), encoding="utf-8")
|
|
for d in dupes:
|
|
rel = d["path"].relative_to(WIKI_ENT)
|
|
arch = ARCHIVED / rel
|
|
arch.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.move(str(d["path"]), str(arch))
|
|
archived += 1
|
|
applied += 1
|
|
|
|
print(f" canonicals updated: {applied}")
|
|
print(f" duplicates archived: {archived}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|