disclosure-bureau/scripts/maintain/50_dedup_fuzzy_trigram.py

#!/usr/bin/env python3
"""
Aggressive entity deduplication — camada 2 (fuzzy trigram).

Para cada entity_class, compara TODAS as entidades restantes via similaridade
trigram (Postgres pg_trgm). Merge automático quando:
  - similarity >= 0.85 e ambos os nomes têm ≥2 tokens significativos OU
  - similarity >= 0.92 (mais tolerante para nomes curtos)
  - mesma classe
  - estado: NÃO já arquivada
  - mesmo "núcleo" (último token após strip de role prefixes)

Para nomes ambíguos (single-word sobrenome como "Smith"), só faz merge se
houver contexto compartilhado (mesma página, mesmo documento na maioria das
menções).

Run:
  DATABASE_URL=postgres://... python3 scripts/maintain/50_dedup_fuzzy_trigram.py --dry-run
"""
from __future__ import annotations
import argparse
import os
import re
import shutil
import sys
import unicodedata
from collections import defaultdict
from pathlib import Path

import psycopg
import yaml

WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
ARCHIVED = WIKI_ENT / "_archived"

ROLE_PREFIX_RE = re.compile(
    r"^("
    r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|"
    r"major|maj|colonel|col|lt|lieutenant|captain|capt|"
    r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|"
    r"agent|special agent|sa|director|deputy director|deputy|"
    r"reverend|rev|professor|"
    r"president|vice president|vp|chairman|secretary|"
    r"detective|det|inspector"
    r")\.?\s+",
    re.IGNORECASE,
)


def ascii_fold(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))


def strip_roles(name: str) -> str:
    s = name
    for _ in range(3):
        new = ROLE_PREFIX_RE.sub("", s)
        if new == s: break
        s = new
    return s.strip()


def core_tokens(name: str) -> set[str]:
    """Significant tokens of a name (no roles, no stopwords, lowercased)."""
    s = ascii_fold(strip_roles(name).lower())
    s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s)
    toks = [t for t in s.split() if len(t) > 1 and t not in {
        "the", "of", "and", "de", "do", "da", "dos", "das", "el", "la", "los", "las",
        "a", "an", "o", "as", "os", "le", "les", "von", "van"
    }]
    return set(toks)


# Tokens that mix letters and digits (II-22, B-6, mode4, district17, 17th, 3rd)
# These are SIGNIFICANT modifiers — if they differ between two names, the
# names refer to DIFFERENT things.
NUMERIC_TOKEN_RE = re.compile(r"^[a-z]*\d+[a-z]*$|^\d+[a-z]+$|^[a-z]+-?\d+[a-z]*$|^[ivxlcdm]+-?\d+$", re.IGNORECASE)


CODE_SUFFIX_RE = re.compile(r"(?:\s-\s|-)([A-Z]{1,3})$|\s([A-Z])$")


def code_suffix(name: str) -> str | None:
    """Extract trailing short code (1-3 uppercase letters) like ' - Z',
    ' M', '-R'. These often denote sub-categories that differ semantically
    (FBI classification subdivisions, military variants)."""
    s = name.strip()
    m = CODE_SUFFIX_RE.search(s)
    if not m: return None
    code = (m.group(1) or m.group(2) or "").upper()
    return code if code else None


ROMAN_NUMERALS = {
    "i","ii","iii","iv","v","vi","vii","viii","ix","x",
    "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
    "xxi","xxii","xxiii","xxiv","xxv","xxvi","xxvii","xxviii","xxix","xxx",
}
ORDINAL_WORDS = {
    "first","second","third","fourth","fifth","sixth","seventh","eighth",
    "ninth","tenth","eleventh","twelfth","thirteenth","fourteenth","fifteenth",
    "sixteenth","seventeenth","eighteenth","nineteenth","twentieth",
    "primeiro","segundo","terceiro","quarto","quinto","sexto","setimo",
    "oitavo","nono","decimo","undecimo","duodecimo",
}


def is_variant_marker(tok: str) -> bool:
    """True if `tok` is the kind of token that distinguishes instances of a
    series: 'A', 'B', 'II', 'XIII', 'Ninth', 'Fourth', '5', etc."""
    t = tok.lower()
    if t.isdigit(): return True
    if t in ROMAN_NUMERALS: return True
    if t in ORDINAL_WORDS: return True
    # Single uppercase letter (e.g. 'A' in 'Pioneer A')
    if len(tok) == 1 and tok.isalpha() and tok.isupper(): return True
    return False


def single_letter_token_diff(name_a: str, name_b: str) -> bool:
    """Returns True if the two names differ by tokens that are 'variant
    markers' — letters, romans, ordinals. Catches:
        Pioneer Launch          vs PIONEER A Launch       (single letter)
        PIONEER-B Launch        vs PIONEER-C Launch
        XII Tactical Air Cmd    vs XIII Tactical Air Cmd  (romans)
        Ninth Air Force         vs Tenth Air Force        (ordinals)
        Apollo                  vs Apollo 11              (digit)
    These are variants of the same program, NOT the same instance.
    """
    def toks(s: str) -> list[str]:
        s = ascii_fold(s.lower())
        s = re.sub(r"[-_]", " ", s)
        return [t for t in re.findall(r"\b[\w]+\b", s) if t]

    # Lowercase tokens for set diff, but remember the original case to detect
    # the single-uppercase-letter case.
    ta_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_a)))
    tb_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_b)))
    ta = [t.lower() for t in ta_orig]
    tb = [t.lower() for t in tb_orig]
    if not ta or not tb: return False
    from collections import Counter
    ca, cb = Counter(ta), Counter(tb)
    diff_a = list((ca - cb).elements())
    diff_b = list((cb - ca).elements())
    if not diff_a and not diff_b: return False
    # Helper: variant marker check considering original case for single letters
    def marker_or_single_letter(lower_tok: str, src: list[str]) -> bool:
        if is_variant_marker(lower_tok): return True
        # Single letter not flagged above because we only allowed UPPERCASE.
        # Re-check via original-case forms in the source name.
        if len(lower_tok) == 1 and lower_tok.isalpha():
            # See if it appears as uppercase in original tokens
            for o in src:
                if o.lower() == lower_tok and o.isupper(): return True
        return False

    a_all_markers = all(marker_or_single_letter(t, ta_orig) for t in diff_a) if diff_a else True
    b_all_markers = all(marker_or_single_letter(t, tb_orig) for t in diff_b) if diff_b else True
    if a_all_markers and b_all_markers and (diff_a or diff_b):
        return True
    return False


def numeric_signature(name: str) -> frozenset[str]:
    """Extract all numeric/ordinal/serial tokens from a name.
    Two names with DIFFERENT numeric signatures CANNOT be merged."""
    s = ascii_fold(name.lower())
    s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s)
    # Extract all tokens that contain at least one digit
    nums = set()
    for t in re.findall(r"\b[\w-]+\b", s):
        # Pure number
        if re.fullmatch(r"\d+(st|nd|rd|th)?", t):
            # Normalize "17th" → "17"
            nums.add(re.sub(r"(st|nd|rd|th)$", "", t))
        # Letter + digit (II-22, b-6, mode4)
        elif re.search(r"\d", t):
            # Normalize "II-22" / "ii-22" → "ii22"; "b-6" → "b6"
            nums.add(re.sub(r"[-\s]", "", t))
    return frozenset(nums)


FOLDER_TO_CLASS = {
    "people": "person",
    "organizations": "organization",
    "locations": "location",
    "events": "event",
    "uap-objects": "uap_object",
    "vehicles": "vehicle",
    "operations": "operation",
    "concepts": "concept",
}
CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()}


def load_entity(path: Path) -> dict | None:
    try:
        text = path.read_text(encoding="utf-8")
        if not text.startswith("---"): return None
        parts = text.split("---", 2)
        if len(parts) < 3: return None
        fm = yaml.safe_load(parts[1]) or {}
        body = parts[2]
        return {"path": path, "fm": fm, "body": body}
    except Exception:
        return None


def dump_entity(entity: dict) -> str:
    return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + entity["body"]


def entity_path_for(cls: str, entity_id: str) -> Path | None:
    folder = CLASS_TO_FOLDER.get(cls)
    if not folder: return None
    p = WIKI_ENT / folder / f"{entity_id}.md"
    return p if p.exists() else None


def merge_into(canonical: dict, duplicate: dict) -> None:
    cfm = canonical["fm"]; dfm = duplicate["fm"]
    cfm.setdefault("aliases", []); cfm.setdefault("mentioned_in", [])
    cfm.setdefault("text_mentioned_in", []); cfm.setdefault("referenced_by", [])
    cfm.setdefault("related", [])
    all_aliases = set(cfm["aliases"] or []); all_aliases.add(cfm.get("canonical_name", ""))
    if dfm.get("canonical_name"): all_aliases.add(dfm["canonical_name"])
    for a in (dfm.get("aliases") or []): all_aliases.add(a)
    all_aliases.discard(""); all_aliases.discard(None)
    cfm["aliases"] = sorted(all_aliases)
    cfm["mentioned_in"] = sorted(set(cfm["mentioned_in"] or []) | set(dfm.get("mentioned_in") or []))
    cfm["text_mentioned_in"] = sorted(set(cfm["text_mentioned_in"] or []) | set(dfm.get("text_mentioned_in") or []))
    cfm["referenced_by"] = sorted(set(cfm["referenced_by"] or []) | set(dfm.get("referenced_by") or []))
    cfm["related"] = sorted(set(cfm["related"] or []) | set(dfm.get("related") or []))
    cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in cfm["mentioned_in"]})
    sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0}
    sigs["page_refs"] = len(cfm["mentioned_in"])
    sigs["text_refs"] = len(cfm["text_mentioned_in"])
    sigs["cross_refs"] = len(cfm["referenced_by"])
    sigs["db_chunks"] = int(sigs.get("db_chunks", 0))
    cfm["signal_sources"] = sigs
    total = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"]
    cfm["total_mentions"] = total
    if total == 0:
        cfm["signal_strength"] = "orphan"
    elif sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3 or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1) or sigs["text_refs"] >= 5:
        cfm["signal_strength"] = "strong"
    else:
        cfm["signal_strength"] = "weak"


def choose_canonical(a: dict, b: dict) -> tuple[dict, dict]:
    """Return (canonical, duplicate). Prefer one with curated narrative,
    then longer aliases list, then higher total_mentions."""
    def score(e: dict) -> tuple:
        fm = e["fm"]
        return (
            1 if fm.get("summary_status") == "curated" else 0,
            len(fm.get("aliases") or []),
            fm.get("total_mentions") or 0,
            len(fm.get("canonical_name") or ""),
        )
    if score(a) >= score(b): return a, b
    return b, a


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--threshold", type=float, default=0.85,
                    help="trigram similarity threshold (0..1)")
    ap.add_argument("--threshold-short", type=float, default=0.92,
                    help="higher threshold for single-token names")
    ap.add_argument("--limit", type=int, default=None,
                    help="apply at most N merges (for cautious runs)")
    args = ap.parse_args()

    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl: sys.exit("DATABASE_URL not set")

    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            cur.execute(f"SET pg_trgm.similarity_threshold = {args.threshold}")
            # All entity pairs above threshold in the SAME class, where a > b (avoid duplicates)
            cur.execute(f"""
                SELECT e1.entity_class,
                       e1.entity_id,  e1.canonical_name,
                       e2.entity_id,  e2.canonical_name,
                       similarity(e1.canonical_name, e2.canonical_name) AS sim
                FROM entities e1
                JOIN entities e2
                  ON e1.entity_class = e2.entity_class
                 AND e1.entity_id    < e2.entity_id
                 AND e1.canonical_name % e2.canonical_name
                ORDER BY sim DESC
            """)
            pairs = cur.fetchall()

    print(f"Trigram candidate pairs (sim >= {args.threshold}): {len(pairs)}")

    # Filter pairs by:
    #  - share at least 1 significant core token (avoids "United States" matching "United Kingdom")
    #  - if both names are single-token AFTER role strip, require higher threshold
    accepted = []
    rejected_short = 0
    rejected_no_overlap = 0
    rejected_numeric = 0
    for cls, id_a, name_a, id_b, name_b, sim in pairs:
        toks_a = core_tokens(name_a or "")
        toks_b = core_tokens(name_b or "")
        if not toks_a or not toks_b:
            rejected_no_overlap += 1; continue
        # Must share at least one significant token
        if not (toks_a & toks_b):
            rejected_no_overlap += 1; continue
        # If one side is single-token, require stricter threshold
        if (len(toks_a) <= 1 or len(toks_b) <= 1) and sim < args.threshold_short:
            rejected_short += 1; continue
        # NUMERIC SAFEGUARD: if numeric signatures differ, the names refer to
        # different objects (NAVSTAR II-2 vs II-24, Mode 3 vs Mode 4,
        # 17th District vs 13th District, etc). Reject.
        sig_a = numeric_signature(name_a or "")
        sig_b = numeric_signature(name_b or "")
        if sig_a != sig_b:
            rejected_numeric += 1; continue
        # CODE SUFFIX SAFEGUARD: if EITHER name has a short code suffix
        # (1-3 uppercase letters), they must have IDENTICAL suffixes.
        # 'INTERNAL SECURITY - Z' ≠ 'INTERNAL SECURITY - X' ≠ 'INTERNAL SECURITY' (base).
        cs_a = code_suffix(name_a or "")
        cs_b = code_suffix(name_b or "")
        if (cs_a or cs_b) and cs_a != cs_b:
            rejected_numeric += 1; continue
        # SINGLE-LETTER VARIANT TOKEN: 'PIONEER A Launch' vs 'PIONEER-B Launch'
        # vs 'Pioneer Launch' are distinct missions of the same program.
        if single_letter_token_diff(name_a or "", name_b or ""):
            rejected_numeric += 1; continue
        accepted.append((cls, id_a, name_a, id_b, name_b, sim))

    print(f"  rejected (no token overlap):           {rejected_no_overlap}")
    print(f"  rejected (single-token below {args.threshold_short}):   {rejected_short}")
    print(f"  rejected (numeric signature mismatch): {rejected_numeric}")
    print(f"  ACCEPTED for merge: {len(accepted)}")

    # Build a union-find over accepted pairs so transitive clusters merge correctly
    parent: dict[tuple[str, str], tuple[str, str]] = {}
    def find(x):
        while parent.get(x, x) != x:
            parent[x] = parent.get(parent[x], parent[x])
            x = parent[x]
        return x
    def union(x, y):
        rx, ry = find(x), find(y)
        if rx != ry: parent[ry] = rx

    for cls, id_a, _, id_b, _, _ in accepted:
        a = (cls, id_a); b = (cls, id_b)
        parent.setdefault(a, a); parent.setdefault(b, b)
        union(a, b)

    clusters: dict[tuple[str, str], list[tuple[str, str]]] = defaultdict(list)
    for node in list(parent.keys()):
        clusters[find(node)].append(node)
    clusters = {k: v for k, v in clusters.items() if len(v) > 1}

    print(f"\nClusters after union-find: {len(clusters)}")
    print(f"Entities to remove: {sum(len(v) - 1 for v in clusters.values())}\n")

    # Sample biggest
    biggest = sorted(clusters.values(), key=lambda c: -len(c))[:15]
    print("=== Top 15 biggest fuzzy clusters ===")
    for cluster in biggest:
        # Load names for display
        names = []
        for cls, eid in cluster:
            p = entity_path_for(cls, eid)
            if p:
                ent = load_entity(p)
                if ent: names.append(ent["fm"].get("canonical_name") or eid)
        if not names: continue
        cls = cluster[0][0]
        print(f"  [{cls}] {len(cluster)} entities:")
        for n in names[:6]: print(f"     - {n}")
        if len(names) > 6: print(f"     ... +{len(names)-6}")

    if args.dry_run:
        print("\n(dry-run; nothing written)")
        return 0

    # Apply merges
    print("\nApplying merges ...")
    applied = 0
    archived = 0
    for cluster in clusters.values():
        if args.limit and applied >= args.limit: break
        # Load all entities
        loaded = []
        for cls, eid in cluster:
            p = entity_path_for(cls, eid)
            if p:
                ent = load_entity(p)
                if ent: loaded.append(ent)
        if len(loaded) < 2: continue
        # Pick canonical: highest score
        canonical = max(loaded, key=lambda e: (
            1 if e["fm"].get("summary_status") == "curated" else 0,
            len(e["fm"].get("aliases") or []),
            e["fm"].get("total_mentions") or 0,
            len(e["fm"].get("canonical_name") or ""),
        ))
        dupes = [e for e in loaded if e is not canonical]
        for d in dupes:
            merge_into(canonical, d)
        canonical["path"].write_text(dump_entity(canonical), encoding="utf-8")
        for d in dupes:
            rel = d["path"].relative_to(WIKI_ENT)
            arch = ARCHIVED / rel
            arch.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(d["path"]), str(arch))
            archived += 1
        applied += 1

    print(f"  canonicals updated: {applied}")
    print(f"  duplicates archived: {archived}")
    return 0


if __name__ == "__main__":
    sys.exit(main())