disclosure-bureau/scripts/maintain/51_remap_entity_mentions.py

#!/usr/bin/env python3
"""
After scripts 49/50 archive duplicates, the DB's `entities` + `entity_mentions`
tables still point at the OLD (now-archived) entity_pks. This script:

  1. Re-reads every active entity YAML from wiki/entities/ (skipping _archived)
  2. For each entity, looks up the corresponding entity_pk in the DB by
     (entity_class, entity_id).
  3. Reads the aliases[] from the YAML and finds DB entities with matching
     entity_id that no longer exist on disk — those are the merged-away ones.
  4. UPDATE entity_mentions SET entity_pk = <canonical_pk> WHERE entity_pk IN (<archived_pks>)
  5. DELETE FROM entities WHERE entity_class||entity_id IS NOT in active set

Idempotent — re-running is a no-op once converged.
"""
from __future__ import annotations
import os
import sys
from pathlib import Path

import psycopg
import yaml

WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
ARCHIVED = WIKI_ENT / "_archived"


def main() -> int:
    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl: sys.exit("DATABASE_URL not set")

    # Build active set + the alias→canonical lookup
    print("Scanning active YAMLs ...")
    active: set[tuple[str, str]] = set()
    canonical_by_alias_eid: dict[tuple[str, str], tuple[str, str]] = {}
    for f in WIKI_ENT.rglob("*.md"):
        if "_archived" in f.parts: continue
        try:
            text = f.read_text(encoding="utf-8")
            if not text.startswith("---"): continue
            fm = yaml.safe_load(text.split("---")[1]) or {}
        except Exception: continue
        cls = fm.get("entity_class")
        eid = (fm.get("entity_id")
               or fm.get(f"{cls}_id") if cls else None)
        if not (cls and eid): continue
        active.add((cls, eid))
        # All archived entities that ended up merged into this one likely
        # had entity_ids that are now in this entity's aliases list. We can't
        # be 100% sure, but a same-class entity with id matching an alias
        # slugified is a strong signal.

    print(f"  active entities: {len(active)}")

    print("\nScanning archived YAMLs ...")
    archived_map: dict[tuple[str, str], tuple[str, str]] = {}
    for f in ARCHIVED.rglob("*.md") if ARCHIVED.exists() else []:
        try:
            text = f.read_text(encoding="utf-8")
            if not text.startswith("---"): continue
            fm = yaml.safe_load(text.split("---")[1]) or {}
        except Exception: continue
        cls = fm.get("entity_class")
        eid = (fm.get("entity_id")
               or (fm.get(f"{cls}_id") if cls else None))
        if not (cls and eid): continue
        # Find canonical: an active entity with same class whose aliases contain
        # this entity's canonical_name.
        dup_name = (fm.get("canonical_name") or "").strip().lower()
        if not dup_name: continue
        archived_map[(cls, eid)] = (cls, dup_name)

    print(f"  archived entities: {len(archived_map)}")

    print("\nConnecting to DB ...")
    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            # Map active YAML entities → their entity_pk
            cur.execute("SELECT entity_pk, entity_class, entity_id, canonical_name FROM entities")
            db_rows = cur.fetchall()
            db_by_key: dict[tuple[str, str], tuple[int, str]] = {
                (cls, eid): (pk, name) for pk, cls, eid, name in db_rows
            }
            print(f"  DB entities: {len(db_rows)}")

            # For each archived (cls, eid), find the canonical active entity in same class
            # whose aliases contain the archived's canonical_name OR whose entity_id matches.
            # Build an alias index from active YAMLs:
            print("\nBuilding alias index from active YAMLs ...")
            alias_index: dict[tuple[str, str], tuple[str, str]] = {}
            for f in WIKI_ENT.rglob("*.md"):
                if "_archived" in f.parts: continue
                try:
                    text = f.read_text(encoding="utf-8")
                    if not text.startswith("---"): continue
                    fm = yaml.safe_load(text.split("---")[1]) or {}
                except Exception: continue
                cls = fm.get("entity_class")
                eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
                if not (cls and eid): continue
                for a in (fm.get("aliases") or []):
                    if isinstance(a, str):
                        alias_index[(cls, a.strip().lower())] = (cls, eid)
                # Also index canonical_name itself
                cn = fm.get("canonical_name")
                if isinstance(cn, str):
                    alias_index[(cls, cn.strip().lower())] = (cls, eid)
            print(f"  alias index size: {len(alias_index)}")

            # Now: for each archived DB entity, find the active canonical
            print("\nResolving remap ...")
            remap_pairs: list[tuple[int, int]] = []  # (old_pk, new_pk)
            orphan_archived: list[tuple[str, str]] = []
            for (cls, eid), (db_pk, db_name) in db_by_key.items():
                if (cls, eid) in active: continue
                # This DB entity is no longer in active YAMLs → archived
                target = alias_index.get((cls, db_name.strip().lower()))
                if not target:
                    orphan_archived.append((cls, eid))
                    continue
                tgt_pk_row = db_by_key.get(target)
                if not tgt_pk_row:
                    orphan_archived.append((cls, eid)); continue
                remap_pairs.append((db_pk, tgt_pk_row[0]))

            print(f"  remap pairs: {len(remap_pairs)}")
            print(f"  orphans (archived but no canonical found): {len(orphan_archived)}")

            if remap_pairs:
                cur.execute("CREATE TEMP TABLE _remap (old_pk BIGINT, new_pk BIGINT)")
                with cur.copy("COPY _remap (old_pk, new_pk) FROM STDIN") as cp:
                    for old, new in remap_pairs:
                        cp.write_row((old, new))
                # 1. Insert new rows for the canonical entity (skip if already exists)
                #    This preserves any non-default columns the table may have.
                cur.execute("""
                    INSERT INTO entity_mentions (chunk_pk, entity_pk, surface_form)
                    SELECT em.chunk_pk, r.new_pk, em.surface_form
                    FROM entity_mentions em
                    JOIN _remap r ON em.entity_pk = r.old_pk
                    ON CONFLICT DO NOTHING
                """)
                inserted = cur.rowcount
                print(f"  new canonical mentions inserted: {inserted}")
                # 2. Delete all old (archived-entity) mentions
                cur.execute("""
                    DELETE FROM entity_mentions em USING _remap r
                    WHERE em.entity_pk = r.old_pk
                """)
                print(f"  archived-entity mentions removed: {cur.rowcount}")
                # 3. Delete archived entities from `entities` table
                cur.execute("DELETE FROM entities WHERE entity_pk IN (SELECT old_pk FROM _remap)")
                print(f"  archived entities removed: {cur.rowcount}")
        conn.commit()
    return 0


if __name__ == "__main__":
    sys.exit(main())