#!/usr/bin/env python3 """ After scripts 49/50 archive duplicates, the DB's `entities` + `entity_mentions` tables still point at the OLD (now-archived) entity_pks. This script: 1. Re-reads every active entity YAML from wiki/entities/ (skipping _archived) 2. For each entity, looks up the corresponding entity_pk in the DB by (entity_class, entity_id). 3. Reads the aliases[] from the YAML and finds DB entities with matching entity_id that no longer exist on disk — those are the merged-away ones. 4. UPDATE entity_mentions SET entity_pk = WHERE entity_pk IN () 5. DELETE FROM entities WHERE entity_class||entity_id IS NOT in active set Idempotent — re-running is a no-op once converged. """ from __future__ import annotations import os import sys from pathlib import Path import psycopg import yaml WIKI_ENT = Path("/Users/guto/ufo/wiki/entities") ARCHIVED = WIKI_ENT / "_archived" def main() -> int: dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") if not dburl: sys.exit("DATABASE_URL not set") # Build active set + the alias→canonical lookup print("Scanning active YAMLs ...") active: set[tuple[str, str]] = set() canonical_by_alias_eid: dict[tuple[str, str], tuple[str, str]] = {} for f in WIKI_ENT.rglob("*.md"): if "_archived" in f.parts: continue try: text = f.read_text(encoding="utf-8") if not text.startswith("---"): continue fm = yaml.safe_load(text.split("---")[1]) or {} except Exception: continue cls = fm.get("entity_class") eid = (fm.get("entity_id") or fm.get(f"{cls}_id") if cls else None) if not (cls and eid): continue active.add((cls, eid)) # All archived entities that ended up merged into this one likely # had entity_ids that are now in this entity's aliases list. We can't # be 100% sure, but a same-class entity with id matching an alias # slugified is a strong signal. print(f" active entities: {len(active)}") print("\nScanning archived YAMLs ...") archived_map: dict[tuple[str, str], tuple[str, str]] = {} for f in ARCHIVED.rglob("*.md") if ARCHIVED.exists() else []: try: text = f.read_text(encoding="utf-8") if not text.startswith("---"): continue fm = yaml.safe_load(text.split("---")[1]) or {} except Exception: continue cls = fm.get("entity_class") eid = (fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)) if not (cls and eid): continue # Find canonical: an active entity with same class whose aliases contain # this entity's canonical_name. dup_name = (fm.get("canonical_name") or "").strip().lower() if not dup_name: continue archived_map[(cls, eid)] = (cls, dup_name) print(f" archived entities: {len(archived_map)}") print("\nConnecting to DB ...") with psycopg.connect(dburl) as conn: with conn.cursor() as cur: # Map active YAML entities → their entity_pk cur.execute("SELECT entity_pk, entity_class, entity_id, canonical_name FROM entities") db_rows = cur.fetchall() db_by_key: dict[tuple[str, str], tuple[int, str]] = { (cls, eid): (pk, name) for pk, cls, eid, name in db_rows } print(f" DB entities: {len(db_rows)}") # For each archived (cls, eid), find the canonical active entity in same class # whose aliases contain the archived's canonical_name OR whose entity_id matches. # Build an alias index from active YAMLs: print("\nBuilding alias index from active YAMLs ...") alias_index: dict[tuple[str, str], tuple[str, str]] = {} for f in WIKI_ENT.rglob("*.md"): if "_archived" in f.parts: continue try: text = f.read_text(encoding="utf-8") if not text.startswith("---"): continue fm = yaml.safe_load(text.split("---")[1]) or {} except Exception: continue cls = fm.get("entity_class") eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None) if not (cls and eid): continue for a in (fm.get("aliases") or []): if isinstance(a, str): alias_index[(cls, a.strip().lower())] = (cls, eid) # Also index canonical_name itself cn = fm.get("canonical_name") if isinstance(cn, str): alias_index[(cls, cn.strip().lower())] = (cls, eid) print(f" alias index size: {len(alias_index)}") # Now: for each archived DB entity, find the active canonical print("\nResolving remap ...") remap_pairs: list[tuple[int, int]] = [] # (old_pk, new_pk) orphan_archived: list[tuple[str, str]] = [] for (cls, eid), (db_pk, db_name) in db_by_key.items(): if (cls, eid) in active: continue # This DB entity is no longer in active YAMLs → archived target = alias_index.get((cls, db_name.strip().lower())) if not target: orphan_archived.append((cls, eid)) continue tgt_pk_row = db_by_key.get(target) if not tgt_pk_row: orphan_archived.append((cls, eid)); continue remap_pairs.append((db_pk, tgt_pk_row[0])) print(f" remap pairs: {len(remap_pairs)}") print(f" orphans (archived but no canonical found): {len(orphan_archived)}") if remap_pairs: cur.execute("CREATE TEMP TABLE _remap (old_pk BIGINT, new_pk BIGINT)") with cur.copy("COPY _remap (old_pk, new_pk) FROM STDIN") as cp: for old, new in remap_pairs: cp.write_row((old, new)) # 1. Insert new rows for the canonical entity (skip if already exists) # This preserves any non-default columns the table may have. cur.execute(""" INSERT INTO entity_mentions (chunk_pk, entity_pk, surface_form) SELECT em.chunk_pk, r.new_pk, em.surface_form FROM entity_mentions em JOIN _remap r ON em.entity_pk = r.old_pk ON CONFLICT DO NOTHING """) inserted = cur.rowcount print(f" new canonical mentions inserted: {inserted}") # 2. Delete all old (archived-entity) mentions cur.execute(""" DELETE FROM entity_mentions em USING _remap r WHERE em.entity_pk = r.old_pk """) print(f" archived-entity mentions removed: {cur.rowcount}") # 3. Delete archived entities from `entities` table cur.execute("DELETE FROM entities WHERE entity_pk IN (SELECT old_pk FROM _remap)") print(f" archived entities removed: {cur.rowcount}") conn.commit() return 0 if __name__ == "__main__": sys.exit(main())