disclosure-bureau/scripts/maintain/51_remap_entity_mentions.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

159 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
After scripts 49/50 archive duplicates, the DB's `entities` + `entity_mentions`
tables still point at the OLD (now-archived) entity_pks. This script:
1. Re-reads every active entity YAML from wiki/entities/ (skipping _archived)
2. For each entity, looks up the corresponding entity_pk in the DB by
(entity_class, entity_id).
3. Reads the aliases[] from the YAML and finds DB entities with matching
entity_id that no longer exist on disk — those are the merged-away ones.
4. UPDATE entity_mentions SET entity_pk = <canonical_pk> WHERE entity_pk IN (<archived_pks>)
5. DELETE FROM entities WHERE entity_class||entity_id IS NOT in active set
Idempotent — re-running is a no-op once converged.
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
import psycopg
import yaml
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
ARCHIVED = WIKI_ENT / "_archived"
def main() -> int:
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
if not dburl: sys.exit("DATABASE_URL not set")
# Build active set + the alias→canonical lookup
print("Scanning active YAMLs ...")
active: set[tuple[str, str]] = set()
canonical_by_alias_eid: dict[tuple[str, str], tuple[str, str]] = {}
for f in WIKI_ENT.rglob("*.md"):
if "_archived" in f.parts: continue
try:
text = f.read_text(encoding="utf-8")
if not text.startswith("---"): continue
fm = yaml.safe_load(text.split("---")[1]) or {}
except Exception: continue
cls = fm.get("entity_class")
eid = (fm.get("entity_id")
or fm.get(f"{cls}_id") if cls else None)
if not (cls and eid): continue
active.add((cls, eid))
# All archived entities that ended up merged into this one likely
# had entity_ids that are now in this entity's aliases list. We can't
# be 100% sure, but a same-class entity with id matching an alias
# slugified is a strong signal.
print(f" active entities: {len(active)}")
print("\nScanning archived YAMLs ...")
archived_map: dict[tuple[str, str], tuple[str, str]] = {}
for f in ARCHIVED.rglob("*.md") if ARCHIVED.exists() else []:
try:
text = f.read_text(encoding="utf-8")
if not text.startswith("---"): continue
fm = yaml.safe_load(text.split("---")[1]) or {}
except Exception: continue
cls = fm.get("entity_class")
eid = (fm.get("entity_id")
or (fm.get(f"{cls}_id") if cls else None))
if not (cls and eid): continue
# Find canonical: an active entity with same class whose aliases contain
# this entity's canonical_name.
dup_name = (fm.get("canonical_name") or "").strip().lower()
if not dup_name: continue
archived_map[(cls, eid)] = (cls, dup_name)
print(f" archived entities: {len(archived_map)}")
print("\nConnecting to DB ...")
with psycopg.connect(dburl) as conn:
with conn.cursor() as cur:
# Map active YAML entities → their entity_pk
cur.execute("SELECT entity_pk, entity_class, entity_id, canonical_name FROM entities")
db_rows = cur.fetchall()
db_by_key: dict[tuple[str, str], tuple[int, str]] = {
(cls, eid): (pk, name) for pk, cls, eid, name in db_rows
}
print(f" DB entities: {len(db_rows)}")
# For each archived (cls, eid), find the canonical active entity in same class
# whose aliases contain the archived's canonical_name OR whose entity_id matches.
# Build an alias index from active YAMLs:
print("\nBuilding alias index from active YAMLs ...")
alias_index: dict[tuple[str, str], tuple[str, str]] = {}
for f in WIKI_ENT.rglob("*.md"):
if "_archived" in f.parts: continue
try:
text = f.read_text(encoding="utf-8")
if not text.startswith("---"): continue
fm = yaml.safe_load(text.split("---")[1]) or {}
except Exception: continue
cls = fm.get("entity_class")
eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
if not (cls and eid): continue
for a in (fm.get("aliases") or []):
if isinstance(a, str):
alias_index[(cls, a.strip().lower())] = (cls, eid)
# Also index canonical_name itself
cn = fm.get("canonical_name")
if isinstance(cn, str):
alias_index[(cls, cn.strip().lower())] = (cls, eid)
print(f" alias index size: {len(alias_index)}")
# Now: for each archived DB entity, find the active canonical
print("\nResolving remap ...")
remap_pairs: list[tuple[int, int]] = [] # (old_pk, new_pk)
orphan_archived: list[tuple[str, str]] = []
for (cls, eid), (db_pk, db_name) in db_by_key.items():
if (cls, eid) in active: continue
# This DB entity is no longer in active YAMLs → archived
target = alias_index.get((cls, db_name.strip().lower()))
if not target:
orphan_archived.append((cls, eid))
continue
tgt_pk_row = db_by_key.get(target)
if not tgt_pk_row:
orphan_archived.append((cls, eid)); continue
remap_pairs.append((db_pk, tgt_pk_row[0]))
print(f" remap pairs: {len(remap_pairs)}")
print(f" orphans (archived but no canonical found): {len(orphan_archived)}")
if remap_pairs:
cur.execute("CREATE TEMP TABLE _remap (old_pk BIGINT, new_pk BIGINT)")
with cur.copy("COPY _remap (old_pk, new_pk) FROM STDIN") as cp:
for old, new in remap_pairs:
cp.write_row((old, new))
# 1. Insert new rows for the canonical entity (skip if already exists)
# This preserves any non-default columns the table may have.
cur.execute("""
INSERT INTO entity_mentions (chunk_pk, entity_pk, surface_form)
SELECT em.chunk_pk, r.new_pk, em.surface_form
FROM entity_mentions em
JOIN _remap r ON em.entity_pk = r.old_pk
ON CONFLICT DO NOTHING
""")
inserted = cur.rowcount
print(f" new canonical mentions inserted: {inserted}")
# 2. Delete all old (archived-entity) mentions
cur.execute("""
DELETE FROM entity_mentions em USING _remap r
WHERE em.entity_pk = r.old_pk
""")
print(f" archived-entity mentions removed: {cur.rowcount}")
# 3. Delete archived entities from `entities` table
cur.execute("DELETE FROM entities WHERE entity_pk IN (SELECT old_pk FROM _remap)")
print(f" archived entities removed: {cur.rowcount}")
conn.commit()
return 0
if __name__ == "__main__":
sys.exit(main())