Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
159 lines
7.2 KiB
Python
159 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
After scripts 49/50 archive duplicates, the DB's `entities` + `entity_mentions`
|
|
tables still point at the OLD (now-archived) entity_pks. This script:
|
|
|
|
1. Re-reads every active entity YAML from wiki/entities/ (skipping _archived)
|
|
2. For each entity, looks up the corresponding entity_pk in the DB by
|
|
(entity_class, entity_id).
|
|
3. Reads the aliases[] from the YAML and finds DB entities with matching
|
|
entity_id that no longer exist on disk — those are the merged-away ones.
|
|
4. UPDATE entity_mentions SET entity_pk = <canonical_pk> WHERE entity_pk IN (<archived_pks>)
|
|
5. DELETE FROM entities WHERE entity_class||entity_id IS NOT in active set
|
|
|
|
Idempotent — re-running is a no-op once converged.
|
|
"""
|
|
from __future__ import annotations
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import psycopg
|
|
import yaml
|
|
|
|
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
|
|
ARCHIVED = WIKI_ENT / "_archived"
|
|
|
|
|
|
def main() -> int:
|
|
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
|
|
if not dburl: sys.exit("DATABASE_URL not set")
|
|
|
|
# Build active set + the alias→canonical lookup
|
|
print("Scanning active YAMLs ...")
|
|
active: set[tuple[str, str]] = set()
|
|
canonical_by_alias_eid: dict[tuple[str, str], tuple[str, str]] = {}
|
|
for f in WIKI_ENT.rglob("*.md"):
|
|
if "_archived" in f.parts: continue
|
|
try:
|
|
text = f.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): continue
|
|
fm = yaml.safe_load(text.split("---")[1]) or {}
|
|
except Exception: continue
|
|
cls = fm.get("entity_class")
|
|
eid = (fm.get("entity_id")
|
|
or fm.get(f"{cls}_id") if cls else None)
|
|
if not (cls and eid): continue
|
|
active.add((cls, eid))
|
|
# All archived entities that ended up merged into this one likely
|
|
# had entity_ids that are now in this entity's aliases list. We can't
|
|
# be 100% sure, but a same-class entity with id matching an alias
|
|
# slugified is a strong signal.
|
|
|
|
print(f" active entities: {len(active)}")
|
|
|
|
print("\nScanning archived YAMLs ...")
|
|
archived_map: dict[tuple[str, str], tuple[str, str]] = {}
|
|
for f in ARCHIVED.rglob("*.md") if ARCHIVED.exists() else []:
|
|
try:
|
|
text = f.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): continue
|
|
fm = yaml.safe_load(text.split("---")[1]) or {}
|
|
except Exception: continue
|
|
cls = fm.get("entity_class")
|
|
eid = (fm.get("entity_id")
|
|
or (fm.get(f"{cls}_id") if cls else None))
|
|
if not (cls and eid): continue
|
|
# Find canonical: an active entity with same class whose aliases contain
|
|
# this entity's canonical_name.
|
|
dup_name = (fm.get("canonical_name") or "").strip().lower()
|
|
if not dup_name: continue
|
|
archived_map[(cls, eid)] = (cls, dup_name)
|
|
|
|
print(f" archived entities: {len(archived_map)}")
|
|
|
|
print("\nConnecting to DB ...")
|
|
with psycopg.connect(dburl) as conn:
|
|
with conn.cursor() as cur:
|
|
# Map active YAML entities → their entity_pk
|
|
cur.execute("SELECT entity_pk, entity_class, entity_id, canonical_name FROM entities")
|
|
db_rows = cur.fetchall()
|
|
db_by_key: dict[tuple[str, str], tuple[int, str]] = {
|
|
(cls, eid): (pk, name) for pk, cls, eid, name in db_rows
|
|
}
|
|
print(f" DB entities: {len(db_rows)}")
|
|
|
|
# For each archived (cls, eid), find the canonical active entity in same class
|
|
# whose aliases contain the archived's canonical_name OR whose entity_id matches.
|
|
# Build an alias index from active YAMLs:
|
|
print("\nBuilding alias index from active YAMLs ...")
|
|
alias_index: dict[tuple[str, str], tuple[str, str]] = {}
|
|
for f in WIKI_ENT.rglob("*.md"):
|
|
if "_archived" in f.parts: continue
|
|
try:
|
|
text = f.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): continue
|
|
fm = yaml.safe_load(text.split("---")[1]) or {}
|
|
except Exception: continue
|
|
cls = fm.get("entity_class")
|
|
eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
|
|
if not (cls and eid): continue
|
|
for a in (fm.get("aliases") or []):
|
|
if isinstance(a, str):
|
|
alias_index[(cls, a.strip().lower())] = (cls, eid)
|
|
# Also index canonical_name itself
|
|
cn = fm.get("canonical_name")
|
|
if isinstance(cn, str):
|
|
alias_index[(cls, cn.strip().lower())] = (cls, eid)
|
|
print(f" alias index size: {len(alias_index)}")
|
|
|
|
# Now: for each archived DB entity, find the active canonical
|
|
print("\nResolving remap ...")
|
|
remap_pairs: list[tuple[int, int]] = [] # (old_pk, new_pk)
|
|
orphan_archived: list[tuple[str, str]] = []
|
|
for (cls, eid), (db_pk, db_name) in db_by_key.items():
|
|
if (cls, eid) in active: continue
|
|
# This DB entity is no longer in active YAMLs → archived
|
|
target = alias_index.get((cls, db_name.strip().lower()))
|
|
if not target:
|
|
orphan_archived.append((cls, eid))
|
|
continue
|
|
tgt_pk_row = db_by_key.get(target)
|
|
if not tgt_pk_row:
|
|
orphan_archived.append((cls, eid)); continue
|
|
remap_pairs.append((db_pk, tgt_pk_row[0]))
|
|
|
|
print(f" remap pairs: {len(remap_pairs)}")
|
|
print(f" orphans (archived but no canonical found): {len(orphan_archived)}")
|
|
|
|
if remap_pairs:
|
|
cur.execute("CREATE TEMP TABLE _remap (old_pk BIGINT, new_pk BIGINT)")
|
|
with cur.copy("COPY _remap (old_pk, new_pk) FROM STDIN") as cp:
|
|
for old, new in remap_pairs:
|
|
cp.write_row((old, new))
|
|
# 1. Insert new rows for the canonical entity (skip if already exists)
|
|
# This preserves any non-default columns the table may have.
|
|
cur.execute("""
|
|
INSERT INTO entity_mentions (chunk_pk, entity_pk, surface_form)
|
|
SELECT em.chunk_pk, r.new_pk, em.surface_form
|
|
FROM entity_mentions em
|
|
JOIN _remap r ON em.entity_pk = r.old_pk
|
|
ON CONFLICT DO NOTHING
|
|
""")
|
|
inserted = cur.rowcount
|
|
print(f" new canonical mentions inserted: {inserted}")
|
|
# 2. Delete all old (archived-entity) mentions
|
|
cur.execute("""
|
|
DELETE FROM entity_mentions em USING _remap r
|
|
WHERE em.entity_pk = r.old_pk
|
|
""")
|
|
print(f" archived-entity mentions removed: {cur.rowcount}")
|
|
# 3. Delete archived entities from `entities` table
|
|
cur.execute("DELETE FROM entities WHERE entity_pk IN (SELECT old_pk FROM _remap)")
|
|
print(f" archived entities removed: {cur.rowcount}")
|
|
conn.commit()
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|