#!/usr/bin/env python3 """ 32-sync-mentioned-in-yaml.py — Backfill `mentioned_in[]` in wiki/entities//.md from public.entity_mentions JOIN public.chunks. After 31-populate-entity-mentions.py has run, the DB has the truth. This script syncs that truth BACK into the markdown frontmatter so the legacy entity page ("Appears in N pages") matches what the new graph/retrieval layer sees. Idempotent: rewrites mentioned_in[] in place. Preserves all other frontmatter. Skips entities with 0 mentions (leaves existing list untouched). Usage: ./32-sync-mentioned-in-yaml.py # all classes ./32-sync-mentioned-in-yaml.py --class people # one class ./32-sync-mentioned-in-yaml.py --dry-run # show diffs, no writes ./32-sync-mentioned-in-yaml.py --max-mentions 200 # cap list length per entity """ from __future__ import annotations import argparse import os import sys import time from pathlib import Path try: import yaml import psycopg except ImportError as e: sys.stderr.write(f"pip3 install pyyaml psycopg[binary] # missing: {e}\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") WIKI = UFO_ROOT / "wiki" DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL") CLASSES = ["people", "organizations", "locations", "events", "uap-objects", "vehicles", "operations", "concepts"] CLASS_SINGULAR = { "people": "person", "organizations": "organization", "locations": "location", "events": "event", "uap-objects": "uap_object", "vehicles": "vehicle", "operations": "operation", "concepts": "concept", } def split_frontmatter(raw: str) -> tuple[str, str]: """Return (frontmatter_yaml, body). Raises if no frontmatter.""" if not raw.startswith("---"): return "", raw end = raw.find("---", 4) if end < 0: return "", raw return raw[3:end].strip(), raw[end + 3:].lstrip("\n") def fetch_mentions(cur, entity_class: str, entity_id: str, max_n: int) -> list[dict]: cur.execute( """ SELECT c.doc_id, c.page, c.chunk_id, em.surface_form FROM public.entities e JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk JOIN public.chunks c ON c.chunk_pk = em.chunk_pk WHERE e.entity_class = %s AND e.entity_id = %s ORDER BY c.doc_id, c.order_global LIMIT %s """, (entity_class, entity_id, max_n), ) rows = cur.fetchall() return [ { "page": f"[[{r[0]}/p{r[1]:03d}]]", "page_ref": f"[[{r[0]}/p{r[1]:03d}#{r[2]}]]", "doc_id": r[0], "chunk_id": r[2], "surface_form": r[3], } for r in rows ] def main(): ap = argparse.ArgumentParser() ap.add_argument("--class", dest="cls", default=None, choices=CLASSES) ap.add_argument("--dry-run", action="store_true") ap.add_argument("--max-mentions", type=int, default=200, help="Cap per entity (default 200)") args = ap.parse_args() if not DATABASE_URL: sys.stderr.write("✗ Set DATABASE_URL\n") sys.exit(1) target_classes = [args.cls] if args.cls else CLASSES total_updated = 0 total_skipped_empty = 0 total_unchanged = 0 t0 = time.time() with psycopg.connect(DATABASE_URL) as conn: for cls_folder in target_classes: cls_dir = WIKI / "entities" / cls_folder if not cls_dir.is_dir(): continue entity_class_sg = CLASS_SINGULAR[cls_folder] files = sorted(cls_dir.glob("*.md")) print(f" ▸ {cls_folder}: {len(files)} files") for i, fpath in enumerate(files): eid = fpath.stem try: raw = fpath.read_text(encoding="utf-8") except Exception as e: print(f" ✗ {eid}: read failed ({e})") continue fm_yaml, body = split_frontmatter(raw) if not fm_yaml: continue try: fm = yaml.safe_load(fm_yaml) or {} except yaml.YAMLError as e: print(f" ✗ {eid}: yaml ({e})") continue with conn.cursor() as cur: mentions = fetch_mentions(cur, entity_class_sg, eid, args.max_mentions) if not mentions: total_skipped_empty += 1 continue # Build new mentioned_in list — preserve order, dedupe by page_ref seen = set() new_mentions = [] for m in mentions: if m["page_ref"] in seen: continue seen.add(m["page_ref"]) new_mentions.append( {"page": m["page"], "page_ref": m["page_ref"], "doc_id": m["doc_id"]} ) old_count = len(fm.get("mentioned_in") or []) fm["mentioned_in"] = new_mentions fm["total_mentions"] = len(new_mentions) new_yaml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=120) new_raw = f"---\n{new_yaml}---\n{body}" if new_raw == raw: total_unchanged += 1 continue if args.dry_run: print(f" Δ {eid}: {old_count} → {len(new_mentions)} mentions") else: fpath.write_text(new_raw, encoding="utf-8") total_updated += 1 if (i + 1) % 500 == 0: elapsed = round(time.time() - t0, 0) print(f" [{i+1}/{len(files)}] updated={total_updated} · {int(elapsed)}s") print( f"\nDONE — updated={total_updated} skipped_empty={total_skipped_empty} unchanged={total_unchanged} · {round(time.time() - t0, 1)}s" ) if __name__ == "__main__": main()