disclosure-bureau/scripts/32-sync-mentioned-in-yaml.py

176 lines
5.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
32-sync-mentioned-in-yaml.py — Backfill `mentioned_in[]` in wiki/entities/<class>/<id>.md
from public.entity_mentions JOIN public.chunks.
After 31-populate-entity-mentions.py has run, the DB has the truth. This script
syncs that truth BACK into the markdown frontmatter so the legacy entity page
("Appears in N pages") matches what the new graph/retrieval layer sees.
Idempotent: rewrites mentioned_in[] in place. Preserves all other frontmatter.
Skips entities with 0 mentions (leaves existing list untouched).
Usage:
./32-sync-mentioned-in-yaml.py # all classes
./32-sync-mentioned-in-yaml.py --class people # one class
./32-sync-mentioned-in-yaml.py --dry-run # show diffs, no writes
./32-sync-mentioned-in-yaml.py --max-mentions 200 # cap list length per entity
"""
from __future__ import annotations
import argparse
import os
import sys
import time
from pathlib import Path
try:
import yaml
import psycopg
except ImportError as e:
sys.stderr.write(f"pip3 install pyyaml psycopg[binary] # missing: {e}\n")
sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
WIKI = UFO_ROOT / "wiki"
DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")
CLASSES = ["people", "organizations", "locations", "events", "uap-objects", "vehicles", "operations", "concepts"]
CLASS_SINGULAR = {
"people": "person",
"organizations": "organization",
"locations": "location",
"events": "event",
"uap-objects": "uap_object",
"vehicles": "vehicle",
"operations": "operation",
"concepts": "concept",
}
def split_frontmatter(raw: str) -> tuple[str, str]:
"""Return (frontmatter_yaml, body). Raises if no frontmatter."""
if not raw.startswith("---"):
return "", raw
end = raw.find("---", 4)
if end < 0:
return "", raw
return raw[3:end].strip(), raw[end + 3:].lstrip("\n")
def fetch_mentions(cur, entity_class: str, entity_id: str, max_n: int) -> list[dict]:
cur.execute(
"""
SELECT c.doc_id, c.page, c.chunk_id, em.surface_form
FROM public.entities e
JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
WHERE e.entity_class = %s AND e.entity_id = %s
ORDER BY c.doc_id, c.order_global
LIMIT %s
""",
(entity_class, entity_id, max_n),
)
rows = cur.fetchall()
return [
{
"page": f"[[{r[0]}/p{r[1]:03d}]]",
"page_ref": f"[[{r[0]}/p{r[1]:03d}#{r[2]}]]",
"doc_id": r[0],
"chunk_id": r[2],
"surface_form": r[3],
}
for r in rows
]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--class", dest="cls", default=None, choices=CLASSES)
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--max-mentions", type=int, default=200, help="Cap per entity (default 200)")
args = ap.parse_args()
if not DATABASE_URL:
sys.stderr.write("✗ Set DATABASE_URL\n")
sys.exit(1)
target_classes = [args.cls] if args.cls else CLASSES
total_updated = 0
total_skipped_empty = 0
total_unchanged = 0
t0 = time.time()
with psycopg.connect(DATABASE_URL) as conn:
for cls_folder in target_classes:
cls_dir = WIKI / "entities" / cls_folder
if not cls_dir.is_dir():
continue
entity_class_sg = CLASS_SINGULAR[cls_folder]
files = sorted(cls_dir.glob("*.md"))
print(f"{cls_folder}: {len(files)} files")
for i, fpath in enumerate(files):
eid = fpath.stem
try:
raw = fpath.read_text(encoding="utf-8")
except Exception as e:
print(f"{eid}: read failed ({e})")
continue
fm_yaml, body = split_frontmatter(raw)
if not fm_yaml:
continue
try:
fm = yaml.safe_load(fm_yaml) or {}
except yaml.YAMLError as e:
print(f"{eid}: yaml ({e})")
continue
with conn.cursor() as cur:
mentions = fetch_mentions(cur, entity_class_sg, eid, args.max_mentions)
if not mentions:
total_skipped_empty += 1
continue
# Build new mentioned_in list — preserve order, dedupe by page_ref
seen = set()
new_mentions = []
for m in mentions:
if m["page_ref"] in seen:
continue
seen.add(m["page_ref"])
new_mentions.append(
{"page": m["page"], "page_ref": m["page_ref"], "doc_id": m["doc_id"]}
)
old_count = len(fm.get("mentioned_in") or [])
fm["mentioned_in"] = new_mentions
fm["total_mentions"] = len(new_mentions)
new_yaml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=120)
new_raw = f"---\n{new_yaml}---\n{body}"
if new_raw == raw:
total_unchanged += 1
continue
if args.dry_run:
print(f" Δ {eid}: {old_count}{len(new_mentions)} mentions")
else:
fpath.write_text(new_raw, encoding="utf-8")
total_updated += 1
if (i + 1) % 500 == 0:
elapsed = round(time.time() - t0, 0)
print(f" [{i+1}/{len(files)}] updated={total_updated} · {int(elapsed)}s")
print(
f"\nDONE — updated={total_updated} skipped_empty={total_skipped_empty} unchanged={total_unchanged} · {round(time.time() - t0, 1)}s"
)
if __name__ == "__main__":
main()