disclosure-bureau/scripts/32-sync-mentioned-in-yaml.py

#!/usr/bin/env python3
"""
32-sync-mentioned-in-yaml.py — Backfill `mentioned_in[]` in wiki/entities/<class>/<id>.md
from public.entity_mentions JOIN public.chunks.

After 31-populate-entity-mentions.py has run, the DB has the truth. This script
syncs that truth BACK into the markdown frontmatter so the legacy entity page
("Appears in N pages") matches what the new graph/retrieval layer sees.

Idempotent: rewrites mentioned_in[] in place. Preserves all other frontmatter.
Skips entities with 0 mentions (leaves existing list untouched).

Usage:
  ./32-sync-mentioned-in-yaml.py                       # all classes
  ./32-sync-mentioned-in-yaml.py --class people        # one class
  ./32-sync-mentioned-in-yaml.py --dry-run             # show diffs, no writes
  ./32-sync-mentioned-in-yaml.py --max-mentions 200    # cap list length per entity
"""
from __future__ import annotations

import argparse
import os
import sys
import time
from pathlib import Path

try:
    import yaml
    import psycopg
except ImportError as e:
    sys.stderr.write(f"pip3 install pyyaml psycopg[binary]  # missing: {e}\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
WIKI = UFO_ROOT / "wiki"

DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")

CLASSES = ["people", "organizations", "locations", "events", "uap-objects", "vehicles", "operations", "concepts"]
CLASS_SINGULAR = {
    "people": "person",
    "organizations": "organization",
    "locations": "location",
    "events": "event",
    "uap-objects": "uap_object",
    "vehicles": "vehicle",
    "operations": "operation",
    "concepts": "concept",
}


def split_frontmatter(raw: str) -> tuple[str, str]:
    """Return (frontmatter_yaml, body). Raises if no frontmatter."""
    if not raw.startswith("---"):
        return "", raw
    end = raw.find("---", 4)
    if end < 0:
        return "", raw
    return raw[3:end].strip(), raw[end + 3:].lstrip("\n")


def fetch_mentions(cur, entity_class: str, entity_id: str, max_n: int) -> list[dict]:
    cur.execute(
        """
        SELECT c.doc_id, c.page, c.chunk_id, em.surface_form
        FROM public.entities e
        JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk
        JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
        WHERE e.entity_class = %s AND e.entity_id = %s
        ORDER BY c.doc_id, c.order_global
        LIMIT %s
        """,
        (entity_class, entity_id, max_n),
    )
    rows = cur.fetchall()
    return [
        {
            "page": f"[[{r[0]}/p{r[1]:03d}]]",
            "page_ref": f"[[{r[0]}/p{r[1]:03d}#{r[2]}]]",
            "doc_id": r[0],
            "chunk_id": r[2],
            "surface_form": r[3],
        }
        for r in rows
    ]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--class", dest="cls", default=None, choices=CLASSES)
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--max-mentions", type=int, default=200, help="Cap per entity (default 200)")
    args = ap.parse_args()

    if not DATABASE_URL:
        sys.stderr.write("✗ Set DATABASE_URL\n")
        sys.exit(1)

    target_classes = [args.cls] if args.cls else CLASSES
    total_updated = 0
    total_skipped_empty = 0
    total_unchanged = 0
    t0 = time.time()

    with psycopg.connect(DATABASE_URL) as conn:
        for cls_folder in target_classes:
            cls_dir = WIKI / "entities" / cls_folder
            if not cls_dir.is_dir():
                continue
            entity_class_sg = CLASS_SINGULAR[cls_folder]
            files = sorted(cls_dir.glob("*.md"))
            print(f"  ▸ {cls_folder}: {len(files)} files")

            for i, fpath in enumerate(files):
                eid = fpath.stem
                try:
                    raw = fpath.read_text(encoding="utf-8")
                except Exception as e:
                    print(f"    ✗ {eid}: read failed ({e})")
                    continue
                fm_yaml, body = split_frontmatter(raw)
                if not fm_yaml:
                    continue
                try:
                    fm = yaml.safe_load(fm_yaml) or {}
                except yaml.YAMLError as e:
                    print(f"    ✗ {eid}: yaml ({e})")
                    continue

                with conn.cursor() as cur:
                    mentions = fetch_mentions(cur, entity_class_sg, eid, args.max_mentions)

                if not mentions:
                    total_skipped_empty += 1
                    continue

                # Build new mentioned_in list — preserve order, dedupe by page_ref
                seen = set()
                new_mentions = []
                for m in mentions:
                    if m["page_ref"] in seen:
                        continue
                    seen.add(m["page_ref"])
                    new_mentions.append(
                        {"page": m["page"], "page_ref": m["page_ref"], "doc_id": m["doc_id"]}
                    )

                old_count = len(fm.get("mentioned_in") or [])
                fm["mentioned_in"] = new_mentions
                fm["total_mentions"] = len(new_mentions)

                new_yaml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=120)
                new_raw = f"---\n{new_yaml}---\n{body}"

                if new_raw == raw:
                    total_unchanged += 1
                    continue

                if args.dry_run:
                    print(f"    Δ {eid}: {old_count} → {len(new_mentions)} mentions")
                else:
                    fpath.write_text(new_raw, encoding="utf-8")
                total_updated += 1

                if (i + 1) % 500 == 0:
                    elapsed = round(time.time() - t0, 0)
                    print(f"    [{i+1}/{len(files)}] updated={total_updated} · {int(elapsed)}s")

    print(
        f"\nDONE — updated={total_updated} skipped_empty={total_skipped_empty} unchanged={total_unchanged} · {round(time.time() - t0, 1)}s"
    )


if __name__ == "__main__":
    main()