176 lines
5.9 KiB
Python
Executable file
176 lines
5.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
32-sync-mentioned-in-yaml.py — Backfill `mentioned_in[]` in wiki/entities/<class>/<id>.md
|
|
from public.entity_mentions JOIN public.chunks.
|
|
|
|
After 31-populate-entity-mentions.py has run, the DB has the truth. This script
|
|
syncs that truth BACK into the markdown frontmatter so the legacy entity page
|
|
("Appears in N pages") matches what the new graph/retrieval layer sees.
|
|
|
|
Idempotent: rewrites mentioned_in[] in place. Preserves all other frontmatter.
|
|
Skips entities with 0 mentions (leaves existing list untouched).
|
|
|
|
Usage:
|
|
./32-sync-mentioned-in-yaml.py # all classes
|
|
./32-sync-mentioned-in-yaml.py --class people # one class
|
|
./32-sync-mentioned-in-yaml.py --dry-run # show diffs, no writes
|
|
./32-sync-mentioned-in-yaml.py --max-mentions 200 # cap list length per entity
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
import psycopg
|
|
except ImportError as e:
|
|
sys.stderr.write(f"pip3 install pyyaml psycopg[binary] # missing: {e}\n")
|
|
sys.exit(1)
|
|
|
|
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|
WIKI = UFO_ROOT / "wiki"
|
|
|
|
DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")
|
|
|
|
CLASSES = ["people", "organizations", "locations", "events", "uap-objects", "vehicles", "operations", "concepts"]
|
|
CLASS_SINGULAR = {
|
|
"people": "person",
|
|
"organizations": "organization",
|
|
"locations": "location",
|
|
"events": "event",
|
|
"uap-objects": "uap_object",
|
|
"vehicles": "vehicle",
|
|
"operations": "operation",
|
|
"concepts": "concept",
|
|
}
|
|
|
|
|
|
def split_frontmatter(raw: str) -> tuple[str, str]:
|
|
"""Return (frontmatter_yaml, body). Raises if no frontmatter."""
|
|
if not raw.startswith("---"):
|
|
return "", raw
|
|
end = raw.find("---", 4)
|
|
if end < 0:
|
|
return "", raw
|
|
return raw[3:end].strip(), raw[end + 3:].lstrip("\n")
|
|
|
|
|
|
def fetch_mentions(cur, entity_class: str, entity_id: str, max_n: int) -> list[dict]:
|
|
cur.execute(
|
|
"""
|
|
SELECT c.doc_id, c.page, c.chunk_id, em.surface_form
|
|
FROM public.entities e
|
|
JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk
|
|
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
|
|
WHERE e.entity_class = %s AND e.entity_id = %s
|
|
ORDER BY c.doc_id, c.order_global
|
|
LIMIT %s
|
|
""",
|
|
(entity_class, entity_id, max_n),
|
|
)
|
|
rows = cur.fetchall()
|
|
return [
|
|
{
|
|
"page": f"[[{r[0]}/p{r[1]:03d}]]",
|
|
"page_ref": f"[[{r[0]}/p{r[1]:03d}#{r[2]}]]",
|
|
"doc_id": r[0],
|
|
"chunk_id": r[2],
|
|
"surface_form": r[3],
|
|
}
|
|
for r in rows
|
|
]
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--class", dest="cls", default=None, choices=CLASSES)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
ap.add_argument("--max-mentions", type=int, default=200, help="Cap per entity (default 200)")
|
|
args = ap.parse_args()
|
|
|
|
if not DATABASE_URL:
|
|
sys.stderr.write("✗ Set DATABASE_URL\n")
|
|
sys.exit(1)
|
|
|
|
target_classes = [args.cls] if args.cls else CLASSES
|
|
total_updated = 0
|
|
total_skipped_empty = 0
|
|
total_unchanged = 0
|
|
t0 = time.time()
|
|
|
|
with psycopg.connect(DATABASE_URL) as conn:
|
|
for cls_folder in target_classes:
|
|
cls_dir = WIKI / "entities" / cls_folder
|
|
if not cls_dir.is_dir():
|
|
continue
|
|
entity_class_sg = CLASS_SINGULAR[cls_folder]
|
|
files = sorted(cls_dir.glob("*.md"))
|
|
print(f" ▸ {cls_folder}: {len(files)} files")
|
|
|
|
for i, fpath in enumerate(files):
|
|
eid = fpath.stem
|
|
try:
|
|
raw = fpath.read_text(encoding="utf-8")
|
|
except Exception as e:
|
|
print(f" ✗ {eid}: read failed ({e})")
|
|
continue
|
|
fm_yaml, body = split_frontmatter(raw)
|
|
if not fm_yaml:
|
|
continue
|
|
try:
|
|
fm = yaml.safe_load(fm_yaml) or {}
|
|
except yaml.YAMLError as e:
|
|
print(f" ✗ {eid}: yaml ({e})")
|
|
continue
|
|
|
|
with conn.cursor() as cur:
|
|
mentions = fetch_mentions(cur, entity_class_sg, eid, args.max_mentions)
|
|
|
|
if not mentions:
|
|
total_skipped_empty += 1
|
|
continue
|
|
|
|
# Build new mentioned_in list — preserve order, dedupe by page_ref
|
|
seen = set()
|
|
new_mentions = []
|
|
for m in mentions:
|
|
if m["page_ref"] in seen:
|
|
continue
|
|
seen.add(m["page_ref"])
|
|
new_mentions.append(
|
|
{"page": m["page"], "page_ref": m["page_ref"], "doc_id": m["doc_id"]}
|
|
)
|
|
|
|
old_count = len(fm.get("mentioned_in") or [])
|
|
fm["mentioned_in"] = new_mentions
|
|
fm["total_mentions"] = len(new_mentions)
|
|
|
|
new_yaml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=120)
|
|
new_raw = f"---\n{new_yaml}---\n{body}"
|
|
|
|
if new_raw == raw:
|
|
total_unchanged += 1
|
|
continue
|
|
|
|
if args.dry_run:
|
|
print(f" Δ {eid}: {old_count} → {len(new_mentions)} mentions")
|
|
else:
|
|
fpath.write_text(new_raw, encoding="utf-8")
|
|
total_updated += 1
|
|
|
|
if (i + 1) % 500 == 0:
|
|
elapsed = round(time.time() - t0, 0)
|
|
print(f" [{i+1}/{len(files)}] updated={total_updated} · {int(elapsed)}s")
|
|
|
|
print(
|
|
f"\nDONE — updated={total_updated} skipped_empty={total_skipped_empty} unchanged={total_unchanged} · {round(time.time() - t0, 1)}s"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|