disclosure-bureau/scripts/maintain/54_sync_is_generic.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

55 lines
1.9 KiB
Python

#!/usr/bin/env python3
"""
Sync `is_generic` flag from each entity YAML to public.entities table.
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
import psycopg
import yaml
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
def main() -> int:
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
if not dburl: sys.exit("DATABASE_URL not set")
rows: list[tuple[str, str, bool]] = []
for f in WIKI_ENT.rglob("*.md"):
if "_archived" in f.parts: continue
try:
text = f.read_text(encoding="utf-8")
if not text.startswith("---"): continue
fm = yaml.safe_load(text.split("---")[1]) or {}
except Exception: continue
cls = fm.get("entity_class")
eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
if not (cls and eid): continue
rows.append((cls, eid, bool(fm.get("is_generic"))))
print(f"Loaded {len(rows)} entities from YAML")
with psycopg.connect(dburl) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TEMP TABLE _gen (entity_class TEXT, entity_id TEXT, is_generic BOOL)")
with cur.copy("COPY _gen (entity_class, entity_id, is_generic) FROM STDIN") as cp:
for row in rows: cp.write_row(row)
cur.execute("""
UPDATE entities e SET is_generic = g.is_generic
FROM _gen g
WHERE e.entity_class = g.entity_class
AND e.entity_id = g.entity_id
AND e.is_generic IS DISTINCT FROM g.is_generic
""")
print(f" rows updated: {cur.rowcount}")
cur.execute("SELECT COUNT(*) FROM entities WHERE is_generic")
print(f" total is_generic=TRUE in DB: {cur.fetchone()[0]}")
conn.commit()
return 0
if __name__ == "__main__":
sys.exit(main())