Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
55 lines
1.9 KiB
Python
55 lines
1.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Sync `is_generic` flag from each entity YAML to public.entities table.
|
|
"""
|
|
from __future__ import annotations
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import psycopg
|
|
import yaml
|
|
|
|
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
|
|
|
|
|
|
def main() -> int:
|
|
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
|
|
if not dburl: sys.exit("DATABASE_URL not set")
|
|
|
|
rows: list[tuple[str, str, bool]] = []
|
|
for f in WIKI_ENT.rglob("*.md"):
|
|
if "_archived" in f.parts: continue
|
|
try:
|
|
text = f.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): continue
|
|
fm = yaml.safe_load(text.split("---")[1]) or {}
|
|
except Exception: continue
|
|
cls = fm.get("entity_class")
|
|
eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
|
|
if not (cls and eid): continue
|
|
rows.append((cls, eid, bool(fm.get("is_generic"))))
|
|
|
|
print(f"Loaded {len(rows)} entities from YAML")
|
|
|
|
with psycopg.connect(dburl) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("CREATE TEMP TABLE _gen (entity_class TEXT, entity_id TEXT, is_generic BOOL)")
|
|
with cur.copy("COPY _gen (entity_class, entity_id, is_generic) FROM STDIN") as cp:
|
|
for row in rows: cp.write_row(row)
|
|
cur.execute("""
|
|
UPDATE entities e SET is_generic = g.is_generic
|
|
FROM _gen g
|
|
WHERE e.entity_class = g.entity_class
|
|
AND e.entity_id = g.entity_id
|
|
AND e.is_generic IS DISTINCT FROM g.is_generic
|
|
""")
|
|
print(f" rows updated: {cur.rowcount}")
|
|
cur.execute("SELECT COUNT(*) FROM entities WHERE is_generic")
|
|
print(f" total is_generic=TRUE in DB: {cur.fetchone()[0]}")
|
|
conn.commit()
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|