disclosure-bureau/scripts/maintain/44_sync_chunk_page_to_db.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

74 lines
2.3 KiB
Python

#!/usr/bin/env python3
"""
Resync `chunks.page` in Postgres from the raw chunk markdowns (after running
43_fix_chunk_page_from_source_png.py).
This avoids re-embedding — we only touch the integer column.
Run:
DATABASE_URL=postgres://... python3 scripts/maintain/44_sync_chunk_page_to_db.py
"""
from __future__ import annotations
import os
import re
import sys
from pathlib import Path
import psycopg
CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)
def main() -> int:
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
if not dburl:
sys.exit("DATABASE_URL not set")
updates: list[tuple[str, str, int]] = [] # (doc_id, chunk_id, page)
for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
doc_id = chunks_dir.parent.name.replace("--subagent", "")
for f in chunks_dir.glob("*.md"):
content = f.read_text(encoding="utf-8")
if not content.startswith("---"):
continue
parts = content.split("---", 2)
if len(parts) < 3:
continue
fm = parts[1]
cid_m = CID_RE.search(fm)
page_m = PAGE_RE.search(fm)
if not (cid_m and page_m):
continue
updates.append((doc_id, cid_m.group(1), int(page_m.group(1))))
print(f"Loaded {len(updates)} chunk records from disk")
with psycopg.connect(dburl) as conn:
with conn.cursor() as cur:
cur.execute(
"CREATE TEMP TABLE _chunk_pages (doc_id TEXT, chunk_id TEXT, page INT)"
)
with cur.copy("COPY _chunk_pages (doc_id, chunk_id, page) FROM STDIN") as cp:
for row in updates:
cp.write_row(row)
cur.execute(
"""
UPDATE chunks c
SET page = t.page
FROM _chunk_pages t
WHERE c.doc_id = t.doc_id AND c.chunk_id = t.chunk_id
AND c.page IS DISTINCT FROM t.page
"""
)
changed = cur.rowcount
print(f"Updated {changed} rows in chunks.page")
conn.commit()
return 0
if __name__ == "__main__":
sys.exit(main())