#!/usr/bin/env python3 """ Resync `chunks.page` in Postgres from the raw chunk markdowns (after running 43_fix_chunk_page_from_source_png.py). This avoids re-embedding — we only touch the integer column. Run: DATABASE_URL=postgres://... python3 scripts/maintain/44_sync_chunk_page_to_db.py """ from __future__ import annotations import os import re import sys from pathlib import Path import psycopg CHUNKS_ROOT = Path("/Users/guto/ufo/raw") PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M) CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M) def main() -> int: dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") if not dburl: sys.exit("DATABASE_URL not set") updates: list[tuple[str, str, int]] = [] # (doc_id, chunk_id, page) for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")): doc_id = chunks_dir.parent.name.replace("--subagent", "") for f in chunks_dir.glob("*.md"): content = f.read_text(encoding="utf-8") if not content.startswith("---"): continue parts = content.split("---", 2) if len(parts) < 3: continue fm = parts[1] cid_m = CID_RE.search(fm) page_m = PAGE_RE.search(fm) if not (cid_m and page_m): continue updates.append((doc_id, cid_m.group(1), int(page_m.group(1)))) print(f"Loaded {len(updates)} chunk records from disk") with psycopg.connect(dburl) as conn: with conn.cursor() as cur: cur.execute( "CREATE TEMP TABLE _chunk_pages (doc_id TEXT, chunk_id TEXT, page INT)" ) with cur.copy("COPY _chunk_pages (doc_id, chunk_id, page) FROM STDIN") as cp: for row in updates: cp.write_row(row) cur.execute( """ UPDATE chunks c SET page = t.page FROM _chunk_pages t WHERE c.doc_id = t.doc_id AND c.chunk_id = t.chunk_id AND c.page IS DISTINCT FROM t.page """ ) changed = cur.rowcount print(f"Updated {changed} rows in chunks.page") conn.commit() return 0 if __name__ == "__main__": sys.exit(main())