Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resync `chunks.page` in Postgres from the raw chunk markdowns (after running
|
|
43_fix_chunk_page_from_source_png.py).
|
|
|
|
This avoids re-embedding — we only touch the integer column.
|
|
|
|
Run:
|
|
DATABASE_URL=postgres://... python3 scripts/maintain/44_sync_chunk_page_to_db.py
|
|
"""
|
|
from __future__ import annotations
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import psycopg
|
|
|
|
CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
|
|
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
|
|
CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)
|
|
|
|
|
|
def main() -> int:
|
|
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
|
|
if not dburl:
|
|
sys.exit("DATABASE_URL not set")
|
|
|
|
updates: list[tuple[str, str, int]] = [] # (doc_id, chunk_id, page)
|
|
for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
|
|
doc_id = chunks_dir.parent.name.replace("--subagent", "")
|
|
for f in chunks_dir.glob("*.md"):
|
|
content = f.read_text(encoding="utf-8")
|
|
if not content.startswith("---"):
|
|
continue
|
|
parts = content.split("---", 2)
|
|
if len(parts) < 3:
|
|
continue
|
|
fm = parts[1]
|
|
cid_m = CID_RE.search(fm)
|
|
page_m = PAGE_RE.search(fm)
|
|
if not (cid_m and page_m):
|
|
continue
|
|
updates.append((doc_id, cid_m.group(1), int(page_m.group(1))))
|
|
|
|
print(f"Loaded {len(updates)} chunk records from disk")
|
|
|
|
with psycopg.connect(dburl) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"CREATE TEMP TABLE _chunk_pages (doc_id TEXT, chunk_id TEXT, page INT)"
|
|
)
|
|
with cur.copy("COPY _chunk_pages (doc_id, chunk_id, page) FROM STDIN") as cp:
|
|
for row in updates:
|
|
cp.write_row(row)
|
|
|
|
cur.execute(
|
|
"""
|
|
UPDATE chunks c
|
|
SET page = t.page
|
|
FROM _chunk_pages t
|
|
WHERE c.doc_id = t.doc_id AND c.chunk_id = t.chunk_id
|
|
AND c.page IS DISTINCT FROM t.page
|
|
"""
|
|
)
|
|
changed = cur.rowcount
|
|
print(f"Updated {changed} rows in chunks.page")
|
|
|
|
conn.commit()
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|