disclosure-bureau/scripts/maintain/45_resync_index_json.py

#!/usr/bin/env python3
"""
Resync each `_index.json` so its embedded chunks[].page reflects the corrected
markdown frontmatter (after script 43).

Idempotent.
"""
from __future__ import annotations
import json
import re
from pathlib import Path

CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)


def main() -> None:
    touched = 0
    for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
        idx_path = chunks_dir.parent / "_index.json"
        if not idx_path.is_file():
            continue
        idx = json.loads(idx_path.read_text(encoding="utf-8"))
        chunks = idx.get("chunks") or []
        if not chunks:
            continue
        # Build chunk_id -> page from disk
        truth: dict[str, int] = {}
        for f in chunks_dir.glob("*.md"):
            head = f.read_text(encoding="utf-8")[:2000]
            cm = CID_RE.search(head)
            pm = PAGE_RE.search(head)
            if cm and pm:
                truth[cm.group(1)] = int(pm.group(1))
        changed = 0
        for entry in chunks:
            cid = entry.get("chunk_id")
            real = truth.get(cid)
            if real is not None and entry.get("page") != real:
                entry["page"] = real
                changed += 1
        if changed:
            idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
            print(f"  {idx.get('doc_id')}: updated {changed} entries")
            touched += 1
    print(f"\nDocs touched: {touched}")


if __name__ == "__main__":
    main()
rebuild entity layer from Sonnet-vision reextract pipeline Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-21 15:20:24 +00:00			`#!/usr/bin/env python3`
			`"""`
			Resync each `_index.json` so its embedded chunks[].page reflects the corrected
			`markdown frontmatter (after script 43).`

			`Idempotent.`
			`"""`
			`from __future__ import annotations`
			`import json`
			`import re`
			`from pathlib import Path`

			`CHUNKS_ROOT = Path("/Users/guto/ufo/raw")`
			`PAGE_RE = re.compile(r"^page:\s(\d+)\s$", re.M)`
			`CID_RE = re.compile(r"^chunk_id:\s(\S+)\s$", re.M)`


			`def main() -> None:`
			`touched = 0`
			`for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):`
			`idx_path = chunks_dir.parent / "_index.json"`
			`if not idx_path.is_file():`
			`continue`
			`idx = json.loads(idx_path.read_text(encoding="utf-8"))`
			`chunks = idx.get("chunks") or []`
			`if not chunks:`
			`continue`
			`# Build chunk_id -> page from disk`
			`truth: dict[str, int] = {}`
			`for f in chunks_dir.glob("*.md"):`
			`head = f.read_text(encoding="utf-8")[:2000]`
			`cm = CID_RE.search(head)`
			`pm = PAGE_RE.search(head)`
			`if cm and pm:`
			`truth[cm.group(1)] = int(pm.group(1))`
			`changed = 0`
			`for entry in chunks:`
			`cid = entry.get("chunk_id")`
			`real = truth.get(cid)`
			`if real is not None and entry.get("page") != real:`
			`entry["page"] = real`
			`changed += 1`
			`if changed:`
			`idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")`
			`print(f" {idx.get('doc_id')}: updated {changed} entries")`
			`touched += 1`
			`print(f"\nDocs touched: {touched}")`


			`if __name__ == "__main__":`
			`main()`