#!/usr/bin/env python3 """ Resync each `_index.json` so its embedded chunks[].page reflects the corrected markdown frontmatter (after script 43). Idempotent. """ from __future__ import annotations import json import re from pathlib import Path CHUNKS_ROOT = Path("/Users/guto/ufo/raw") PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M) CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M) def main() -> None: touched = 0 for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")): idx_path = chunks_dir.parent / "_index.json" if not idx_path.is_file(): continue idx = json.loads(idx_path.read_text(encoding="utf-8")) chunks = idx.get("chunks") or [] if not chunks: continue # Build chunk_id -> page from disk truth: dict[str, int] = {} for f in chunks_dir.glob("*.md"): head = f.read_text(encoding="utf-8")[:2000] cm = CID_RE.search(head) pm = PAGE_RE.search(head) if cm and pm: truth[cm.group(1)] = int(pm.group(1)) changed = 0 for entry in chunks: cid = entry.get("chunk_id") real = truth.get(cid) if real is not None and entry.get("page") != real: entry["page"] = real changed += 1 if changed: idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8") print(f" {idx.get('doc_id')}: updated {changed} entries") touched += 1 print(f"\nDocs touched: {touched}") if __name__ == "__main__": main()