#!/usr/bin/env python3 """ Normalize each chunk's `page:` field to match the actual PNG it was rendered against (`source_png`). Background: the chunker (Sonnet) populated `page:` with the page-number it INFERRED from the document's printed footer/header — which often diverges from the PNG index after the PDF→PNG conversion (cover sheets, blank pages, FBI section markers, etc). The UI routes `/d//` by PNG index, so the chunk `page` field MUST match the PNG index for the page view to show the right chunks alongside the right scan. This script rewrites `page:` IN PLACE in every raw chunk markdown where the field disagrees with the number embedded in `source_png:`. It is idempotent — re-running it on a clean tree is a no-op. Run: python3 scripts/maintain/43_fix_chunk_page_from_source_png.py [--dry-run] """ from __future__ import annotations import re import sys from pathlib import Path from collections import defaultdict CHUNKS_ROOT = Path("/Users/guto/ufo/raw") PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M) SRC_RE = re.compile(r"source_png:\s*\"?[^\"\n]*?p-?(\d+)\.png", re.M) def main() -> int: dry = "--dry-run" in sys.argv fixed = 0 scanned = 0 by_doc: dict[str, int] = defaultdict(int) samples: list[tuple[str, int, int]] = [] for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")): doc_id = chunks_dir.parent.name.replace("--subagent", "") for f in chunks_dir.glob("*.md"): content = f.read_text(encoding="utf-8") if not content.startswith("---"): continue parts = content.split("---", 2) if len(parts) < 3: continue _, fm, body = parts page_m = PAGE_RE.search(fm) src_m = SRC_RE.search(fm) if not (page_m and src_m): continue scanned += 1 declared = int(page_m.group(1)) real = int(src_m.group(1)) if declared == real: continue new_fm = PAGE_RE.sub(f"page: {real}", fm, count=1) new_content = "---" + new_fm + "---" + body if not dry: f.write_text(new_content, encoding="utf-8") fixed += 1 by_doc[doc_id] += 1 if len(samples) < 5: samples.append((f"{doc_id}/{f.name}", declared, real)) print(f"Scanned: {scanned} chunks") print(f"Fixed: {fixed} chunks ({'dry-run' if dry else 'written'})") print(f"Docs touched: {len(by_doc)}") if by_doc: print("\nTop docs by fix count:") for doc, n in sorted(by_doc.items(), key=lambda x: -x[1])[:15]: print(f" {n:>5} {doc}") if samples: print("\nSample fixes:") for path, d, r in samples: print(f" {path}: page {d} -> {r}") return 0 if __name__ == "__main__": sys.exit(main())