disclosure-bureau/scripts/maintain/43_fix_chunk_page_from_source_png.py

#!/usr/bin/env python3
"""
Normalize each chunk's `page:` field to match the actual PNG it was rendered
against (`source_png`).

Background: the chunker (Sonnet) populated `page:` with the page-number it
INFERRED from the document's printed footer/header — which often diverges from
the PNG index after the PDF→PNG conversion (cover sheets, blank pages, FBI
section markers, etc).

The UI routes `/d/<doc>/<pNNN>` by PNG index, so the chunk `page` field MUST
match the PNG index for the page view to show the right chunks alongside the
right scan.

This script rewrites `page:` IN PLACE in every raw chunk markdown where the
field disagrees with the number embedded in `source_png:`. It is idempotent —
re-running it on a clean tree is a no-op.

Run:
    python3 scripts/maintain/43_fix_chunk_page_from_source_png.py [--dry-run]
"""
from __future__ import annotations
import re
import sys
from pathlib import Path
from collections import defaultdict

CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
SRC_RE = re.compile(r"source_png:\s*\"?[^\"\n]*?p-?(\d+)\.png", re.M)


def main() -> int:
    dry = "--dry-run" in sys.argv
    fixed = 0
    scanned = 0
    by_doc: dict[str, int] = defaultdict(int)
    samples: list[tuple[str, int, int]] = []

    for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
        doc_id = chunks_dir.parent.name.replace("--subagent", "")
        for f in chunks_dir.glob("*.md"):
            content = f.read_text(encoding="utf-8")
            if not content.startswith("---"):
                continue
            parts = content.split("---", 2)
            if len(parts) < 3:
                continue
            _, fm, body = parts
            page_m = PAGE_RE.search(fm)
            src_m = SRC_RE.search(fm)
            if not (page_m and src_m):
                continue
            scanned += 1
            declared = int(page_m.group(1))
            real = int(src_m.group(1))
            if declared == real:
                continue
            new_fm = PAGE_RE.sub(f"page: {real}", fm, count=1)
            new_content = "---" + new_fm + "---" + body
            if not dry:
                f.write_text(new_content, encoding="utf-8")
            fixed += 1
            by_doc[doc_id] += 1
            if len(samples) < 5:
                samples.append((f"{doc_id}/{f.name}", declared, real))

    print(f"Scanned: {scanned} chunks")
    print(f"Fixed:   {fixed} chunks  ({'dry-run' if dry else 'written'})")
    print(f"Docs touched: {len(by_doc)}")
    if by_doc:
        print("\nTop docs by fix count:")
        for doc, n in sorted(by_doc.items(), key=lambda x: -x[1])[:15]:
            print(f"  {n:>5}  {doc}")
    if samples:
        print("\nSample fixes:")
        for path, d, r in samples:
            print(f"  {path}: page {d} -> {r}")
    return 0


if __name__ == "__main__":
    sys.exit(main())