disclosure-bureau/scripts/synthesize/31_aggregate_pages_from_chunks.py

#!/usr/bin/env python3
"""
31_aggregate_pages_from_chunks.py — Generate thin wiki/pages/<doc>/p<NNN>.md
files for pages where the chunks/ already have content but the per-page vision
pipeline (02-vision-page.py) never produced an aggregator file.

Source of truth: raw/<doc>--subagent/_index.json + chunks/c*.md (Sonnet-extracted)
Output:          wiki/pages/<doc>/p<NNN>.md (thin aggregator, tagged source:chunk-aggregator)

Skips pages that already have a wiki/pages/.md (idempotent).

Run:
  python3 scripts/synthesize/31_aggregate_pages_from_chunks.py
  python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --doc-id <id>   # one doc
  python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --dry-run
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path

UFO = Path("/Users/guto/ufo")
RAW = UFO / "raw"
PNG_BASE = UFO / "processing" / "png"
PAGES_BASE = UFO / "wiki" / "pages"

SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def split_frontmatter(text: str) -> tuple[dict, str]:
    if not text.startswith("---"): return {}, text
    parts = text.split("---", 2)
    if len(parts) < 3: return {}, text
    fm: dict = {}
    for line in parts[1].splitlines():
        m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
        if not m: continue
        fm[m.group(1)] = m.group(2).strip()
    return fm, parts[2]


def extract_bilingual(body: str) -> tuple[str, str]:
    """Return (en, pt_br) text from a chunk body. Either may be empty."""
    en_m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
    pt_m = re.search(r"\*\*PT-BR:\*\*\s*(.*?)\Z", body, re.S)
    en = (en_m.group(1).strip() if en_m else "").strip()
    pt = (pt_m.group(1).strip() if pt_m else "").strip()
    return en, pt


def find_missing_pages() -> dict[str, list[int]]:
    """For each doc, return sorted list of pages where PNG exists but wiki/pages/.md doesn't."""
    missing: dict[str, list[int]] = defaultdict(list)
    for png in PNG_BASE.glob("*/p-*.png"):
        doc_id = png.parent.name
        m = re.match(r"p-(\d+)\.png$", png.name)
        if not m: continue
        n = int(m.group(1))
        wiki = PAGES_BASE / doc_id / f"p{n:03d}.md"
        if not wiki.is_file():
            missing[doc_id].append(n)
    return {d: sorted(ps) for d, ps in missing.items()}


def build_page_md(doc_id: str, page_num: int) -> str | None:
    """Assemble a single page.md from the doc's _index.json + chunks/."""
    sub = RAW / f"{doc_id}--subagent"
    idx_path = sub / "_index.json"
    if not idx_path.is_file(): return None
    idx = json.loads(idx_path.read_text(encoding="utf-8"))
    chunks_for_page = [c for c in (idx.get("chunks") or []) if c.get("page") == page_num]
    if not chunks_for_page:
        return None  # no chunk data → can't aggregate
    chunks_for_page.sort(key=lambda x: x.get("order_in_page", 0))

    total_pages = idx.get("total_pages")
    rel_png = f"../../../processing/png/{doc_id}/p-{page_num:03d}.png"

    # Aggregate per-chunk EN/PT/metadata
    body_blocks: list[str] = []
    types_seen: set[str] = set()
    chunk_ids: list[str] = []
    has_redaction = has_image = has_table = has_stamp = has_signature = False
    classifications: set[str] = set()

    for c in chunks_for_page:
        cid = c.get("chunk_id")
        chunk_ids.append(cid)
        ctype = c.get("type") or "?"
        types_seen.add(ctype)
        chunk_path = sub / "chunks" / f"{cid}.md"
        if not chunk_path.is_file(): continue
        text = chunk_path.read_text(encoding="utf-8")
        fm, body = split_frontmatter(text)
        en, pt = extract_bilingual(body)

        if not en and not pt:
            # fall back to extracted_text / image_description fields
            en = (fm.get("image_description_en") or fm.get("extracted_text") or "").strip().strip('"\'')
            pt = (fm.get("image_description_pt_br") or "").strip().strip('"\'')

        # Heuristic flags
        if ctype in ("redaction", "redacted_block"): has_redaction = True
        if "image" in ctype or "photo" in ctype or "diagram" in ctype or "sketch" in ctype or "map" in ctype:
            has_image = True
        if "table" in ctype: has_table = True
        if "stamp" in ctype: has_stamp = True
        if "signature" in ctype: has_signature = True
        cls = fm.get("classification")
        if cls and cls != "null": classifications.add(cls)

        # Body block
        block = f"### Chunk `{cid}` — type: {ctype}\n"
        bbox = c.get("bbox") or {}
        if bbox:
            block += f"_bbox_: x={bbox.get('x')}, y={bbox.get('y')}, w={bbox.get('w')}, h={bbox.get('h')}\n\n"
        if en:    block += f"**EN:** {en}\n\n"
        if pt:    block += f"**PT-BR:** {pt}\n"
        body_blocks.append(block.rstrip())

    # Content classification
    content_class = []
    if has_image: content_class.append("contains-photos")
    if has_table: content_class.append("contains-tables")
    if has_stamp: content_class.append("contains-stamps")
    if has_signature: content_class.append("contains-signatures")
    if has_redaction: content_class.append("redaction-heavy")
    if not content_class: content_class.append("text-only")

    # Page-level inferred type (best-effort)
    if "classification_banner" in types_seen and len(types_seen) <= 3:
        page_type = "cover"
    elif "header" in types_seen and "transcript_block" in types_seen:
        page_type = "transcript"
    elif has_table and not body_blocks:
        page_type = "table_only"
    elif "letterhead" in types_seen:
        page_type = "memo"
    else:
        page_type = "mixed"

    # Frontmatter
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "page",
        "page_id": f"{doc_id}/p{page_num:03d}",
        "doc_id": doc_id,
        "page_number": page_num,
        "total_pages": total_pages,
        "png_path": rel_png,
        "page_type": page_type,
        "content_classification": content_class,
        "classification_markings": [{"level": c} for c in sorted(classifications)] if classifications else [],
        "chunks_on_page": chunk_ids,
        "chunk_count": len(chunk_ids),
        "source": "chunk-aggregator",
        "source_note": "Page-md generated from chunks built by Sonnet vision (raw/<doc>--subagent/chunks/). Per-page vision Haiku pipeline (02-vision-page.py) never produced an output for this page.",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
    }
    import yaml
    yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
                                default_flow_style=False, width=10_000).rstrip()
    body = "\n\n".join(body_blocks) if body_blocks else "_(no extractable text — see chunk files directly)_"
    return f"---\n{yaml_block}\n---\n\n# Page {page_num} of {doc_id}\n\n{body}\n"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", default=None)
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    missing = find_missing_pages()
    if args.doc_id:
        missing = {args.doc_id: missing.get(args.doc_id, [])}

    total_missing = sum(len(ps) for ps in missing.values())
    print(f"[1/2] Inventory: {sum(1 for d, ps in missing.items() if ps)} docs, {total_missing} missing pages")
    if args.dry_run:
        for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
            if ps: print(f"  {d}: {len(ps)}")
        return 0

    print(f"\n[2/2] Generating thin aggregator page.md files ...")
    written = 0
    skipped_no_chunks = 0
    for doc_id, pages in missing.items():
        for n in pages:
            md = build_page_md(doc_id, n)
            if md is None:
                skipped_no_chunks += 1
                continue
            out = PAGES_BASE / doc_id / f"p{n:03d}.md"
            out.parent.mkdir(parents=True, exist_ok=True)
            out.write_text(md, encoding="utf-8")
            written += 1

    print(f"      written: {written}")
    print(f"      skipped (no chunk data): {skipped_no_chunks}")
    print(f"\n✓ done.")
    return 0


if __name__ == "__main__":
    sys.exit(main())