#!/usr/bin/env python3 """ 31_aggregate_pages_from_chunks.py — Generate thin wiki/pages//p.md files for pages where the chunks/ already have content but the per-page vision pipeline (02-vision-page.py) never produced an aggregator file. Source of truth: raw/--subagent/_index.json + chunks/c*.md (Sonnet-extracted) Output: wiki/pages//p.md (thin aggregator, tagged source:chunk-aggregator) Skips pages that already have a wiki/pages/.md (idempotent). Run: python3 scripts/synthesize/31_aggregate_pages_from_chunks.py python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --doc-id # one doc python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --dry-run """ from __future__ import annotations import argparse import json import re import sys from collections import defaultdict from datetime import datetime, timezone from pathlib import Path UFO = Path("/Users/guto/ufo") RAW = UFO / "raw" PNG_BASE = UFO / "processing" / "png" PAGES_BASE = UFO / "wiki" / "pages" SCHEMA_VERSION = "0.1.0" WIKI_VERSION = "0.1.0" NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def split_frontmatter(text: str) -> tuple[dict, str]: if not text.startswith("---"): return {}, text parts = text.split("---", 2) if len(parts) < 3: return {}, text fm: dict = {} for line in parts[1].splitlines(): m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line) if not m: continue fm[m.group(1)] = m.group(2).strip() return fm, parts[2] def extract_bilingual(body: str) -> tuple[str, str]: """Return (en, pt_br) text from a chunk body. Either may be empty.""" en_m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S) pt_m = re.search(r"\*\*PT-BR:\*\*\s*(.*?)\Z", body, re.S) en = (en_m.group(1).strip() if en_m else "").strip() pt = (pt_m.group(1).strip() if pt_m else "").strip() return en, pt def find_missing_pages() -> dict[str, list[int]]: """For each doc, return sorted list of pages where PNG exists but wiki/pages/.md doesn't.""" missing: dict[str, list[int]] = defaultdict(list) for png in PNG_BASE.glob("*/p-*.png"): doc_id = png.parent.name m = re.match(r"p-(\d+)\.png$", png.name) if not m: continue n = int(m.group(1)) wiki = PAGES_BASE / doc_id / f"p{n:03d}.md" if not wiki.is_file(): missing[doc_id].append(n) return {d: sorted(ps) for d, ps in missing.items()} def build_page_md(doc_id: str, page_num: int) -> str | None: """Assemble a single page.md from the doc's _index.json + chunks/.""" sub = RAW / f"{doc_id}--subagent" idx_path = sub / "_index.json" if not idx_path.is_file(): return None idx = json.loads(idx_path.read_text(encoding="utf-8")) chunks_for_page = [c for c in (idx.get("chunks") or []) if c.get("page") == page_num] if not chunks_for_page: return None # no chunk data → can't aggregate chunks_for_page.sort(key=lambda x: x.get("order_in_page", 0)) total_pages = idx.get("total_pages") rel_png = f"../../../processing/png/{doc_id}/p-{page_num:03d}.png" # Aggregate per-chunk EN/PT/metadata body_blocks: list[str] = [] types_seen: set[str] = set() chunk_ids: list[str] = [] has_redaction = has_image = has_table = has_stamp = has_signature = False classifications: set[str] = set() for c in chunks_for_page: cid = c.get("chunk_id") chunk_ids.append(cid) ctype = c.get("type") or "?" types_seen.add(ctype) chunk_path = sub / "chunks" / f"{cid}.md" if not chunk_path.is_file(): continue text = chunk_path.read_text(encoding="utf-8") fm, body = split_frontmatter(text) en, pt = extract_bilingual(body) if not en and not pt: # fall back to extracted_text / image_description fields en = (fm.get("image_description_en") or fm.get("extracted_text") or "").strip().strip('"\'') pt = (fm.get("image_description_pt_br") or "").strip().strip('"\'') # Heuristic flags if ctype in ("redaction", "redacted_block"): has_redaction = True if "image" in ctype or "photo" in ctype or "diagram" in ctype or "sketch" in ctype or "map" in ctype: has_image = True if "table" in ctype: has_table = True if "stamp" in ctype: has_stamp = True if "signature" in ctype: has_signature = True cls = fm.get("classification") if cls and cls != "null": classifications.add(cls) # Body block block = f"### Chunk `{cid}` — type: {ctype}\n" bbox = c.get("bbox") or {} if bbox: block += f"_bbox_: x={bbox.get('x')}, y={bbox.get('y')}, w={bbox.get('w')}, h={bbox.get('h')}\n\n" if en: block += f"**EN:** {en}\n\n" if pt: block += f"**PT-BR:** {pt}\n" body_blocks.append(block.rstrip()) # Content classification content_class = [] if has_image: content_class.append("contains-photos") if has_table: content_class.append("contains-tables") if has_stamp: content_class.append("contains-stamps") if has_signature: content_class.append("contains-signatures") if has_redaction: content_class.append("redaction-heavy") if not content_class: content_class.append("text-only") # Page-level inferred type (best-effort) if "classification_banner" in types_seen and len(types_seen) <= 3: page_type = "cover" elif "header" in types_seen and "transcript_block" in types_seen: page_type = "transcript" elif has_table and not body_blocks: page_type = "table_only" elif "letterhead" in types_seen: page_type = "memo" else: page_type = "mixed" # Frontmatter fm = { "schema_version": SCHEMA_VERSION, "type": "page", "page_id": f"{doc_id}/p{page_num:03d}", "doc_id": doc_id, "page_number": page_num, "total_pages": total_pages, "png_path": rel_png, "page_type": page_type, "content_classification": content_class, "classification_markings": [{"level": c} for c in sorted(classifications)] if classifications else [], "chunks_on_page": chunk_ids, "chunk_count": len(chunk_ids), "source": "chunk-aggregator", "source_note": "Page-md generated from chunks built by Sonnet vision (raw/--subagent/chunks/). Per-page vision Haiku pipeline (02-vision-page.py) never produced an output for this page.", "last_ingest": NOW, "wiki_version": WIKI_VERSION, } import yaml yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, default_flow_style=False, width=10_000).rstrip() body = "\n\n".join(body_blocks) if body_blocks else "_(no extractable text — see chunk files directly)_" return f"---\n{yaml_block}\n---\n\n# Page {page_num} of {doc_id}\n\n{body}\n" def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc-id", default=None) ap.add_argument("--dry-run", action="store_true") args = ap.parse_args() missing = find_missing_pages() if args.doc_id: missing = {args.doc_id: missing.get(args.doc_id, [])} total_missing = sum(len(ps) for ps in missing.values()) print(f"[1/2] Inventory: {sum(1 for d, ps in missing.items() if ps)} docs, {total_missing} missing pages") if args.dry_run: for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])): if ps: print(f" {d}: {len(ps)}") return 0 print(f"\n[2/2] Generating thin aggregator page.md files ...") written = 0 skipped_no_chunks = 0 for doc_id, pages in missing.items(): for n in pages: md = build_page_md(doc_id, n) if md is None: skipped_no_chunks += 1 continue out = PAGES_BASE / doc_id / f"p{n:03d}.md" out.parent.mkdir(parents=True, exist_ok=True) out.write_text(md, encoding="utf-8") written += 1 print(f" written: {written}") print(f" skipped (no chunk data): {skipped_no_chunks}") print(f"\n✓ done.") return 0 if __name__ == "__main__": sys.exit(main())