#!/usr/bin/env python3 """ 26-chunk-harness.py — Deterministic harness that assembles document.md from raw//chunks/*.md + _index.json. Use to: - Verify chunks are losslessly assemblable - Re-render document.md after manual chunk edits - Generate alternate views (HTML, PDF, single-language) Usage: ./26-chunk-harness.py --doc-id # rebuild document.md ./26-chunk-harness.py --doc-id --validate # just check structure ./26-chunk-harness.py --doc-id --lang pt-br # render only PT-BR ./26-chunk-harness.py --doc-id --format html # render to HTML """ from __future__ import annotations import argparse import json import re import sys from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") CANONICAL_TYPES = { "letterhead", "address_block", "classification_marking", "heading", "paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block", "caption", "table_marker", "image", "stamp", "signature", "marginalia", "redaction", "footer", "blank_area", "unknown", } TYPE_NORMALIZER = { "body_paragraph": "paragraph", "narrative": "paragraph", "prose": "paragraph", "body_text": "paragraph", "classification_banner": "classification_marking", "security_banner": "classification_marking", "classification_label": "classification_marking", "header_block": "heading", "section_header": "heading", "subject_line": "heading", "doc_title": "heading", "agenda_heading": "heading", "addressee_block": "address_block", "distribution_list": "address_block", "routing_block": "address_block", "to_block": "address_block", "from_block": "address_block", "signature_block": "signature", "sig": "signature", "form_reference": "form_field", "field": "form_field", "label_value": "form_field", "kv_field": "form_field", } def canonicalize_type(t: str) -> str: if t in CANONICAL_TYPES: return t return TYPE_NORMALIZER.get(t, t) def _shallow_yaml_extract(text: str) -> dict: """Best-effort key:value extraction when full yaml parse fails (broken quotes etc). Only handles top-level scalar fields — drops broken arrays / objects. Enough for the harness to render bodies + render basic metadata. """ out: dict = {} for line in text.splitlines(): # only treat lines that look like `key: value` (no indentation) if not line or line[0] in (" ", "\t", "-"): continue m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.*)$", line) if not m: continue k, v = m.group(1), m.group(2).strip() if v.startswith("{") or v.startswith("["): # complex — skip rather than parse partial continue if v == "null" or v == "": out[k] = None elif v.lower() == "true": out[k] = True elif v.lower() == "false": out[k] = False elif re.match(r"^-?\d+\.\d+$", v): out[k] = float(v) elif re.match(r"^-?\d+$", v): out[k] = int(v) elif (v[0] == v[-1]) and v[0] in ('"', "'"): out[k] = v[1:-1] else: out[k] = v return out def read_chunk(path: Path) -> tuple[dict, str]: c = path.read_text(encoding="utf-8") if not c.startswith("---"): return {}, c end = c.find("---", 4) fm_text = c[3:end].strip() body = c[end + 3:].lstrip("\n") try: fm = yaml.safe_load(fm_text) or {} except yaml.YAMLError: # Malformed frontmatter (quoted strings, unclosed brackets) — degrade gracefully fm = _shallow_yaml_extract(fm_text) fm["_yaml_error"] = True return fm, body def validate(doc_dir: Path) -> list[str]: """Return list of errors (empty if valid).""" errors: list[str] = [] index_path = doc_dir / "_index.json" if not index_path.exists(): errors.append("missing _index.json") return errors try: index = json.loads(index_path.read_text()) except json.JSONDecodeError as e: errors.append(f"_index.json malformed: {e}") return errors chunks_dir = doc_dir / "chunks" expected_ids = set() for entry in index.get("chunks", []): cid = entry.get("chunk_id") if not cid: errors.append(f"index entry missing chunk_id: {entry}") continue expected_ids.add(cid) chunk_path = chunks_dir / f"{cid}.md" if not chunk_path.exists(): errors.append(f"chunk file missing: {chunk_path}") continue try: fm, body = read_chunk(chunk_path) except Exception as e: errors.append(f"chunk {cid} unreadable: {e}") continue if fm.get("_yaml_error"): errors.append(f"chunk {cid}: YAML frontmatter malformed (shallow-parsed; body OK)") if not fm.get("type"): errors.append(f"chunk {cid}: missing type") if not body.strip(): errors.append(f"chunk {cid}: empty body") related_image = fm.get("related_image") if related_image: img_path = doc_dir / "images" / related_image if not img_path.exists(): errors.append(f"chunk {cid}: related_image missing on disk: {related_image}") # Check chunk files that aren't in the index (orphans) if chunks_dir.exists(): for chunk_file in chunks_dir.glob("c*.md"): cid = chunk_file.stem if cid not in expected_ids: errors.append(f"orphan chunk file (not in index): {cid}") return errors TEXTUAL_TYPES = { # Canonical "letterhead", "address_block", "classification_marking", "heading", "paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block", "caption", "footer", # Variations the agent invented (kept as-is) "body_paragraph", "header_block", "header", "section_header", "subject_line", "addressee_block", "form_reference", "distribution_list", "transcript_block", "to_from_line", "date_line", "list_item", "page_number", "title_block", "narrative_paragraph", "signature_block", "handwriting", "marginalia_note", } def assemble_prose(doc_dir: Path, lang: str) -> str: """Clean reading view: just the textual content in the chosen language, page by page.""" index = json.loads((doc_dir / "_index.json").read_text()) chunks_meta = index.get("chunks", []) by_page: dict[int, list[dict]] = {} for c in chunks_meta: by_page.setdefault(c.get("page", 0), []).append(c) for page_chunks in by_page.values(): page_chunks.sort(key=lambda x: x.get("order_in_page", 0)) chunks_dir = doc_dir / "chunks" out: list[str] = [] out.append(f"# {index.get('doc_id')}") out.append("") out.append(f"> {index.get('total_pages')} páginas · {len(chunks_meta)} chunks · idioma: {lang}") out.append("") marker = "**EN:**" if lang == "en" else "**PT-BR:**" for page_num in sorted(by_page.keys()): out.append(f"## Página {page_num}" if lang == "pt-br" else f"## Page {page_num}") out.append("") for c in by_page[page_num]: canonical = canonicalize_type(c.get("type", "")) if canonical not in TEXTUAL_TYPES: continue fm, body = read_chunk(chunks_dir / f"{c['chunk_id']}.md") text = "" for line in body.split("\n"): s = line.strip() if s.startswith(marker): text = s.removeprefix(marker).strip() break if not text: continue if canonical == "heading": out.append(f"### {text}") elif canonical == "classification_marking": out.append(f"_{text}_") elif canonical in ("bulleted_item", "numbered_item"): out.append(f"- {text}") elif canonical == "quote_block": out.append(f"> {text}") else: out.append(text) out.append("") out.append("") return "\n".join(out) def assemble_markdown(doc_dir: Path, lang: str = "both") -> str: """Read _index.json + chunks/, return assembled markdown.""" index = json.loads((doc_dir / "_index.json").read_text()) doc_id = index.get("doc_id", doc_dir.name) chunks_meta = index.get("chunks", []) # Group by page by_page: dict[int, list[dict]] = {} for c in chunks_meta: by_page.setdefault(c.get("page", 0), []).append(c) for page_chunks in by_page.values(): page_chunks.sort(key=lambda x: x.get("order_in_page", 0)) # Compute summary stats type_hist: dict[str, int] = {} ufo_flags: list[str] = [] cryptid_flags: list[str] = [] for c in chunks_meta: type_hist[c.get("type", "unknown")] = type_hist.get(c.get("type", "unknown"), 0) + 1 chunks_dir = doc_dir / "chunks" for entry in chunks_meta: cid = entry.get("chunk_id") fm, _ = read_chunk(chunks_dir / f"{cid}.md") if fm.get("ufo_anomaly_detected"): ufo_flags.append(cid) if fm.get("cryptid_anomaly_detected"): cryptid_flags.append(cid) out: list[str] = [] out.append("---") out.append(yaml.dump({ "schema_version": "0.2.0", "type": "master_document", "doc_id": doc_id, "total_pages": index.get("total_pages"), "total_chunks": len(chunks_meta), "chunk_types_histogram": type_hist, "ufo_anomalies_flagged": ufo_flags, "cryptid_anomalies_flagged": cryptid_flags, "build_approach": "subagents+harness", "build_model": "claude-sonnet-4-6", "assembled_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), }, sort_keys=False, allow_unicode=True).rstrip()) out.append("---") out.append("") out.append(f"# {doc_id}") out.append("") out.append(f"> **{len(chunks_meta)} chunks** across **{index.get('total_pages', '?')} pages** · types: {type_hist}") if ufo_flags: out.append(f"> 🛸 **UAP anomalies flagged in chunks:** {', '.join(ufo_flags)}") out.append("") for page_num in sorted(by_page.keys()): out.append(f"## Page {page_num}") out.append("") for c in by_page[page_num]: cid = c.get("chunk_id") fm, body = read_chunk(chunks_dir / f"{cid}.md") bbox = fm.get("bbox") or {} bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}" out.append(f'') out.append(f"### Chunk {cid} — {fm.get('type','?')} · p{page_num} · bbox: {bbox_str}") out.append("") # Render body — body already has **EN:** and **PT-BR:** sections if lang == "en": # Extract only EN line for line in body.split("\n"): if line.strip().startswith("**EN:**"): out.append(line) elif lang == "pt-br": for line in body.split("\n"): if line.strip().startswith("**PT-BR:**"): out.append(line) else: out.append(body.rstrip()) out.append("") # Embed image if applicable if fm.get("related_image"): out.append(f"![chunk image](./images/{fm['related_image']})") out.append("") out.append("---") out.append("") return "\n".join(out) def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc-id", required=True) ap.add_argument("--validate", action="store_true") ap.add_argument("--lang", choices=["both", "en", "pt-br"], default="both") ap.add_argument("--prose", action="store_true", help="Produce text-only reading view (no bbox/metadata, only textual chunks)") ap.add_argument("--root", default=str(UFO_ROOT / "raw")) args = ap.parse_args() doc_dir = Path(args.root) / args.doc_id if not doc_dir.exists(): sys.stderr.write(f"✗ Doc dir not found: {doc_dir}\n"); sys.exit(1) if args.validate: errs = validate(doc_dir) if errs: print(f"✗ {len(errs)} validation errors:") for e in errs[:50]: print(f" · {e}") sys.exit(1) index = json.loads((doc_dir / "_index.json").read_text()) print(f"✓ {len(index.get('chunks', []))} chunks validated across {index.get('total_pages', '?')} pages") return if args.prose: if args.lang == "both": sys.stderr.write("--prose requires --lang en or --lang pt-br\n"); sys.exit(1) md = assemble_prose(doc_dir, lang=args.lang) out_path = doc_dir / f"document.prose.{args.lang}.md" else: md = assemble_markdown(doc_dir, lang=args.lang) out_path = doc_dir / ("document.md" if args.lang == "both" else f"document.{args.lang}.md") out_path.write_text(md, encoding="utf-8") print(f"✓ Wrote {out_path} ({len(md)} bytes)") if __name__ == "__main__": main()