disclosure-bureau/scripts/26-chunk-harness.py

#!/usr/bin/env python3
"""
26-chunk-harness.py — Deterministic harness that assembles document.md
from raw/<doc-id>/chunks/*.md + _index.json.

Use to:
  - Verify chunks are losslessly assemblable
  - Re-render document.md after manual chunk edits
  - Generate alternate views (HTML, PDF, single-language)

Usage:
  ./26-chunk-harness.py --doc-id <id>                          # rebuild document.md
  ./26-chunk-harness.py --doc-id <id> --validate               # just check structure
  ./26-chunk-harness.py --doc-id <id> --lang pt-br             # render only PT-BR
  ./26-chunk-harness.py --doc-id <id> --format html            # render to HTML
"""
from __future__ import annotations

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")


CANONICAL_TYPES = {
    "letterhead", "address_block", "classification_marking", "heading",
    "paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
    "caption", "table_marker", "image", "stamp", "signature", "marginalia",
    "redaction", "footer", "blank_area", "unknown",
}

TYPE_NORMALIZER = {
    "body_paragraph": "paragraph",
    "narrative": "paragraph",
    "prose": "paragraph",
    "body_text": "paragraph",
    "classification_banner": "classification_marking",
    "security_banner": "classification_marking",
    "classification_label": "classification_marking",
    "header_block": "heading",
    "section_header": "heading",
    "subject_line": "heading",
    "doc_title": "heading",
    "agenda_heading": "heading",
    "addressee_block": "address_block",
    "distribution_list": "address_block",
    "routing_block": "address_block",
    "to_block": "address_block",
    "from_block": "address_block",
    "signature_block": "signature",
    "sig": "signature",
    "form_reference": "form_field",
    "field": "form_field",
    "label_value": "form_field",
    "kv_field": "form_field",
}


def canonicalize_type(t: str) -> str:
    if t in CANONICAL_TYPES:
        return t
    return TYPE_NORMALIZER.get(t, t)


def _shallow_yaml_extract(text: str) -> dict:
    """Best-effort key:value extraction when full yaml parse fails (broken quotes etc).

    Only handles top-level scalar fields — drops broken arrays / objects.
    Enough for the harness to render bodies + render basic metadata.
    """
    out: dict = {}
    for line in text.splitlines():
        # only treat lines that look like `key: value` (no indentation)
        if not line or line[0] in (" ", "\t", "-"):
            continue
        m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.*)$", line)
        if not m:
            continue
        k, v = m.group(1), m.group(2).strip()
        if v.startswith("{") or v.startswith("["):
            # complex — skip rather than parse partial
            continue
        if v == "null" or v == "":
            out[k] = None
        elif v.lower() == "true":
            out[k] = True
        elif v.lower() == "false":
            out[k] = False
        elif re.match(r"^-?\d+\.\d+$", v):
            out[k] = float(v)
        elif re.match(r"^-?\d+$", v):
            out[k] = int(v)
        elif (v[0] == v[-1]) and v[0] in ('"', "'"):
            out[k] = v[1:-1]
        else:
            out[k] = v
    return out


def read_chunk(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    fm_text = c[3:end].strip()
    body = c[end + 3:].lstrip("\n")
    try:
        fm = yaml.safe_load(fm_text) or {}
    except yaml.YAMLError:
        # Malformed frontmatter (quoted strings, unclosed brackets) — degrade gracefully
        fm = _shallow_yaml_extract(fm_text)
        fm["_yaml_error"] = True
    return fm, body


def validate(doc_dir: Path) -> list[str]:
    """Return list of errors (empty if valid)."""
    errors: list[str] = []

    index_path = doc_dir / "_index.json"
    if not index_path.exists():
        errors.append("missing _index.json")
        return errors

    try:
        index = json.loads(index_path.read_text())
    except json.JSONDecodeError as e:
        errors.append(f"_index.json malformed: {e}")
        return errors

    chunks_dir = doc_dir / "chunks"
    expected_ids = set()
    for entry in index.get("chunks", []):
        cid = entry.get("chunk_id")
        if not cid:
            errors.append(f"index entry missing chunk_id: {entry}")
            continue
        expected_ids.add(cid)
        chunk_path = chunks_dir / f"{cid}.md"
        if not chunk_path.exists():
            errors.append(f"chunk file missing: {chunk_path}")
            continue
        try:
            fm, body = read_chunk(chunk_path)
        except Exception as e:
            errors.append(f"chunk {cid} unreadable: {e}")
            continue
        if fm.get("_yaml_error"):
            errors.append(f"chunk {cid}: YAML frontmatter malformed (shallow-parsed; body OK)")
        if not fm.get("type"):
            errors.append(f"chunk {cid}: missing type")
        if not body.strip():
            errors.append(f"chunk {cid}: empty body")
        related_image = fm.get("related_image")
        if related_image:
            img_path = doc_dir / "images" / related_image
            if not img_path.exists():
                errors.append(f"chunk {cid}: related_image missing on disk: {related_image}")

    # Check chunk files that aren't in the index (orphans)
    if chunks_dir.exists():
        for chunk_file in chunks_dir.glob("c*.md"):
            cid = chunk_file.stem
            if cid not in expected_ids:
                errors.append(f"orphan chunk file (not in index): {cid}")

    return errors


TEXTUAL_TYPES = {
    # Canonical
    "letterhead", "address_block", "classification_marking", "heading",
    "paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
    "caption", "footer",
    # Variations the agent invented (kept as-is)
    "body_paragraph", "header_block", "header", "section_header",
    "subject_line", "addressee_block", "form_reference", "distribution_list",
    "transcript_block", "to_from_line", "date_line", "list_item",
    "page_number", "title_block", "narrative_paragraph", "signature_block",
    "handwriting", "marginalia_note",
}


def assemble_prose(doc_dir: Path, lang: str) -> str:
    """Clean reading view: just the textual content in the chosen language, page by page."""
    index = json.loads((doc_dir / "_index.json").read_text())
    chunks_meta = index.get("chunks", [])
    by_page: dict[int, list[dict]] = {}
    for c in chunks_meta:
        by_page.setdefault(c.get("page", 0), []).append(c)
    for page_chunks in by_page.values():
        page_chunks.sort(key=lambda x: x.get("order_in_page", 0))

    chunks_dir = doc_dir / "chunks"
    out: list[str] = []
    out.append(f"# {index.get('doc_id')}")
    out.append("")
    out.append(f"> {index.get('total_pages')} páginas · {len(chunks_meta)} chunks · idioma: {lang}")
    out.append("")

    marker = "**EN:**" if lang == "en" else "**PT-BR:**"
    for page_num in sorted(by_page.keys()):
        out.append(f"## Página {page_num}" if lang == "pt-br" else f"## Page {page_num}")
        out.append("")
        for c in by_page[page_num]:
            canonical = canonicalize_type(c.get("type", ""))
            if canonical not in TEXTUAL_TYPES:
                continue
            fm, body = read_chunk(chunks_dir / f"{c['chunk_id']}.md")
            text = ""
            for line in body.split("\n"):
                s = line.strip()
                if s.startswith(marker):
                    text = s.removeprefix(marker).strip()
                    break
            if not text:
                continue
            if canonical == "heading":
                out.append(f"### {text}")
            elif canonical == "classification_marking":
                out.append(f"_{text}_")
            elif canonical in ("bulleted_item", "numbered_item"):
                out.append(f"- {text}")
            elif canonical == "quote_block":
                out.append(f"> {text}")
            else:
                out.append(text)
            out.append("")
        out.append("")
    return "\n".join(out)


def assemble_markdown(doc_dir: Path, lang: str = "both") -> str:
    """Read _index.json + chunks/, return assembled markdown."""
    index = json.loads((doc_dir / "_index.json").read_text())
    doc_id = index.get("doc_id", doc_dir.name)
    chunks_meta = index.get("chunks", [])

    # Group by page
    by_page: dict[int, list[dict]] = {}
    for c in chunks_meta:
        by_page.setdefault(c.get("page", 0), []).append(c)
    for page_chunks in by_page.values():
        page_chunks.sort(key=lambda x: x.get("order_in_page", 0))

    # Compute summary stats
    type_hist: dict[str, int] = {}
    ufo_flags: list[str] = []
    cryptid_flags: list[str] = []
    for c in chunks_meta:
        type_hist[c.get("type", "unknown")] = type_hist.get(c.get("type", "unknown"), 0) + 1

    chunks_dir = doc_dir / "chunks"
    for entry in chunks_meta:
        cid = entry.get("chunk_id")
        fm, _ = read_chunk(chunks_dir / f"{cid}.md")
        if fm.get("ufo_anomaly_detected"):
            ufo_flags.append(cid)
        if fm.get("cryptid_anomaly_detected"):
            cryptid_flags.append(cid)

    out: list[str] = []
    out.append("---")
    out.append(yaml.dump({
        "schema_version": "0.2.0",
        "type": "master_document",
        "doc_id": doc_id,
        "total_pages": index.get("total_pages"),
        "total_chunks": len(chunks_meta),
        "chunk_types_histogram": type_hist,
        "ufo_anomalies_flagged": ufo_flags,
        "cryptid_anomalies_flagged": cryptid_flags,
        "build_approach": "subagents+harness",
        "build_model": "claude-sonnet-4-6",
        "assembled_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    }, sort_keys=False, allow_unicode=True).rstrip())
    out.append("---")
    out.append("")
    out.append(f"# {doc_id}")
    out.append("")
    out.append(f"> **{len(chunks_meta)} chunks** across **{index.get('total_pages', '?')} pages** · types: {type_hist}")
    if ufo_flags:
        out.append(f"> 🛸 **UAP anomalies flagged in chunks:** {', '.join(ufo_flags)}")
    out.append("")

    for page_num in sorted(by_page.keys()):
        out.append(f"## Page {page_num}")
        out.append("")
        for c in by_page[page_num]:
            cid = c.get("chunk_id")
            fm, body = read_chunk(chunks_dir / f"{cid}.md")
            bbox = fm.get("bbox") or {}
            bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"
            out.append(f'<a id="{cid}"></a>')
            out.append(f"### Chunk {cid} — {fm.get('type','?')} · p{page_num} · bbox: {bbox_str}")
            out.append("")

            # Render body — body already has **EN:** and **PT-BR:** sections
            if lang == "en":
                # Extract only EN line
                for line in body.split("\n"):
                    if line.strip().startswith("**EN:**"):
                        out.append(line)
            elif lang == "pt-br":
                for line in body.split("\n"):
                    if line.strip().startswith("**PT-BR:**"):
                        out.append(line)
            else:
                out.append(body.rstrip())
            out.append("")

            # Embed image if applicable
            if fm.get("related_image"):
                out.append(f"![chunk image](./images/{fm['related_image']})")
                out.append("")
            out.append("---")
            out.append("")

    return "\n".join(out)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", required=True)
    ap.add_argument("--validate", action="store_true")
    ap.add_argument("--lang", choices=["both", "en", "pt-br"], default="both")
    ap.add_argument("--prose", action="store_true", help="Produce text-only reading view (no bbox/metadata, only textual chunks)")
    ap.add_argument("--root", default=str(UFO_ROOT / "raw"))
    args = ap.parse_args()

    doc_dir = Path(args.root) / args.doc_id
    if not doc_dir.exists():
        sys.stderr.write(f"✗ Doc dir not found: {doc_dir}\n"); sys.exit(1)

    if args.validate:
        errs = validate(doc_dir)
        if errs:
            print(f"✗ {len(errs)} validation errors:")
            for e in errs[:50]:
                print(f"  · {e}")
            sys.exit(1)
        index = json.loads((doc_dir / "_index.json").read_text())
        print(f"✓ {len(index.get('chunks', []))} chunks validated across {index.get('total_pages', '?')} pages")
        return

    if args.prose:
        if args.lang == "both":
            sys.stderr.write("--prose requires --lang en or --lang pt-br\n"); sys.exit(1)
        md = assemble_prose(doc_dir, lang=args.lang)
        out_path = doc_dir / f"document.prose.{args.lang}.md"
    else:
        md = assemble_markdown(doc_dir, lang=args.lang)
        out_path = doc_dir / ("document.md" if args.lang == "both" else f"document.{args.lang}.md")
    out_path.write_text(md, encoding="utf-8")
    print(f"✓ Wrote {out_path} ({len(md)} bytes)")


if __name__ == "__main__":
    main()