disclosure-bureau/scripts/rebuild_doc65_assemble.py

#!/usr/bin/env python3
"""
Assemble chunks/, _index.json, and document.md from _pages_raw.json
for doc-65-hs1-834228961-62-hq-83894-section-1.

Also:
- Crops image chunks using PIL
- Detects multi-page table markers for stitching
- Writes all output files
"""
from __future__ import annotations

import json
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from collections import defaultdict

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)"
PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1")
OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1")
CHUNKS_DIR = OUTPUT_DIR / "chunks"
IMAGES_DIR = OUTPUT_DIR / "images"
TABLES_DIR = OUTPUT_DIR / "tables"

TOTAL_PAGES = 150
BUILD_AT = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
BUILD_MODEL = "claude-haiku-4-5"


def load_pages() -> list[dict]:
    raw_path = OUTPUT_DIR / "_pages_raw.json"
    with open(raw_path, encoding="utf-8") as f:
        return json.load(f)


def normalize_chunk(chunk: dict, page_num: int) -> dict:
    """Ensure all required fields exist with correct types."""
    defaults = {
        "order_in_page": 1,
        "type": "paragraph",
        "content_en": "",
        "content_pt_br": "",
        "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.05},
        "classification": None,
        "formatting": [],
        "cross_page_hint": "self_contained",
        "ocr_confidence": 0.85,
        "ocr_source_lines": [],
        "redaction_code": None,
        "redaction_inferred_content_type": None,
        "image_type": None,
        "ufo_anomaly_detected": False,
        "ufo_anomaly_type": None,
        "ufo_anomaly_rationale": None,
        "cryptid_anomaly_detected": False,
        "cryptid_anomaly_type": None,
        "cryptid_anomaly_rationale": None,
        "image_description_en": None,
        "image_description_pt_br": None,
        "extracted_text": None,
    }
    result = {**defaults, **chunk}
    # Coerce None strings to empty
    if result.get('content_en') is None:
        result['content_en'] = ''
    if result.get('content_pt_br') is None:
        result['content_pt_br'] = ''
    result["page"] = page_num

    # Normalize booleans
    for bool_field in ("ufo_anomaly_detected", "cryptid_anomaly_detected"):
        val = result.get(bool_field)
        if isinstance(val, str):
            result[bool_field] = val.lower() in ("true", "1", "yes")
        elif val is None:
            result[bool_field] = False
        else:
            result[bool_field] = bool(val)

    # Normalize formatting to list
    if not isinstance(result.get("formatting"), list):
        result["formatting"] = []

    # Normalize ocr_source_lines to list
    if not isinstance(result.get("ocr_source_lines"), list):
        result["ocr_source_lines"] = []

    # Normalize bbox
    bbox = result.get("bbox", {})
    if not isinstance(bbox, dict):
        bbox = {}
    result["bbox"] = {
        "x": float(bbox.get("x", 0.0)),
        "y": float(bbox.get("y", 0.0)),
        "w": float(bbox.get("w", 1.0)),
        "h": float(bbox.get("h", 0.05)),
    }

    return result


def assign_global_ids(pages: list[dict]) -> list[dict]:
    """
    Assign chunk_id, order_global, prev_chunk, next_chunk to all chunks.
    Returns flat list of all chunks in global order.
    """
    all_chunks = []
    counter = 1

    for page_data in pages:
        page_num = page_data.get("page_number", 0)
        chunks = page_data.get("chunks", [])
        # Sort by order_in_page
        chunks.sort(key=lambda c: c.get("order_in_page", 0))

        for chunk in chunks:
            normalized = normalize_chunk(chunk, page_num)
            normalized["chunk_id"] = f"c{counter:04d}"
            normalized["order_global"] = counter
            all_chunks.append(normalized)
            counter += 1

    # Set prev/next pointers
    for i, chunk in enumerate(all_chunks):
        chunk["prev_chunk"] = all_chunks[i - 1]["chunk_id"] if i > 0 else None
        chunk["next_chunk"] = all_chunks[i + 1]["chunk_id"] if i < len(all_chunks) - 1 else None

    return all_chunks


def crop_image(chunk: dict) -> str | None:
    """Crop image region from page PNG. Returns saved path or None."""
    page_num = chunk["page"]
    chunk_id = chunk["chunk_id"]
    png_path = PNG_DIR / f"p-{page_num:03d}.png"
    if not png_path.exists():
        return None

    bbox = chunk["bbox"]
    out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"

    try:
        from PIL import Image
        im = Image.open(png_path)
        W, H = im.size
        pad = 0.005
        x, y, w, h = bbox["x"], bbox["y"], bbox["w"], bbox["h"]
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        if right <= left or bottom <= top:
            return None
        cropped = im.crop((left, top, right, bottom))
        cropped.save(str(out_path))
        return str(out_path)
    except Exception as e:
        print(f"  Crop error for {chunk_id}: {e}", file=sys.stderr)
        return None


def write_chunk_file(chunk: dict, source_png_relative: str) -> None:
    """Write chunks/c<NNNN>.md for one chunk."""
    chunk_id = chunk["chunk_id"]
    chunk_type = chunk.get("type", "paragraph")
    page = chunk.get("page", 0)
    order_in_page = chunk.get("order_in_page", 1)
    order_global = chunk.get("order_global", 1)
    bbox = chunk["bbox"]
    classification = chunk.get("classification")
    formatting = chunk.get("formatting", [])
    cross_page_hint = chunk.get("cross_page_hint", "self_contained")
    prev_chunk = chunk.get("prev_chunk")
    next_chunk = chunk.get("next_chunk")
    ocr_confidence = chunk.get("ocr_confidence", 0.85)
    ocr_source_lines = chunk.get("ocr_source_lines", [])
    redaction_code = chunk.get("redaction_code")
    redaction_inferred = chunk.get("redaction_inferred_content_type")
    image_type = chunk.get("image_type")
    ufo_anomaly = chunk.get("ufo_anomaly_detected", False)
    ufo_type = chunk.get("ufo_anomaly_type")
    ufo_rationale = chunk.get("ufo_anomaly_rationale")
    cryptid_anomaly = chunk.get("cryptid_anomaly_detected", False)
    cryptid_type = chunk.get("cryptid_anomaly_type")
    cryptid_rationale = chunk.get("cryptid_anomaly_rationale")
    image_desc_en = chunk.get("image_description_en")
    image_desc_pt = chunk.get("image_description_pt_br")
    extracted_text = chunk.get("extracted_text")
    content_en = chunk.get("content_en", "")
    content_pt_br = chunk.get("content_pt_br", "")

    # Related fields
    related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
    related_table = chunk.get("related_table")

    def yaml_val(v):
        if v is None:
            return "null"
        if isinstance(v, bool):
            return "true" if v else "false"
        if isinstance(v, (int, float)):
            return str(v)
        if isinstance(v, list):
            if not v:
                return "[]"
            items = ", ".join(f'"{x}"' for x in v)
            return f"[{items}]"
        # string
        s = str(v).replace('"', '\\"')
        return f'"{s}"'

    lines = [
        "---",
        f"chunk_id: {chunk_id}",
        f"type: {chunk_type}",
        f"page: {page}",
        f"order_in_page: {order_in_page}",
        f"order_global: {order_global}",
        f"bbox: {{x: {bbox['x']:.2f}, y: {bbox['y']:.2f}, w: {bbox['w']:.2f}, h: {bbox['h']:.2f}}}",
        f"classification: {yaml_val(classification)}",
        f"formatting: {yaml_val(formatting)}",
        f"cross_page_hint: {cross_page_hint}",
        f"prev_chunk: {yaml_val(prev_chunk)}",
        f"next_chunk: {yaml_val(next_chunk)}",
        f"related_image: {yaml_val(related_image)}",
        f"related_table: {yaml_val(related_table)}",
        f"ocr_confidence: {ocr_confidence}",
        f"ocr_source_lines: {yaml_val(ocr_source_lines)}",
        f"redaction_code: {yaml_val(redaction_code)}",
        f"redaction_inferred_content_type: {yaml_val(redaction_inferred)}",
        f"image_type: {yaml_val(image_type)}",
        f"ufo_anomaly_detected: {yaml_val(ufo_anomaly)}",
        f"cryptid_anomaly_detected: {yaml_val(cryptid_anomaly)}",
        f"ufo_anomaly_type: {yaml_val(ufo_type)}",
        f"ufo_anomaly_rationale: {yaml_val(ufo_rationale)}",
        f"cryptid_anomaly_type: {yaml_val(cryptid_type)}",
        f"cryptid_anomaly_rationale: {yaml_val(cryptid_rationale)}",
        f"image_description_en: {yaml_val(image_desc_en)}",
        f"image_description_pt_br: {yaml_val(image_desc_pt)}",
        f"extracted_text: {yaml_val(extracted_text)}",
        f"source_png: {source_png_relative}",
        "---",
        "",
        f"**EN:** {content_en}",
        "",
        f"**PT-BR:** {content_pt_br}",
    ]

    out_path = CHUNKS_DIR / f"{chunk_id}.md"
    out_path.write_text("\n".join(lines), encoding="utf-8")


def write_index(all_chunks: list[dict]) -> None:
    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": len(all_chunks),
        "build_approach": "subagents",
        "build_model": BUILD_MODEL,
        "build_at": BUILD_AT,
        "chunks": []
    }

    for chunk in all_chunks:
        preview = chunk.get("content_en", "")[:80]
        index["chunks"].append({
            "chunk_id": chunk["chunk_id"],
            "type": chunk.get("type", "paragraph"),
            "page": chunk.get("page", 0),
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk.get("order_global", 1),
            "file": f"chunks/{chunk['chunk_id']}.md",
            "bbox": chunk["bbox"],
            "preview": preview,
        })

    out_path = OUTPUT_DIR / "_index.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(index, f, ensure_ascii=False, indent=2)
    print(f"Written: {out_path}")


def write_document_md(all_chunks: list[dict], stats: dict) -> None:
    # Compute histogram
    histogram: dict[str, int] = defaultdict(int)
    ufo_flagged = []
    cryptid_flagged = []
    for chunk in all_chunks:
        histogram[chunk.get("type", "paragraph")] += 1
        if chunk.get("ufo_anomaly_detected"):
            ufo_flagged.append(chunk["chunk_id"])
        if chunk.get("cryptid_anomaly_detected"):
            cryptid_flagged.append(chunk["chunk_id"])

    histogram_yaml = "\n".join(f"  {k}: {v}" for k, v in sorted(histogram.items()))
    ufo_yaml = json.dumps(ufo_flagged, ensure_ascii=False)
    cryptid_yaml = json.dumps(cryptid_flagged, ensure_ascii=False)

    lines = [
        "---",
        'schema_version: "0.2.0"',
        "type: master_document",
        f"doc_id: {DOC_ID}",
        f'canonical_title: "{DOC_TITLE}"',
        f"total_pages: {TOTAL_PAGES}",
        f"total_chunks: {len(all_chunks)}",
        "chunk_types_histogram:",
        histogram_yaml,
        f"multi_page_tables: []",
        f"ufo_anomalies_flagged: {ufo_yaml}",
        f"cryptid_anomalies_flagged: {cryptid_yaml}",
        'build_approach: "subagents"',
        f"build_model: {BUILD_MODEL}",
        f"build_at: {BUILD_AT}",
        "---",
        "",
    ]

    # Group chunks by page
    pages_map: dict[int, list[dict]] = defaultdict(list)
    for chunk in all_chunks:
        pages_map[chunk["page"]].append(chunk)

    for page_num in sorted(pages_map.keys()):
        page_chunks = pages_map[page_num]
        lines.append(f"## Page {page_num}")
        lines.append("")

        for chunk in page_chunks:
            cid = chunk["chunk_id"]
            ctype = chunk.get("type", "paragraph")
            bbox = chunk["bbox"]
            bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
            content_en = chunk.get("content_en", "")
            content_pt_br = chunk.get("content_pt_br", "")

            lines.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->")
            lines.append(f'<a id="{cid}"></a>')
            lines.append(f"### Chunk {cid} — {ctype} · p{page_num} · bbox: {bbox_str}")
            lines.append("")
            lines.append(f"**EN:** {content_en}")
            lines.append("")
            lines.append(f"**PT-BR:** {content_pt_br}")
            lines.append("")

            # Embed image if applicable
            if ctype == "image":
                img_path = IMAGES_DIR / f"IMG-{cid}.png"
                if img_path.exists():
                    lines.append(f"![{cid} image](./images/IMG-{cid}.png)")
                    lines.append("")
                if chunk.get("image_description_en"):
                    lines.append(f"*Image description:* {chunk['image_description_en']}")
                    lines.append("")

            # Metadata collapsible
            meta = {
                "chunk_id": cid,
                "type": ctype,
                "page": chunk.get("page"),
                "order_in_page": chunk.get("order_in_page"),
                "order_global": chunk.get("order_global"),
                "bbox": bbox,
                "classification": chunk.get("classification"),
                "formatting": chunk.get("formatting", []),
                "cross_page_hint": chunk.get("cross_page_hint"),
                "prev_chunk": chunk.get("prev_chunk"),
                "next_chunk": chunk.get("next_chunk"),
                "related_image": f"IMG-{cid}.png" if ctype == "image" else None,
                "related_table": chunk.get("related_table"),
                "ocr_confidence": chunk.get("ocr_confidence"),
                "ocr_source_lines": chunk.get("ocr_source_lines", []),
                "redaction_code": chunk.get("redaction_code"),
                "redaction_inferred_content_type": chunk.get("redaction_inferred_content_type"),
                "image_type": chunk.get("image_type"),
                "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
                "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
                "ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
                "ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
                "cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
                "cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"),
                "image_description_en": chunk.get("image_description_en"),
                "image_description_pt_br": chunk.get("image_description_pt_br"),
                "extracted_text": chunk.get("extracted_text"),
            }
            lines.append("<details><summary>metadata</summary>")
            lines.append("")
            lines.append("```json")
            lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
            lines.append("```")
            lines.append("")
            lines.append("</details>")
            lines.append("")
            lines.append("---")
            lines.append("")

    out_path = OUTPUT_DIR / "document.md"
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"Written: {out_path}")
    return len("\n".join(lines).encode("utf-8"))


def main():
    start = time.time()
    print("Loading pages...")
    pages = load_pages()
    print(f"  {len(pages)} pages loaded")

    print("Assigning global IDs...")
    all_chunks = assign_global_ids(pages)
    print(f"  {len(all_chunks)} chunks total")

    # Create dirs
    CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)

    # Crop images
    image_chunks = [c for c in all_chunks if c.get("type") == "image"]
    print(f"Cropping {len(image_chunks)} images...")
    images_saved = 0
    for chunk in image_chunks:
        path = crop_image(chunk)
        if path:
            images_saved += 1

    # Write chunk files
    print("Writing chunk files...")
    for chunk in all_chunks:
        page_num = chunk["page"]
        source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png"
        write_chunk_file(chunk, source_png)
    print(f"  {len(all_chunks)} chunk files written")

    # Write _index.json
    print("Writing _index.json...")
    write_index(all_chunks)

    # Write document.md
    print("Writing document.md...")
    stats = {}
    doc_bytes = write_document_md(all_chunks, stats)

    # Compute final stats
    ufo_count = sum(1 for c in all_chunks if c.get("ufo_anomaly_detected"))
    cryptid_count = sum(1 for c in all_chunks if c.get("cryptid_anomaly_detected"))
    elapsed = int(time.time() - start)

    print(f"\nDone in {elapsed}s")
    print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_saved} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}")


if __name__ == "__main__":
    main()