disclosure-bureau/scripts/rebuild_doc65_s2_v2.py

#!/usr/bin/env python3
"""
Rebuild script v2 for doc-65-hs1-834228961-62-hq-83894-section-2
Uses claude CLI for vision processing (no direct API key needed).
Processes 159 pages in batches of 5.
"""

import os
import sys
import json
import time
import subprocess
import concurrent.futures
import textwrap
from datetime import datetime, timezone
from pathlib import Path
from PIL import Image

# ── Config ──────────────────────────────────────────────────────────────────
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2"
DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
CLAUDE_BIN = "/Users/guto/.local/bin/claude"

BATCH_SIZE = 5
CLAUDE_TIMEOUT = 120  # seconds per page call

def build_page_map():
    pngs = sorted(
        int(p.stem.replace("p-", ""))
        for p in PNG_DIR.glob("p-*.png")
    )
    return {i + 1: num for i, num in enumerate(pngs)}

PAGE_MAP = build_page_map()
TOTAL_PAGES = len(PAGE_MAP)

def load_ocr(actual_num: int) -> str:
    ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt"
    if ocr_path.exists():
        text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
        return text[:2000] if text else ""
    return ""

PAGE_REBUILDER_PROMPT_TEMPLATE = """You are a page-rebuilder agent analyzing a page from a declassified FBI document about Flying Discs / UAP investigations.

Document: {doc_title}
Actual page file: p-{actual_num:03d}.png
Sequential page number: {page_seq} of {total_pages}

OCR text (may be empty or poor quality):
{ocr_text}

Use the Read tool to read this image:
/Users/guto/ufo/processing/png/{doc_id}/p-{actual_num:03d}.png

Then analyze ALL visible content and return a JSON object with this exact structure (return ONLY the JSON, no markdown fences, no explanation):
{{
  "page_number": {page_seq},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "cover",
      "content_en": "exact transcription or description in English",
      "content_pt_br": "descrição ou transcrição em português brasileiro",
      "bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.85,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null
    }}
  ]
}}

RULES:
- type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank
- bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0)
- Split the page into logical chunks (letterhead separate from body text, stamps separate, etc.)
- For redacted blocks: type=redaction, include redaction_code if visible e.g. "(b)(1)", "(b)(3)", "(b)(6)"
- For stamps (RECEIVED, RECORDED, etc.): type=stamp
- For photos, sketches, diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other
- For tables: type=table_marker
- cross_page_hint: self_contained | continues_to_next | continues_from_prev
- content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]"
- content_pt_br: Brazilian Portuguese translation/description
- formatting: array of applicable: bold | italic | all_caps | underline | typewritten | handwritten
- ufo_anomaly_detected: true ONLY if page has image/sketch/photo of an anomalous aerial object
- Blank pages: one chunk with type=blank
- Return ONLY valid JSON, nothing else"""

IMAGE_ANALYST_PROMPT_TEMPLATE = """You are an image analyst for declassified FBI UFO/UAP investigation documents.

Read this cropped image region:
{img_path}

Analyze it and return ONLY this JSON (no markdown fences):
{{
  "image_type": "photo",
  "image_description_en": "detailed description in English",
  "image_description_pt_br": "descrição detalhada em português brasileiro",
  "extracted_text": "any text visible verbatim or null",
  "ufo_anomaly_detected": false,
  "ufo_anomaly_type": null,
  "ufo_anomaly_rationale": null,
  "cryptid_anomaly_detected": false,
  "cryptid_anomaly_type": null,
  "cryptid_anomaly_rationale": null
}}

image_type: photo | diagram | sketch | map | chart | signature_block | stamp | seal | other
Return ONLY valid JSON."""

def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str:
    """Run claude CLI with a prompt, return stdout text."""
    try:
        result = subprocess.run(
            [CLAUDE_BIN, "-p", "--dangerously-skip-permissions",
             "--model", "claude-haiku-4-5",
             "--no-session-persistence",
             prompt],
            capture_output=True,
            text=True,
            timeout=timeout,
            env={**os.environ}
        )
        return result.stdout.strip()
    except subprocess.TimeoutExpired:
        return ""
    except Exception as e:
        return f"ERROR: {e}"

def parse_json_response(raw: str):
    """Try to parse JSON from response, stripping markdown fences."""
    text = raw.strip()
    # Strip markdown fences
    if text.startswith("```"):
        lines = text.split("\n")
        # Remove first line (```json or ```)
        lines = lines[1:]
        # Remove last line if it's ```
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]
        text = "\n".join(lines).strip()

    # Find JSON object boundaries
    start = text.find("{")
    if start == -1:
        return None
    # Find matching closing brace
    depth = 0
    end = -1
    for i, ch in enumerate(text[start:]):
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                end = start + i + 1
                break
    if end == -1:
        return None

    try:
        return json.loads(text[start:end])
    except json.JSONDecodeError:
        return None

def rebuild_page(page_seq: int) -> dict:
    """Process one page via claude CLI."""
    actual_num = PAGE_MAP[page_seq]
    ocr_text = load_ocr(actual_num)

    prompt = PAGE_REBUILDER_PROMPT_TEMPLATE.format(
        doc_title=DOC_TITLE,
        actual_num=actual_num,
        page_seq=page_seq,
        total_pages=TOTAL_PAGES,
        ocr_text=ocr_text if ocr_text else "(no OCR available)",
        doc_id=DOC_ID
    )

    retries = 3
    for attempt in range(retries):
        raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT)
        if not raw or raw.startswith("ERROR:"):
            if attempt < retries - 1:
                wait = 5 * (attempt + 1)
                print(f"  [RETRY {attempt+1}] page {page_seq}: empty/error, waiting {wait}s", flush=True)
                time.sleep(wait)
                continue
            else:
                break

        data = parse_json_response(raw)
        if data and "chunks" in data:
            data["page_number"] = page_seq
            data["actual_num"] = actual_num
            for i, ch in enumerate(data["chunks"]):
                ch["order_in_page"] = i + 1
                ch["page"] = page_seq
            print(f"  [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True)
            return data
        else:
            if attempt < retries - 1:
                print(f"  [RETRY {attempt+1}] page {page_seq}: bad JSON, retrying", flush=True)
                time.sleep(3)
            else:
                print(f"  [FAIL] page {page_seq}: could not parse JSON. Raw: {raw[:200]}", flush=True)

    # Fallback
    return {
        "page_number": page_seq,
        "actual_num": actual_num,
        "chunks": [{
            "order_in_page": 1,
            "type": "blank",
            "page": page_seq,
            "content_en": "[Page processing failed - manual review required]",
            "content_pt_br": "[Falha no processamento da página - revisão manual necessária]",
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "classification": None,
            "formatting": [],
            "cross_page_hint": "self_contained",
            "ocr_confidence": 0.0,
            "ocr_source_lines": [],
            "redaction_code": None,
            "redaction_inferred_content_type": None,
            "image_type": None,
            "ufo_anomaly_detected": False,
            "ufo_anomaly_type": None,
            "ufo_anomaly_rationale": None,
            "cryptid_anomaly_detected": False,
            "cryptid_anomaly_type": None,
            "cryptid_anomaly_rationale": None
        }]
    }

def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path:
    """Crop bbox region from page PNG."""
    src = PNG_DIR / f"p-{actual_num:03d}.png"
    dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
    try:
        im = Image.open(src)
        W, H = im.size
        x = max(0.0, min(1.0, bbox.get("x", 0.0)))
        y = max(0.0, min(1.0, bbox.get("y", 0.0)))
        w = max(0.01, min(1.0 - x, bbox.get("w", 1.0)))
        h = max(0.01, min(1.0 - y, bbox.get("h", 0.1)))
        pad = 0.005
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        cropped = im.crop((left, top, right, bottom))
        cropped.save(str(dst))
    except Exception as e:
        print(f"  [CROP FAIL] {chunk_id}: {e}", flush=True)
    return dst

def analyze_image(chunk_id: str, img_path: Path) -> dict:
    """Analyze cropped image via claude CLI."""
    if not img_path.exists():
        return {
            "image_type": "other",
            "image_description_en": "Image not available",
            "image_description_pt_br": "Imagem não disponível",
            "extracted_text": None,
            "ufo_anomaly_detected": False,
            "ufo_anomaly_type": None,
            "ufo_anomaly_rationale": None,
            "cryptid_anomaly_detected": False,
            "cryptid_anomaly_type": None,
            "cryptid_anomaly_rationale": None
        }

    prompt = IMAGE_ANALYST_PROMPT_TEMPLATE.format(img_path=str(img_path))
    retries = 2
    for attempt in range(retries):
        raw = run_claude(prompt, timeout=60)
        data = parse_json_response(raw)
        if data:
            print(f"    [IMG OK] {chunk_id}", flush=True)
            return data
        if attempt < retries - 1:
            time.sleep(3)

    print(f"  [IMG FAIL] {chunk_id}", flush=True)
    return {
        "image_type": "other",
        "image_description_en": "Analysis failed",
        "image_description_pt_br": "Análise falhou",
        "extracted_text": None,
        "ufo_anomaly_detected": False,
        "ufo_anomaly_type": None,
        "ufo_anomaly_rationale": None,
        "cryptid_anomaly_detected": False,
        "cryptid_anomaly_type": None,
        "cryptid_anomaly_rationale": None
    }

def write_chunk_file(chunk: dict):
    """Write individual chunk markdown file."""
    chunk_id = chunk["chunk_id"]
    bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
    page = chunk.get("page", 1)
    actual_num = PAGE_MAP.get(page, page)
    ctype = chunk.get("type", "paragraph")

    related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null"
    related_table = chunk.get("related_table") or "null"
    prev_chunk = chunk.get("prev_chunk") or "null"
    next_chunk = chunk.get("next_chunk") or "null"

    fmt_list = chunk.get("formatting") or []
    fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"

    ocr_lines = chunk.get("ocr_source_lines") or []
    ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"

    def yv(v):
        if v is None:
            return "null"
        if isinstance(v, bool):
            return str(v).lower()
        s = str(v)
        # Quote if contains special chars
        if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@', '`']):
            return f'"{s}"'
        return s

    content = f"""---
chunk_id: {chunk_id}
type: {ctype}
page: {page}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
classification: {yv(chunk.get("classification"))}
formatting: {fmt_str}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {prev_chunk}
next_chunk: {next_chunk}
related_image: {related_image}
related_table: {related_table}
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
ocr_source_lines: {ocr_lines_str}
redaction_code: {yv(chunk.get("redaction_code"))}
redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))}
image_type: {yv(chunk.get("image_type"))}
ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()}
cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()}
ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {yv(chunk.get("image_description_en"))}
image_description_pt_br: {yv(chunk.get("image_description_pt_br"))}
extracted_text: {yv(chunk.get("extracted_text"))}
source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png
---

**EN:** {chunk.get("content_en", "")}

**PT-BR:** {chunk.get("content_pt_br", "")}
"""
    (CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8")

def main():
    t_start = time.time()
    print(f"Starting rebuild: {DOC_ID}", flush=True)
    print(f"Total pages: {TOTAL_PAGES}", flush=True)

    CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)

    # Phase 1: Rebuild pages in parallel batches of 5
    print("\n=== Phase 1: Page rebuilding ===", flush=True)
    all_page_results = {}
    page_seqs = list(range(1, TOTAL_PAGES + 1))

    for batch_start in range(0, len(page_seqs), BATCH_SIZE):
        batch = page_seqs[batch_start:batch_start + BATCH_SIZE]
        print(f"  Batch pages {batch[0]}-{batch[-1]}...", flush=True)

        with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
            futures = {executor.submit(rebuild_page, p): p for p in batch}
            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                all_page_results[result["page_number"]] = result

        # Save intermediate state after each batch
        state_path = OUT_DIR / "_rebuild_state.json"
        state_path.write_text(
            json.dumps({str(k): v for k, v in all_page_results.items()}, ensure_ascii=False),
            encoding="utf-8"
        )

    # Phase 2: Global chunk numbering
    print("\n=== Phase 2: Global chunk numbering ===", flush=True)
    all_chunks = []
    order_global = 0

    for page_seq in sorted(all_page_results.keys()):
        chunks = all_page_results[page_seq].get("chunks", [])
        actual_num = all_page_results[page_seq].get("actual_num", PAGE_MAP.get(page_seq, page_seq))
        for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)):
            order_global += 1
            chunk_id = f"c{order_global:04d}"
            chunk["chunk_id"] = chunk_id
            chunk["order_global"] = order_global
            chunk["actual_num"] = actual_num
            all_chunks.append(chunk)

    for i, chunk in enumerate(all_chunks):
        chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
        chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None

    print(f"  Total chunks: {len(all_chunks)}", flush=True)

    # Phase 3: Crop all images
    print("\n=== Phase 3: Cropping images ===", flush=True)
    image_chunks = [c for c in all_chunks if c.get("type") == "image"]
    print(f"  Found {len(image_chunks)} image chunks", flush=True)

    for chunk in image_chunks:
        crop_image(
            chunk["page"],
            chunk.get("actual_num", PAGE_MAP.get(chunk["page"], chunk["page"])),
            chunk["chunk_id"],
            chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
        )

    # Phase 4: Analyze images in parallel batches of 5
    print("\n=== Phase 4: Image analysis ===", flush=True)
    chunk_lookup = {c["chunk_id"]: c for c in all_chunks}

    for batch_start in range(0, len(image_chunks), BATCH_SIZE):
        batch = image_chunks[batch_start:batch_start + BATCH_SIZE]
        print(f"  Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True)

        with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
            futures = {}
            for chunk in batch:
                chunk_id = chunk["chunk_id"]
                img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
                futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id

            for future in concurrent.futures.as_completed(futures):
                chunk_id = futures[future]
                img_meta = future.result()
                chunk = chunk_lookup.get(chunk_id)
                if chunk:
                    chunk.update({k: v for k, v in img_meta.items() if v is not None})

    # Phase 5: Table stitching check
    print("\n=== Phase 5: Table stitching ===", flush=True)
    tables_stitched = 0
    table_markers = [c for c in all_chunks if c.get("type") == "table_marker"]
    print(f"  Found {len(table_markers)} table markers (no cross-page stitching needed)", flush=True)

    # Phase 6: Write chunk files
    print("\n=== Phase 6: Writing chunk files ===", flush=True)
    for chunk in all_chunks:
        write_chunk_file(chunk)
    print(f"  Wrote {len(all_chunks)} chunk files", flush=True)

    # Phase 7: Write _index.json
    print("\n=== Phase 7: Writing _index.json ===", flush=True)
    build_at = datetime.now(timezone.utc).isoformat()

    index_chunks = []
    for chunk in all_chunks:
        bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
        content_en = chunk.get("content_en", "")
        preview = content_en[:80] + ("..." if len(content_en) > 80 else "")
        index_chunks.append({
            "chunk_id": chunk["chunk_id"],
            "type": chunk.get("type", "paragraph"),
            "page": chunk.get("page", 1),
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk.get("order_global", 1),
            "file": f"chunks/{chunk['chunk_id']}.md",
            "bbox": bbox,
            "preview": preview
        })

    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": len(all_chunks),
        "build_approach": "subagents",
        "build_model": "claude-haiku-4-5",
        "build_at": build_at,
        "chunks": index_chunks
    }
    (OUT_DIR / "_index.json").write_text(
        json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
    )

    # Phase 8: Assemble document.md
    print("\n=== Phase 8: Assembling document.md ===", flush=True)

    type_histogram = {}
    for chunk in all_chunks:
        t = chunk.get("type", "paragraph")
        type_histogram[t] = type_histogram.get(t, 0) + 1

    ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
    cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
    images_extracted = len(image_chunks)

    histogram_yaml = "\n".join(f"  {k}: {v}" for k, v in sorted(type_histogram.items()))

    def list_yaml(items):
        if not items:
            return "  []"
        return "\n".join(f"  - {i}" for i in items)

    doc_parts = [f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {len(all_chunks)}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged:
{list_yaml(ufo_flagged)}
cryptid_anomalies_flagged:
{list_yaml(cryptid_flagged)}
build_approach: "subagents"
build_model: "claude-haiku-4-5"
build_at: "{build_at}"
---
"""]

    chunks_by_page = {}
    for chunk in all_chunks:
        p = chunk.get("page", 1)
        chunks_by_page.setdefault(p, []).append(chunk)

    for page_seq in sorted(chunks_by_page.keys()):
        page_chunks = chunks_by_page[page_seq]
        doc_parts.append(f"\n## Page {page_seq}\n")

        for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)):
            chunk_id = chunk["chunk_id"]
            ctype = chunk.get("type", "paragraph")
            bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
            bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"

            doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
            doc_parts.append(f'<a id="{chunk_id}"></a>\n')
            doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n")

            doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n")
            doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n")

            if ctype == "image":
                doc_parts.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)\n\n")
                if chunk.get("image_description_en"):
                    doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n")
                if chunk.get("image_description_pt_br"):
                    doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n")

            # Metadata block
            meta = {
                "chunk_id": chunk_id,
                "type": ctype,
                "page": chunk.get("page"),
                "order_in_page": chunk.get("order_in_page"),
                "order_global": chunk.get("order_global"),
                "bbox": bbox,
                "classification": chunk.get("classification"),
                "formatting": chunk.get("formatting", []),
                "cross_page_hint": chunk.get("cross_page_hint"),
                "prev_chunk": chunk.get("prev_chunk"),
                "next_chunk": chunk.get("next_chunk"),
                "ocr_confidence": chunk.get("ocr_confidence"),
                "redaction_code": chunk.get("redaction_code"),
                "image_type": chunk.get("image_type"),
                "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
                "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
                "ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
                "ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
            }
            meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
            doc_parts.append(
                f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n"
            )

    doc_md = "".join(doc_parts)
    (OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8")
    doc_md_bytes = len(doc_md.encode("utf-8"))

    # Cleanup intermediate state
    state_path = OUT_DIR / "_rebuild_state.json"
    if state_path.exists():
        state_path.unlink()

    t_end = time.time()
    wall_seconds = int(t_end - t_start)

    print(f"\n=== DONE ===", flush=True)
    final_line = f"pages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}"
    print(final_line, flush=True)
    print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True)

if __name__ == "__main__":
    main()