disclosure-bureau/scripts/rebuild_doc255.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for
into structured chunk files, _index.json, and document.md.

Uses `claude -p --model haiku` subprocess calls (OAuth via Max plan).
"""

import json
import os
import random
import re
import subprocess
import sys
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path

DOC_ID = "doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for"
DOC_TITLE = "UFO's and Defense: What Should We Prepare For?"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"

MODEL = "haiku"
TOTAL_PAGES = 93
WORKERS = 4
TIMEOUT = 240  # seconds per page call

_print_lock = threading.Lock()

def safe_print(*args, **kwargs):
    with _print_lock:
        print(*args, **kwargs, flush=True)


PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder agent. Analyze the scanned document page image and extract all content into structured chunks.

Document: {doc_title}
Page: {page_number} of {total_pages}
Doc ID: {doc_id}

STEP 1: Use the Read tool to view this PNG image:
{png_path}

STEP 2: Analyze every element on the page carefully.

STEP 3: Return ONE JSON object only (no markdown fence, no commentary):
{{
  "page_number": {page_number},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "paragraph",
      "content_en": "verbatim English text from page",
      "content_pt_br": "tradução em português brasileiro",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.85,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null,
      "image_description_en": null,
      "image_description_pt_br": null,
      "extracted_text": null
    }}
  ]
}}

CHUNK TYPES (use exactly one):
- letterhead: document header/letterhead
- classification_marking: classification marking (TOP SECRET, CUI, etc.)
- date_line: date field
- address_block: TO:/FROM:/distribution fields
- heading: section/chapter/subject heading
- paragraph: body text paragraph
- numbered_item: numbered list item
- bulleted_item: bullet list item
- table_marker: table content
- image: photograph, diagram, chart, sketch, map, graph
- caption: figure/image caption
- footer: page footer
- page_number: standalone page number
- signature: signature/signatory block
- redaction: blacked-out/redacted area
- stamp: official stamp or seal
- handwriting: handwritten annotation
- blank_area: empty area
- form_field: form field with label and value
- unknown: unidentifiable element

RULES:
1. Split content into logical chunks (one concept per chunk). A typical page has 3-15 chunks.
2. For image chunks: describe what you see in content_en and set image_type.
3. image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other
4. bbox: normalized coordinates 0.0-1.0 (x=left, y=top, w=width, h=height)
5. content_en: verbatim text if text chunk; visual description if image chunk
6. content_pt_br: Brazilian Portuguese translation (NOT European Portuguese)
7. classification: null or the marking text (e.g. "CUI", "UNCLASSIFIED")
8. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
9. formatting: array from ["bold", "italic", "all_caps", "underline"]
10. If page is completely blank: ONE chunk of type "blank_area"
11. Preserve French text verbatim (document may contain French)
12. For redaction chunks: set redaction_code if visible (e.g. "(b)(1)")
13. ufo_anomaly_detected: true ONLY for image chunks showing actual UAP/anomalous phenomena

Output ONLY the JSON object. No preamble. No fence. No commentary.'''


IMAGE_ANALYST_PROMPT = '''You are an image analyst for a UAP/UFO declassified document.

STEP 1: Use the Read tool to view this cropped image:
{image_path}

STEP 2: Analyze it carefully.

STEP 3: Return ONE JSON object only (no markdown fence):
{{
  "image_description_en": "detailed English description",
  "image_description_pt_br": "descrição detalhada em português brasileiro",
  "image_type": "photograph",
  "extracted_text": null,
  "ufo_anomaly_detected": false,
  "ufo_anomaly_type": null,
  "ufo_anomaly_rationale": null,
  "cryptid_anomaly_detected": false,
  "cryptid_anomaly_type": null,
  "cryptid_anomaly_rationale": null
}}

image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other
Set ufo_anomaly_detected=true only if the image shows an actual UAP/UFO or anomalous aerial phenomenon.
Set cryptid_anomaly_detected=true only if the image shows a cryptid or unknown creature.
extracted_text: any text visible inside the image (verbatim), or null.

Output ONLY the JSON object.'''


def extract_json(text: str) -> dict:
    """Extract JSON from claude CLI output."""
    text = text.strip()
    # Strip markdown fences if present
    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```\s*$", "", text.rstrip())
    # Find first { and matching }
    start = text.find("{")
    if start == -1:
        raise ValueError(f"No JSON found in: {text[:200]}")
    depth = 0
    for i, c in enumerate(text[start:], start):
        if c == "{":
            depth += 1
        elif c == "}":
            depth -= 1
            if depth == 0:
                return json.loads(text[start:i + 1])
    raise ValueError("Unclosed JSON in response")


def call_claude(prompt: str, allowed_tools: str = "Read", timeout: int = TIMEOUT) -> str:
    """Call claude -p CLI and return result text."""
    cmd = [
        "claude", "-p",
        "--model", MODEL,
        "--output-format", "json",
        "--max-turns", "5",
        "--allowedTools", allowed_tools,
        "--add-dir", str(PNG_DIR),
        "--add-dir", str(IMAGES_DIR),
        "--",
        prompt,
    ]
    res = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        timeout=timeout,
        check=False,
        env={**os.environ},
    )
    if res.returncode != 0:
        raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}")

    cli = json.loads(res.stdout)
    if cli.get("is_error"):
        raise RuntimeError(f"claude error: {cli.get('result', '')[:500]}")

    return cli.get("result", "")


def process_page(page_num: int) -> dict:
    """Process a single page using claude -p CLI."""
    png_path = PNG_DIR / f"p-{page_num:03d}.png"

    if not png_path.exists():
        safe_print(f"  Page {page_num}: PNG missing — placeholder")
        return {
            "page_number": page_num,
            "chunks": [{
                "order_in_page": 1,
                "type": "blank_area",
                "content_en": f"[Page {page_num} — PNG not available]",
                "content_pt_br": f"[Página {page_num} — PNG não disponível]",
                "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
                "classification": None, "formatting": [],
                "cross_page_hint": "self_contained",
                "ocr_confidence": 0.0, "ocr_source_lines": [],
                "redaction_code": None, "redaction_inferred_content_type": None,
                "image_type": None,
                "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
                "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
                "image_description_en": None, "image_description_pt_br": None, "extracted_text": None
            }]
        }

    prompt = PAGE_REBUILDER_PROMPT.format(
        doc_title=DOC_TITLE,
        page_number=page_num,
        total_pages=TOTAL_PAGES,
        doc_id=DOC_ID,
        png_path=str(png_path),
    )

    max_retries = 3
    for attempt in range(1, max_retries + 1):
        try:
            result_text = call_claude(prompt, allowed_tools="Read")
            data = extract_json(result_text)
            data["page_number"] = page_num
            # Validate chunks exist
            if not isinstance(data.get("chunks"), list) or len(data["chunks"]) == 0:
                raise ValueError("No chunks in response")
            safe_print(f"  Page {page_num}: {len(data['chunks'])} chunks")
            return data
        except (subprocess.TimeoutExpired,) as e:
            safe_print(f"  Page {page_num}: timeout attempt {attempt}/{max_retries}")
            if attempt == max_retries:
                break
            time.sleep(10 * attempt)
        except (RuntimeError, ValueError, json.JSONDecodeError) as e:
            safe_print(f"  Page {page_num}: error attempt {attempt}/{max_retries}: {str(e)[:100]}")
            if attempt == max_retries:
                break
            backoff = 5 * attempt + random.uniform(0, 3)
            time.sleep(backoff)

    # Return fallback
    safe_print(f"  Page {page_num}: FALLBACK after {max_retries} attempts")
    return {
        "page_number": page_num,
        "chunks": [{
            "order_in_page": 1,
            "type": "unknown",
            "content_en": f"[Page {page_num} — content extraction failed after {max_retries} attempts]",
            "content_pt_br": f"[Página {page_num} — extração de conteúdo falhou após {max_retries} tentativas]",
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "classification": None, "formatting": [],
            "cross_page_hint": "self_contained",
            "ocr_confidence": 0.0, "ocr_source_lines": [],
            "redaction_code": None, "redaction_inferred_content_type": None,
            "image_type": None,
            "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
            "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
            "image_description_en": None, "image_description_pt_br": None, "extracted_text": None
        }]
    }


def crop_image(chunk_id: str, png_path: Path, bbox: dict) -> object:
    """Crop image region from page PNG."""
    from PIL import Image

    cropped_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
    try:
        im = Image.open(png_path)
        W, H = im.size
        x = max(0.0, float(bbox.get("x", 0)))
        y = max(0.0, float(bbox.get("y", 0)))
        w = max(0.01, float(bbox.get("w", 1)))
        h = max(0.01, float(bbox.get("h", 0.1)))
        pad = 0.005
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))

        if right <= left or bottom <= top:
            safe_print(f"  Crop {chunk_id}: degenerate bbox {bbox}")
            return None

        cropped = im.crop((left, top, right, bottom))
        cropped.save(str(cropped_path))
        safe_print(f"  Cropped {chunk_id}: {left},{top},{right},{bottom} from {W}x{H}")
        return cropped_path
    except Exception as e:
        safe_print(f"  Crop {chunk_id}: error: {e}")
        return None


def analyze_image(chunk_id: str, cropped_path: Path) -> dict:
    """Analyze a cropped image using claude -p CLI."""
    if not cropped_path or not cropped_path.exists():
        return {}

    prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path))

    max_retries = 2
    for attempt in range(1, max_retries + 1):
        try:
            result_text = call_claude(prompt, allowed_tools="Read", timeout=120)
            data = extract_json(result_text)
            safe_print(f"  Image {chunk_id}: analyzed (ufo={data.get('ufo_anomaly_detected', False)})")
            return data
        except Exception as e:
            safe_print(f"  Image {chunk_id}: error attempt {attempt}: {str(e)[:80]}")
            if attempt < max_retries:
                time.sleep(5)
    return {}


def write_chunk_file(chunk: dict) -> None:
    """Write chunk .md file."""
    chunk_id = chunk["chunk_id"]
    chunk_path = CHUNKS_DIR / f"{chunk_id}.md"

    bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
    page_num = chunk.get("page", 1)
    source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png"

    content = f"""---
chunk_id: {chunk_id}
type: {chunk.get("type", "paragraph")}
page: {page_num}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {float(bbox.get('x') or 0):.2f}, y: {float(bbox.get('y') or 0):.2f}, w: {float(bbox.get('w') or 1):.2f}, h: {float(bbox.get('h') or 0.1):.2f}}}
classification: {json.dumps(chunk.get("classification"))}
formatting: {json.dumps(chunk.get("formatting", []))}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {json.dumps(chunk.get("prev_chunk"))}
next_chunk: {json.dumps(chunk.get("next_chunk"))}
related_image: {json.dumps(chunk.get("related_image"))}
related_table: null
ocr_confidence: {float(chunk.get("ocr_confidence") or 0.85)}
ocr_source_lines: {json.dumps(chunk.get("ocr_source_lines", []))}
redaction_code: {json.dumps(chunk.get("redaction_code"))}
redaction_inferred_content_type: {json.dumps(chunk.get("redaction_inferred_content_type"))}
image_type: {json.dumps(chunk.get("image_type"))}
ufo_anomaly_detected: {str(bool(chunk.get("ufo_anomaly_detected", False))).lower()}
cryptid_anomaly_detected: {str(bool(chunk.get("cryptid_anomaly_detected", False))).lower()}
ufo_anomaly_type: {json.dumps(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {json.dumps(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {json.dumps(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {json.dumps(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {json.dumps(chunk.get("image_description_en"))}
image_description_pt_br: {json.dumps(chunk.get("image_description_pt_br"))}
extracted_text: {json.dumps(chunk.get("extracted_text"))}
source_png: {source_png}
---

**EN:** {chunk.get("content_en", "")}

**PT-BR:** {chunk.get("content_pt_br", "")}
"""
    chunk_path.write_text(content, encoding="utf-8")


def main():
    start_time = time.time()

    CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)

    print(f"Rebuilding {DOC_ID}")
    print(f"Processing {TOTAL_PAGES} pages with {WORKERS} workers...")
    print("=" * 70)

    page_numbers = list(range(1, TOTAL_PAGES + 1))  # 1..93
    all_page_data = {}

    # Process pages in batches of WORKERS
    for batch_start in range(0, len(page_numbers), WORKERS):
        batch = page_numbers[batch_start:batch_start + WORKERS]
        batch_num = batch_start // WORKERS + 1
        total_batches = (len(page_numbers) + WORKERS - 1) // WORKERS
        safe_print(f"\nBatch {batch_num}/{total_batches}: pages {batch}")

        with ThreadPoolExecutor(max_workers=WORKERS) as executor:
            futures = {executor.submit(process_page, p): p for p in batch}
            for future in as_completed(futures):
                page_num = futures[future]
                try:
                    data = future.result()
                    all_page_data[page_num] = data
                except Exception as e:
                    safe_print(f"  Page {page_num}: CRITICAL FAILURE: {e}")
                    all_page_data[page_num] = {
                        "page_number": page_num,
                        "chunks": [{
                            "order_in_page": 1,
                            "type": "unknown",
                            "content_en": f"[Page {page_num} — critical failure]",
                            "content_pt_br": f"[Página {page_num} — falha crítica]",
                            "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
                            "classification": None, "formatting": [],
                            "cross_page_hint": "self_contained",
                            "ocr_confidence": 0.0, "ocr_source_lines": [],
                            "redaction_code": None, "redaction_inferred_content_type": None,
                            "image_type": None,
                            "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
                            "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
                            "image_description_en": None, "image_description_pt_br": None, "extracted_text": None
                        }]
                    }

    print(f"\nAll pages processed. Assigning global chunk IDs...")

    # Assign global chunk IDs in page order
    all_chunks = []
    chunk_counter = 1
    for page_num in sorted(all_page_data.keys()):
        page_data = all_page_data[page_num]
        chunks = page_data.get("chunks", [])
        chunks.sort(key=lambda c: c.get("order_in_page", 1))
        for chunk in chunks:
            chunk_id = f"c{chunk_counter:04d}"
            chunk["chunk_id"] = chunk_id
            chunk["page"] = page_num
            chunk["order_global"] = chunk_counter
            chunk_counter += 1
            all_chunks.append(chunk)

    total_chunks = len(all_chunks)
    print(f"Total chunks: {total_chunks}")

    # Prev/next pointers
    for i, chunk in enumerate(all_chunks):
        chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
        chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None

    # Identify image chunks
    image_chunks = [c for c in all_chunks if c.get("type") == "image"]
    print(f"\nCropping {len(image_chunks)} images...")

    # Crop all images first
    crop_results = {}
    for chunk in image_chunks:
        chunk_id = chunk["chunk_id"]
        page_num = chunk["page"]
        png_path = PNG_DIR / f"p-{page_num:03d}.png"
        if png_path.exists():
            cp = crop_image(chunk_id, png_path, chunk.get("bbox", {}))
            crop_results[chunk_id] = cp
        else:
            crop_results[chunk_id] = None

    # Analyze images in batches
    image_items = [(c["chunk_id"], crop_results.get(c["chunk_id"]))
                   for c in image_chunks if crop_results.get(c["chunk_id"])]
    print(f"\nAnalyzing {len(image_items)} cropped images...")

    image_analysis = {}
    for batch_start in range(0, len(image_items), WORKERS):
        batch = image_items[batch_start:batch_start + WORKERS]
        with ThreadPoolExecutor(max_workers=WORKERS) as executor:
            futures = {executor.submit(analyze_image, cid, cp): cid for cid, cp in batch}
            for future in as_completed(futures):
                chunk_id = futures[future]
                try:
                    image_analysis[chunk_id] = future.result()
                except Exception as e:
                    safe_print(f"  Image analysis {chunk_id}: {e}")
                    image_analysis[chunk_id] = {}

    # Merge image analysis into chunks
    for chunk in all_chunks:
        chunk_id = chunk["chunk_id"]
        if chunk.get("type") == "image":
            chunk["related_image"] = f"IMG-{chunk_id}.png"
            if chunk_id in image_analysis:
                for field in ["image_description_en", "image_description_pt_br", "image_type",
                              "extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type",
                              "ufo_anomaly_rationale", "cryptid_anomaly_detected",
                              "cryptid_anomaly_type", "cryptid_anomaly_rationale"]:
                    if field in image_analysis[chunk_id]:
                        chunk[field] = image_analysis[chunk_id][field]

    # Write chunk files
    print(f"\nWriting {total_chunks} chunk files...")
    for chunk in all_chunks:
        write_chunk_file(chunk)
    print("Chunk files written.")

    # Build _index.json
    now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    index_chunks = []
    for chunk in all_chunks:
        bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
        index_chunks.append({
            "chunk_id": chunk["chunk_id"],
            "type": chunk.get("type", "paragraph"),
            "page": chunk.get("page", 1),
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk.get("order_global", 1),
            "file": f"chunks/{chunk['chunk_id']}.md",
            "bbox": bbox,
            "preview": chunk.get("content_en", "")[:80]
        })

    index_data = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": "claude-haiku-4-5",
        "build_at": now_iso,
        "chunks": index_chunks
    }
    (OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
    print("_index.json written.")

    # Compute stats
    chunk_types = {}
    ufo_anomalies = []
    cryptid_anomalies = []
    images_count = 0
    for chunk in all_chunks:
        t = chunk.get("type", "paragraph")
        chunk_types[t] = chunk_types.get(t, 0) + 1
        if chunk.get("ufo_anomaly_detected"):
            ufo_anomalies.append(chunk["chunk_id"])
        if chunk.get("cryptid_anomaly_detected"):
            cryptid_anomalies.append(chunk["chunk_id"])
        if t == "image":
            images_count += 1

    # Assemble document.md
    print("\nAssembling document.md...")
    parts = []

    # Frontmatter
    parts.append("---")
    parts.append('schema_version: "0.2.0"')
    parts.append("type: master_document")
    parts.append(f"doc_id: {DOC_ID}")
    parts.append(f'canonical_title: "{DOC_TITLE}"')
    parts.append(f"total_pages: {TOTAL_PAGES}")
    parts.append(f"total_chunks: {total_chunks}")
    parts.append("chunk_types_histogram:")
    for t, count in sorted(chunk_types.items()):
        parts.append(f"  {t}: {count}")
    parts.append("multi_page_tables: []")
    parts.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
    parts.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
    parts.append('build_approach: "subagents"')
    parts.append("build_model: claude-haiku-4-5")
    parts.append(f"build_at: {now_iso}")
    parts.append("---")
    parts.append("")

    current_page = None
    for chunk in all_chunks:
        page = chunk.get("page", 1)
        if page != current_page:
            current_page = page
            parts.append(f"\n## Page {page}\n")

        chunk_id = chunk["chunk_id"]
        bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
        bbox_str = f"{float(bbox.get('x') or 0):.2f}/{float(bbox.get('y') or 0):.2f}/{float(bbox.get('w') or 1):.2f}/{float(bbox.get('h') or 0.1):.2f}"
        ctype = chunk.get("type", "paragraph")

        parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
        parts.append(f'<a id="{chunk_id}"></a>')
        parts.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}")
        parts.append("")
        parts.append(f"**EN:** {chunk.get('content_en', '')}")
        parts.append("")
        parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
        parts.append("")

        if ctype == "image":
            img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
            if img_path.exists():
                parts.append(f"![chunk image](./images/IMG-{chunk_id}.png)")
                parts.append("")
            if chunk.get("image_description_en"):
                parts.append(f"*{chunk['image_description_en']}*")
                parts.append("")

        # Metadata block
        meta = {k: v for k, v in chunk.items() if k not in ["content_en", "content_pt_br"]}
        parts.append("<details><summary>metadata</summary>")
        parts.append("")
        parts.append("```json")
        parts.append(json.dumps(meta, ensure_ascii=False, indent=2))
        parts.append("```")
        parts.append("")
        parts.append("</details>")
        parts.append("")
        parts.append("---")
        parts.append("")

    document_md = "\n".join(parts)
    doc_path = OUT_DIR / "document.md"
    doc_path.write_text(document_md, encoding="utf-8")
    doc_md_bytes = len(document_md.encode("utf-8"))
    print(f"document.md written ({doc_md_bytes:,} bytes)")

    wall_seconds = int(time.time() - start_time)
    print(f"\n{'='*70}")
    print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={len(ufo_anomalies)}, cryptid_anomalies={len(cryptid_anomalies)}, wall_seconds={wall_seconds}")


if __name__ == "__main__":
    main()