disclosure-bureau/scripts/rebuild_doc65_section7.py

#!/usr/bin/env python3
"""
Rebuilds doc-65-hs1-834228961-62-hq-83894-section-7 into the raw/ layout.
Uses claude CLI (OAuth via Max plan) to process each page PNG via vision.
"""

import os
import sys
import json
import base64
import time
import subprocess
import concurrent.futures
import threading
from datetime import datetime, timezone
from pathlib import Path

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-7"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 7"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"

CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)

_print_lock = threading.Lock()

def safe_print(*args, **kwargs):
    with _print_lock:
        print(*args, **kwargs, flush=True)


PAGE_REBUILDER_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO document reconstruction system.

STEP 1: Use the Read tool to view this page image:
{page_png_path}

STEP 2: Analyze the page carefully. The page is from document: {doc_title}
Doc ID: {doc_id}
Page number (1-indexed in document): {page_number}
Total pages: {total_pages}

OCR text (may be empty):
{page_ocr_text}

STEP 3: Return a JSON object with ALL content from the page split into chunks.

Return ONLY this JSON structure (no markdown fences, no commentary):
{{
  "page_number": {page_number},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "<type_enum>",
      "content_en": "English content or description",
      "content_pt_br": "Conteúdo em português brasileiro",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.90,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null,
      "image_description_en": null,
      "image_description_pt_br": null,
      "extracted_text": null
    }}
  ]
}}

Chunk type enum (use ONLY these):
- letterhead: agency/org header at top
- classification_banner: TOP SECRET/SECRET/CONFIDENTIAL/UNCLASSIFIED banners
- date_line: date of document
- to_from_line: TO:/FROM:/VIA: address lines
- subject_line: RE:/SUBJECT: lines
- paragraph: body text paragraph
- section_header: bold/underlined section title
- list_item: numbered or bulleted item
- redaction_block: blacked-out or whited-out region
- signature_block: signature/name/title at bottom
- image: photograph, diagram, sketch, stamp, seal
- table_marker: table content
- page_number: page number indicator
- footnote: footnote or endnote
- handwriting: handwritten annotation
- form_field: form label+value pairs
- blank: empty/whitespace page or region

Rules:
1. bbox values are NORMALIZED [0..1] (x=left, y=top, w=width, h=height)
2. Every visible region must be a chunk
3. For redaction_block: estimate redacted content type in redaction_inferred_content_type
4. For image chunks: provide detailed image_description_en AND image_description_pt_br
5. classification: extract from banners (e.g. "TOP SECRET") or null
6. formatting: array from: ["bold","italic","underline","all_caps","centered","right_aligned"]
7. cross_page_hint: "self_contained"|"continues_to_next"|"continues_from_prev"|"continues_both"
8. If blank page: one chunk of type "blank"
9. content_en: verbatim text (EN) or description; content_pt_br: PT-BR translation
10. ufo_anomaly_detected: true ONLY if page shows unidentified aerial phenomenon evidence
11. Output ONLY valid JSON, nothing else
"""

IMAGE_ANALYST_PROMPT = """You are an image analyst for declassified UAP/UFO document reconstruction.

STEP 1: Use the Read tool to view this cropped image:
{image_path}

STEP 2: Analyze the image carefully.

STEP 3: Return ONLY this JSON (no fences, no commentary):
{{
  "image_description_en": "Detailed description in English",
  "image_description_pt_br": "Descrição detalhada em português brasileiro",
  "image_type": "<type>",
  "extracted_text": "Any text visible in image verbatim, or null",
  "ufo_anomaly_detected": false,
  "ufo_anomaly_type": null,
  "ufo_anomaly_rationale": null,
  "cryptid_anomaly_detected": false,
  "cryptid_anomaly_type": null,
  "cryptid_anomaly_rationale": null
}}

image_type enum: photograph|diagram|sketch|map|chart|seal|stamp|signature|redacted_region|form|other
ufo_anomaly_detected: true ONLY if image shows craft/object/phenomenon that appears to be UAP
cryptid_anomaly_detected: true ONLY if image shows anomalous/non-human entity
Return ONLY valid JSON.
"""


def extract_json(text: str) -> dict:
    """Extract JSON from claude CLI output."""
    text = text.strip()
    if text.startswith("```"):
        import re
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```\s*$", "", text)
    start = text.find("{")
    if start == -1:
        raise ValueError("No JSON object found")
    depth = 0
    for i, c in enumerate(text[start:], start):
        if c == "{":
            depth += 1
        elif c == "}":
            depth -= 1
            if depth == 0:
                return json.loads(text[start:i+1])
    raise ValueError("Unclosed JSON")


def call_claude(prompt: str, png_dir: Path, timeout: int = 180) -> dict:
    """Call claude CLI and return parsed JSON."""
    cmd = [
        "claude", "-p",
        "--model", "haiku",
        "--output-format", "json",
        "--max-turns", "3",
        "--allowedTools", "Read",
        "--add-dir", str(png_dir),
        "--", prompt
    ]
    res = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
    if res.returncode != 0:
        raise RuntimeError(f"claude CLI failed rc={res.returncode}: {res.stderr[-1000:]}")
    cli_output = json.loads(res.stdout)
    if cli_output.get("is_error"):
        raise RuntimeError(f"claude error: {cli_output.get('result', '')[:500]}")
    result_text = cli_output.get("result", "")
    return extract_json(result_text)


def get_page_list():
    """Returns list of (page_number, png_path) tuples sorted by page_number."""
    files = sorted(PNG_DIR.glob("p-*.png"))
    return [(i+1, f) for i, f in enumerate(files)]


def load_ocr(png_path: Path) -> str:
    stem = png_path.stem  # p-NNN
    ocr_path = OCR_DIR / f"{stem}.txt"
    if ocr_path.exists():
        text = ocr_path.read_text(encoding="utf-8").strip()
        return text if len(text) > 2 else ""
    return ""


def process_page(page_number: int, png_path: Path, total_pages: int) -> dict:
    """Process a single page via claude vision."""
    ocr_text = load_ocr(png_path)
    prompt = PAGE_REBUILDER_PROMPT.format(
        doc_title=DOC_TITLE,
        doc_id=DOC_ID,
        page_number=page_number,
        total_pages=total_pages,
        page_png_path=str(png_path),
        page_ocr_text=ocr_text if ocr_text else "(no OCR available)"
    )

    retries = 3
    for attempt in range(retries):
        try:
            result = call_claude(prompt, png_path.parent, timeout=180)
            chunks = result.get("chunks", [])
            safe_print(f"  [OK] p{page_number:03d}: {len(chunks)} chunks")
            return result
        except Exception as e:
            safe_print(f"  [ERR] p{page_number:03d} attempt {attempt+1}: {str(e)[:200]}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt)

    # Fallback
    return {
        "page_number": page_number,
        "chunks": [{
            "order_in_page": 1,
            "type": "blank",
            "content_en": f"[Page {page_number} — processing error]",
            "content_pt_br": f"[Página {page_number} — erro de processamento]",
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "classification": None, "formatting": [],
            "cross_page_hint": "self_contained", "ocr_confidence": 0.0,
            "ocr_source_lines": [], "redaction_code": None,
            "redaction_inferred_content_type": None, "image_type": None,
            "ufo_anomaly_detected": False, "ufo_anomaly_type": None,
            "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False,
            "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
            "image_description_en": None, "image_description_pt_br": None,
            "extracted_text": None
        }]
    }


def global_number_chunks(all_page_results: dict) -> list:
    """Assign global chunk IDs across all pages."""
    chunks_flat = []
    for page_num in sorted(all_page_results.keys()):
        page_data = all_page_results[page_num]
        page_chunks = sorted(page_data.get("chunks", []), key=lambda c: c.get("order_in_page", 0))
        for chunk in page_chunks:
            chunk["page"] = page_num
            chunks_flat.append(chunk)

    for i, chunk in enumerate(chunks_flat):
        chunk["chunk_id"] = f"c{i+1:04d}"
        chunk["order_global"] = i + 1
        chunk["prev_chunk"] = f"c{i:04d}" if i > 0 else None
        chunk["next_chunk"] = f"c{i+2:04d}" if i < len(chunks_flat) - 1 else None

    return chunks_flat


def crop_image(chunk: dict, png_path: Path):
    """Crop image chunk bbox from page PNG."""
    from PIL import Image
    chunk_id = chunk["chunk_id"]
    bbox = chunk.get("bbox", {})
    x = bbox.get("x", 0)
    y = bbox.get("y", 0)
    w = bbox.get("w", 1)
    h = bbox.get("h", 1)
    out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
    try:
        im = Image.open(png_path)
        W, H = im.size
        pad = 0.005
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        if right <= left or bottom <= top:
            right = min(W, left + 10)
            bottom = min(H, top + 10)
        cropped = im.crop((left, top, right, bottom))
        cropped.save(str(out_path))
        return out_path
    except Exception as e:
        safe_print(f"  [WARN] Crop failed {chunk_id}: {e}")
        return None


def analyze_image(chunk: dict, png_path: Path) -> dict:
    """Crop and analyze an image chunk."""
    cropped_path = crop_image(chunk, png_path)
    if not cropped_path or not cropped_path.exists():
        return chunk

    prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path))
    retries = 2
    for attempt in range(retries):
        try:
            analysis = call_claude(prompt, cropped_path.parent, timeout=120)
            for key in ["image_description_en", "image_description_pt_br", "image_type",
                        "extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type",
                        "ufo_anomaly_rationale", "cryptid_anomaly_detected",
                        "cryptid_anomaly_type", "cryptid_anomaly_rationale"]:
                if key in analysis:
                    chunk[key] = analysis[key]
            chunk["related_image"] = f"IMG-{chunk['chunk_id']}.png"
            safe_print(f"  [IMG] {chunk['chunk_id']}: analyzed")
            return chunk
        except Exception as e:
            safe_print(f"  [WARN] Image analysis {chunk['chunk_id']} attempt {attempt+1}: {str(e)[:150]}")
            if attempt < retries - 1:
                time.sleep(1)
    return chunk


def write_chunk_file(chunk: dict, page_png_map: dict):
    """Write individual chunk .md file."""
    chunk_id = chunk["chunk_id"]
    page = chunk.get("page", 0)
    bbox = chunk.get("bbox", {})
    png_path = page_png_map.get(page)
    source_png = f"../../processing/png/{DOC_ID}/{png_path.name}" if png_path else "unknown"

    def jv(v):
        return json.dumps(v, ensure_ascii=False)

    yaml_lines = [
        "---",
        f"chunk_id: {chunk_id}",
        f"type: {chunk.get('type', 'paragraph')}",
        f"page: {page}",
        f"order_in_page: {chunk.get('order_in_page', 1)}",
        f"order_global: {chunk.get('order_global', 1)}",
        f"bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 0):.3f}}}",
        f"classification: {jv(chunk.get('classification'))}",
        f"formatting: {jv(chunk.get('formatting', []))}",
        f"cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}",
        f"prev_chunk: {jv(chunk.get('prev_chunk'))}",
        f"next_chunk: {jv(chunk.get('next_chunk'))}",
        f"related_image: {jv(chunk.get('related_image'))}",
        f"related_table: {jv(chunk.get('related_table'))}",
        f"ocr_confidence: {chunk.get('ocr_confidence', 0.85)}",
        f"ocr_source_lines: {jv(chunk.get('ocr_source_lines', []))}",
        f"redaction_code: {jv(chunk.get('redaction_code'))}",
        f"redaction_inferred_content_type: {jv(chunk.get('redaction_inferred_content_type'))}",
        f"image_type: {jv(chunk.get('image_type'))}",
        f"ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()}",
        f"cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()}",
        f"ufo_anomaly_type: {jv(chunk.get('ufo_anomaly_type'))}",
        f"ufo_anomaly_rationale: {jv(chunk.get('ufo_anomaly_rationale'))}",
        f"cryptid_anomaly_type: {jv(chunk.get('cryptid_anomaly_type'))}",
        f"cryptid_anomaly_rationale: {jv(chunk.get('cryptid_anomaly_rationale'))}",
        f"image_description_en: {jv(chunk.get('image_description_en'))}",
        f"image_description_pt_br: {jv(chunk.get('image_description_pt_br'))}",
        f"extracted_text: {jv(chunk.get('extracted_text'))}",
        f"source_png: {source_png}",
        "---",
        "",
        f"**EN:** {chunk.get('content_en', '')}",
        "",
        f"**PT-BR:** {chunk.get('content_pt_br', '')}",
        ""
    ]
    out_path = CHUNKS_DIR / f"{chunk_id}.md"
    out_path.write_text("\n".join(yaml_lines), encoding="utf-8")


def write_index(chunks_flat: list, total_pages: int):
    """Write _index.json."""
    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": total_pages,
        "total_chunks": len(chunks_flat),
        "build_approach": "subagents",
        "build_model": "claude-haiku-4-5",
        "build_at": datetime.now(timezone.utc).isoformat(),
        "chunks": []
    }
    for chunk in chunks_flat:
        chunk_id = chunk["chunk_id"]
        preview = (chunk.get("content_en", "") or "")[:80]
        index["chunks"].append({
            "chunk_id": chunk_id,
            "type": chunk.get("type", "paragraph"),
            "page": chunk.get("page", 1),
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk.get("order_global", 1),
            "file": f"chunks/{chunk_id}.md",
            "bbox": chunk.get("bbox", {}),
            "preview": preview
        })
    (OUT_DIR / "_index.json").write_text(
        json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
    )


def write_document_md(chunks_flat: list, total_pages: int) -> int:
    """Assemble the master document.md."""
    type_histogram = {}
    ufo_flagged = []
    cryptid_flagged = []
    for chunk in chunks_flat:
        t = chunk.get("type", "paragraph")
        type_histogram[t] = type_histogram.get(t, 0) + 1
        if chunk.get("ufo_anomaly_detected"):
            ufo_flagged.append(chunk["chunk_id"])
        if chunk.get("cryptid_anomaly_detected"):
            cryptid_flagged.append(chunk["chunk_id"])

    now_iso = datetime.now(timezone.utc).isoformat()
    lines = [
        "---",
        "schema_version: \"0.2.0\"",
        "type: master_document",
        f"doc_id: {DOC_ID}",
        f"canonical_title: \"{DOC_TITLE}\"",
        f"total_pages: {total_pages}",
        f"total_chunks: {len(chunks_flat)}",
        f"chunk_types_histogram: {json.dumps(type_histogram, ensure_ascii=False)}",
        "multi_page_tables: []",
        f"ufo_anomalies_flagged: {json.dumps(ufo_flagged)}",
        f"cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)}",
        "build_approach: \"subagents\"",
        "build_model: claude-haiku-4-5",
        f"build_at: {now_iso}",
        "---",
        ""
    ]

    current_page = None
    for chunk in chunks_flat:
        page = chunk.get("page", 1)
        if page != current_page:
            current_page = page
            lines.append(f"\n## Page {page}\n")

        chunk_id = chunk["chunk_id"]
        ctype = chunk.get("type", "paragraph")
        bbox = chunk.get("bbox", {})
        bbox_str = f"{bbox.get('x', 0):.2f}/{bbox.get('y', 0):.2f}/{bbox.get('w', 1):.2f}/{bbox.get('h', 0):.2f}"

        lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
        lines.append(f"<a id=\"{chunk_id}\"></a>")
        lines.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}")
        lines.append("")
        lines.append(f"**EN:** {chunk.get('content_en', '')}")
        lines.append("")
        lines.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
        lines.append("")

        if ctype == "image" and chunk.get("related_image"):
            lines.append(f"![{chunk_id} image](./images/{chunk.get('related_image')})")
            lines.append("")
            if chunk.get("image_description_en"):
                lines.append(f"**Image Description (EN):** {chunk['image_description_en']}")
                lines.append("")
            if chunk.get("image_description_pt_br"):
                lines.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}")
                lines.append("")

        meta = {k: v for k, v in chunk.items() if k not in ("content_en", "content_pt_br")}
        lines.append("<details><summary>metadata</summary>")
        lines.append("")
        lines.append("```json")
        lines.append(json.dumps(meta, indent=2, ensure_ascii=False))
        lines.append("```")
        lines.append("")
        lines.append("</details>")
        lines.append("")
        lines.append("---")
        lines.append("")

    content = "\n".join(lines)
    (OUT_DIR / "document.md").write_text(content, encoding="utf-8")
    return len(content.encode("utf-8"))


def main():
    start_time = time.time()
    pages = get_page_list()
    total_pages = len(pages)
    page_png_map = {pnum: ppath for pnum, ppath in pages}
    safe_print(f"Processing {total_pages} pages for {DOC_ID}")

    # Process pages in batches of 5
    batch_size = 5
    all_page_results = {}
    batches = [pages[i:i+batch_size] for i in range(0, len(pages), batch_size)]

    for batch_idx, batch in enumerate(batches):
        page_nums = [p[0] for p in batch]
        safe_print(f"Batch {batch_idx+1}/{len(batches)}: pages {page_nums}")
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = {
                executor.submit(process_page, pnum, ppath, total_pages): pnum
                for pnum, ppath in batch
            }
            for future in concurrent.futures.as_completed(futures):
                pnum = futures[future]
                try:
                    result = future.result()
                    all_page_results[pnum] = result
                except Exception as e:
                    safe_print(f"  [FATAL] Page {pnum}: {e}")
                    all_page_results[pnum] = {
                        "page_number": pnum,
                        "chunks": [{
                            "order_in_page": 1,
                            "type": "blank",
                            "content_en": f"[Page {pnum} — fatal error]",
                            "content_pt_br": f"[Página {pnum} — erro fatal]",
                            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
                            "classification": None, "formatting": [],
                            "cross_page_hint": "self_contained",
                            "ocr_confidence": 0.0, "ocr_source_lines": [],
                            "redaction_code": None,
                            "redaction_inferred_content_type": None,
                            "image_type": None, "ufo_anomaly_detected": False,
                            "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
                            "cryptid_anomaly_detected": False,
                            "cryptid_anomaly_type": None,
                            "cryptid_anomaly_rationale": None,
                            "image_description_en": None,
                            "image_description_pt_br": None,
                            "extracted_text": None
                        }]
                    }

    safe_print(f"\nAll pages processed. Numbering chunks globally...")
    chunks_flat = global_number_chunks(all_page_results)
    total_chunks = len(chunks_flat)
    safe_print(f"Total chunks: {total_chunks}")

    # Analyze image chunks in batches of 5
    image_chunks = [c for c in chunks_flat if c.get("type") == "image"]
    safe_print(f"\nProcessing {len(image_chunks)} image chunks...")
    img_batches = [image_chunks[i:i+5] for i in range(0, len(image_chunks), 5)]
    for img_batch_idx, img_batch in enumerate(img_batches):
        safe_print(f"Image batch {img_batch_idx+1}/{len(img_batches)}")
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = {}
            for chunk in img_batch:
                page = chunk.get("page", 1)
                png_path = page_png_map.get(page)
                if png_path:
                    f = executor.submit(analyze_image, chunk, png_path)
                    futures[f] = chunk["chunk_id"]
            for future in concurrent.futures.as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    cid = futures[future]
                    safe_print(f"  [ERR] Image {cid}: {e}")

    safe_print(f"\nWriting chunk files...")
    for chunk in chunks_flat:
        write_chunk_file(chunk, page_png_map)

    safe_print(f"Writing _index.json...")
    write_index(chunks_flat, total_pages)

    safe_print(f"Writing document.md...")
    doc_bytes = write_document_md(chunks_flat, total_pages)

    images_count = len([c for c in chunks_flat if c.get("type") == "image"])
    ufo_count = len([c for c in chunks_flat if c.get("ufo_anomaly_detected")])
    cryptid_count = len([c for c in chunks_flat if c.get("cryptid_anomaly_detected")])
    wall_seconds = int(time.time() - start_time)

    safe_print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_count} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}")
    safe_print(f"pages_done={total_pages}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={ufo_count}, cryptid_anomalies={cryptid_count}, wall_seconds={wall_seconds}")


if __name__ == "__main__":
    main()