disclosure-bureau/scripts/rebuild_doc65_serial130_resume.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_serial130_resume.py
Resume rebuild for doc-65-hs1-834228961-62-hq-83894-serial-130.

Pages 1-50 already processed (chunks c0001-c0204 exist).
This script:
  Phase A: Process pages 51-91 via claude CLI → write c0205+
  Phase B: Read ALL chunk files → rebuild _index.json + document.md
"""

import os
import sys
import json
import time
import subprocess
import concurrent.futures
import re
from datetime import datetime, timezone
from pathlib import Path

try:
    from PIL import Image as PILImage
    PILLOW_OK = True
except ImportError:
    PILLOW_OK = False

# ── Config ──────────────────────────────────────────────────────────────────
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130"
DOC_TITLE = "HQ Air Defense Command – Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)"
PNG_DIR   = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR   = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR   = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
CLAUDE_BIN = "/Users/guto/.local/bin/claude"

TOTAL_PAGES     = 91
START_PAGE      = 51   # first missing page
FIRST_CHUNK_NUM = 205  # c0205 onwards for new chunks
BATCH_SIZE      = 4
CLAUDE_TIMEOUT  = 150

# ── Helpers ──────────────────────────────────────────────────────────────────
def load_ocr(page_num: int) -> str:
    ocr_path = OCR_DIR / f"p-{page_num - 1:03d}.txt"
    if ocr_path.exists():
        text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
        return text[:2000] if text else ""
    return ""


PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent analyzing a page from a declassified US government document about Unidentified Flying Objects (UFO/UAP) investigations.

Document: {doc_title}
Page: {page_num} of {total_pages}
PNG file: /Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png

OCR text (may be incomplete):
{ocr_text}

Use the Read tool to read the image at:
/Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png

Analyze ALL visible content and return ONLY a JSON object (no markdown fences, no extra text):
{{
  "page_number": {page_num},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "letterhead",
      "content_en": "exact transcription or description in English",
      "content_pt_br": "transcrição ou descrição em português brasileiro",
      "bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.85,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null
    }}
  ]
}}

RULES:
- type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank | classification_banner | signature_block | redaction_block
- bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0)
- Split page into logical chunks (letterhead separate from body, stamps separate, etc.)
- For redacted blocks: type=redaction, redaction_code e.g. "(b)(1)", "(b)(3)", "(b)(6)"
- For photos/sketches/diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other
- cross_page_hint: self_contained | continues_to_next | continues_from_prev
- content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]"
- content_pt_br: Brazilian Portuguese translation/description
- ufo_anomaly_detected: true ONLY if page has image/sketch of anomalous aerial object
- Blank pages: one chunk with type=blank
- Return ONLY valid JSON, nothing else"""


def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str:
    try:
        result = subprocess.run(
            [CLAUDE_BIN, "-p", "--dangerously-skip-permissions",
             "--model", "claude-haiku-4-5",
             "--no-session-persistence",
             prompt],
            capture_output=True, text=True, timeout=timeout,
            env={**os.environ}
        )
        return result.stdout.strip()
    except subprocess.TimeoutExpired:
        return ""
    except Exception as e:
        return f"ERROR: {e}"


def parse_json(raw: str):
    text = raw.strip()
    if text.startswith("```"):
        lines = text.split("\n")[1:]
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]
        text = "\n".join(lines).strip()
    start = text.find("{")
    if start == -1:
        return None
    depth = 0
    end = -1
    for i, ch in enumerate(text[start:]):
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                end = start + i + 1
                break
    if end == -1:
        return None
    try:
        return json.loads(text[start:end])
    except json.JSONDecodeError:
        return None


def rebuild_page(page_num: int) -> dict:
    png_num = page_num - 1  # 0-indexed
    ocr_text = load_ocr(page_num)
    prompt = PAGE_REBUILDER_PROMPT.format(
        doc_title=DOC_TITLE,
        page_num=page_num,
        total_pages=TOTAL_PAGES,
        doc_id=DOC_ID,
        png_num=png_num,
        ocr_text=ocr_text or "(no OCR available)"
    )
    for attempt in range(3):
        raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT)
        if not raw or raw.startswith("ERROR:"):
            if attempt < 2:
                time.sleep(5 * (attempt + 1))
                continue
            break
        data = parse_json(raw)
        if data and "chunks" in data:
            data["page_number"] = page_num
            data["png_num"] = png_num
            for i, ch in enumerate(data["chunks"]):
                ch["order_in_page"] = i + 1
                ch["page"] = page_num
            print(f"  [OK] page {page_num:03d} → {len(data['chunks'])} chunks", flush=True)
            return data
        if attempt < 2:
            print(f"  [RETRY {attempt+1}] page {page_num}: bad JSON", flush=True)
            time.sleep(3)
        else:
            print(f"  [FAIL] page {page_num}: {raw[:200]}", flush=True)

    # Fallback
    return {
        "page_number": page_num, "png_num": page_num - 1,
        "chunks": [{
            "order_in_page": 1, "type": "blank", "page": page_num,
            "content_en": "[Page processing failed]",
            "content_pt_br": "[Falha no processamento da página]",
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "classification": None, "formatting": [],
            "cross_page_hint": "self_contained", "ocr_confidence": 0.0,
            "ocr_source_lines": [], "redaction_code": None,
            "redaction_inferred_content_type": None, "image_type": None,
            "ufo_anomaly_detected": False, "ufo_anomaly_type": None,
            "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False,
            "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None
        }]
    }


def yv(v):
    if v is None:
        return "null"
    if isinstance(v, bool):
        return str(v).lower()
    s = str(v)
    if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '<', '>', '=', '!', '%', '@', '`']):
        return f'"{s}"'
    return s


def write_chunk_file(chunk: dict):
    chunk_id = chunk["chunk_id"]
    bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
    page = chunk.get("page", 1)
    png_num = chunk.get("png_num", page - 1)
    ctype = chunk.get("type", "paragraph")
    fmt_list = chunk.get("formatting") or []
    fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
    ocr_lines = chunk.get("ocr_source_lines") or []
    ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
    related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null"

    content = f"""---
chunk_id: {chunk_id}
type: {ctype}
page: {page}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
classification: {yv(chunk.get("classification"))}
formatting: {fmt_str}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {chunk.get("prev_chunk") or "null"}
next_chunk: {chunk.get("next_chunk") or "null"}
related_image: {related_image}
related_table: null
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
ocr_source_lines: {ocr_lines_str}
redaction_code: {yv(chunk.get("redaction_code"))}
redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))}
image_type: {yv(chunk.get("image_type"))}
ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()}
cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()}
ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {yv(chunk.get("image_description_en"))}
image_description_pt_br: {yv(chunk.get("image_description_pt_br"))}
extracted_text: {yv(chunk.get("extracted_text"))}
source_png: ../../processing/png/{DOC_ID}/p-{png_num:03d}.png
---

**EN:** {chunk.get("content_en", "")}

**PT-BR:** {chunk.get("content_pt_br", "")}
"""
    (CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8")


def crop_image(chunk: dict):
    chunk_id = chunk["chunk_id"]
    png_num = chunk.get("png_num", chunk.get("page", 1) - 1)
    bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
    src = PNG_DIR / f"p-{png_num:03d}.png"
    dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
    if not PILLOW_OK or not src.exists():
        return
    try:
        im = PILImage.open(src)
        W, H = im.size
        x = max(0.0, min(1.0, bbox.get("x", 0.0)))
        y = max(0.0, min(1.0, bbox.get("y", 0.0)))
        w = max(0.01, min(1.0 - x, bbox.get("w", 1.0)))
        h = max(0.01, min(1.0 - y, bbox.get("h", 0.1)))
        pad = 0.005
        left   = max(0, int((x - pad) * W))
        top    = max(0, int((y - pad) * H))
        right  = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        im.crop((left, top, right, bottom)).save(str(dst))
        print(f"    [CROP] {chunk_id}", flush=True)
    except Exception as e:
        print(f"    [CROP FAIL] {chunk_id}: {e}", flush=True)


def parse_frontmatter(path: Path) -> dict:
    """Read YAML frontmatter from a chunk .md file."""
    text = path.read_text(encoding="utf-8", errors="replace")
    if not text.startswith("---"):
        return {}
    end = text.find("\n---\n", 3)
    if end == -1:
        return {}
    fm_text = text[3:end]
    data = {}
    for line in fm_text.split("\n"):
        m = re.match(r'^(\w+):\s*(.*)', line)
        if not m:
            continue
        key, val = m.group(1), m.group(2).strip()
        if val == "null":
            data[key] = None
        elif val == "true":
            data[key] = True
        elif val == "false":
            data[key] = False
        else:
            # Try int
            try:
                data[key] = int(val)
            except ValueError:
                # Strip surrounding quotes
                if val.startswith('"') and val.endswith('"'):
                    data[key] = val[1:-1]
                else:
                    data[key] = val
    # Parse bbox specially
    bbox_m = re.search(r'bbox:\s*\{x:\s*([\d.]+),\s*y:\s*([\d.]+),\s*w:\s*([\d.]+),\s*h:\s*([\d.]+)\}', text)
    if bbox_m:
        data["bbox"] = {
            "x": float(bbox_m.group(1)),
            "y": float(bbox_m.group(2)),
            "w": float(bbox_m.group(3)),
            "h": float(bbox_m.group(4)),
        }
    # Extract body content
    body = text[end + 5:].strip()
    en_m = re.search(r'\*\*EN:\*\*\s*(.*?)(?=\n\n\*\*PT-BR:|$)', body, re.DOTALL)
    ptbr_m = re.search(r'\*\*PT-BR:\*\*\s*(.*?)$', body, re.DOTALL)
    data["content_en"] = en_m.group(1).strip() if en_m else ""
    data["content_pt_br"] = ptbr_m.group(1).strip() if ptbr_m else ""
    return data


def build_assembly(all_chunks: list, build_at: str):
    """Write _index.json and document.md from all_chunks list."""
    type_histogram = {}
    for chunk in all_chunks:
        t = chunk.get("type", "paragraph")
        type_histogram[t] = type_histogram.get(t, 0) + 1

    ufo_flagged     = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
    cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
    images_extracted = sum(1 for c in all_chunks if c.get("type") == "image")

    # _index.json
    index_chunks = []
    for chunk in all_chunks:
        bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
        content_en = chunk.get("content_en", "")
        preview = content_en[:80] + ("..." if len(content_en) > 80 else "")
        index_chunks.append({
            "chunk_id": chunk["chunk_id"],
            "type": chunk.get("type", "paragraph"),
            "page": chunk.get("page", 1),
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk.get("order_global", 1),
            "file": f"chunks/{chunk['chunk_id']}.md",
            "bbox": bbox,
            "preview": preview
        })

    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": len(all_chunks),
        "build_approach": "subagents",
        "build_model": "claude-haiku-4-5",
        "build_at": build_at,
        "chunks": index_chunks
    }
    (OUT_DIR / "_index.json").write_text(
        json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
    )
    print(f"  Wrote _index.json ({len(all_chunks)} chunks)", flush=True)

    # document.md
    histogram_yaml = "\n".join(f"  {k}: {v}" for k, v in sorted(type_histogram.items()))
    def list_yaml(items):
        return "  []" if not items else "\n".join(f"  - {i}" for i in items)

    doc_parts = [f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {len(all_chunks)}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged:
{list_yaml(ufo_flagged)}
cryptid_anomalies_flagged:
{list_yaml(cryptid_flagged)}
build_approach: "subagents"
build_model: "claude-haiku-4-5"
build_at: "{build_at}"
---
"""]

    chunks_by_page: dict = {}
    for chunk in all_chunks:
        p = chunk.get("page", 1)
        chunks_by_page.setdefault(p, []).append(chunk)

    for page_seq in sorted(chunks_by_page.keys()):
        png_num = page_seq - 1
        doc_parts.append(f"\n## Page {page_seq} (source: p-{png_num:03d}.png)\n")
        for chunk in sorted(chunks_by_page[page_seq], key=lambda c: c.get("order_in_page", 1)):
            chunk_id = chunk["chunk_id"]
            ctype    = chunk.get("type", "paragraph")
            bbox     = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
            bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"

            doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
            doc_parts.append(f'<a id="{chunk_id}"></a>\n')
            doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
            doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n")
            doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n")

            if ctype == "image":
                doc_parts.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)\n\n")
                if chunk.get("image_description_en"):
                    doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n")
                if chunk.get("image_description_pt_br"):
                    doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n")

            meta = {k: chunk.get(k) for k in [
                "chunk_id", "type", "page", "order_in_page", "order_global",
                "bbox", "classification", "formatting", "cross_page_hint",
                "prev_chunk", "next_chunk", "ocr_confidence", "redaction_code",
                "image_type", "ufo_anomaly_detected", "cryptid_anomaly_detected",
                "ufo_anomaly_type", "ufo_anomaly_rationale",
            ]}
            meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
            doc_parts.append(
                f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n"
            )

    doc_md = "".join(doc_parts)
    (OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8")
    print(f"  Wrote document.md ({len(doc_md):,} chars)", flush=True)

    return images_extracted, ufo_flagged, cryptid_flagged


def main():
    t_start = time.time()
    CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)

    print(f"=== Phase A: Process pages {START_PAGE}-{TOTAL_PAGES} via claude CLI ===", flush=True)

    pages_to_process = list(range(START_PAGE, TOTAL_PAGES + 1))
    new_page_results: dict = {}

    for batch_start in range(0, len(pages_to_process), BATCH_SIZE):
        batch = pages_to_process[batch_start:batch_start + BATCH_SIZE]
        print(f"  Batch pages {batch[0]}-{batch[-1]}...", flush=True)
        with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as ex:
            futures = {ex.submit(rebuild_page, p): p for p in batch}
            for fut in concurrent.futures.as_completed(futures):
                result = fut.result()
                new_page_results[result["page_number"]] = result

    # Assign global chunk IDs (continuing from c0204)
    print(f"\n=== Phase A2: Numbering new chunks from c{FIRST_CHUNK_NUM:04d} ===", flush=True)
    new_chunks = []
    order_global = FIRST_CHUNK_NUM - 1
    for page_num in sorted(new_page_results.keys()):
        result = new_page_results[page_num]
        png_num = result.get("png_num", page_num - 1)
        for ch in sorted(result.get("chunks", []), key=lambda c: c.get("order_in_page", 0)):
            order_global += 1
            ch["chunk_id"]     = f"c{order_global:04d}"
            ch["order_global"] = order_global
            ch["png_num"]      = png_num
            new_chunks.append(ch)

    # prev/next links (will be re-linked globally in Phase B)
    for i, ch in enumerate(new_chunks):
        ch["prev_chunk"] = new_chunks[i-1]["chunk_id"] if i > 0 else None
        ch["next_chunk"] = new_chunks[i+1]["chunk_id"] if i < len(new_chunks)-1 else None

    print(f"  {len(new_chunks)} new chunks generated", flush=True)

    # Crop images
    image_chunks = [c for c in new_chunks if c.get("type") == "image"]
    if image_chunks:
        print(f"\n=== Phase A3: Cropping {len(image_chunks)} images ===", flush=True)
        for ch in image_chunks:
            crop_image(ch)

    # Write new chunk files
    print(f"\n=== Phase A4: Writing {len(new_chunks)} new chunk files ===", flush=True)
    for ch in new_chunks:
        write_chunk_file(ch)

    # ── Phase B: Read ALL chunks and rebuild assembly ──────────────────────
    print(f"\n=== Phase B: Reading all chunk files for full assembly ===", flush=True)

    all_chunk_files = sorted(CHUNKS_DIR.glob("c*.md"))
    print(f"  Found {len(all_chunk_files)} total chunk files", flush=True)

    all_chunks = []
    for path in all_chunk_files:
        fm = parse_frontmatter(path)
        if not fm.get("chunk_id"):
            fm["chunk_id"] = path.stem
        all_chunks.append(fm)

    # Sort by order_global
    all_chunks.sort(key=lambda c: (c.get("order_global", 999999), c.get("page", 0), c.get("order_in_page", 0)))

    # Re-link prev/next globally
    for i, ch in enumerate(all_chunks):
        ch["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
        ch["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None

    print(f"  Total chunks: {len(all_chunks)}", flush=True)

    print(f"\n=== Phase B2: Building _index.json and document.md ===", flush=True)
    build_at = datetime.now(timezone.utc).isoformat()
    images_extracted, ufo_flagged, cryptid_flagged = build_assembly(all_chunks, build_at)

    t_end = time.time()
    wall_seconds = int(t_end - t_start)

    pages_done   = TOTAL_PAGES
    chunks_total = len(all_chunks)
    tables_stitched = 0

    final = (
        f"pages_done={pages_done}, chunks_total={chunks_total}, "
        f"images_extracted={images_extracted}, tables_stitched={tables_stitched}, "
        f"ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, "
        f"wall_seconds={wall_seconds}"
    )
    print(f"\n=== DONE ===\n{final}", flush=True)


if __name__ == "__main__":
    main()