disclosure-bureau/scripts/rebuild_doc65_suba_final.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_suba_final.py
Full rebuild of doc-65-hs1-834228961-62-hq-83894-sub-a
89 pages (p-000 to p-063, p-100 to p-124 PNGs)
Uses Anthropic claude-haiku-4-5 for vision processing.
"""

import os
import sys
import json
import base64
import time
import re
import threading
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from PIL import Image as PILImage
import anthropic

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-sub-a"
DOC_TITLE = "FBI HQ 62-HQ-83894 Sub A — Flying Saucers / UAP Investigation File"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"

BATCH_SIZE = 4
MAX_WORKERS = 4

_lock = threading.Lock()

def safe_print(*args, **kwargs):
    with _lock:
        print(*args, **kwargs, flush=True)

# Ensure dirs
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

client = anthropic.Anthropic()

# Build ordered list of PNG files
png_files = sorted(PNG_DIR.glob("p-*.png"))
TOTAL_PAGES = len(png_files)
safe_print(f"Found {TOTAL_PAGES} PNG pages")


def load_image_b64(path: Path) -> str:
    with open(path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")


def load_ocr(png_name: str) -> str:
    ocr_name = png_name.replace(".png", ".txt")
    ocr_path = OCR_DIR / ocr_name
    if ocr_path.exists():
        txt = ocr_path.read_text(encoding="utf-8").strip()
        if txt:
            return txt[:3000]
    return "(no OCR text available — use vision only)"


def extract_json(text: str) -> dict:
    text = text.strip()
    text = re.sub(r"^```(?:json)?\s*", "", text)
    text = re.sub(r"\s*```\s*$", "", text)
    start = text.find("{")
    if start == -1:
        raise ValueError("No JSON found")
    depth = 0
    for i, c in enumerate(text[start:], start):
        if c == "{":
            depth += 1
        elif c == "}":
            depth -= 1
            if depth == 0:
                return json.loads(text[start:i+1])
    raise ValueError("Unclosed JSON")


PAGE_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO FBI document archive.

Document: {doc_title}
Doc ID: {doc_id}
Page: {page_number} of {total_pages}
PNG: {png_filename}

OCR text:
---
{ocr_text}
---

Analyze this page image carefully. Extract ALL content as ordered semantic chunks.

Return ONLY valid JSON (no markdown, no fences):

{{
  "page_number": {page_number},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "cover|letterhead|classification_banner|subject_line|salutation|body_paragraph|signature_block|date_line|reference_line|redaction_block|table_marker|image|caption|footer|header|list_item|handwritten_note|stamp|page_number|section_heading|blank",
      "content_en": "verbatim text or description in English",
      "content_pt_br": "tradução em português brasileiro",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.85,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null
    }}
  ]
}}

Rules:
1. Every visible region = its own chunk. Do not skip content.
2. For images: set image_type to photograph|diagram|map|sketch|stamp|chart|handwriting|newspaper_clipping|other
3. For redaction_block: set redaction_code to visible FOIA code if shown.
4. For classification banners/stamps: set classification field to exact text.
5. ufo_anomaly_detected=true if content has UAP/UFO sighting details, craft descriptions, anomalous phenomena.
6. cross_page_hint: "self_contained"|"continues_to_next"|"continues_from_prev"
7. bbox: normalized 0.0-1.0 (x=left, y=top, w=width, h=height).
8. formatting: ["bold","italic","all_caps","underline","strikethrough"]
9. Newspaper clippings = type "image", image_type="newspaper_clipping", ufo_anomaly_detected=true if about UFOs.
10. Return ONLY the JSON object, nothing else."""


def fallback_chunk(page_number: int, reason: str) -> dict:
    return {
        "page_number": page_number,
        "chunks": [{
            "order_in_page": 1,
            "type": "body_paragraph",
            "content_en": f"[Page {page_number} - processing failed: {reason[:80]}]",
            "content_pt_br": f"[Página {page_number} - falha no processamento: {reason[:80]}]",
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "classification": None, "formatting": [], "cross_page_hint": "self_contained",
            "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
            "redaction_inferred_content_type": None, "image_type": None,
            "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
            "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
        }]
    }


def process_page(page_idx: int, png_path: Path) -> dict:
    page_number = page_idx + 1
    png_filename = png_path.name
    ocr_text = load_ocr(png_filename)
    img_b64 = load_image_b64(png_path)

    prompt = PAGE_PROMPT.format(
        doc_title=DOC_TITLE, doc_id=DOC_ID,
        page_number=page_number, total_pages=TOTAL_PAGES,
        png_filename=png_filename, ocr_text=ocr_text,
    )

    for attempt in range(3):
        try:
            response = client.messages.create(
                model="claude-haiku-4-5",
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
                        {"type": "text", "text": prompt},
                    ],
                }],
            )
            raw = response.content[0].text
            data = extract_json(raw)
            data["page_number"] = page_number
            data["png_path"] = str(png_path)
            data["png_filename"] = png_filename
            safe_print(f"  p{page_number} ({png_filename}): {len(data.get('chunks',[]))} chunks")
            return data
        except json.JSONDecodeError as e:
            safe_print(f"  p{page_number} JSON error attempt {attempt+1}: {e}")
            if attempt == 2:
                return fallback_chunk(page_number, f"JSON parse: {e}")
        except Exception as e:
            safe_print(f"  p{page_number} error attempt {attempt+1}: {e}")
            if attempt < 2:
                time.sleep(2 ** attempt)
            else:
                return fallback_chunk(page_number, str(e))


IMAGE_ANALYST_PROMPT = """You are an image analyst for a declassified FBI UAP/UFO document archive.

Analyze this cropped image from FBI file 62-HQ-83894 about Flying Saucers/UAP.

Return ONLY valid JSON (no markdown, no fences):

{{
  "image_description_en": "detailed English description",
  "image_description_pt_br": "descrição detalhada em português brasileiro",
  "image_type": "photograph|diagram|map|sketch|stamp|chart|handwriting|newspaper_clipping|other",
  "extracted_text": "visible text verbatim or null",
  "ufo_anomaly_detected": false,
  "ufo_anomaly_type": null,
  "ufo_anomaly_rationale": null,
  "cryptid_anomaly_detected": false,
  "cryptid_anomaly_type": null,
  "cryptid_anomaly_rationale": null
}}"""


def crop_and_analyze_image(chunk: dict) -> dict:
    chunk_id = chunk["chunk_id"]
    png_path = chunk["png_path"]
    bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})

    # Crop
    try:
        im = PILImage.open(png_path)
        W, H = im.size
        x, y, w, h = bbox.get("x",0), bbox.get("y",0), bbox.get("w",1), bbox.get("h",1)
        pad = 0.005
        left = max(0, int((x-pad)*W))
        top = max(0, int((y-pad)*H))
        right = min(W, int((x+w+pad)*W))
        bottom = min(H, int((y+h+pad)*H))
        crop = im.crop((left, top, right, bottom))
        out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
        crop.save(str(out_path))
        img_b64 = load_image_b64(out_path)
    except Exception as e:
        safe_print(f"  Crop error {chunk_id}: {e}")
        return chunk

    # Analyze
    try:
        response = client.messages.create(
            model="claude-haiku-4-5",
            max_tokens=1024,
            messages=[{
                "role": "user",
                "content": [
                    {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
                    {"type": "text", "text": IMAGE_ANALYST_PROMPT},
                ],
            }],
        )
        raw = response.content[0].text
        analysis = extract_json(raw)
        for key in ["image_description_en","image_description_pt_br","image_type","extracted_text",
                    "ufo_anomaly_detected","ufo_anomaly_type","ufo_anomaly_rationale",
                    "cryptid_anomaly_detected","cryptid_anomaly_type","cryptid_anomaly_rationale"]:
            if key in analysis:
                chunk[key] = analysis[key]
        safe_print(f"  image analyzed: {chunk_id} ufo={chunk.get('ufo_anomaly_detected',False)}")
    except Exception as e:
        safe_print(f"  Image analysis error {chunk_id}: {e}")

    return chunk


def yaml_val(v):
    if v is None:
        return "null"
    if isinstance(v, bool):
        return "true" if v else "false"
    if isinstance(v, list):
        if not v:
            return "[]"
        return "[" + ", ".join(json.dumps(i, ensure_ascii=False) for i in v) + "]"
    return json.dumps(v, ensure_ascii=False)


def write_chunk_file(chunk: dict):
    chunk_id = chunk["chunk_id"]
    bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
    chunk_type = chunk.get("type", "body_paragraph")
    related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
    png_filename = chunk.get("png_filename", "")

    fm = f"""---
chunk_id: {chunk_id}
type: {chunk_type}
page: {chunk['page']}
order_in_page: {chunk.get('order_in_page', 1)}
order_global: {chunk['order_global']}
bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',1):.2f}}}
classification: {yaml_val(chunk.get('classification'))}
formatting: {yaml_val(chunk.get('formatting', []))}
cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}
prev_chunk: {yaml_val(chunk.get('prev_chunk'))}
next_chunk: {yaml_val(chunk.get('next_chunk'))}
related_image: {yaml_val(related_image)}
related_table: {yaml_val(chunk.get('related_table'))}
ocr_confidence: {chunk.get('ocr_confidence', 0.8)}
ocr_source_lines: {yaml_val(chunk.get('ocr_source_lines', []))}
redaction_code: {yaml_val(chunk.get('redaction_code'))}
redaction_inferred_content_type: {yaml_val(chunk.get('redaction_inferred_content_type'))}
image_type: {yaml_val(chunk.get('image_type'))}
ufo_anomaly_detected: {yaml_val(chunk.get('ufo_anomaly_detected', False))}
cryptid_anomaly_detected: {yaml_val(chunk.get('cryptid_anomaly_detected', False))}
ufo_anomaly_type: {yaml_val(chunk.get('ufo_anomaly_type'))}
ufo_anomaly_rationale: {yaml_val(chunk.get('ufo_anomaly_rationale'))}
cryptid_anomaly_type: {yaml_val(chunk.get('cryptid_anomaly_type'))}
cryptid_anomaly_rationale: {yaml_val(chunk.get('cryptid_anomaly_rationale'))}
image_description_en: {yaml_val(chunk.get('image_description_en'))}
image_description_pt_br: {yaml_val(chunk.get('image_description_pt_br'))}
extracted_text: {yaml_val(chunk.get('extracted_text'))}
source_png: ../../processing/png/{DOC_ID}/{png_filename}
---

**EN:** {chunk.get('content_en', '')}

**PT-BR:** {chunk.get('content_pt_br', '')}
"""
    (CHUNKS_DIR / f"{chunk_id}.md").write_text(fm, encoding="utf-8")


def main():
    start = time.time()
    safe_print(f"=== Rebuild {DOC_ID} ===")
    safe_print(f"Total pages: {TOTAL_PAGES}")

    # Phase 1: Process pages in batches
    all_pages = []
    page_items = list(enumerate(png_files))  # (idx, path)

    for batch_start in range(0, TOTAL_PAGES, BATCH_SIZE):
        batch = page_items[batch_start: batch_start + BATCH_SIZE]
        safe_print(f"Batch pages {[b[0]+1 for b in batch]}...")
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
            futs = {ex.submit(process_page, idx, pth): idx for idx, pth in batch}
            for fut in as_completed(futs):
                result = fut.result()
                all_pages.append(result)

    all_pages.sort(key=lambda p: p["page_number"])

    # Phase 2: Global chunk numbering
    global_chunks = []
    chunk_counter = 1
    for page_data in all_pages:
        page_chunks = sorted(page_data.get("chunks", []), key=lambda c: c.get("order_in_page", 1))
        for chunk in page_chunks:
            chunk["chunk_id"] = f"c{chunk_counter:04d}"
            chunk["page"] = page_data["page_number"]
            chunk["png_path"] = page_data["png_path"]
            chunk["png_filename"] = page_data["png_filename"]
            chunk["order_global"] = chunk_counter
            global_chunks.append(chunk)
            chunk_counter += 1

    total_chunks = len(global_chunks)
    safe_print(f"Total chunks: {total_chunks}")

    # Set prev/next
    for i, chunk in enumerate(global_chunks):
        chunk["prev_chunk"] = global_chunks[i-1]["chunk_id"] if i > 0 else None
        chunk["next_chunk"] = global_chunks[i+1]["chunk_id"] if i < total_chunks-1 else None

    # Phase 3: Crop & analyze images
    image_chunks = [c for c in global_chunks if c.get("type") == "image"]
    safe_print(f"Image chunks: {len(image_chunks)}")

    for batch_start in range(0, len(image_chunks), BATCH_SIZE):
        batch = image_chunks[batch_start: batch_start + BATCH_SIZE]
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
            futs = {ex.submit(crop_and_analyze_image, chunk): chunk["chunk_id"] for chunk in batch}
            for fut in as_completed(futs):
                fut.result()  # side-effects already applied

    # Phase 4: Write chunk files
    safe_print("Writing chunk files...")
    for chunk in global_chunks:
        write_chunk_file(chunk)

    # Phase 5: Write _index.json
    safe_print("Writing _index.json...")
    build_at = datetime.now(timezone.utc).isoformat()
    index_chunks = []
    for chunk in global_chunks:
        index_chunks.append({
            "chunk_id": chunk["chunk_id"],
            "type": chunk.get("type", "body_paragraph"),
            "page": chunk["page"],
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk["order_global"],
            "file": f"chunks/{chunk['chunk_id']}.md",
            "bbox": chunk.get("bbox", {"x":0,"y":0,"w":1,"h":1}),
            "preview": chunk.get("content_en","")[:80].replace("\n"," "),
        })
    index_data = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": "claude-haiku-4-5",
        "build_at": build_at,
        "chunks": index_chunks,
    }
    (OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")

    # Phase 6: Assemble document.md
    safe_print("Assembling document.md...")
    type_hist = {}
    for chunk in global_chunks:
        t = chunk.get("type","body_paragraph")
        type_hist[t] = type_hist.get(t,0)+1

    ufo_flagged = [c["chunk_id"] for c in global_chunks if c.get("ufo_anomaly_detected")]
    cryptid_flagged = [c["chunk_id"] for c in global_chunks if c.get("cryptid_anomaly_detected")]

    hist_yaml = "\n".join(f"  {k}: {v}" for k,v in sorted(type_hist.items()))

    doc_parts = [f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {total_chunks}
chunk_types_histogram:
{hist_yaml}
multi_page_tables: []
ufo_anomalies_flagged: {json.dumps(ufo_flagged, ensure_ascii=False)}
cryptid_anomalies_flagged: {json.dumps(cryptid_flagged, ensure_ascii=False)}
build_approach: "subagents"
build_model: "claude-haiku-4-5"
build_at: "{build_at}"
---
"""]

    chunks_by_page = {}
    for chunk in global_chunks:
        p = chunk["page"]
        chunks_by_page.setdefault(p, []).append(chunk)

    for page_num in sorted(chunks_by_page.keys()):
        doc_parts.append(f"\n## Page {page_num}\n\n")
        for chunk in chunks_by_page[page_num]:
            cid = chunk["chunk_id"]
            ctype = chunk.get("type","body_paragraph")
            bbox = chunk.get("bbox",{})
            bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"

            doc_parts.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->\n")
            doc_parts.append(f'<a id="{cid}"></a>\n')
            doc_parts.append(f"### Chunk {cid} — {ctype} · p{page_num} · bbox: {bbox_str}\n\n")

            if ctype == "image":
                doc_parts.append(f"![chunk image](./images/IMG-{cid}.png)\n\n")
                d_en = chunk.get("image_description_en")
                d_pt = chunk.get("image_description_pt_br")
                if d_en:
                    doc_parts.append(f"**Image Description (EN):** {d_en}\n\n")
                if d_pt:
                    doc_parts.append(f"**Descrição da Imagem (PT-BR):** {d_pt}\n\n")

            doc_parts.append(f"**EN:** {chunk.get('content_en','')}\n\n")
            doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br','')}\n\n")

            meta = {
                "chunk_id": cid, "type": ctype,
                "page": page_num, "order_in_page": chunk.get("order_in_page",1),
                "order_global": chunk["order_global"],
                "bbox": chunk.get("bbox",{}),
                "classification": chunk.get("classification"),
                "formatting": chunk.get("formatting",[]),
                "cross_page_hint": chunk.get("cross_page_hint","self_contained"),
                "prev_chunk": chunk.get("prev_chunk"),
                "next_chunk": chunk.get("next_chunk"),
                "ocr_confidence": chunk.get("ocr_confidence",0.8),
                "redaction_code": chunk.get("redaction_code"),
                "image_type": chunk.get("image_type"),
                "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected",False),
                "ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
                "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected",False),
                "source_png": f"../../processing/png/{DOC_ID}/{chunk.get('png_filename','')}",
            }
            doc_parts.append("<details><summary>metadata</summary>\n\n```json\n")
            doc_parts.append(json.dumps(meta, ensure_ascii=False, indent=2))
            doc_parts.append("\n```\n\n</details>\n\n---\n\n")

    doc_content = "".join(doc_parts)
    (OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
    doc_md_bytes = len(doc_content.encode("utf-8"))

    elapsed = int(time.time() - start)
    safe_print(f"\nSTATS pages={TOTAL_PAGES} chunks={total_chunks} images={len(image_chunks)} tables=0 ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}")
    print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={len(image_chunks)}, tables_stitched=0, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={elapsed}")


if __name__ == "__main__":
    main()