disclosure-bureau/scripts/rebuild_doc65_s8.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-8
Processes all 218 pages (p-000 to p-217) using Anthropic vision API.
"""

import anthropic
import base64
import json
import os
import re
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8"
DOC_TITLE = "FBI Flying Saucers Investigation — 62-HQ-83894 Section 8"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"

client = anthropic.Anthropic()

PAGE_PROMPT = """You are an expert document archivist analyzing a page from a declassified FBI document about flying saucer investigations (62-HQ-83894 Section 8).

Analyze this page image carefully and return a JSON object with the following structure:

{
  "page_number": <int>,
  "chunks": [
    {
      "order_in_page": <int starting at 1>,
      "type": "<one of: cover, letterhead, header, paragraph, signature_block, stamp, handwriting, redaction, table_marker, image, blank, footer, marginalia, classification_banner>",
      "content_en": "<full text content in English, verbatim from document>",
      "content_pt_br": "<Brazilian Portuguese translation of content (keep verbatim quotes in original language)>",
      "bbox": {"x": <0-1 float>, "y": <0-1 float>, "w": <0-1 float>, "h": <0-1 float>},
      "classification": "<null or classification string if visible>",
      "formatting": ["<list of: bold, italic, all_caps, underline, typewritten, handwritten>"],
      "cross_page_hint": "<self_contained|continues_to_next|continues_from_prev>",
      "ocr_confidence": <0.0-1.0>,
      "ocr_source_lines": [<list of line numbers>],
      "redaction_code": "<null or redaction code like b1, b6, b7c>",
      "redaction_inferred_content_type": "<null or description of what was redacted>",
      "image_type": "<null or: photograph, diagram, sketch, map, chart>",
      "ufo_anomaly_detected": <true|false>,
      "ufo_anomaly_type": "<null or description>",
      "ufo_anomaly_rationale": "<null or rationale>",
      "cryptid_anomaly_detected": <false>,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null
    }
  ]
}

Rules:
- Extract ALL text verbatim from the document including stamps, handwriting, headers, footers
- For redacted/blacked out areas, type="redaction" and estimate what was redacted
- For stamps (RECORDED, INDEXED, FOIPA, etc.), type="stamp"
- For handwritten annotations, type="handwriting"
- For the cover page (folder cover), type="cover"
- The bbox coordinates are normalized (0-1) relative to page dimensions: x=left, y=top, w=width, h=height
- If page is blank or nearly blank, one chunk of type="blank"
- Mark ufo_anomaly_detected=true for chunks describing UAP/UFO sightings, objects, or unusual aerial phenomena
- Always include content_pt_br as Brazilian Portuguese translation
- For document headers/letterheads, include all visible text

Return ONLY the JSON object, no other text."""


def load_image_b64(path: Path) -> str:
    with open(path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")


def analyze_page(page_num: int) -> dict:
    """Analyze a single page via vision API."""
    # PNG pages are 0-indexed (p-000 through p-217)
    png_path = PNG_DIR / f"p-{page_num:03d}.png"

    if not png_path.exists():
        return {"page_number": page_num, "chunks": [
            {"order_in_page": 1, "type": "blank", "content_en": "(page not found)",
             "content_pt_br": "(página não encontrada)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
             "classification": None, "formatting": [], "cross_page_hint": "self_contained",
             "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
             "redaction_inferred_content_type": None, "image_type": None,
             "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
             "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
        ]}

    img_b64 = load_image_b64(png_path)

    try:
        response = client.messages.create(
            model="claude-haiku-4-5",
            max_tokens=4000,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": img_b64,
                            },
                        },
                        {
                            "type": "text",
                            "text": PAGE_PROMPT + f"\n\nThis is page {page_num} (0-indexed) of the document."
                        }
                    ],
                }
            ],
        )

        raw = response.content[0].text.strip()
        # Strip markdown code fences if present
        if raw.startswith("```"):
            raw = re.sub(r'^```[a-z]*\n?', '', raw)
            raw = re.sub(r'\n?```$', '', raw)

        data = json.loads(raw)
        data["page_number"] = page_num
        return data

    except json.JSONDecodeError as e:
        print(f"  JSON parse error on page {page_num}: {e}", file=sys.stderr)
        # Try to extract JSON from response
        try:
            match = re.search(r'\{.*\}', raw, re.DOTALL)
            if match:
                data = json.loads(match.group())
                data["page_number"] = page_num
                return data
        except Exception:
            pass
        return {"page_number": page_num, "chunks": [
            {"order_in_page": 1, "type": "blank", "content_en": f"(parse error: {e})",
             "content_pt_br": "(erro de análise)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
             "classification": None, "formatting": [], "cross_page_hint": "self_contained",
             "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
             "redaction_inferred_content_type": None, "image_type": None,
             "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
             "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
        ]}
    except Exception as e:
        print(f"  API error on page {page_num}: {e}", file=sys.stderr)
        return {"page_number": page_num, "chunks": [
            {"order_in_page": 1, "type": "blank", "content_en": f"(api error: {e})",
             "content_pt_br": "(erro de API)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
             "classification": None, "formatting": [], "cross_page_hint": "self_contained",
             "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
             "redaction_inferred_content_type": None, "image_type": None,
             "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
             "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
        ]}


def process_pages_batch(page_nums: list, max_workers: int = 4) -> list:
    """Process a batch of pages in parallel."""
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {executor.submit(analyze_page, p): p for p in page_nums}
        for future in as_completed(future_to_page):
            page_num = future_to_page[future]
            try:
                result = future.result()
                results[page_num] = result
                print(f"  Page {page_num} done: {len(result.get('chunks', []))} chunks")
            except Exception as e:
                print(f"  Page {page_num} failed: {e}", file=sys.stderr)
    return [results[p] for p in sorted(results.keys())]


def main():
    # Determine pages to process
    png_files = sorted(PNG_DIR.glob("p-*.png"))
    page_nums = [int(f.stem.split("-")[1]) for f in png_files]
    total_pages = len(page_nums)

    print(f"Processing {total_pages} pages for {DOC_ID}")
    print(f"Pages: {min(page_nums)} to {max(page_nums)}")

    # Check for already processed pages
    already_done = set()
    out_json = OUT_DIR / "pages_raw.json"
    all_page_data = {}

    if out_json.exists():
        with open(out_json) as f:
            existing = json.load(f)
        for pd in existing:
            all_page_data[pd["page_number"]] = pd
            already_done.add(pd["page_number"])
        print(f"Already processed: {len(already_done)} pages")

    remaining = [p for p in page_nums if p not in already_done]
    print(f"Remaining: {len(remaining)} pages")

    # Process in batches of 5
    batch_size = 5
    for i in range(0, len(remaining), batch_size):
        batch = remaining[i:i + batch_size]
        print(f"\nBatch {i//batch_size + 1}: pages {batch}")
        results = process_pages_batch(batch, max_workers=4)
        for r in results:
            all_page_data[r["page_number"]] = r

        # Save progress
        pages_list = [all_page_data[p] for p in sorted(all_page_data.keys())]
        with open(out_json, "w", encoding="utf-8") as f:
            json.dump(pages_list, f, ensure_ascii=False, indent=2)
        print(f"  Saved progress: {len(all_page_data)} pages done")

    print(f"\nAll pages processed. Total: {len(all_page_data)}")
    return all_page_data


if __name__ == "__main__":
    main()