disclosure-bureau/scripts/rebuild_doc65_s8.py

227 lines
9.6 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-8
Processes all 218 pages (p-000 to p-217) using Anthropic vision API.
"""
import anthropic
import base64
import json
import os
import re
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8"
DOC_TITLE = "FBI Flying Saucers Investigation — 62-HQ-83894 Section 8"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
client = anthropic.Anthropic()
PAGE_PROMPT = """You are an expert document archivist analyzing a page from a declassified FBI document about flying saucer investigations (62-HQ-83894 Section 8).
Analyze this page image carefully and return a JSON object with the following structure:
{
"page_number": <int>,
"chunks": [
{
"order_in_page": <int starting at 1>,
"type": "<one of: cover, letterhead, header, paragraph, signature_block, stamp, handwriting, redaction, table_marker, image, blank, footer, marginalia, classification_banner>",
"content_en": "<full text content in English, verbatim from document>",
"content_pt_br": "<Brazilian Portuguese translation of content (keep verbatim quotes in original language)>",
"bbox": {"x": <0-1 float>, "y": <0-1 float>, "w": <0-1 float>, "h": <0-1 float>},
"classification": "<null or classification string if visible>",
"formatting": ["<list of: bold, italic, all_caps, underline, typewritten, handwritten>"],
"cross_page_hint": "<self_contained|continues_to_next|continues_from_prev>",
"ocr_confidence": <0.0-1.0>,
"ocr_source_lines": [<list of line numbers>],
"redaction_code": "<null or redaction code like b1, b6, b7c>",
"redaction_inferred_content_type": "<null or description of what was redacted>",
"image_type": "<null or: photograph, diagram, sketch, map, chart>",
"ufo_anomaly_detected": <true|false>,
"ufo_anomaly_type": "<null or description>",
"ufo_anomaly_rationale": "<null or rationale>",
"cryptid_anomaly_detected": <false>,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}
]
}
Rules:
- Extract ALL text verbatim from the document including stamps, handwriting, headers, footers
- For redacted/blacked out areas, type="redaction" and estimate what was redacted
- For stamps (RECORDED, INDEXED, FOIPA, etc.), type="stamp"
- For handwritten annotations, type="handwriting"
- For the cover page (folder cover), type="cover"
- The bbox coordinates are normalized (0-1) relative to page dimensions: x=left, y=top, w=width, h=height
- If page is blank or nearly blank, one chunk of type="blank"
- Mark ufo_anomaly_detected=true for chunks describing UAP/UFO sightings, objects, or unusual aerial phenomena
- Always include content_pt_br as Brazilian Portuguese translation
- For document headers/letterheads, include all visible text
Return ONLY the JSON object, no other text."""
def load_image_b64(path: Path) -> str:
with open(path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def analyze_page(page_num: int) -> dict:
"""Analyze a single page via vision API."""
# PNG pages are 0-indexed (p-000 through p-217)
png_path = PNG_DIR / f"p-{page_num:03d}.png"
if not png_path.exists():
return {"page_number": page_num, "chunks": [
{"order_in_page": 1, "type": "blank", "content_en": "(page not found)",
"content_pt_br": "(página não encontrada)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
]}
img_b64 = load_image_b64(png_path)
try:
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=4000,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_b64,
},
},
{
"type": "text",
"text": PAGE_PROMPT + f"\n\nThis is page {page_num} (0-indexed) of the document."
}
],
}
],
)
raw = response.content[0].text.strip()
# Strip markdown code fences if present
if raw.startswith("```"):
raw = re.sub(r'^```[a-z]*\n?', '', raw)
raw = re.sub(r'\n?```$', '', raw)
data = json.loads(raw)
data["page_number"] = page_num
return data
except json.JSONDecodeError as e:
print(f" JSON parse error on page {page_num}: {e}", file=sys.stderr)
# Try to extract JSON from response
try:
match = re.search(r'\{.*\}', raw, re.DOTALL)
if match:
data = json.loads(match.group())
data["page_number"] = page_num
return data
except Exception:
pass
return {"page_number": page_num, "chunks": [
{"order_in_page": 1, "type": "blank", "content_en": f"(parse error: {e})",
"content_pt_br": "(erro de análise)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
]}
except Exception as e:
print(f" API error on page {page_num}: {e}", file=sys.stderr)
return {"page_number": page_num, "chunks": [
{"order_in_page": 1, "type": "blank", "content_en": f"(api error: {e})",
"content_pt_br": "(erro de API)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
]}
def process_pages_batch(page_nums: list, max_workers: int = 4) -> list:
"""Process a batch of pages in parallel."""
results = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_page = {executor.submit(analyze_page, p): p for p in page_nums}
for future in as_completed(future_to_page):
page_num = future_to_page[future]
try:
result = future.result()
results[page_num] = result
print(f" Page {page_num} done: {len(result.get('chunks', []))} chunks")
except Exception as e:
print(f" Page {page_num} failed: {e}", file=sys.stderr)
return [results[p] for p in sorted(results.keys())]
def main():
# Determine pages to process
png_files = sorted(PNG_DIR.glob("p-*.png"))
page_nums = [int(f.stem.split("-")[1]) for f in png_files]
total_pages = len(page_nums)
print(f"Processing {total_pages} pages for {DOC_ID}")
print(f"Pages: {min(page_nums)} to {max(page_nums)}")
# Check for already processed pages
already_done = set()
out_json = OUT_DIR / "pages_raw.json"
all_page_data = {}
if out_json.exists():
with open(out_json) as f:
existing = json.load(f)
for pd in existing:
all_page_data[pd["page_number"]] = pd
already_done.add(pd["page_number"])
print(f"Already processed: {len(already_done)} pages")
remaining = [p for p in page_nums if p not in already_done]
print(f"Remaining: {len(remaining)} pages")
# Process in batches of 5
batch_size = 5
for i in range(0, len(remaining), batch_size):
batch = remaining[i:i + batch_size]
print(f"\nBatch {i//batch_size + 1}: pages {batch}")
results = process_pages_batch(batch, max_workers=4)
for r in results:
all_page_data[r["page_number"]] = r
# Save progress
pages_list = [all_page_data[p] for p in sorted(all_page_data.keys())]
with open(out_json, "w", encoding="utf-8") as f:
json.dump(pages_list, f, ensure_ascii=False, indent=2)
print(f" Saved progress: {len(all_page_data)} pages done")
print(f"\nAll pages processed. Total: {len(all_page_data)}")
return all_page_data
if __name__ == "__main__":
main()