227 lines
9.6 KiB
Python
227 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-8
|
|
Processes all 218 pages (p-000 to p-217) using Anthropic vision API.
|
|
"""
|
|
|
|
import anthropic
|
|
import base64
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8"
|
|
DOC_TITLE = "FBI Flying Saucers Investigation — 62-HQ-83894 Section 8"
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
|
|
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
PAGE_PROMPT = """You are an expert document archivist analyzing a page from a declassified FBI document about flying saucer investigations (62-HQ-83894 Section 8).
|
|
|
|
Analyze this page image carefully and return a JSON object with the following structure:
|
|
|
|
{
|
|
"page_number": <int>,
|
|
"chunks": [
|
|
{
|
|
"order_in_page": <int starting at 1>,
|
|
"type": "<one of: cover, letterhead, header, paragraph, signature_block, stamp, handwriting, redaction, table_marker, image, blank, footer, marginalia, classification_banner>",
|
|
"content_en": "<full text content in English, verbatim from document>",
|
|
"content_pt_br": "<Brazilian Portuguese translation of content (keep verbatim quotes in original language)>",
|
|
"bbox": {"x": <0-1 float>, "y": <0-1 float>, "w": <0-1 float>, "h": <0-1 float>},
|
|
"classification": "<null or classification string if visible>",
|
|
"formatting": ["<list of: bold, italic, all_caps, underline, typewritten, handwritten>"],
|
|
"cross_page_hint": "<self_contained|continues_to_next|continues_from_prev>",
|
|
"ocr_confidence": <0.0-1.0>,
|
|
"ocr_source_lines": [<list of line numbers>],
|
|
"redaction_code": "<null or redaction code like b1, b6, b7c>",
|
|
"redaction_inferred_content_type": "<null or description of what was redacted>",
|
|
"image_type": "<null or: photograph, diagram, sketch, map, chart>",
|
|
"ufo_anomaly_detected": <true|false>,
|
|
"ufo_anomaly_type": "<null or description>",
|
|
"ufo_anomaly_rationale": "<null or rationale>",
|
|
"cryptid_anomaly_detected": <false>,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}
|
|
]
|
|
}
|
|
|
|
Rules:
|
|
- Extract ALL text verbatim from the document including stamps, handwriting, headers, footers
|
|
- For redacted/blacked out areas, type="redaction" and estimate what was redacted
|
|
- For stamps (RECORDED, INDEXED, FOIPA, etc.), type="stamp"
|
|
- For handwritten annotations, type="handwriting"
|
|
- For the cover page (folder cover), type="cover"
|
|
- The bbox coordinates are normalized (0-1) relative to page dimensions: x=left, y=top, w=width, h=height
|
|
- If page is blank or nearly blank, one chunk of type="blank"
|
|
- Mark ufo_anomaly_detected=true for chunks describing UAP/UFO sightings, objects, or unusual aerial phenomena
|
|
- Always include content_pt_br as Brazilian Portuguese translation
|
|
- For document headers/letterheads, include all visible text
|
|
|
|
Return ONLY the JSON object, no other text."""
|
|
|
|
|
|
def load_image_b64(path: Path) -> str:
|
|
with open(path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
|
|
def analyze_page(page_num: int) -> dict:
|
|
"""Analyze a single page via vision API."""
|
|
# PNG pages are 0-indexed (p-000 through p-217)
|
|
png_path = PNG_DIR / f"p-{page_num:03d}.png"
|
|
|
|
if not png_path.exists():
|
|
return {"page_number": page_num, "chunks": [
|
|
{"order_in_page": 1, "type": "blank", "content_en": "(page not found)",
|
|
"content_pt_br": "(página não encontrada)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
|
|
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
|
|
]}
|
|
|
|
img_b64 = load_image_b64(png_path)
|
|
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=4000,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_b64,
|
|
},
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": PAGE_PROMPT + f"\n\nThis is page {page_num} (0-indexed) of the document."
|
|
}
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
raw = response.content[0].text.strip()
|
|
# Strip markdown code fences if present
|
|
if raw.startswith("```"):
|
|
raw = re.sub(r'^```[a-z]*\n?', '', raw)
|
|
raw = re.sub(r'\n?```$', '', raw)
|
|
|
|
data = json.loads(raw)
|
|
data["page_number"] = page_num
|
|
return data
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f" JSON parse error on page {page_num}: {e}", file=sys.stderr)
|
|
# Try to extract JSON from response
|
|
try:
|
|
match = re.search(r'\{.*\}', raw, re.DOTALL)
|
|
if match:
|
|
data = json.loads(match.group())
|
|
data["page_number"] = page_num
|
|
return data
|
|
except Exception:
|
|
pass
|
|
return {"page_number": page_num, "chunks": [
|
|
{"order_in_page": 1, "type": "blank", "content_en": f"(parse error: {e})",
|
|
"content_pt_br": "(erro de análise)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
|
|
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
|
|
]}
|
|
except Exception as e:
|
|
print(f" API error on page {page_num}: {e}", file=sys.stderr)
|
|
return {"page_number": page_num, "chunks": [
|
|
{"order_in_page": 1, "type": "blank", "content_en": f"(api error: {e})",
|
|
"content_pt_br": "(erro de API)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
|
|
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None}
|
|
]}
|
|
|
|
|
|
def process_pages_batch(page_nums: list, max_workers: int = 4) -> list:
|
|
"""Process a batch of pages in parallel."""
|
|
results = {}
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
future_to_page = {executor.submit(analyze_page, p): p for p in page_nums}
|
|
for future in as_completed(future_to_page):
|
|
page_num = future_to_page[future]
|
|
try:
|
|
result = future.result()
|
|
results[page_num] = result
|
|
print(f" Page {page_num} done: {len(result.get('chunks', []))} chunks")
|
|
except Exception as e:
|
|
print(f" Page {page_num} failed: {e}", file=sys.stderr)
|
|
return [results[p] for p in sorted(results.keys())]
|
|
|
|
|
|
def main():
|
|
# Determine pages to process
|
|
png_files = sorted(PNG_DIR.glob("p-*.png"))
|
|
page_nums = [int(f.stem.split("-")[1]) for f in png_files]
|
|
total_pages = len(page_nums)
|
|
|
|
print(f"Processing {total_pages} pages for {DOC_ID}")
|
|
print(f"Pages: {min(page_nums)} to {max(page_nums)}")
|
|
|
|
# Check for already processed pages
|
|
already_done = set()
|
|
out_json = OUT_DIR / "pages_raw.json"
|
|
all_page_data = {}
|
|
|
|
if out_json.exists():
|
|
with open(out_json) as f:
|
|
existing = json.load(f)
|
|
for pd in existing:
|
|
all_page_data[pd["page_number"]] = pd
|
|
already_done.add(pd["page_number"])
|
|
print(f"Already processed: {len(already_done)} pages")
|
|
|
|
remaining = [p for p in page_nums if p not in already_done]
|
|
print(f"Remaining: {len(remaining)} pages")
|
|
|
|
# Process in batches of 5
|
|
batch_size = 5
|
|
for i in range(0, len(remaining), batch_size):
|
|
batch = remaining[i:i + batch_size]
|
|
print(f"\nBatch {i//batch_size + 1}: pages {batch}")
|
|
results = process_pages_batch(batch, max_workers=4)
|
|
for r in results:
|
|
all_page_data[r["page_number"]] = r
|
|
|
|
# Save progress
|
|
pages_list = [all_page_data[p] for p in sorted(all_page_data.keys())]
|
|
with open(out_json, "w", encoding="utf-8") as f:
|
|
json.dump(pages_list, f, ensure_ascii=False, indent=2)
|
|
print(f" Saved progress: {len(all_page_data)} pages done")
|
|
|
|
print(f"\nAll pages processed. Total: {len(all_page_data)}")
|
|
return all_page_data
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|