#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-8 Processes all 218 pages (p-000 to p-217) using Anthropic vision API. """ import anthropic import base64 import json import os import re import sys from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from pathlib import Path DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8" DOC_TITLE = "FBI Flying Saucers Investigation — 62-HQ-83894 Section 8" PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" client = anthropic.Anthropic() PAGE_PROMPT = """You are an expert document archivist analyzing a page from a declassified FBI document about flying saucer investigations (62-HQ-83894 Section 8). Analyze this page image carefully and return a JSON object with the following structure: { "page_number": , "chunks": [ { "order_in_page": , "type": "", "content_en": "", "content_pt_br": "", "bbox": {"x": <0-1 float>, "y": <0-1 float>, "w": <0-1 float>, "h": <0-1 float>}, "classification": "", "formatting": [""], "cross_page_hint": "", "ocr_confidence": <0.0-1.0>, "ocr_source_lines": [], "redaction_code": "", "redaction_inferred_content_type": "", "image_type": "", "ufo_anomaly_detected": , "ufo_anomaly_type": "", "ufo_anomaly_rationale": "", "cryptid_anomaly_detected": , "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null } ] } Rules: - Extract ALL text verbatim from the document including stamps, handwriting, headers, footers - For redacted/blacked out areas, type="redaction" and estimate what was redacted - For stamps (RECORDED, INDEXED, FOIPA, etc.), type="stamp" - For handwritten annotations, type="handwriting" - For the cover page (folder cover), type="cover" - The bbox coordinates are normalized (0-1) relative to page dimensions: x=left, y=top, w=width, h=height - If page is blank or nearly blank, one chunk of type="blank" - Mark ufo_anomaly_detected=true for chunks describing UAP/UFO sightings, objects, or unusual aerial phenomena - Always include content_pt_br as Brazilian Portuguese translation - For document headers/letterheads, include all visible text Return ONLY the JSON object, no other text.""" def load_image_b64(path: Path) -> str: with open(path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") def analyze_page(page_num: int) -> dict: """Analyze a single page via vision API.""" # PNG pages are 0-indexed (p-000 through p-217) png_path = PNG_DIR / f"p-{page_num:03d}.png" if not png_path.exists(): return {"page_number": page_num, "chunks": [ {"order_in_page": 1, "type": "blank", "content_en": "(page not found)", "content_pt_br": "(página não encontrada)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None} ]} img_b64 = load_image_b64(png_path) try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=4000, messages=[ { "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": img_b64, }, }, { "type": "text", "text": PAGE_PROMPT + f"\n\nThis is page {page_num} (0-indexed) of the document." } ], } ], ) raw = response.content[0].text.strip() # Strip markdown code fences if present if raw.startswith("```"): raw = re.sub(r'^```[a-z]*\n?', '', raw) raw = re.sub(r'\n?```$', '', raw) data = json.loads(raw) data["page_number"] = page_num return data except json.JSONDecodeError as e: print(f" JSON parse error on page {page_num}: {e}", file=sys.stderr) # Try to extract JSON from response try: match = re.search(r'\{.*\}', raw, re.DOTALL) if match: data = json.loads(match.group()) data["page_number"] = page_num return data except Exception: pass return {"page_number": page_num, "chunks": [ {"order_in_page": 1, "type": "blank", "content_en": f"(parse error: {e})", "content_pt_br": "(erro de análise)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None} ]} except Exception as e: print(f" API error on page {page_num}: {e}", file=sys.stderr) return {"page_number": page_num, "chunks": [ {"order_in_page": 1, "type": "blank", "content_en": f"(api error: {e})", "content_pt_br": "(erro de API)", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None} ]} def process_pages_batch(page_nums: list, max_workers: int = 4) -> list: """Process a batch of pages in parallel.""" results = {} with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = {executor.submit(analyze_page, p): p for p in page_nums} for future in as_completed(future_to_page): page_num = future_to_page[future] try: result = future.result() results[page_num] = result print(f" Page {page_num} done: {len(result.get('chunks', []))} chunks") except Exception as e: print(f" Page {page_num} failed: {e}", file=sys.stderr) return [results[p] for p in sorted(results.keys())] def main(): # Determine pages to process png_files = sorted(PNG_DIR.glob("p-*.png")) page_nums = [int(f.stem.split("-")[1]) for f in png_files] total_pages = len(page_nums) print(f"Processing {total_pages} pages for {DOC_ID}") print(f"Pages: {min(page_nums)} to {max(page_nums)}") # Check for already processed pages already_done = set() out_json = OUT_DIR / "pages_raw.json" all_page_data = {} if out_json.exists(): with open(out_json) as f: existing = json.load(f) for pd in existing: all_page_data[pd["page_number"]] = pd already_done.add(pd["page_number"]) print(f"Already processed: {len(already_done)} pages") remaining = [p for p in page_nums if p not in already_done] print(f"Remaining: {len(remaining)} pages") # Process in batches of 5 batch_size = 5 for i in range(0, len(remaining), batch_size): batch = remaining[i:i + batch_size] print(f"\nBatch {i//batch_size + 1}: pages {batch}") results = process_pages_batch(batch, max_workers=4) for r in results: all_page_data[r["page_number"]] = r # Save progress pages_list = [all_page_data[p] for p in sorted(all_page_data.keys())] with open(out_json, "w", encoding="utf-8") as f: json.dump(pages_list, f, ensure_ascii=False, indent=2) print(f" Saved progress: {len(all_page_data)} pages done") print(f"\nAll pages processed. Total: {len(all_page_data)}") return all_page_data if __name__ == "__main__": main()