#!/usr/bin/env python3 """ Page rebuilder for doc-65-hs1-834228961-62-hq-83894-section-1 Processes pages 1-150 using vision (PNGs at p-001.png .. p-150.png) Outputs JSON per page with chunks list. """ import anthropic import base64 import json import os import sys import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UFO/UAP Investigative File)" PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1") OCR_DIR = Path("/Users/guto/ufo/processing/ocr/doc-65-hs1-834228961-62-hq-83894-section-1") OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1") TOTAL_PAGES = 150 client = anthropic.Anthropic() CHUNK_TYPES = [ "letterhead", "classification_banner", "header", "subheader", "paragraph", "list_item", "caption", "footnote", "page_number", "signature_block", "stamp", "redaction_block", "image", "table_marker", "form_field", "watermark", "separator", "blank" ] PAGE_REBUILDER_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO government document. Document: {doc_title} Page: {page_number} of {total_pages} Analyze this page image carefully and extract ALL content as structured chunks. Return a JSON object with this exact structure: {{ "page_number": {page_number}, "chunks": [ {{ "order_in_page": 1, "type": "", "content_en": "", "content_pt_br": "", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null }} ] }} CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank RULES: 1. Extract EVERY element on the page — nothing is skipped 2. bbox: normalized coordinates (x=left, y=top, w=width, h=height) relative to page size (0.0 to 1.0) 3. content_en: verbatim OCR text for text chunks; for images describe what you see 4. content_pt_br: Brazilian Portuguese translation of content_en (NOT European Portuguese) 5. Preserve UTF-8 accents: ç, ã, á, é, í, ó, ú, â, ê, ô, à 6. For redacted sections: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]" 7. For images/photos: type="image", describe the visual content in image_description_en and image_description_pt_br 8. For stamps: type="stamp" 9. classification: extract classification markings if visible (e.g. "SECRET", "CONFIDENTIAL") 10. formatting: array of applicable ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"] 11. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev" 12. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, or anomalous phenomena 13. If page is blank: return one chunk with type="blank" 14. Order chunks top-to-bottom, left-to-right as they appear on the page 15. Return ONLY valid JSON, no markdown code blocks, no extra text OCR text hint (may be empty or garbled): {ocr_text} """ def load_image_b64(png_path: Path) -> str: with open(png_path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") def load_ocr(page_num: int) -> str: txt_path = OCR_DIR / f"p-{page_num:03d}.txt" if txt_path.exists(): try: content = txt_path.read_text(encoding="utf-8").strip() return content if content else "(empty)" except Exception: return "(unreadable)" return "(not found)" def process_page(page_num: int, retries: int = 3) -> dict: png_path = PNG_DIR / f"p-{page_num:03d}.png" if not png_path.exists(): print(f" WARNING: PNG not found for page {page_num}: {png_path}", file=sys.stderr) return { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": "[PAGE NOT FOUND]", "content_pt_br": "[PÁGINA NÃO ENCONTRADA]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } ocr_text = load_ocr(page_num) img_b64 = load_image_b64(png_path) prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, page_number=page_num, total_pages=TOTAL_PAGES, ocr_text=ocr_text[:2000] # cap at 2000 chars ) for attempt in range(retries): try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=4096, messages=[{ "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": img_b64 } }, { "type": "text", "text": prompt } ] }] ) raw = response.content[0].text.strip() # Strip markdown code blocks if present if raw.startswith("```"): lines = raw.split("\n") raw = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:]) result = json.loads(raw) result["page_number"] = page_num # ensure correct print(f" Page {page_num:3d} done — {len(result.get('chunks', []))} chunks", flush=True) return result except json.JSONDecodeError as e: print(f" Page {page_num} JSON error (attempt {attempt+1}): {e}", file=sys.stderr) if attempt == retries - 1: return { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": f"[PARSE ERROR: {str(e)[:100]}]", "content_pt_br": f"[ERRO DE ANÁLISE: {str(e)[:100]}]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } time.sleep(2 ** attempt) except Exception as e: print(f" Page {page_num} API error (attempt {attempt+1}): {e}", file=sys.stderr) if attempt == retries - 1: return { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": f"[API ERROR: {str(e)[:100]}]", "content_pt_br": f"[ERRO DE API: {str(e)[:100]}]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } time.sleep(2 ** attempt) def main(): pages = list(range(1, TOTAL_PAGES + 1)) results = {} print(f"Processing {len(pages)} pages in parallel batches of 5...") batch_size = 5 for batch_start in range(0, len(pages), batch_size): batch = pages[batch_start:batch_start + batch_size] print(f"Batch {batch_start//batch_size + 1}: pages {batch[0]}-{batch[-1]}") with ThreadPoolExecutor(max_workers=5) as executor: future_to_page = {executor.submit(process_page, p): p for p in batch} for future in as_completed(future_to_page): page_num = future_to_page[future] try: result = future.result() results[page_num] = result except Exception as e: print(f" Page {page_num} FATAL: {e}", file=sys.stderr) # Small pause between batches to avoid rate limits if batch_start + batch_size < len(pages): time.sleep(1) # Save intermediate results out_path = OUTPUT_DIR / "_pages_raw.json" sorted_results = [results[p] for p in sorted(results.keys())] with open(out_path, "w", encoding="utf-8") as f: json.dump(sorted_results, f, ensure_ascii=False, indent=2) print(f"\nSaved {len(sorted_results)} pages to {out_path}") total_chunks = sum(len(r.get("chunks", [])) for r in sorted_results) print(f"Total chunks: {total_chunks}") if __name__ == "__main__": main()