#!/usr/bin/env python3 """ Rebuild doc-65-hs1-834228961-62-hq-83894-section-1 Uses claude CLI (OAuth, Max plan) via subprocess — no direct API key needed. Processes pages 1-150 in parallel batches of 5. """ from __future__ import annotations import base64 import json import os import re import subprocess import sys import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from pathlib import Path DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)" PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1") OCR_DIR = Path("/Users/guto/ufo/processing/ocr/doc-65-hs1-834228961-62-hq-83894-section-1") OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1") TOTAL_PAGES = 150 MAX_WORKERS = 4 TIMEOUT = 180 RETRIES = 3 _lock = threading.Lock() def safe_print(*args, **kwargs): with _lock: print(*args, **kwargs, flush=True) def load_ocr(page_num: int) -> str: txt_path = OCR_DIR / f"p-{page_num:03d}.txt" if txt_path.exists(): try: content = txt_path.read_text(encoding="utf-8").strip() return content[:3000] if content else "(empty)" except Exception: return "(unreadable)" return "(not found)" def extract_json(text: str) -> dict: """Extract JSON object from text, stripping markdown fences.""" text = text.strip() if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```\s*$", "", text) start = text.find("{") if start == -1: raise ValueError("No JSON object found") depth = 0 for i, c in enumerate(text[start:], start): if c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: return json.loads(text[start:i + 1]) raise ValueError("Unclosed JSON object") PAGE_PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document. Document: {doc_title} Page: {page_number} of {total_pages} STEP 1: Use the Read tool to view this image: {png_path} STEP 2: Analyze the page carefully and extract ALL content as structured chunks. STEP 3: Output ONLY a valid JSON object (no markdown, no code fences, no preamble): {{ "page_number": {page_number}, "chunks": [ {{ "order_in_page": 1, "type": "paragraph", "content_en": "verbatim text or description in English", "content_pt_br": "tradução em português brasileiro", "bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.05}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null }} ] }} ALLOWED chunk types (use only these exact strings): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank RULES: 1. Extract EVERY visible element — no skipping 2. bbox: normalized 0.0–1.0 (x=left, y=top, w=width, h=height) 3. content_en: verbatim OCR text for text elements; description for images 4. content_pt_br: Brazilian Portuguese (NOT European) translation 5. Preserve UTF-8 accents: ç, ã, á, é, í, ó, ú, â, ê, ô, à 6. Redacted sections: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]" 7. Images/photos: type="image", fill image_description_en and image_description_pt_br 8. classification: visible marking text (e.g. "SECRET", "UNCLASSIFIED") or null 9. formatting: subset of ["bold","italic","underline","all_caps","handwritten","typewritten","strikethrough"] 10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev" 11. ufo_anomaly_detected: true if chunk contains UAP sighting data, coordinates, witness accounts 12. Blank page: one chunk type="blank" 13. Order chunks top-to-bottom, left-to-right 14. Return ONLY the JSON — no text before or after OCR hint (may be empty): {ocr_text} """ def process_page(page_num: int) -> dict: png_path = PNG_DIR / f"p-{page_num:03d}.png" if not png_path.exists(): safe_print(f" WARNING p{page_num:03d}: PNG missing") return _error_page(page_num, "[PAGE NOT FOUND]", "[PÁGINA NÃO ENCONTRADA]") ocr_text = load_ocr(page_num) prompt = PAGE_PROMPT_TEMPLATE.format( doc_title=DOC_TITLE, page_number=page_num, total_pages=TOTAL_PAGES, png_path=str(png_path), ocr_text=ocr_text, ) for attempt in range(1, RETRIES + 1): try: cmd = [ "claude", "-p", "--model", "haiku", "--output-format", "json", "--max-turns", "3", "--allowedTools", "Read", "--add-dir", str(PNG_DIR), "--", prompt, ] res = subprocess.run( cmd, capture_output=True, text=True, timeout=TIMEOUT, check=False, ) if res.returncode != 0: raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}") cli_out = json.loads(res.stdout) if cli_out.get("is_error"): raise RuntimeError(f"claude error: {cli_out.get('result','')[:500]}") result_text = cli_out.get("result", "") data = extract_json(result_text) data["page_number"] = page_num n_chunks = len(data.get("chunks", [])) safe_print(f" p{page_num:03d} OK — {n_chunks} chunks") return data except subprocess.TimeoutExpired: safe_print(f" p{page_num:03d} TIMEOUT (attempt {attempt})") if attempt == RETRIES: return _error_page(page_num, "[TIMEOUT]", "[TIMEOUT]") time.sleep(5 * attempt) except (RuntimeError, json.JSONDecodeError, ValueError) as e: safe_print(f" p{page_num:03d} ERROR (attempt {attempt}): {str(e)[:200]}") if attempt == RETRIES: return _error_page(page_num, f"[ERROR: {str(e)[:80]}]", f"[ERRO: {str(e)[:80]}]") time.sleep(5 * attempt) return _error_page(page_num, "[UNKNOWN ERROR]", "[ERRO DESCONHECIDO]") def _error_page(page_num: int, msg_en: str, msg_pt: str) -> dict: return { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": msg_en, "content_pt_br": msg_pt, "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None, }] } def main(): pages = list(range(1, TOTAL_PAGES + 1)) results: dict[int, dict] = {} start_time = time.time() print(f"Processing {len(pages)} pages, {MAX_WORKERS} workers, batches of 5...") batch_size = 5 for b_start in range(0, len(pages), batch_size): batch = pages[b_start:b_start + batch_size] print(f"\nBatch {b_start//batch_size + 1}/{(len(pages)+batch_size-1)//batch_size}: pages {batch[0]}-{batch[-1]}") with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex: futures = {ex.submit(process_page, p): p for p in batch} for fut in as_completed(futures): p = futures[fut] try: results[p] = fut.result() except Exception as e: safe_print(f" p{p:03d} FATAL: {e}") results[p] = _error_page(p, f"[FATAL: {str(e)[:80]}]", f"[FATAL: {str(e)[:80]}]") # Pause between batches if b_start + batch_size < len(pages): time.sleep(2) elapsed = time.time() - start_time sorted_results = [results[p] for p in sorted(results.keys())] total_chunks = sum(len(r.get("chunks", [])) for r in sorted_results) out_path = OUTPUT_DIR / "_pages_raw.json" with open(out_path, "w", encoding="utf-8") as f: json.dump(sorted_results, f, ensure_ascii=False, indent=2) print(f"\nDone in {elapsed:.0f}s — {len(sorted_results)} pages, {total_chunks} chunks") print(f"Saved: {out_path}") if __name__ == "__main__": main()