#!/usr/bin/env python3 """ 02-vision-page.py — Fase 3 — Vision Haiku via Claude Code CLI (OAuth) Usa o `claude` CLI (plano Max 20x do usuário) — NÃO usa ANTHROPIC_API_KEY direta. Invoca `claude -p --model haiku` por subprocess para cada PNG. Para cada PNG em processing/png//p-NNN.png: 1. Lê OCR raw (processing/ocr//p-NNN.txt) 2. Chama claude CLI com prompt estruturado pedindo que use Read no PNG 3. Recebe JSON com page_type, content_classification, entities_extracted, etc. 4. Salva JSON em processing/vision//p-NNN.json 5. Escreve wiki/pages//p.md (frontmatter + corpo) — idioma ORIGINAL Idempotente: pula se vision JSON + page.md já existem (use --force para refazer). Uso: ./02-vision-page.py --doc-id dow-uap-d54-mission-report-mediterranean-sea-na [--force] [--max-pages N] ./02-vision-page.py --all """ from __future__ import annotations import argparse import hashlib import json import os import random import re import subprocess import sys import threading import time import unicodedata from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n") sys.exit(1) try: from PIL import Image except ImportError: sys.stderr.write("Missing pillow. Run: pip3 install pillow\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") RAW_DIR = UFO_ROOT / "raw" PNG_BASE = UFO_ROOT / "processing" / "png" OCR_BASE = UFO_ROOT / "processing" / "ocr" VISION_BASE = UFO_ROOT / "processing" / "vision" PAGES_BASE = UFO_ROOT / "wiki" / "pages" LOG_PATH = UFO_ROOT / "wiki" / "log.md" MODEL = "haiku" # claude-haiku-4-5 alias VISION_MODEL_FULL = "claude-haiku-4-5" WIKI_VERSION = "0.1.0" SCHEMA_VERSION = "0.1.0" MAX_TURNS = 3 DEFAULT_WORKERS = 4 DEFAULT_RETRIES = 3 DEFAULT_TIMEOUT = 180 _print_lock = threading.Lock() def safe_print(*args, **kwargs): """Thread-safe print.""" with _print_lock: print(*args, **kwargs, flush=True) VISION_JSON_SCHEMA = { "type": "object", "properties": { "page_type": {"type": "string"}, "content_classification": {"type": "array", "items": {"type": "string"}}, "language_detected": {"type": "string"}, "classification_markings": {"type": "array"}, "redactions": {"type": "array"}, "signatures_observed": {"type": "array"}, "tables_detected": {"type": "array"}, "images_detected": {"type": "array"}, "entities_extracted": {"type": "object"}, "uap_observation_fields": {}, "vision_description": {"type": "string"}, "ocr_quality_score": {"type": "number"}, "vision_quality_score": {"type": "number"}, "flags": {"type": "array"}, }, "required": [ "page_type", "content_classification", "language_detected", "vision_description", "entities_extracted", "redactions", "classification_markings", ], } def build_prompt(png_path: Path, ocr_text: str) -> str: """Build the prompt sent to claude CLI.""" return f"""You are an evidence officer in the Investigation Bureau, analyzing one page of a US Department of War UAP/UFO document released at war.gov/ufo. STEP 1: Use the Read tool to view this PNG of the page: {png_path} STEP 2: Combine what you SEE in the image with the raw pdftotext OCR below. OCR raw (pdftotext -layout): ``` {ocr_text} ``` STEP 3: Output ONE JSON object (no markdown fence, no commentary, no preamble) matching this exact schema: {{ "page_type": "cover|toc|body|signature|photo|sketch|map|stamp|blank|appendix|redaction-heavy|table-page|mixed", "content_classification": ["text-only"|"contains-photos"|"contains-sketches"|"contains-diagrams"|"contains-maps"|"contains-tables"|"contains-signatures"|"contains-stamps"|"redaction-heavy"|"mixed"|"blank"], "language_detected": "en|pt|es|fr|de|ru|unknown", "classification_markings": [ {{"level":"UNCLASSIFIED|CUI|CONFIDENTIAL|SECRET|TOP SECRET","caveats":["NOFORN"],"location":"header|footer|banner|stamp","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}}}} ], "redactions": [ {{"code":"(b)(1) 1.4(a)|(b)(3)|(b)(6)|other","description":"...","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"text_inferred":null}} ], "signatures_observed": [ {{"signer_inferred":null,"confidence_band":"low|medium|high","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"notes":"..."}} ], "tables_detected": [ {{"local_table_index":1,"bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"spans_multi_page":false,"continues_from_prev_page":false,"likely_continues_next_page":false,"row_count_estimate":0,"col_count_estimate":0,"headers_summary":"..."}} ], "images_detected": [ {{"local_image_index":1,"image_type":"photo|sketch|map|chart|stamp|signature|redaction|logo|seal|diagram|other","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"caption_ocr":"..."}} ], "vision_description": "Rich English description (2-5 sentences) of the page layout, visible elements, redaction extent, stamps, sketches, etc. PRESERVE ORIGINAL LANGUAGE of any quoted text from the document.", "vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Use Brazilian spelling and idioms (NOT European Portuguese). Preserve UTF-8 accents (ã, é, ç, etc.). KEEP verbatim English quotes from the document in English (do not translate quoted text from the page itself); only the narrative description is translated.", "entities_extracted": {{ "people": [{{"name":"As written","role_in_page":"subject|witness|author|signer|mentioned"}}], "organizations": [{{"name":"As written","aliases":[]}}], "locations": [{{"name":"As written","type":"city|region|country|sea|strait|airbase|naval-base|mountain|desert|other"}}], "events": [{{"label":"Short label","date":"YYYY-MM-DD|YYYY|NA"}}], "uap_objects": [{{"shape":"sphere|disc|triangle|cylinder|cube|elongated-ellipsoid|cigar|irregular|unknown","color":"...","size_estimate":"..."}}], "vehicles": [{{"name":"...","class":"aircraft|ship|submarine|spacecraft|satellite|ground|other"}}], "operations": [{{"name":"...","type":"military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}}], "concepts": [{{"name":"...","class":"legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}}] }}, "uap_observation_fields": {{ "date_time_utc":"...","duration_seconds":null,"shape":"...","color":"...","size_estimate":"...","altitude_ft":null,"speed_kts":null,"bearing_deg":null,"distance_nm":null,"coordinates":{{"lat":null,"lon":null,"raw_text":"..."}} }}, "ocr_quality_score": 0.0, "vision_quality_score": 0.0, "flags": ["low-ocr"|"heavy-redaction"|"rotated"|"scanned-twice"|"missing-page-number"] }} Rules: - Empty arrays for not-applicable fields. Do not omit keys. - bbox is normalized 0..1 (x,y,w,h) relative to the page image. - Entity NAMES, OCR-extracted strings, verbatim quotes, classification markings, redaction codes: ALWAYS in ORIGINAL source language (do NOT translate). Preserve original spelling, including any typos (e.g., "TRIANGLUAR" must stay as written). - ONLY `vision_description_pt_br` is the translation. Everything else stays in source language. - `vision_description_pt_br` must be Brazilian Portuguese (pt-br), NOT European Portuguese (pt-pt). Use Brazilian vocabulary and spelling. Preserve UTF-8 accentuation correctly (ç, ã, á, é, í, ó, ú, â, ê, ô, à). - uap_observation_fields = null when page has no UAP encounter block. - Output ONLY the JSON. No preamble, no fence, no commentary. """ def utc_now_iso(): return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def filename_to_doc_id(filename: str) -> str: base = filename.rsplit(".", 1)[0] nfkd = unicodedata.normalize("NFKD", base) ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) lower = ascii_str.lower() replaced = re.sub(r"[^a-z0-9-]", "-", lower) collapsed = re.sub(r"-+", "-", replaced).strip("-") if collapsed and collapsed[0].isdigit(): collapsed = "doc-" + collapsed return collapsed def sha256_file(p: Path) -> str: h = hashlib.sha256() with open(p, "rb") as fh: for chunk in iter(lambda: fh.read(65536), b""): h.update(chunk) return h.hexdigest() def extract_json(text: str) -> dict: """Extract JSON object from claude CLI output (may have markdown fences).""" text = text.strip() if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```$", "", text) # Find first { and matching last } start = text.find("{") if start == -1: raise ValueError("No JSON object in response") # Track depth to find matching close depth = 0 for i, c in enumerate(text[start:], start): if c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: return json.loads(text[start : i + 1]) raise ValueError("Unclosed JSON object") def call_claude_vision(png_path: Path, ocr_text: str, timeout: int = DEFAULT_TIMEOUT) -> tuple[dict, dict]: """Invoke `claude -p --model haiku` and return (vision_data, metadata). Single attempt.""" prompt = build_prompt(png_path, ocr_text) cmd = [ "claude", "-p", "--model", MODEL, "--output-format", "json", "--max-turns", str(MAX_TURNS), "--allowedTools", "Read", "--add-dir", str(png_path.parent), "--", prompt, ] res = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout, check=False, ) if res.returncode != 0: raise RuntimeError(f"claude CLI failed (rc={res.returncode}): {res.stderr[-2000:]}") cli_output = json.loads(res.stdout) if cli_output.get("is_error"): raise RuntimeError(f"claude reported error: {cli_output.get('result', '')[:500]}") result_text = cli_output.get("result", "") vision_data = extract_json(result_text) metadata = { "duration_ms": cli_output.get("duration_ms"), "duration_api_ms": cli_output.get("duration_api_ms"), "total_cost_usd": cli_output.get("total_cost_usd"), "num_turns": cli_output.get("num_turns"), "session_id": cli_output.get("session_id"), "usage": cli_output.get("usage"), } return vision_data, metadata def call_with_retry( png_path: Path, ocr_text: str, retries: int = DEFAULT_RETRIES, base_backoff: float = 5.0, timeout: int = DEFAULT_TIMEOUT, ) -> tuple[dict, dict]: """Call vision with exponential backoff + jitter. Raises on final failure.""" last_err: Exception | None = None for attempt in range(1, retries + 1): try: return call_claude_vision(png_path, ocr_text, timeout=timeout) except subprocess.TimeoutExpired as e: last_err = e backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2) safe_print(f" timeout (attempt {attempt}/{retries}); sleeping {backoff:.1f}s") time.sleep(backoff) except RuntimeError as e: last_err = e msg = str(e).lower() transient = any(s in msg for s in ("overloaded", "rate", "429", "500", "502", "503", "504", "timeout", "connection")) if not transient or attempt == retries: raise backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2) safe_print(f" transient error (attempt {attempt}/{retries}); sleeping {backoff:.1f}s") time.sleep(backoff) except json.JSONDecodeError as e: last_err = e if attempt == retries: raise backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2) safe_print(f" JSON parse error (attempt {attempt}/{retries}); sleeping {backoff:.1f}s") time.sleep(backoff) if last_err: raise last_err raise RuntimeError("unreachable") def render_page_md( *, doc_id: str, page_num: int, total_pages: int, png_path: Path, ocr_path: Path, vision_path: Path, vision_data: dict, png_dimensions: tuple[int, int], now_iso: str, ) -> str: padded = f"{page_num:03d}" page_id = f"{doc_id}/p{padded}" frontmatter = { "schema_version": SCHEMA_VERSION, "type": "page", "page_id": page_id, "doc_id": doc_id, "page_number": page_num, "total_pages": total_pages, "png_path": f"../../../processing/png/{doc_id}/{png_path.name}", "png_sha256": sha256_file(png_path), "png_dpi": 200, "png_width": png_dimensions[0], "png_height": png_dimensions[1], "ocr_raw_path": f"../../../processing/ocr/{doc_id}/{ocr_path.name}", "vision_raw_path": f"../../../processing/vision/{doc_id}/{vision_path.name}", "vision_model": VISION_MODEL_FULL, "vision_run_at": now_iso, "page_type": vision_data.get("page_type", "body"), "content_classification": vision_data.get("content_classification", []), "language_detected": vision_data.get("language_detected", "unknown"), "classification_markings": vision_data.get("classification_markings", []), "redactions": vision_data.get("redactions", []), "signatures_observed": vision_data.get("signatures_observed", []), "tables_detected": vision_data.get("tables_detected", []), "images_detected": vision_data.get("images_detected", []), "entities_extracted": vision_data.get("entities_extracted", {}), "uap_observation_fields": vision_data.get("uap_observation_fields"), "vision_description": vision_data.get("vision_description", ""), "vision_description_pt_br": vision_data.get("vision_description_pt_br", ""), "ocr_quality_score": vision_data.get("ocr_quality_score", 0.0), "vision_quality_score": vision_data.get("vision_quality_score", 0.0), "flags": vision_data.get("flags", []), "last_ingest": now_iso, "last_lint": None, "wiki_version": WIKI_VERSION, } yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False) ocr_text = ocr_path.read_text(encoding="utf-8", errors="replace").strip() pt_desc = vision_data.get("vision_description_pt_br", "").strip() en_desc = vision_data.get("vision_description", "").strip() body = f"""# [[{doc_id}]] — Page {page_num} of {total_pages} ![Page {page_num}](../../../processing/png/{doc_id}/{png_path.name}) ## OCR Text (raw, original language) ``` {ocr_text} ``` ## Vision Description (EN) {en_desc} ## Descrição Vision (PT-BR) {pt_desc} ## Investigation Notes - `page_type`: `{vision_data.get("page_type", "unknown")}` - `content_classification`: {', '.join(f"`{c}`" for c in vision_data.get("content_classification", [])) or "_n/a_"} - `language_detected`: `{vision_data.get("language_detected", "unknown")}` - `flags`: {', '.join(f"`{f}`" for f in vision_data.get("flags", [])) or "_none_"} """ return f"---\n{yaml_str}---\n\n{body}" def _process_page( *, doc_id: str, png_path: Path, ocr_path: Path, vision_json_path: Path, page_md_path: Path, page_num: int, total_pages: int, retries: int, timeout: int, ) -> tuple[str, float, float, str | None]: """Process a single page. Returns (label, elapsed_seconds, cost_usd, error_or_none).""" padded = f"{page_num:03d}" t0 = time.time() try: with Image.open(png_path) as im: png_dimensions = im.size except Exception: png_dimensions = (0, 0) ocr_text = ocr_path.read_text(encoding="utf-8", errors="replace").strip() try: vision_data, meta = call_with_retry(png_path, ocr_text, retries=retries, timeout=timeout) except Exception as e: return (f"p{padded}", time.time() - t0, 0.0, str(e)[:300]) vision_json_path.write_text( json.dumps({"vision_data": vision_data, "meta": meta}, indent=2, ensure_ascii=False), encoding="utf-8", ) md = render_page_md( doc_id=doc_id, page_num=page_num, total_pages=total_pages, png_path=png_path, ocr_path=ocr_path, vision_path=vision_json_path, vision_data=vision_data, png_dimensions=png_dimensions, now_iso=utc_now_iso(), ) page_md_path.write_text(md, encoding="utf-8") elapsed = time.time() - t0 cost = meta.get("total_cost_usd", 0.0) or 0.0 return (f"p{padded}", elapsed, cost, None) def find_pdf_filename_for_doc_id(doc_id: str) -> str | None: """Reverse-canonicalize: scan raw/ for a PDF whose canonical doc_id matches.""" def _canon(fname: str) -> str: base = fname.rsplit(".", 1)[0] nfkd = unicodedata.normalize("NFKD", base) ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) lower = ascii_str.lower() replaced = re.sub(r"[^a-z0-9-]", "-", lower) collapsed = re.sub(r"-+", "-", replaced).strip("-") if collapsed and collapsed[0].isdigit(): collapsed = "doc-" + collapsed return collapsed raw_dir = UFO_ROOT / "raw" for p in raw_dir.glob("*.pdf"): if _canon(p.name) == doc_id: return p.name return None def try_reconvert_from_raw(doc_id: str) -> bool: """Attempt to regenerate PNGs/OCR via scripts/01-convert-pdfs.sh. Returns True if reconvert succeeded (PNGs now exist), False otherwise.""" fname = find_pdf_filename_for_doc_id(doc_id) if not fname: safe_print(f" ⚠ PDF for {doc_id} not in raw/ — manual download required from https://www.war.gov/ufo/.pdf") return False script = UFO_ROOT / "scripts" / "01-convert-pdfs.sh" safe_print(f" ↻ re-converting from raw/{fname} ...") res = subprocess.run( [str(script), "--filename", fname], capture_output=True, text=True, timeout=300, check=False, ) if res.returncode != 0: safe_print(f" ✗ re-conversion failed: {res.stderr[-500:]}") return False return True def process_doc( doc_id: str, force: bool = False, max_pages: int | None = None, workers: int = DEFAULT_WORKERS, retries: int = DEFAULT_RETRIES, timeout: int = DEFAULT_TIMEOUT, ): png_dir = PNG_BASE / doc_id ocr_dir = OCR_BASE / doc_id vision_dir = VISION_BASE / doc_id pages_dir = PAGES_BASE / doc_id vision_dir.mkdir(parents=True, exist_ok=True) pages_dir.mkdir(parents=True, exist_ok=True) pngs = sorted(png_dir.glob("p-*.png")) if not pngs: # Fallback: try to re-convert from raw/.pdf safe_print(f"No PNGs for doc_id={doc_id} in {png_dir} — attempting re-conversion from raw/") if try_reconvert_from_raw(doc_id): pngs = sorted(png_dir.glob("p-*.png")) if not pngs: sys.stderr.write( f"FATAL: no PNGs for doc_id={doc_id} after re-conversion attempt.\n" f" Expected at: {png_dir}\n" f" Manual recovery: download the PDF from https://www.war.gov/ufo/.pdf\n" f" and place it in /Users/guto/ufo/raw/, then re-run this script.\n" ) return total_pages = len(pngs) if max_pages: pngs = pngs[:max_pages] # Build worklist (after skip filter) worklist = [] for png_path in pngs: m = re.match(r"p-(\d+)\.png$", png_path.name) if not m: continue page_num = int(m.group(1)) padded = f"{page_num:03d}" vision_json_path = vision_dir / f"p-{padded}.json" page_md_path = pages_dir / f"p{padded}.md" ocr_path = ocr_dir / f"p-{padded}.txt" if not ocr_path.exists(): safe_print(f" p{padded}: skip (missing OCR)") continue if not force and vision_json_path.exists() and page_md_path.exists(): continue # silently skip already-processed worklist.append((png_path, ocr_path, vision_json_path, page_md_path, page_num)) skipped = len(pngs) - len(worklist) safe_print(f"\n=== {doc_id} ({total_pages} total, {len(worklist)} to process, {skipped} skipped, {workers} workers) ===") if not worklist: return log_entries: list[str] = [] total_cost = 0.0 done = 0 started_at = time.time() def _job(args): png_path, ocr_path, vision_json_path, page_md_path, page_num = args return _process_page( doc_id=doc_id, png_path=png_path, ocr_path=ocr_path, vision_json_path=vision_json_path, page_md_path=page_md_path, page_num=page_num, total_pages=total_pages, retries=retries, timeout=timeout, ) with ThreadPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_job, item): item for item in worklist} for fut in as_completed(futures): label, elapsed, cost, err = fut.result() done += 1 total_cost += cost wall = time.time() - started_at if err: safe_print(f" [{done}/{len(worklist)}] {label}: FAILED ({elapsed:.1f}s) — {err}") log_entries.append(f" - {label}: vision error: {err}") else: rate = done / wall if wall > 0 else 0 eta = (len(worklist) - done) / rate if rate > 0 else 0 safe_print(f" [{done}/{len(worklist)}] {label}: ok ({elapsed:.1f}s, ${cost:.4f}) — wall {wall:.0f}s eta {eta:.0f}s") log_entries.append(f" - {label}: ok ({elapsed:.1f}s, ${cost:.4f})") wall = time.time() - started_at safe_print(f" Total: {done} pages in {wall:.0f}s ({wall / max(done,1):.1f}s/page avg), ${total_cost:.4f}") # Append to log if log_entries: with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write(f"\n## {utc_now_iso()} — VISION INGEST\n") fh.write( f"- operator: archivist (via claude CLI OAuth)\n" f"- doc_id: {doc_id}\n" f"- model: {VISION_MODEL_FULL}\n" f"- workers: {workers}\n" f"- pages_processed: {len(log_entries)}\n" f"- wall_seconds: {wall:.0f}\n" f"- total_cost_usd: {total_cost:.4f}\n" f"- results:\n" ) for entry in sorted(log_entries): fh.write(entry + "\n") def main(): ap = argparse.ArgumentParser(description="Vision-process each PNG of a UFO doc via claude CLI (OAuth).") g = ap.add_mutually_exclusive_group(required=True) g.add_argument("--doc-id", help="single doc_id (kebab-case)") g.add_argument("--all", action="store_true", help="process all docs in processing/png/") ap.add_argument("--force", action="store_true", help="reprocess existing pages") ap.add_argument("--max-pages", type=int, default=None, help="cap pages per doc (for smoke test)") ap.add_argument("--workers", type=int, default=DEFAULT_WORKERS, help=f"parallel workers per doc (default {DEFAULT_WORKERS})") ap.add_argument("--retries", type=int, default=DEFAULT_RETRIES, help=f"retries on transient errors (default {DEFAULT_RETRIES})") ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help=f"per-call timeout seconds (default {DEFAULT_TIMEOUT})") args = ap.parse_args() # Verify claude CLI is available try: subprocess.run(["claude", "--version"], capture_output=True, check=True, timeout=10) except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: sys.stderr.write(f"claude CLI not found or not working: {e}\n") sys.exit(2) common = dict( force=args.force, max_pages=args.max_pages, workers=args.workers, retries=args.retries, timeout=args.timeout, ) if args.doc_id: process_doc(args.doc_id, **common) else: for doc_dir in sorted(PNG_BASE.iterdir()): if doc_dir.is_dir(): process_doc(doc_dir.name, **common) if __name__ == "__main__": main()