627 lines
24 KiB
Python
Executable file
627 lines
24 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
02-vision-page.py — Fase 3 — Vision Haiku via Claude Code CLI (OAuth)
|
|
|
|
Usa o `claude` CLI (plano Max 20x do usuário) — NÃO usa ANTHROPIC_API_KEY direta.
|
|
Invoca `claude -p --model haiku` por subprocess para cada PNG.
|
|
|
|
Para cada PNG em processing/png/<doc-id>/p-NNN.png:
|
|
1. Lê OCR raw (processing/ocr/<doc-id>/p-NNN.txt)
|
|
2. Chama claude CLI com prompt estruturado pedindo que use Read no PNG
|
|
3. Recebe JSON com page_type, content_classification, entities_extracted, etc.
|
|
4. Salva JSON em processing/vision/<doc-id>/p-NNN.json
|
|
5. Escreve wiki/pages/<doc-id>/p<NNN>.md (frontmatter + corpo) — idioma ORIGINAL
|
|
|
|
Idempotente: pula se vision JSON + page.md já existem (use --force para refazer).
|
|
|
|
Uso:
|
|
./02-vision-page.py --doc-id dow-uap-d54-mission-report-mediterranean-sea-na [--force] [--max-pages N]
|
|
./02-vision-page.py --all
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import threading
|
|
import time
|
|
import unicodedata
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
sys.stderr.write("Missing pillow. Run: pip3 install pillow\n")
|
|
sys.exit(1)
|
|
|
|
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|
RAW_DIR = UFO_ROOT / "raw"
|
|
PNG_BASE = UFO_ROOT / "processing" / "png"
|
|
OCR_BASE = UFO_ROOT / "processing" / "ocr"
|
|
VISION_BASE = UFO_ROOT / "processing" / "vision"
|
|
PAGES_BASE = UFO_ROOT / "wiki" / "pages"
|
|
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
|
|
|
|
MODEL = "haiku" # claude-haiku-4-5 alias
|
|
VISION_MODEL_FULL = "claude-haiku-4-5"
|
|
WIKI_VERSION = "0.1.0"
|
|
SCHEMA_VERSION = "0.1.0"
|
|
MAX_TURNS = 3
|
|
DEFAULT_WORKERS = 4
|
|
DEFAULT_RETRIES = 3
|
|
DEFAULT_TIMEOUT = 180
|
|
|
|
_print_lock = threading.Lock()
|
|
|
|
|
|
def safe_print(*args, **kwargs):
|
|
"""Thread-safe print."""
|
|
with _print_lock:
|
|
print(*args, **kwargs, flush=True)
|
|
|
|
|
|
VISION_JSON_SCHEMA = {
|
|
"type": "object",
|
|
"properties": {
|
|
"page_type": {"type": "string"},
|
|
"content_classification": {"type": "array", "items": {"type": "string"}},
|
|
"language_detected": {"type": "string"},
|
|
"classification_markings": {"type": "array"},
|
|
"redactions": {"type": "array"},
|
|
"signatures_observed": {"type": "array"},
|
|
"tables_detected": {"type": "array"},
|
|
"images_detected": {"type": "array"},
|
|
"entities_extracted": {"type": "object"},
|
|
"uap_observation_fields": {},
|
|
"vision_description": {"type": "string"},
|
|
"ocr_quality_score": {"type": "number"},
|
|
"vision_quality_score": {"type": "number"},
|
|
"flags": {"type": "array"},
|
|
},
|
|
"required": [
|
|
"page_type",
|
|
"content_classification",
|
|
"language_detected",
|
|
"vision_description",
|
|
"entities_extracted",
|
|
"redactions",
|
|
"classification_markings",
|
|
],
|
|
}
|
|
|
|
|
|
def build_prompt(png_path: Path, ocr_text: str) -> str:
|
|
"""Build the prompt sent to claude CLI."""
|
|
return f"""You are an evidence officer in the Investigation Bureau, analyzing one page of a US Department of War UAP/UFO document released at war.gov/ufo.
|
|
|
|
STEP 1: Use the Read tool to view this PNG of the page:
|
|
{png_path}
|
|
|
|
STEP 2: Combine what you SEE in the image with the raw pdftotext OCR below.
|
|
|
|
OCR raw (pdftotext -layout):
|
|
```
|
|
{ocr_text}
|
|
```
|
|
|
|
STEP 3: Output ONE JSON object (no markdown fence, no commentary, no preamble) matching this exact schema:
|
|
|
|
{{
|
|
"page_type": "cover|toc|body|signature|photo|sketch|map|stamp|blank|appendix|redaction-heavy|table-page|mixed",
|
|
"content_classification": ["text-only"|"contains-photos"|"contains-sketches"|"contains-diagrams"|"contains-maps"|"contains-tables"|"contains-signatures"|"contains-stamps"|"redaction-heavy"|"mixed"|"blank"],
|
|
"language_detected": "en|pt|es|fr|de|ru|unknown",
|
|
"classification_markings": [
|
|
{{"level":"UNCLASSIFIED|CUI|CONFIDENTIAL|SECRET|TOP SECRET","caveats":["NOFORN"],"location":"header|footer|banner|stamp","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}}}}
|
|
],
|
|
"redactions": [
|
|
{{"code":"(b)(1) 1.4(a)|(b)(3)|(b)(6)|other","description":"...","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"text_inferred":null}}
|
|
],
|
|
"signatures_observed": [
|
|
{{"signer_inferred":null,"confidence_band":"low|medium|high","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"notes":"..."}}
|
|
],
|
|
"tables_detected": [
|
|
{{"local_table_index":1,"bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"spans_multi_page":false,"continues_from_prev_page":false,"likely_continues_next_page":false,"row_count_estimate":0,"col_count_estimate":0,"headers_summary":"..."}}
|
|
],
|
|
"images_detected": [
|
|
{{"local_image_index":1,"image_type":"photo|sketch|map|chart|stamp|signature|redaction|logo|seal|diagram|other","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"caption_ocr":"..."}}
|
|
],
|
|
"vision_description": "Rich English description (2-5 sentences) of the page layout, visible elements, redaction extent, stamps, sketches, etc. PRESERVE ORIGINAL LANGUAGE of any quoted text from the document.",
|
|
"vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Use Brazilian spelling and idioms (NOT European Portuguese). Preserve UTF-8 accents (ã, é, ç, etc.). KEEP verbatim English quotes from the document in English (do not translate quoted text from the page itself); only the narrative description is translated.",
|
|
"entities_extracted": {{
|
|
"people": [{{"name":"As written","role_in_page":"subject|witness|author|signer|mentioned"}}],
|
|
"organizations": [{{"name":"As written","aliases":[]}}],
|
|
"locations": [{{"name":"As written","type":"city|region|country|sea|strait|airbase|naval-base|mountain|desert|other"}}],
|
|
"events": [{{"label":"Short label","date":"YYYY-MM-DD|YYYY|NA"}}],
|
|
"uap_objects": [{{"shape":"sphere|disc|triangle|cylinder|cube|elongated-ellipsoid|cigar|irregular|unknown","color":"...","size_estimate":"..."}}],
|
|
"vehicles": [{{"name":"...","class":"aircraft|ship|submarine|spacecraft|satellite|ground|other"}}],
|
|
"operations": [{{"name":"...","type":"military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}}],
|
|
"concepts": [{{"name":"...","class":"legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}}]
|
|
}},
|
|
"uap_observation_fields": {{
|
|
"date_time_utc":"...","duration_seconds":null,"shape":"...","color":"...","size_estimate":"...","altitude_ft":null,"speed_kts":null,"bearing_deg":null,"distance_nm":null,"coordinates":{{"lat":null,"lon":null,"raw_text":"..."}}
|
|
}},
|
|
"ocr_quality_score": 0.0,
|
|
"vision_quality_score": 0.0,
|
|
"flags": ["low-ocr"|"heavy-redaction"|"rotated"|"scanned-twice"|"missing-page-number"]
|
|
}}
|
|
|
|
Rules:
|
|
- Empty arrays for not-applicable fields. Do not omit keys.
|
|
- bbox is normalized 0..1 (x,y,w,h) relative to the page image.
|
|
- Entity NAMES, OCR-extracted strings, verbatim quotes, classification markings, redaction codes: ALWAYS in ORIGINAL source language (do NOT translate). Preserve original spelling, including any typos (e.g., "TRIANGLUAR" must stay as written).
|
|
- ONLY `vision_description_pt_br` is the translation. Everything else stays in source language.
|
|
- `vision_description_pt_br` must be Brazilian Portuguese (pt-br), NOT European Portuguese (pt-pt). Use Brazilian vocabulary and spelling. Preserve UTF-8 accentuation correctly (ç, ã, á, é, í, ó, ú, â, ê, ô, à).
|
|
- uap_observation_fields = null when page has no UAP encounter block.
|
|
- Output ONLY the JSON. No preamble, no fence, no commentary.
|
|
"""
|
|
|
|
|
|
def utc_now_iso():
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def filename_to_doc_id(filename: str) -> str:
|
|
base = filename.rsplit(".", 1)[0]
|
|
nfkd = unicodedata.normalize("NFKD", base)
|
|
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
|
|
lower = ascii_str.lower()
|
|
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
|
|
collapsed = re.sub(r"-+", "-", replaced).strip("-")
|
|
if collapsed and collapsed[0].isdigit():
|
|
collapsed = "doc-" + collapsed
|
|
return collapsed
|
|
|
|
|
|
def sha256_file(p: Path) -> str:
|
|
h = hashlib.sha256()
|
|
with open(p, "rb") as fh:
|
|
for chunk in iter(lambda: fh.read(65536), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
def extract_json(text: str) -> dict:
|
|
"""Extract JSON object from claude CLI output (may have markdown fences)."""
|
|
text = text.strip()
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```$", "", text)
|
|
# Find first { and matching last }
|
|
start = text.find("{")
|
|
if start == -1:
|
|
raise ValueError("No JSON object in response")
|
|
# Track depth to find matching close
|
|
depth = 0
|
|
for i, c in enumerate(text[start:], start):
|
|
if c == "{":
|
|
depth += 1
|
|
elif c == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
return json.loads(text[start : i + 1])
|
|
raise ValueError("Unclosed JSON object")
|
|
|
|
|
|
def call_claude_vision(png_path: Path, ocr_text: str, timeout: int = DEFAULT_TIMEOUT) -> tuple[dict, dict]:
|
|
"""Invoke `claude -p --model haiku` and return (vision_data, metadata). Single attempt."""
|
|
prompt = build_prompt(png_path, ocr_text)
|
|
cmd = [
|
|
"claude",
|
|
"-p",
|
|
"--model", MODEL,
|
|
"--output-format", "json",
|
|
"--max-turns", str(MAX_TURNS),
|
|
"--allowedTools", "Read",
|
|
"--add-dir", str(png_path.parent),
|
|
"--",
|
|
prompt,
|
|
]
|
|
res = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
check=False,
|
|
)
|
|
if res.returncode != 0:
|
|
raise RuntimeError(f"claude CLI failed (rc={res.returncode}): {res.stderr[-2000:]}")
|
|
|
|
cli_output = json.loads(res.stdout)
|
|
if cli_output.get("is_error"):
|
|
raise RuntimeError(f"claude reported error: {cli_output.get('result', '')[:500]}")
|
|
|
|
result_text = cli_output.get("result", "")
|
|
vision_data = extract_json(result_text)
|
|
|
|
metadata = {
|
|
"duration_ms": cli_output.get("duration_ms"),
|
|
"duration_api_ms": cli_output.get("duration_api_ms"),
|
|
"total_cost_usd": cli_output.get("total_cost_usd"),
|
|
"num_turns": cli_output.get("num_turns"),
|
|
"session_id": cli_output.get("session_id"),
|
|
"usage": cli_output.get("usage"),
|
|
}
|
|
return vision_data, metadata
|
|
|
|
|
|
def call_with_retry(
|
|
png_path: Path,
|
|
ocr_text: str,
|
|
retries: int = DEFAULT_RETRIES,
|
|
base_backoff: float = 5.0,
|
|
timeout: int = DEFAULT_TIMEOUT,
|
|
) -> tuple[dict, dict]:
|
|
"""Call vision with exponential backoff + jitter. Raises on final failure."""
|
|
last_err: Exception | None = None
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
return call_claude_vision(png_path, ocr_text, timeout=timeout)
|
|
except subprocess.TimeoutExpired as e:
|
|
last_err = e
|
|
backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2)
|
|
safe_print(f" timeout (attempt {attempt}/{retries}); sleeping {backoff:.1f}s")
|
|
time.sleep(backoff)
|
|
except RuntimeError as e:
|
|
last_err = e
|
|
msg = str(e).lower()
|
|
transient = any(s in msg for s in ("overloaded", "rate", "429", "500", "502", "503", "504", "timeout", "connection"))
|
|
if not transient or attempt == retries:
|
|
raise
|
|
backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2)
|
|
safe_print(f" transient error (attempt {attempt}/{retries}); sleeping {backoff:.1f}s")
|
|
time.sleep(backoff)
|
|
except json.JSONDecodeError as e:
|
|
last_err = e
|
|
if attempt == retries:
|
|
raise
|
|
backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2)
|
|
safe_print(f" JSON parse error (attempt {attempt}/{retries}); sleeping {backoff:.1f}s")
|
|
time.sleep(backoff)
|
|
if last_err:
|
|
raise last_err
|
|
raise RuntimeError("unreachable")
|
|
|
|
|
|
def render_page_md(
|
|
*,
|
|
doc_id: str,
|
|
page_num: int,
|
|
total_pages: int,
|
|
png_path: Path,
|
|
ocr_path: Path,
|
|
vision_path: Path,
|
|
vision_data: dict,
|
|
png_dimensions: tuple[int, int],
|
|
now_iso: str,
|
|
) -> str:
|
|
padded = f"{page_num:03d}"
|
|
page_id = f"{doc_id}/p{padded}"
|
|
|
|
frontmatter = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "page",
|
|
"page_id": page_id,
|
|
"doc_id": doc_id,
|
|
"page_number": page_num,
|
|
"total_pages": total_pages,
|
|
"png_path": f"../../../processing/png/{doc_id}/{png_path.name}",
|
|
"png_sha256": sha256_file(png_path),
|
|
"png_dpi": 200,
|
|
"png_width": png_dimensions[0],
|
|
"png_height": png_dimensions[1],
|
|
"ocr_raw_path": f"../../../processing/ocr/{doc_id}/{ocr_path.name}",
|
|
"vision_raw_path": f"../../../processing/vision/{doc_id}/{vision_path.name}",
|
|
"vision_model": VISION_MODEL_FULL,
|
|
"vision_run_at": now_iso,
|
|
"page_type": vision_data.get("page_type", "body"),
|
|
"content_classification": vision_data.get("content_classification", []),
|
|
"language_detected": vision_data.get("language_detected", "unknown"),
|
|
"classification_markings": vision_data.get("classification_markings", []),
|
|
"redactions": vision_data.get("redactions", []),
|
|
"signatures_observed": vision_data.get("signatures_observed", []),
|
|
"tables_detected": vision_data.get("tables_detected", []),
|
|
"images_detected": vision_data.get("images_detected", []),
|
|
"entities_extracted": vision_data.get("entities_extracted", {}),
|
|
"uap_observation_fields": vision_data.get("uap_observation_fields"),
|
|
"vision_description": vision_data.get("vision_description", ""),
|
|
"vision_description_pt_br": vision_data.get("vision_description_pt_br", ""),
|
|
"ocr_quality_score": vision_data.get("ocr_quality_score", 0.0),
|
|
"vision_quality_score": vision_data.get("vision_quality_score", 0.0),
|
|
"flags": vision_data.get("flags", []),
|
|
"last_ingest": now_iso,
|
|
"last_lint": None,
|
|
"wiki_version": WIKI_VERSION,
|
|
}
|
|
|
|
yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
ocr_text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
|
|
|
|
pt_desc = vision_data.get("vision_description_pt_br", "").strip()
|
|
en_desc = vision_data.get("vision_description", "").strip()
|
|
|
|
body = f"""# [[{doc_id}]] — Page {page_num} of {total_pages}
|
|
|
|

|
|
|
|
## OCR Text (raw, original language)
|
|
|
|
```
|
|
{ocr_text}
|
|
```
|
|
|
|
## Vision Description (EN)
|
|
|
|
{en_desc}
|
|
|
|
## Descrição Vision (PT-BR)
|
|
|
|
{pt_desc}
|
|
|
|
## Investigation Notes
|
|
|
|
- `page_type`: `{vision_data.get("page_type", "unknown")}`
|
|
- `content_classification`: {', '.join(f"`{c}`" for c in vision_data.get("content_classification", [])) or "_n/a_"}
|
|
- `language_detected`: `{vision_data.get("language_detected", "unknown")}`
|
|
- `flags`: {', '.join(f"`{f}`" for f in vision_data.get("flags", [])) or "_none_"}
|
|
"""
|
|
return f"---\n{yaml_str}---\n\n{body}"
|
|
|
|
|
|
def _process_page(
|
|
*,
|
|
doc_id: str,
|
|
png_path: Path,
|
|
ocr_path: Path,
|
|
vision_json_path: Path,
|
|
page_md_path: Path,
|
|
page_num: int,
|
|
total_pages: int,
|
|
retries: int,
|
|
timeout: int,
|
|
) -> tuple[str, float, float, str | None]:
|
|
"""Process a single page. Returns (label, elapsed_seconds, cost_usd, error_or_none)."""
|
|
padded = f"{page_num:03d}"
|
|
t0 = time.time()
|
|
|
|
try:
|
|
with Image.open(png_path) as im:
|
|
png_dimensions = im.size
|
|
except Exception:
|
|
png_dimensions = (0, 0)
|
|
|
|
ocr_text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
|
|
|
|
try:
|
|
vision_data, meta = call_with_retry(png_path, ocr_text, retries=retries, timeout=timeout)
|
|
except Exception as e:
|
|
return (f"p{padded}", time.time() - t0, 0.0, str(e)[:300])
|
|
|
|
vision_json_path.write_text(
|
|
json.dumps({"vision_data": vision_data, "meta": meta}, indent=2, ensure_ascii=False),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
md = render_page_md(
|
|
doc_id=doc_id,
|
|
page_num=page_num,
|
|
total_pages=total_pages,
|
|
png_path=png_path,
|
|
ocr_path=ocr_path,
|
|
vision_path=vision_json_path,
|
|
vision_data=vision_data,
|
|
png_dimensions=png_dimensions,
|
|
now_iso=utc_now_iso(),
|
|
)
|
|
page_md_path.write_text(md, encoding="utf-8")
|
|
|
|
elapsed = time.time() - t0
|
|
cost = meta.get("total_cost_usd", 0.0) or 0.0
|
|
return (f"p{padded}", elapsed, cost, None)
|
|
|
|
|
|
def find_pdf_filename_for_doc_id(doc_id: str) -> str | None:
|
|
"""Reverse-canonicalize: scan raw/ for a PDF whose canonical doc_id matches."""
|
|
def _canon(fname: str) -> str:
|
|
base = fname.rsplit(".", 1)[0]
|
|
nfkd = unicodedata.normalize("NFKD", base)
|
|
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
|
|
lower = ascii_str.lower()
|
|
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
|
|
collapsed = re.sub(r"-+", "-", replaced).strip("-")
|
|
if collapsed and collapsed[0].isdigit():
|
|
collapsed = "doc-" + collapsed
|
|
return collapsed
|
|
|
|
raw_dir = UFO_ROOT / "raw"
|
|
for p in raw_dir.glob("*.pdf"):
|
|
if _canon(p.name) == doc_id:
|
|
return p.name
|
|
return None
|
|
|
|
|
|
def try_reconvert_from_raw(doc_id: str) -> bool:
|
|
"""Attempt to regenerate PNGs/OCR via scripts/01-convert-pdfs.sh.
|
|
Returns True if reconvert succeeded (PNGs now exist), False otherwise."""
|
|
fname = find_pdf_filename_for_doc_id(doc_id)
|
|
if not fname:
|
|
safe_print(f" ⚠ PDF for {doc_id} not in raw/ — manual download required from https://www.war.gov/ufo/<filename>.pdf")
|
|
return False
|
|
script = UFO_ROOT / "scripts" / "01-convert-pdfs.sh"
|
|
safe_print(f" ↻ re-converting from raw/{fname} ...")
|
|
res = subprocess.run(
|
|
[str(script), "--filename", fname],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=300,
|
|
check=False,
|
|
)
|
|
if res.returncode != 0:
|
|
safe_print(f" ✗ re-conversion failed: {res.stderr[-500:]}")
|
|
return False
|
|
return True
|
|
|
|
|
|
def process_doc(
|
|
doc_id: str,
|
|
force: bool = False,
|
|
max_pages: int | None = None,
|
|
workers: int = DEFAULT_WORKERS,
|
|
retries: int = DEFAULT_RETRIES,
|
|
timeout: int = DEFAULT_TIMEOUT,
|
|
):
|
|
png_dir = PNG_BASE / doc_id
|
|
ocr_dir = OCR_BASE / doc_id
|
|
vision_dir = VISION_BASE / doc_id
|
|
pages_dir = PAGES_BASE / doc_id
|
|
vision_dir.mkdir(parents=True, exist_ok=True)
|
|
pages_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
pngs = sorted(png_dir.glob("p-*.png"))
|
|
if not pngs:
|
|
# Fallback: try to re-convert from raw/<file>.pdf
|
|
safe_print(f"No PNGs for doc_id={doc_id} in {png_dir} — attempting re-conversion from raw/")
|
|
if try_reconvert_from_raw(doc_id):
|
|
pngs = sorted(png_dir.glob("p-*.png"))
|
|
if not pngs:
|
|
sys.stderr.write(
|
|
f"FATAL: no PNGs for doc_id={doc_id} after re-conversion attempt.\n"
|
|
f" Expected at: {png_dir}\n"
|
|
f" Manual recovery: download the PDF from https://www.war.gov/ufo/<filename>.pdf\n"
|
|
f" and place it in /Users/guto/ufo/raw/, then re-run this script.\n"
|
|
)
|
|
return
|
|
|
|
total_pages = len(pngs)
|
|
if max_pages:
|
|
pngs = pngs[:max_pages]
|
|
|
|
# Build worklist (after skip filter)
|
|
worklist = []
|
|
for png_path in pngs:
|
|
m = re.match(r"p-(\d+)\.png$", png_path.name)
|
|
if not m:
|
|
continue
|
|
page_num = int(m.group(1))
|
|
padded = f"{page_num:03d}"
|
|
vision_json_path = vision_dir / f"p-{padded}.json"
|
|
page_md_path = pages_dir / f"p{padded}.md"
|
|
ocr_path = ocr_dir / f"p-{padded}.txt"
|
|
if not ocr_path.exists():
|
|
safe_print(f" p{padded}: skip (missing OCR)")
|
|
continue
|
|
if not force and vision_json_path.exists() and page_md_path.exists():
|
|
continue # silently skip already-processed
|
|
worklist.append((png_path, ocr_path, vision_json_path, page_md_path, page_num))
|
|
|
|
skipped = len(pngs) - len(worklist)
|
|
safe_print(f"\n=== {doc_id} ({total_pages} total, {len(worklist)} to process, {skipped} skipped, {workers} workers) ===")
|
|
|
|
if not worklist:
|
|
return
|
|
|
|
log_entries: list[str] = []
|
|
total_cost = 0.0
|
|
done = 0
|
|
started_at = time.time()
|
|
|
|
def _job(args):
|
|
png_path, ocr_path, vision_json_path, page_md_path, page_num = args
|
|
return _process_page(
|
|
doc_id=doc_id,
|
|
png_path=png_path,
|
|
ocr_path=ocr_path,
|
|
vision_json_path=vision_json_path,
|
|
page_md_path=page_md_path,
|
|
page_num=page_num,
|
|
total_pages=total_pages,
|
|
retries=retries,
|
|
timeout=timeout,
|
|
)
|
|
|
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
futures = {pool.submit(_job, item): item for item in worklist}
|
|
for fut in as_completed(futures):
|
|
label, elapsed, cost, err = fut.result()
|
|
done += 1
|
|
total_cost += cost
|
|
wall = time.time() - started_at
|
|
if err:
|
|
safe_print(f" [{done}/{len(worklist)}] {label}: FAILED ({elapsed:.1f}s) — {err}")
|
|
log_entries.append(f" - {label}: vision error: {err}")
|
|
else:
|
|
rate = done / wall if wall > 0 else 0
|
|
eta = (len(worklist) - done) / rate if rate > 0 else 0
|
|
safe_print(f" [{done}/{len(worklist)}] {label}: ok ({elapsed:.1f}s, ${cost:.4f}) — wall {wall:.0f}s eta {eta:.0f}s")
|
|
log_entries.append(f" - {label}: ok ({elapsed:.1f}s, ${cost:.4f})")
|
|
|
|
wall = time.time() - started_at
|
|
safe_print(f" Total: {done} pages in {wall:.0f}s ({wall / max(done,1):.1f}s/page avg), ${total_cost:.4f}")
|
|
|
|
# Append to log
|
|
if log_entries:
|
|
with open(LOG_PATH, "a", encoding="utf-8") as fh:
|
|
fh.write(f"\n## {utc_now_iso()} — VISION INGEST\n")
|
|
fh.write(
|
|
f"- operator: archivist (via claude CLI OAuth)\n"
|
|
f"- doc_id: {doc_id}\n"
|
|
f"- model: {VISION_MODEL_FULL}\n"
|
|
f"- workers: {workers}\n"
|
|
f"- pages_processed: {len(log_entries)}\n"
|
|
f"- wall_seconds: {wall:.0f}\n"
|
|
f"- total_cost_usd: {total_cost:.4f}\n"
|
|
f"- results:\n"
|
|
)
|
|
for entry in sorted(log_entries):
|
|
fh.write(entry + "\n")
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description="Vision-process each PNG of a UFO doc via claude CLI (OAuth).")
|
|
g = ap.add_mutually_exclusive_group(required=True)
|
|
g.add_argument("--doc-id", help="single doc_id (kebab-case)")
|
|
g.add_argument("--all", action="store_true", help="process all docs in processing/png/")
|
|
ap.add_argument("--force", action="store_true", help="reprocess existing pages")
|
|
ap.add_argument("--max-pages", type=int, default=None, help="cap pages per doc (for smoke test)")
|
|
ap.add_argument("--workers", type=int, default=DEFAULT_WORKERS, help=f"parallel workers per doc (default {DEFAULT_WORKERS})")
|
|
ap.add_argument("--retries", type=int, default=DEFAULT_RETRIES, help=f"retries on transient errors (default {DEFAULT_RETRIES})")
|
|
ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help=f"per-call timeout seconds (default {DEFAULT_TIMEOUT})")
|
|
args = ap.parse_args()
|
|
|
|
# Verify claude CLI is available
|
|
try:
|
|
subprocess.run(["claude", "--version"], capture_output=True, check=True, timeout=10)
|
|
except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
|
sys.stderr.write(f"claude CLI not found or not working: {e}\n")
|
|
sys.exit(2)
|
|
|
|
common = dict(
|
|
force=args.force,
|
|
max_pages=args.max_pages,
|
|
workers=args.workers,
|
|
retries=args.retries,
|
|
timeout=args.timeout,
|
|
)
|
|
if args.doc_id:
|
|
process_doc(args.doc_id, **common)
|
|
else:
|
|
for doc_dir in sorted(PNG_BASE.iterdir()):
|
|
if doc_dir.is_dir():
|
|
process_doc(doc_dir.name, **common)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|