disclosure-bureau/scripts/02-vision-page.py

#!/usr/bin/env python3
"""
02-vision-page.py — Fase 3 — Vision Haiku via Claude Code CLI (OAuth)

Usa o `claude` CLI (plano Max 20x do usuário) — NÃO usa ANTHROPIC_API_KEY direta.
Invoca `claude -p --model haiku` por subprocess para cada PNG.

Para cada PNG em processing/png/<doc-id>/p-NNN.png:
  1. Lê OCR raw (processing/ocr/<doc-id>/p-NNN.txt)
  2. Chama claude CLI com prompt estruturado pedindo que use Read no PNG
  3. Recebe JSON com page_type, content_classification, entities_extracted, etc.
  4. Salva JSON em processing/vision/<doc-id>/p-NNN.json
  5. Escreve wiki/pages/<doc-id>/p<NNN>.md (frontmatter + corpo) — idioma ORIGINAL

Idempotente: pula se vision JSON + page.md já existem (use --force para refazer).

Uso:
  ./02-vision-page.py --doc-id dow-uap-d54-mission-report-mediterranean-sea-na [--force] [--max-pages N]
  ./02-vision-page.py --all
"""
from __future__ import annotations

import argparse
import hashlib
import json
import os
import random
import re
import subprocess
import sys
import threading
import time
import unicodedata
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
    sys.exit(1)

try:
    from PIL import Image
except ImportError:
    sys.stderr.write("Missing pillow. Run: pip3 install pillow\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
RAW_DIR = UFO_ROOT / "raw"
PNG_BASE = UFO_ROOT / "processing" / "png"
OCR_BASE = UFO_ROOT / "processing" / "ocr"
VISION_BASE = UFO_ROOT / "processing" / "vision"
PAGES_BASE = UFO_ROOT / "wiki" / "pages"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

MODEL = "haiku"   # claude-haiku-4-5 alias
VISION_MODEL_FULL = "claude-haiku-4-5"
WIKI_VERSION = "0.1.0"
SCHEMA_VERSION = "0.1.0"
MAX_TURNS = 3
DEFAULT_WORKERS = 4
DEFAULT_RETRIES = 3
DEFAULT_TIMEOUT = 180

_print_lock = threading.Lock()


def safe_print(*args, **kwargs):
    """Thread-safe print."""
    with _print_lock:
        print(*args, **kwargs, flush=True)


VISION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "page_type": {"type": "string"},
        "content_classification": {"type": "array", "items": {"type": "string"}},
        "language_detected": {"type": "string"},
        "classification_markings": {"type": "array"},
        "redactions": {"type": "array"},
        "signatures_observed": {"type": "array"},
        "tables_detected": {"type": "array"},
        "images_detected": {"type": "array"},
        "entities_extracted": {"type": "object"},
        "uap_observation_fields": {},
        "vision_description": {"type": "string"},
        "ocr_quality_score": {"type": "number"},
        "vision_quality_score": {"type": "number"},
        "flags": {"type": "array"},
    },
    "required": [
        "page_type",
        "content_classification",
        "language_detected",
        "vision_description",
        "entities_extracted",
        "redactions",
        "classification_markings",
    ],
}


def build_prompt(png_path: Path, ocr_text: str) -> str:
    """Build the prompt sent to claude CLI."""
    return f"""You are an evidence officer in the Investigation Bureau, analyzing one page of a US Department of War UAP/UFO document released at war.gov/ufo.

STEP 1: Use the Read tool to view this PNG of the page:
{png_path}

STEP 2: Combine what you SEE in the image with the raw pdftotext OCR below.

OCR raw (pdftotext -layout):
```
{ocr_text}
```

STEP 3: Output ONE JSON object (no markdown fence, no commentary, no preamble) matching this exact schema:

{{
  "page_type": "cover|toc|body|signature|photo|sketch|map|stamp|blank|appendix|redaction-heavy|table-page|mixed",
  "content_classification": ["text-only"|"contains-photos"|"contains-sketches"|"contains-diagrams"|"contains-maps"|"contains-tables"|"contains-signatures"|"contains-stamps"|"redaction-heavy"|"mixed"|"blank"],
  "language_detected": "en|pt|es|fr|de|ru|unknown",
  "classification_markings": [
    {{"level":"UNCLASSIFIED|CUI|CONFIDENTIAL|SECRET|TOP SECRET","caveats":["NOFORN"],"location":"header|footer|banner|stamp","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}}}}
  ],
  "redactions": [
    {{"code":"(b)(1) 1.4(a)|(b)(3)|(b)(6)|other","description":"...","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"text_inferred":null}}
  ],
  "signatures_observed": [
    {{"signer_inferred":null,"confidence_band":"low|medium|high","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"notes":"..."}}
  ],
  "tables_detected": [
    {{"local_table_index":1,"bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"spans_multi_page":false,"continues_from_prev_page":false,"likely_continues_next_page":false,"row_count_estimate":0,"col_count_estimate":0,"headers_summary":"..."}}
  ],
  "images_detected": [
    {{"local_image_index":1,"image_type":"photo|sketch|map|chart|stamp|signature|redaction|logo|seal|diagram|other","bbox":{{"x":0.0,"y":0.0,"w":0.0,"h":0.0}},"caption_ocr":"..."}}
  ],
  "vision_description": "Rich English description (2-5 sentences) of the page layout, visible elements, redaction extent, stamps, sketches, etc. PRESERVE ORIGINAL LANGUAGE of any quoted text from the document.",
  "vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Use Brazilian spelling and idioms (NOT European Portuguese). Preserve UTF-8 accents (ã, é, ç, etc.). KEEP verbatim English quotes from the document in English (do not translate quoted text from the page itself); only the narrative description is translated.",
  "entities_extracted": {{
    "people": [{{"name":"As written","role_in_page":"subject|witness|author|signer|mentioned"}}],
    "organizations": [{{"name":"As written","aliases":[]}}],
    "locations": [{{"name":"As written","type":"city|region|country|sea|strait|airbase|naval-base|mountain|desert|other"}}],
    "events": [{{"label":"Short label","date":"YYYY-MM-DD|YYYY|NA"}}],
    "uap_objects": [{{"shape":"sphere|disc|triangle|cylinder|cube|elongated-ellipsoid|cigar|irregular|unknown","color":"...","size_estimate":"..."}}],
    "vehicles": [{{"name":"...","class":"aircraft|ship|submarine|spacecraft|satellite|ground|other"}}],
    "operations": [{{"name":"...","type":"military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}}],
    "concepts": [{{"name":"...","class":"legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}}]
  }},
  "uap_observation_fields": {{
    "date_time_utc":"...","duration_seconds":null,"shape":"...","color":"...","size_estimate":"...","altitude_ft":null,"speed_kts":null,"bearing_deg":null,"distance_nm":null,"coordinates":{{"lat":null,"lon":null,"raw_text":"..."}}
  }},
  "ocr_quality_score": 0.0,
  "vision_quality_score": 0.0,
  "flags": ["low-ocr"|"heavy-redaction"|"rotated"|"scanned-twice"|"missing-page-number"]
}}

Rules:
- Empty arrays for not-applicable fields. Do not omit keys.
- bbox is normalized 0..1 (x,y,w,h) relative to the page image.
- Entity NAMES, OCR-extracted strings, verbatim quotes, classification markings, redaction codes: ALWAYS in ORIGINAL source language (do NOT translate). Preserve original spelling, including any typos (e.g., "TRIANGLUAR" must stay as written).
- ONLY `vision_description_pt_br` is the translation. Everything else stays in source language.
- `vision_description_pt_br` must be Brazilian Portuguese (pt-br), NOT European Portuguese (pt-pt). Use Brazilian vocabulary and spelling. Preserve UTF-8 accentuation correctly (ç, ã, á, é, í, ó, ú, â, ê, ô, à).
- uap_observation_fields = null when page has no UAP encounter block.
- Output ONLY the JSON. No preamble, no fence, no commentary.
"""


def utc_now_iso():
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def filename_to_doc_id(filename: str) -> str:
    base = filename.rsplit(".", 1)[0]
    nfkd = unicodedata.normalize("NFKD", base)
    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
    lower = ascii_str.lower()
    replaced = re.sub(r"[^a-z0-9-]", "-", lower)
    collapsed = re.sub(r"-+", "-", replaced).strip("-")
    if collapsed and collapsed[0].isdigit():
        collapsed = "doc-" + collapsed
    return collapsed


def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as fh:
        for chunk in iter(lambda: fh.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()


def extract_json(text: str) -> dict:
    """Extract JSON object from claude CLI output (may have markdown fences)."""
    text = text.strip()
    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```$", "", text)
    # Find first { and matching last }
    start = text.find("{")
    if start == -1:
        raise ValueError("No JSON object in response")
    # Track depth to find matching close
    depth = 0
    for i, c in enumerate(text[start:], start):
        if c == "{":
            depth += 1
        elif c == "}":
            depth -= 1
            if depth == 0:
                return json.loads(text[start : i + 1])
    raise ValueError("Unclosed JSON object")


def call_claude_vision(png_path: Path, ocr_text: str, timeout: int = DEFAULT_TIMEOUT) -> tuple[dict, dict]:
    """Invoke `claude -p --model haiku` and return (vision_data, metadata). Single attempt."""
    prompt = build_prompt(png_path, ocr_text)
    cmd = [
        "claude",
        "-p",
        "--model", MODEL,
        "--output-format", "json",
        "--max-turns", str(MAX_TURNS),
        "--allowedTools", "Read",
        "--add-dir", str(png_path.parent),
        "--",
        prompt,
    ]
    res = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        timeout=timeout,
        check=False,
    )
    if res.returncode != 0:
        raise RuntimeError(f"claude CLI failed (rc={res.returncode}): {res.stderr[-2000:]}")

    cli_output = json.loads(res.stdout)
    if cli_output.get("is_error"):
        raise RuntimeError(f"claude reported error: {cli_output.get('result', '')[:500]}")

    result_text = cli_output.get("result", "")
    vision_data = extract_json(result_text)

    metadata = {
        "duration_ms": cli_output.get("duration_ms"),
        "duration_api_ms": cli_output.get("duration_api_ms"),
        "total_cost_usd": cli_output.get("total_cost_usd"),
        "num_turns": cli_output.get("num_turns"),
        "session_id": cli_output.get("session_id"),
        "usage": cli_output.get("usage"),
    }
    return vision_data, metadata


def call_with_retry(
    png_path: Path,
    ocr_text: str,
    retries: int = DEFAULT_RETRIES,
    base_backoff: float = 5.0,
    timeout: int = DEFAULT_TIMEOUT,
) -> tuple[dict, dict]:
    """Call vision with exponential backoff + jitter. Raises on final failure."""
    last_err: Exception | None = None
    for attempt in range(1, retries + 1):
        try:
            return call_claude_vision(png_path, ocr_text, timeout=timeout)
        except subprocess.TimeoutExpired as e:
            last_err = e
            backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2)
            safe_print(f"    timeout (attempt {attempt}/{retries}); sleeping {backoff:.1f}s")
            time.sleep(backoff)
        except RuntimeError as e:
            last_err = e
            msg = str(e).lower()
            transient = any(s in msg for s in ("overloaded", "rate", "429", "500", "502", "503", "504", "timeout", "connection"))
            if not transient or attempt == retries:
                raise
            backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2)
            safe_print(f"    transient error (attempt {attempt}/{retries}); sleeping {backoff:.1f}s")
            time.sleep(backoff)
        except json.JSONDecodeError as e:
            last_err = e
            if attempt == retries:
                raise
            backoff = base_backoff * (2 ** (attempt - 1)) + random.uniform(0, 2)
            safe_print(f"    JSON parse error (attempt {attempt}/{retries}); sleeping {backoff:.1f}s")
            time.sleep(backoff)
    if last_err:
        raise last_err
    raise RuntimeError("unreachable")


def render_page_md(
    *,
    doc_id: str,
    page_num: int,
    total_pages: int,
    png_path: Path,
    ocr_path: Path,
    vision_path: Path,
    vision_data: dict,
    png_dimensions: tuple[int, int],
    now_iso: str,
) -> str:
    padded = f"{page_num:03d}"
    page_id = f"{doc_id}/p{padded}"

    frontmatter = {
        "schema_version": SCHEMA_VERSION,
        "type": "page",
        "page_id": page_id,
        "doc_id": doc_id,
        "page_number": page_num,
        "total_pages": total_pages,
        "png_path": f"../../../processing/png/{doc_id}/{png_path.name}",
        "png_sha256": sha256_file(png_path),
        "png_dpi": 200,
        "png_width": png_dimensions[0],
        "png_height": png_dimensions[1],
        "ocr_raw_path": f"../../../processing/ocr/{doc_id}/{ocr_path.name}",
        "vision_raw_path": f"../../../processing/vision/{doc_id}/{vision_path.name}",
        "vision_model": VISION_MODEL_FULL,
        "vision_run_at": now_iso,
        "page_type": vision_data.get("page_type", "body"),
        "content_classification": vision_data.get("content_classification", []),
        "language_detected": vision_data.get("language_detected", "unknown"),
        "classification_markings": vision_data.get("classification_markings", []),
        "redactions": vision_data.get("redactions", []),
        "signatures_observed": vision_data.get("signatures_observed", []),
        "tables_detected": vision_data.get("tables_detected", []),
        "images_detected": vision_data.get("images_detected", []),
        "entities_extracted": vision_data.get("entities_extracted", {}),
        "uap_observation_fields": vision_data.get("uap_observation_fields"),
        "vision_description": vision_data.get("vision_description", ""),
        "vision_description_pt_br": vision_data.get("vision_description_pt_br", ""),
        "ocr_quality_score": vision_data.get("ocr_quality_score", 0.0),
        "vision_quality_score": vision_data.get("vision_quality_score", 0.0),
        "flags": vision_data.get("flags", []),
        "last_ingest": now_iso,
        "last_lint": None,
        "wiki_version": WIKI_VERSION,
    }

    yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False)
    ocr_text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()

    pt_desc = vision_data.get("vision_description_pt_br", "").strip()
    en_desc = vision_data.get("vision_description", "").strip()

    body = f"""# [[{doc_id}]] — Page {page_num} of {total_pages}

![Page {page_num}](../../../processing/png/{doc_id}/{png_path.name})

## OCR Text (raw, original language)

```
{ocr_text}
```

## Vision Description (EN)

{en_desc}

## Descrição Vision (PT-BR)

{pt_desc}

## Investigation Notes

- `page_type`: `{vision_data.get("page_type", "unknown")}`
- `content_classification`: {', '.join(f"`{c}`" for c in vision_data.get("content_classification", [])) or "_n/a_"}
- `language_detected`: `{vision_data.get("language_detected", "unknown")}`
- `flags`: {', '.join(f"`{f}`" for f in vision_data.get("flags", [])) or "_none_"}
"""
    return f"---\n{yaml_str}---\n\n{body}"


def _process_page(
    *,
    doc_id: str,
    png_path: Path,
    ocr_path: Path,
    vision_json_path: Path,
    page_md_path: Path,
    page_num: int,
    total_pages: int,
    retries: int,
    timeout: int,
) -> tuple[str, float, float, str | None]:
    """Process a single page. Returns (label, elapsed_seconds, cost_usd, error_or_none)."""
    padded = f"{page_num:03d}"
    t0 = time.time()

    try:
        with Image.open(png_path) as im:
            png_dimensions = im.size
    except Exception:
        png_dimensions = (0, 0)

    ocr_text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()

    try:
        vision_data, meta = call_with_retry(png_path, ocr_text, retries=retries, timeout=timeout)
    except Exception as e:
        return (f"p{padded}", time.time() - t0, 0.0, str(e)[:300])

    vision_json_path.write_text(
        json.dumps({"vision_data": vision_data, "meta": meta}, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )

    md = render_page_md(
        doc_id=doc_id,
        page_num=page_num,
        total_pages=total_pages,
        png_path=png_path,
        ocr_path=ocr_path,
        vision_path=vision_json_path,
        vision_data=vision_data,
        png_dimensions=png_dimensions,
        now_iso=utc_now_iso(),
    )
    page_md_path.write_text(md, encoding="utf-8")

    elapsed = time.time() - t0
    cost = meta.get("total_cost_usd", 0.0) or 0.0
    return (f"p{padded}", elapsed, cost, None)


def find_pdf_filename_for_doc_id(doc_id: str) -> str | None:
    """Reverse-canonicalize: scan raw/ for a PDF whose canonical doc_id matches."""
    def _canon(fname: str) -> str:
        base = fname.rsplit(".", 1)[0]
        nfkd = unicodedata.normalize("NFKD", base)
        ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
        lower = ascii_str.lower()
        replaced = re.sub(r"[^a-z0-9-]", "-", lower)
        collapsed = re.sub(r"-+", "-", replaced).strip("-")
        if collapsed and collapsed[0].isdigit():
            collapsed = "doc-" + collapsed
        return collapsed

    raw_dir = UFO_ROOT / "raw"
    for p in raw_dir.glob("*.pdf"):
        if _canon(p.name) == doc_id:
            return p.name
    return None


def try_reconvert_from_raw(doc_id: str) -> bool:
    """Attempt to regenerate PNGs/OCR via scripts/01-convert-pdfs.sh.
    Returns True if reconvert succeeded (PNGs now exist), False otherwise."""
    fname = find_pdf_filename_for_doc_id(doc_id)
    if not fname:
        safe_print(f"  ⚠ PDF for {doc_id} not in raw/ — manual download required from https://www.war.gov/ufo/<filename>.pdf")
        return False
    script = UFO_ROOT / "scripts" / "01-convert-pdfs.sh"
    safe_print(f"  ↻ re-converting from raw/{fname} ...")
    res = subprocess.run(
        [str(script), "--filename", fname],
        capture_output=True,
        text=True,
        timeout=300,
        check=False,
    )
    if res.returncode != 0:
        safe_print(f"  ✗ re-conversion failed: {res.stderr[-500:]}")
        return False
    return True


def process_doc(
    doc_id: str,
    force: bool = False,
    max_pages: int | None = None,
    workers: int = DEFAULT_WORKERS,
    retries: int = DEFAULT_RETRIES,
    timeout: int = DEFAULT_TIMEOUT,
):
    png_dir = PNG_BASE / doc_id
    ocr_dir = OCR_BASE / doc_id
    vision_dir = VISION_BASE / doc_id
    pages_dir = PAGES_BASE / doc_id
    vision_dir.mkdir(parents=True, exist_ok=True)
    pages_dir.mkdir(parents=True, exist_ok=True)

    pngs = sorted(png_dir.glob("p-*.png"))
    if not pngs:
        # Fallback: try to re-convert from raw/<file>.pdf
        safe_print(f"No PNGs for doc_id={doc_id} in {png_dir} — attempting re-conversion from raw/")
        if try_reconvert_from_raw(doc_id):
            pngs = sorted(png_dir.glob("p-*.png"))
        if not pngs:
            sys.stderr.write(
                f"FATAL: no PNGs for doc_id={doc_id} after re-conversion attempt.\n"
                f"  Expected at: {png_dir}\n"
                f"  Manual recovery: download the PDF from https://www.war.gov/ufo/<filename>.pdf\n"
                f"  and place it in /Users/guto/ufo/raw/, then re-run this script.\n"
            )
            return

    total_pages = len(pngs)
    if max_pages:
        pngs = pngs[:max_pages]

    # Build worklist (after skip filter)
    worklist = []
    for png_path in pngs:
        m = re.match(r"p-(\d+)\.png$", png_path.name)
        if not m:
            continue
        page_num = int(m.group(1))
        padded = f"{page_num:03d}"
        vision_json_path = vision_dir / f"p-{padded}.json"
        page_md_path = pages_dir / f"p{padded}.md"
        ocr_path = ocr_dir / f"p-{padded}.txt"
        if not ocr_path.exists():
            safe_print(f"  p{padded}: skip (missing OCR)")
            continue
        if not force and vision_json_path.exists() and page_md_path.exists():
            continue  # silently skip already-processed
        worklist.append((png_path, ocr_path, vision_json_path, page_md_path, page_num))

    skipped = len(pngs) - len(worklist)
    safe_print(f"\n=== {doc_id} ({total_pages} total, {len(worklist)} to process, {skipped} skipped, {workers} workers) ===")

    if not worklist:
        return

    log_entries: list[str] = []
    total_cost = 0.0
    done = 0
    started_at = time.time()

    def _job(args):
        png_path, ocr_path, vision_json_path, page_md_path, page_num = args
        return _process_page(
            doc_id=doc_id,
            png_path=png_path,
            ocr_path=ocr_path,
            vision_json_path=vision_json_path,
            page_md_path=page_md_path,
            page_num=page_num,
            total_pages=total_pages,
            retries=retries,
            timeout=timeout,
        )

    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_job, item): item for item in worklist}
        for fut in as_completed(futures):
            label, elapsed, cost, err = fut.result()
            done += 1
            total_cost += cost
            wall = time.time() - started_at
            if err:
                safe_print(f"  [{done}/{len(worklist)}] {label}: FAILED ({elapsed:.1f}s) — {err}")
                log_entries.append(f"  - {label}: vision error: {err}")
            else:
                rate = done / wall if wall > 0 else 0
                eta = (len(worklist) - done) / rate if rate > 0 else 0
                safe_print(f"  [{done}/{len(worklist)}] {label}: ok ({elapsed:.1f}s, ${cost:.4f}) — wall {wall:.0f}s eta {eta:.0f}s")
                log_entries.append(f"  - {label}: ok ({elapsed:.1f}s, ${cost:.4f})")

    wall = time.time() - started_at
    safe_print(f"  Total: {done} pages in {wall:.0f}s ({wall / max(done,1):.1f}s/page avg), ${total_cost:.4f}")

    # Append to log
    if log_entries:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(f"\n## {utc_now_iso()} — VISION INGEST\n")
            fh.write(
                f"- operator: archivist (via claude CLI OAuth)\n"
                f"- doc_id: {doc_id}\n"
                f"- model: {VISION_MODEL_FULL}\n"
                f"- workers: {workers}\n"
                f"- pages_processed: {len(log_entries)}\n"
                f"- wall_seconds: {wall:.0f}\n"
                f"- total_cost_usd: {total_cost:.4f}\n"
                f"- results:\n"
            )
            for entry in sorted(log_entries):
                fh.write(entry + "\n")


def main():
    ap = argparse.ArgumentParser(description="Vision-process each PNG of a UFO doc via claude CLI (OAuth).")
    g = ap.add_mutually_exclusive_group(required=True)
    g.add_argument("--doc-id", help="single doc_id (kebab-case)")
    g.add_argument("--all", action="store_true", help="process all docs in processing/png/")
    ap.add_argument("--force", action="store_true", help="reprocess existing pages")
    ap.add_argument("--max-pages", type=int, default=None, help="cap pages per doc (for smoke test)")
    ap.add_argument("--workers", type=int, default=DEFAULT_WORKERS, help=f"parallel workers per doc (default {DEFAULT_WORKERS})")
    ap.add_argument("--retries", type=int, default=DEFAULT_RETRIES, help=f"retries on transient errors (default {DEFAULT_RETRIES})")
    ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help=f"per-call timeout seconds (default {DEFAULT_TIMEOUT})")
    args = ap.parse_args()

    # Verify claude CLI is available
    try:
        subprocess.run(["claude", "--version"], capture_output=True, check=True, timeout=10)
    except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
        sys.stderr.write(f"claude CLI not found or not working: {e}\n")
        sys.exit(2)

    common = dict(
        force=args.force,
        max_pages=args.max_pages,
        workers=args.workers,
        retries=args.retries,
        timeout=args.timeout,
    )
    if args.doc_id:
        process_doc(args.doc_id, **common)
    else:
        for doc_dir in sorted(PNG_BASE.iterdir()):
            if doc_dir.is_dir():
                process_doc(doc_dir.name, **common)


if __name__ == "__main__":
    main()