disclosure-bureau/scripts/synthesize/32_reprocess_missing_pages.py

#!/usr/bin/env python3
"""
32_reprocess_missing_pages.py — Reprocess pages that the doc-rebuilder agent
silently dropped due to context-window overflow.

For each doc:
  1. Read raw/<doc>--subagent/_index.json (current chunk inventory)
  2. Find missing pages: PNGs that exist but have no chunks
  3. For each missing page, call `claude -p --model sonnet` with the page PNG
     and ask for a chunks JSON (matching the page-rebuilder schema)
  4. Append new chunks to _index.json with continued global IDs (chunk_id, order_global)
  5. Write new chunks/c<NNNN>.md files

Idempotent — re-running skips pages already processed.
Uses WORKERS=2 to avoid hammering OAuth rate limits.

Usage:
  python3 scripts/synthesize/32_reprocess_missing_pages.py --dry-run
  python3 scripts/synthesize/32_reprocess_missing_pages.py --doc-id <id>
  WORKERS=2 python3 scripts/synthesize/32_reprocess_missing_pages.py
"""
from __future__ import annotations
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path

UFO = Path("/Users/guto/ufo")
RAW = UFO / "raw"
PNG_BASE = UFO / "processing" / "png"

NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
SONNET_MODEL = "sonnet"
WORKERS = int(os.environ.get("WORKERS", "2"))

PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document.

You will receive the path of ONE page image. Read it with the Read tool, then analyze it carefully and extract ALL content as structured chunks.

DOCUMENT_ID: {doc_id}
PAGE_NUMBER: {page_num}
PNG_PATH: {png_path}

Return ONE JSON object with this exact structure (no markdown fence, no preamble, no postscript):

{{
  "page_number": {page_num},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "<chunk_type>",
      "content_en": "<English verbatim text or visual description>",
      "content_pt_br": "<Brazilian Portuguese translation>",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.85,
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null,
      "image_description_en": null,
      "image_description_pt_br": null,
      "extracted_text": null
    }}
  ]
}}

CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank, annotation, transcript_block.

RULES:
1. Extract EVERY element on the page — nothing is skipped.
2. bbox is normalized coords (0.0..1.0) relative to the page image.
3. content_en is verbatim OCR text for text chunks; for images, describe what you see.
4. content_pt_br is Brazilian Portuguese (NOT European Portuguese). Preserve UTF-8 accents.
5. For redactions: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]".
6. For images/photos/diagrams/sketches/maps: type="image", describe in image_description_en/pt_br.
7. For stamps: type="stamp".
8. classification: extract markings if visible ("SECRET", "CONFIDENTIAL", "UNCLASSIFIED", etc.).
9. formatting: any of ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"].
10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev".
11. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, anomalous phenomena.
12. If page is truly blank: return one chunk with type="blank".
13. Order chunks top-to-bottom, left-to-right.

Return ONLY the JSON. No markdown. No commentary.
"""

DISALLOWED = (
    "AskUserQuestion,Bash,Edit,Write,Task,Glob,Grep,"
    "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
    "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
    "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
    "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
    "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
    "ShareOnboardingGuide"
)  # NOTE: Read is allowed (we need vision)


def extract_json_block(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = "\n".join(line for line in s.splitlines() if not line.startswith("```"))
        s = s.strip()
    start = s.find("{")
    end = s.rfind("}")
    if start >= 0 and end > start: return s[start:end + 1]
    return s


def call_sonnet_vision(doc_id: str, page_num: int) -> dict | None:
    png_path = PNG_BASE / doc_id / f"p-{page_num:03d}.png"
    if not png_path.is_file(): return None
    prompt = PROMPT_TEMPLATE.format(doc_id=doc_id, page_num=page_num, png_path=str(png_path))

    env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
    with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp:
        tmp_path = tmp.name
    try:
        with open(tmp_path, "wb") as out_f:
            r = subprocess.run(
                ["claude", "-p", "--model", SONNET_MODEL, "--output-format", "text",
                 "--disallowed-tools", DISALLOWED],
                input=prompt.encode("utf-8"),
                stdout=out_f, stderr=subprocess.PIPE, env=env,
                timeout=300,
            )
        if r.returncode != 0:
            print(f"  [FAIL] {doc_id} p{page_num:03d} — claude rc={r.returncode}: {r.stderr.decode('utf-8', errors='replace')[:300]}", file=sys.stderr)
            return None
        with open(tmp_path, "r", encoding="utf-8") as f:
            raw = f.read()
        js = extract_json_block(raw)
        try:
            return json.loads(js)
        except json.JSONDecodeError as e:
            print(f"  [JSON] {doc_id} p{page_num:03d} — {e} | raw_len={len(raw)}", file=sys.stderr)
            return None
    finally:
        try: os.unlink(tmp_path)
        except OSError: pass


def find_missing_pages_per_doc() -> dict[str, list[int]]:
    """For each doc, find pages that have a PNG but no chunks in _index.json.
    Excludes the Poppler-phantom (last) page only if pdf_pages is known and PNG == pdf+1."""
    result: dict[str, list[int]] = {}
    import subprocess as sp
    # Try to map pdf_pages by exact filename matching
    pdf_pages_map: dict[str, int] = {}
    for p in RAW.glob("*.pdf"):
        try:
            out = sp.run(["pdfinfo", str(p)], capture_output=True, text=True, timeout=30).stdout
            m = re.search(r"Pages:\s+(\d+)", out)
            if m:
                # filename → doc_id (same algorithm as page-rebuilder did)
                import unicodedata
                nfd = unicodedata.normalize("NFD", p.stem)
                ascii_str = "".join(c for c in nfd if not unicodedata.combining(c))
                slug = re.sub(r"-+", "-", re.sub(r"[^a-z0-9-]", "-", ascii_str.lower())).strip("-")
                if slug and slug[0].isdigit(): slug = "doc-" + slug
                pdf_pages_map[slug] = int(m.group(1))
        except Exception: pass

    for png_dir in PNG_BASE.glob("*/"):
        doc_id = png_dir.name
        pngs = sorted(
            int(re.match(r"p-(\d+)\.png", p.name).group(1))
            for p in png_dir.glob("p-*.png") if re.match(r"p-\d+\.png", p.name)
        )
        if not pngs: continue
        idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
        if not idx_path.is_file(): continue
        try:
            idx = json.loads(idx_path.read_text(encoding="utf-8"))
        except Exception: continue
        pages_in_chunks = {c.get("page") for c in idx.get("chunks", []) if c.get("page")}

        # Filter: only pages 1..pdf_pages (avoid Poppler phantom)
        pdf_pages = pdf_pages_map.get(doc_id)
        upper_bound = pdf_pages if pdf_pages else pngs[-1]
        missing = [p for p in pngs if p <= upper_bound and p not in pages_in_chunks]
        if missing: result[doc_id] = missing
    return result


def render_chunk_md(chunk: dict) -> str:
    """Render a chunk dict to the chunk.md format."""
    import yaml
    body_en = chunk.pop("_body_en", "")
    body_pt = chunk.pop("_body_pt", "")
    # YAML keys in stable order
    fm_keys = [
        "chunk_id", "type", "page", "order_in_page", "order_global", "bbox",
        "classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk",
        "related_image", "related_table", "ocr_confidence", "ocr_source_lines",
        "redaction_code", "redaction_inferred_content_type", "image_type",
        "ufo_anomaly_detected", "cryptid_anomaly_detected",
        "ufo_anomaly_type", "ufo_anomaly_rationale",
        "cryptid_anomaly_type", "cryptid_anomaly_rationale",
        "image_description_en", "image_description_pt_br", "extracted_text",
        "source_png",
    ]
    fm = {k: chunk.get(k) for k in fm_keys if k in chunk}
    yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
                                default_flow_style=False, width=10_000).rstrip()
    body = f"**EN:** {body_en}\n\n**PT-BR:** {body_pt}\n" if (body_en or body_pt) else ""
    return f"---\n{yaml_block}\n---\n\n{body}"


def integrate_page_chunks(doc_id: str, page_num: int, page_result: dict, idx: dict) -> int:
    """Add new page chunks to idx + write chunk .md files. Returns chunks added."""
    chunks = page_result.get("chunks") or []
    if not chunks: return 0
    sub = RAW / f"{doc_id}--subagent"
    chunks_dir = sub / "chunks"
    chunks_dir.mkdir(exist_ok=True)
    # Determine next global order
    next_global = max((c.get("order_global", 0) for c in idx.get("chunks") or []), default=0) + 1
    # Determine next chunk_id numeric
    next_id_num = next_global
    rel_png = f"../../processing/png/{doc_id}/p-{page_num:03d}.png"
    added = 0
    new_index_entries = []
    for i, c in enumerate(chunks, 1):
        cid = f"c{next_id_num:04d}"
        ctype = c.get("type") or "paragraph"
        en = c.get("content_en") or ""
        pt = c.get("content_pt_br") or ""
        entry = {
            "chunk_id": cid,
            "type": ctype,
            "page": page_num,
            "order_in_page": c.get("order_in_page") or i,
            "order_global": next_id_num,
            "file": f"chunks/{cid}.md",
            "bbox": c.get("bbox") or {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "preview": (en or pt or "")[:120],
        }
        new_index_entries.append(entry)

        chunk_dict = {
            "chunk_id": cid,
            "type": ctype,
            "page": page_num,
            "order_in_page": entry["order_in_page"],
            "order_global": next_id_num,
            "bbox": entry["bbox"],
            "classification": c.get("classification"),
            "formatting": c.get("formatting") or [],
            "cross_page_hint": c.get("cross_page_hint") or "self_contained",
            "prev_chunk": f"c{next_id_num-1:04d}" if next_id_num > 1 else None,
            "next_chunk": None,  # patched after all known
            "related_image": None,
            "related_table": None,
            "ocr_confidence": c.get("ocr_confidence") or 0.85,
            "ocr_source_lines": [],
            "redaction_code": c.get("redaction_code"),
            "redaction_inferred_content_type": c.get("redaction_inferred_content_type"),
            "image_type": c.get("image_type"),
            "ufo_anomaly_detected": bool(c.get("ufo_anomaly_detected")),
            "cryptid_anomaly_detected": bool(c.get("cryptid_anomaly_detected")),
            "ufo_anomaly_type": c.get("ufo_anomaly_type"),
            "ufo_anomaly_rationale": c.get("ufo_anomaly_rationale"),
            "cryptid_anomaly_type": c.get("cryptid_anomaly_type"),
            "cryptid_anomaly_rationale": c.get("cryptid_anomaly_rationale"),
            "image_description_en": c.get("image_description_en"),
            "image_description_pt_br": c.get("image_description_pt_br"),
            "extracted_text": c.get("extracted_text"),
            "source_png": rel_png,
            "_body_en": en, "_body_pt": pt,
        }
        (chunks_dir / f"{cid}.md").write_text(render_chunk_md(chunk_dict), encoding="utf-8")
        next_id_num += 1
        added += 1

    idx.setdefault("chunks", []).extend(new_index_entries)
    return added


import threading

# One lock per doc_id (only contended when 2+ workers process pages of same doc)
_doc_locks: dict[str, threading.Lock] = {}
_locks_mutex = threading.Lock()
def _doc_lock(doc_id: str) -> threading.Lock:
    with _locks_mutex:
        if doc_id not in _doc_locks: _doc_locks[doc_id] = threading.Lock()
        return _doc_locks[doc_id]


def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]:
    """Process a single page and persist to _index.json under doc lock.
    Returns (ok, chunks_added)."""
    result = call_sonnet_vision(doc_id, page_num)
    if not result:
        print(f"  [SKIP] {doc_id} p{page_num:03d} — no result", flush=True)
        return (False, 0)
    sub = RAW / f"{doc_id}--subagent"
    idx_path = sub / "_index.json"
    with _doc_lock(doc_id):
        idx = json.loads(idx_path.read_text(encoding="utf-8"))
        # Idempotent: if page already integrated meanwhile, skip
        if any(c.get("page") == page_num for c in idx.get("chunks") or []):
            print(f"  [SKIP] {doc_id} p{page_num:03d} — already present", flush=True)
            return (False, 0)
        try:
            n = integrate_page_chunks(doc_id, page_num, result, idx)
        except Exception as e:
            print(f"  [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True)
            return (False, 0)
        idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"  [OK  ] {doc_id} p{page_num:03d} — {n} chunks", flush=True)
    return (True, n)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", default=None)
    ap.add_argument("--page", type=int, default=None, help="single page for testing")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    missing = find_missing_pages_per_doc()
    if args.doc_id:
        missing = {args.doc_id: missing.get(args.doc_id, [])}
    if args.page and args.doc_id:
        missing = {args.doc_id: [args.page]}

    # Flatten (doc, page) job list — page-level parallelism
    jobs: list[tuple[str, int]] = []
    for d, ps in missing.items():
        for p in ps: jobs.append((d, p))
    total = len(jobs)
    print(f"[1/2] {len(missing)} docs · {total} page-jobs")
    if args.dry_run:
        for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
            if ps: print(f"  {d}: {len(ps)} pages → {ps[:5]}{'...' if len(ps)>5 else ''}")
        return 0
    if total == 0: print("Nothing to do."); return 0

    print(f"\n[2/2] Processing with WORKERS={WORKERS} (page-level parallel) ...")
    pages_done = chunks_added = 0
    completed = 0
    with ThreadPoolExecutor(max_workers=WORKERS) as pool:
        futs = {pool.submit(process_one_page, d, p): (d, p) for d, p in jobs}
        for fut in as_completed(futs):
            d, p = futs[fut]
            completed += 1
            try:
                ok, n = fut.result()
                if ok: pages_done += 1; chunks_added += n
            except Exception as e:
                print(f"  [ERR ] {d} p{p:03d}: {e}", flush=True)
            if completed % 25 == 0:
                print(f"  ... [progress] {completed}/{total}  pages_done={pages_done}  chunks={chunks_added}", flush=True)

    print(f"\n✓ {pages_done}/{total} pages processed, {chunks_added} new chunks.")
    return 0


if __name__ == "__main__":
    sys.exit(main())