#!/usr/bin/env python3 """ 32_reprocess_missing_pages.py — Reprocess pages that the doc-rebuilder agent silently dropped due to context-window overflow. For each doc: 1. Read raw/--subagent/_index.json (current chunk inventory) 2. Find missing pages: PNGs that exist but have no chunks 3. For each missing page, call `claude -p --model sonnet` with the page PNG and ask for a chunks JSON (matching the page-rebuilder schema) 4. Append new chunks to _index.json with continued global IDs (chunk_id, order_global) 5. Write new chunks/c.md files Idempotent — re-running skips pages already processed. Uses WORKERS=2 to avoid hammering OAuth rate limits. Usage: python3 scripts/synthesize/32_reprocess_missing_pages.py --dry-run python3 scripts/synthesize/32_reprocess_missing_pages.py --doc-id WORKERS=2 python3 scripts/synthesize/32_reprocess_missing_pages.py """ from __future__ import annotations import argparse import json import os import re import subprocess import sys import tempfile from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from pathlib import Path UFO = Path("/Users/guto/ufo") RAW = UFO / "raw" PNG_BASE = UFO / "processing" / "png" NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") SONNET_MODEL = "sonnet" WORKERS = int(os.environ.get("WORKERS", "2")) PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document. You will receive the path of ONE page image. Read it with the Read tool, then analyze it carefully and extract ALL content as structured chunks. DOCUMENT_ID: {doc_id} PAGE_NUMBER: {page_num} PNG_PATH: {png_path} Return ONE JSON object with this exact structure (no markdown fence, no preamble, no postscript): {{ "page_number": {page_num}, "chunks": [ {{ "order_in_page": 1, "type": "", "content_en": "", "content_pt_br": "", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null }} ] }} CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank, annotation, transcript_block. RULES: 1. Extract EVERY element on the page — nothing is skipped. 2. bbox is normalized coords (0.0..1.0) relative to the page image. 3. content_en is verbatim OCR text for text chunks; for images, describe what you see. 4. content_pt_br is Brazilian Portuguese (NOT European Portuguese). Preserve UTF-8 accents. 5. For redactions: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]". 6. For images/photos/diagrams/sketches/maps: type="image", describe in image_description_en/pt_br. 7. For stamps: type="stamp". 8. classification: extract markings if visible ("SECRET", "CONFIDENTIAL", "UNCLASSIFIED", etc.). 9. formatting: any of ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"]. 10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev". 11. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, anomalous phenomena. 12. If page is truly blank: return one chunk with type="blank". 13. Order chunks top-to-bottom, left-to-right. Return ONLY the JSON. No markdown. No commentary. """ DISALLOWED = ( "AskUserQuestion,Bash,Edit,Write,Task,Glob,Grep," "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput," "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit," "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree," "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch," "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool," "ShareOnboardingGuide" ) # NOTE: Read is allowed (we need vision) def extract_json_block(s: str) -> str: s = s.strip() if s.startswith("```"): s = "\n".join(line for line in s.splitlines() if not line.startswith("```")) s = s.strip() start = s.find("{") end = s.rfind("}") if start >= 0 and end > start: return s[start:end + 1] return s def call_sonnet_vision(doc_id: str, page_num: int) -> dict | None: png_path = PNG_BASE / doc_id / f"p-{page_num:03d}.png" if not png_path.is_file(): return None prompt = PROMPT_TEMPLATE.format(doc_id=doc_id, page_num=page_num, png_path=str(png_path)) env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"} with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp: tmp_path = tmp.name try: with open(tmp_path, "wb") as out_f: r = subprocess.run( ["claude", "-p", "--model", SONNET_MODEL, "--output-format", "text", "--disallowed-tools", DISALLOWED], input=prompt.encode("utf-8"), stdout=out_f, stderr=subprocess.PIPE, env=env, timeout=300, ) if r.returncode != 0: print(f" [FAIL] {doc_id} p{page_num:03d} — claude rc={r.returncode}: {r.stderr.decode('utf-8', errors='replace')[:300]}", file=sys.stderr) return None with open(tmp_path, "r", encoding="utf-8") as f: raw = f.read() js = extract_json_block(raw) try: return json.loads(js) except json.JSONDecodeError as e: print(f" [JSON] {doc_id} p{page_num:03d} — {e} | raw_len={len(raw)}", file=sys.stderr) return None finally: try: os.unlink(tmp_path) except OSError: pass def find_missing_pages_per_doc() -> dict[str, list[int]]: """For each doc, find pages that have a PNG but no chunks in _index.json. Excludes the Poppler-phantom (last) page only if pdf_pages is known and PNG == pdf+1.""" result: dict[str, list[int]] = {} import subprocess as sp # Try to map pdf_pages by exact filename matching pdf_pages_map: dict[str, int] = {} for p in RAW.glob("*.pdf"): try: out = sp.run(["pdfinfo", str(p)], capture_output=True, text=True, timeout=30).stdout m = re.search(r"Pages:\s+(\d+)", out) if m: # filename → doc_id (same algorithm as page-rebuilder did) import unicodedata nfd = unicodedata.normalize("NFD", p.stem) ascii_str = "".join(c for c in nfd if not unicodedata.combining(c)) slug = re.sub(r"-+", "-", re.sub(r"[^a-z0-9-]", "-", ascii_str.lower())).strip("-") if slug and slug[0].isdigit(): slug = "doc-" + slug pdf_pages_map[slug] = int(m.group(1)) except Exception: pass for png_dir in PNG_BASE.glob("*/"): doc_id = png_dir.name pngs = sorted( int(re.match(r"p-(\d+)\.png", p.name).group(1)) for p in png_dir.glob("p-*.png") if re.match(r"p-\d+\.png", p.name) ) if not pngs: continue idx_path = RAW / f"{doc_id}--subagent" / "_index.json" if not idx_path.is_file(): continue try: idx = json.loads(idx_path.read_text(encoding="utf-8")) except Exception: continue pages_in_chunks = {c.get("page") for c in idx.get("chunks", []) if c.get("page")} # Filter: only pages 1..pdf_pages (avoid Poppler phantom) pdf_pages = pdf_pages_map.get(doc_id) upper_bound = pdf_pages if pdf_pages else pngs[-1] missing = [p for p in pngs if p <= upper_bound and p not in pages_in_chunks] if missing: result[doc_id] = missing return result def render_chunk_md(chunk: dict) -> str: """Render a chunk dict to the chunk.md format.""" import yaml body_en = chunk.pop("_body_en", "") body_pt = chunk.pop("_body_pt", "") # YAML keys in stable order fm_keys = [ "chunk_id", "type", "page", "order_in_page", "order_global", "bbox", "classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk", "related_image", "related_table", "ocr_confidence", "ocr_source_lines", "redaction_code", "redaction_inferred_content_type", "image_type", "ufo_anomaly_detected", "cryptid_anomaly_detected", "ufo_anomaly_type", "ufo_anomaly_rationale", "cryptid_anomaly_type", "cryptid_anomaly_rationale", "image_description_en", "image_description_pt_br", "extracted_text", "source_png", ] fm = {k: chunk.get(k) for k in fm_keys if k in chunk} yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, default_flow_style=False, width=10_000).rstrip() body = f"**EN:** {body_en}\n\n**PT-BR:** {body_pt}\n" if (body_en or body_pt) else "" return f"---\n{yaml_block}\n---\n\n{body}" def integrate_page_chunks(doc_id: str, page_num: int, page_result: dict, idx: dict) -> int: """Add new page chunks to idx + write chunk .md files. Returns chunks added.""" chunks = page_result.get("chunks") or [] if not chunks: return 0 sub = RAW / f"{doc_id}--subagent" chunks_dir = sub / "chunks" chunks_dir.mkdir(exist_ok=True) # Determine next global order next_global = max((c.get("order_global", 0) for c in idx.get("chunks") or []), default=0) + 1 # Determine next chunk_id numeric next_id_num = next_global rel_png = f"../../processing/png/{doc_id}/p-{page_num:03d}.png" added = 0 new_index_entries = [] for i, c in enumerate(chunks, 1): cid = f"c{next_id_num:04d}" ctype = c.get("type") or "paragraph" en = c.get("content_en") or "" pt = c.get("content_pt_br") or "" entry = { "chunk_id": cid, "type": ctype, "page": page_num, "order_in_page": c.get("order_in_page") or i, "order_global": next_id_num, "file": f"chunks/{cid}.md", "bbox": c.get("bbox") or {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "preview": (en or pt or "")[:120], } new_index_entries.append(entry) chunk_dict = { "chunk_id": cid, "type": ctype, "page": page_num, "order_in_page": entry["order_in_page"], "order_global": next_id_num, "bbox": entry["bbox"], "classification": c.get("classification"), "formatting": c.get("formatting") or [], "cross_page_hint": c.get("cross_page_hint") or "self_contained", "prev_chunk": f"c{next_id_num-1:04d}" if next_id_num > 1 else None, "next_chunk": None, # patched after all known "related_image": None, "related_table": None, "ocr_confidence": c.get("ocr_confidence") or 0.85, "ocr_source_lines": [], "redaction_code": c.get("redaction_code"), "redaction_inferred_content_type": c.get("redaction_inferred_content_type"), "image_type": c.get("image_type"), "ufo_anomaly_detected": bool(c.get("ufo_anomaly_detected")), "cryptid_anomaly_detected": bool(c.get("cryptid_anomaly_detected")), "ufo_anomaly_type": c.get("ufo_anomaly_type"), "ufo_anomaly_rationale": c.get("ufo_anomaly_rationale"), "cryptid_anomaly_type": c.get("cryptid_anomaly_type"), "cryptid_anomaly_rationale": c.get("cryptid_anomaly_rationale"), "image_description_en": c.get("image_description_en"), "image_description_pt_br": c.get("image_description_pt_br"), "extracted_text": c.get("extracted_text"), "source_png": rel_png, "_body_en": en, "_body_pt": pt, } (chunks_dir / f"{cid}.md").write_text(render_chunk_md(chunk_dict), encoding="utf-8") next_id_num += 1 added += 1 idx.setdefault("chunks", []).extend(new_index_entries) return added import threading # One lock per doc_id (only contended when 2+ workers process pages of same doc) _doc_locks: dict[str, threading.Lock] = {} _locks_mutex = threading.Lock() def _doc_lock(doc_id: str) -> threading.Lock: with _locks_mutex: if doc_id not in _doc_locks: _doc_locks[doc_id] = threading.Lock() return _doc_locks[doc_id] def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]: """Process a single page and persist to _index.json under doc lock. Returns (ok, chunks_added).""" result = call_sonnet_vision(doc_id, page_num) if not result: print(f" [SKIP] {doc_id} p{page_num:03d} — no result", flush=True) return (False, 0) sub = RAW / f"{doc_id}--subagent" idx_path = sub / "_index.json" with _doc_lock(doc_id): idx = json.loads(idx_path.read_text(encoding="utf-8")) # Idempotent: if page already integrated meanwhile, skip if any(c.get("page") == page_num for c in idx.get("chunks") or []): print(f" [SKIP] {doc_id} p{page_num:03d} — already present", flush=True) return (False, 0) try: n = integrate_page_chunks(doc_id, page_num, result, idx) except Exception as e: print(f" [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True) return (False, 0) # Keep total_pages in sync with the real max page (recovered pages extend it) max_page = max((c.get("page", 0) for c in idx.get("chunks") or []), default=0) if max_page > idx.get("total_pages", 0): idx["total_pages"] = max_page idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8") print(f" [OK ] {doc_id} p{page_num:03d} — {n} chunks", flush=True) return (True, n) def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc-id", default=None) ap.add_argument("--page", type=int, default=None, help="single page for testing") ap.add_argument("--dry-run", action="store_true") args = ap.parse_args() missing = find_missing_pages_per_doc() if args.doc_id: missing = {args.doc_id: missing.get(args.doc_id, [])} if args.page and args.doc_id: missing = {args.doc_id: [args.page]} # Flatten (doc, page) job list — page-level parallelism jobs: list[tuple[str, int]] = [] for d, ps in missing.items(): for p in ps: jobs.append((d, p)) total = len(jobs) print(f"[1/2] {len(missing)} docs · {total} page-jobs") if args.dry_run: for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])): if ps: print(f" {d}: {len(ps)} pages → {ps[:5]}{'...' if len(ps)>5 else ''}") return 0 if total == 0: print("Nothing to do."); return 0 print(f"\n[2/2] Processing with WORKERS={WORKERS} (page-level parallel) ...") pages_done = chunks_added = 0 completed = 0 with ThreadPoolExecutor(max_workers=WORKERS) as pool: futs = {pool.submit(process_one_page, d, p): (d, p) for d, p in jobs} for fut in as_completed(futs): d, p = futs[fut] completed += 1 try: ok, n = fut.result() if ok: pages_done += 1; chunks_added += n except Exception as e: print(f" [ERR ] {d} p{p:03d}: {e}", flush=True) if completed % 25 == 0: print(f" ... [progress] {completed}/{total} pages_done={pages_done} chunks={chunks_added}", flush=True) print(f"\n✓ {pages_done}/{total} pages processed, {chunks_added} new chunks.") return 0 if __name__ == "__main__": sys.exit(main())