#!/usr/bin/env python3 """ Rebuild script v2 for doc-65-hs1-834228961-62-hq-83894-section-2 Uses claude CLI for vision processing (no direct API key needed). Processes 159 pages in batches of 5. """ import os import sys import json import time import subprocess import concurrent.futures import textwrap from datetime import datetime, timezone from pathlib import Path from PIL import Image # ── Config ────────────────────────────────────────────────────────────────── DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2" DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)" PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" CLAUDE_BIN = "/Users/guto/.local/bin/claude" BATCH_SIZE = 5 CLAUDE_TIMEOUT = 120 # seconds per page call def build_page_map(): pngs = sorted( int(p.stem.replace("p-", "")) for p in PNG_DIR.glob("p-*.png") ) return {i + 1: num for i, num in enumerate(pngs)} PAGE_MAP = build_page_map() TOTAL_PAGES = len(PAGE_MAP) def load_ocr(actual_num: int) -> str: ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt" if ocr_path.exists(): text = ocr_path.read_text(encoding="utf-8", errors="replace").strip() return text[:2000] if text else "" return "" PAGE_REBUILDER_PROMPT_TEMPLATE = """You are a page-rebuilder agent analyzing a page from a declassified FBI document about Flying Discs / UAP investigations. Document: {doc_title} Actual page file: p-{actual_num:03d}.png Sequential page number: {page_seq} of {total_pages} OCR text (may be empty or poor quality): {ocr_text} Use the Read tool to read this image: /Users/guto/ufo/processing/png/{doc_id}/p-{actual_num:03d}.png Then analyze ALL visible content and return a JSON object with this exact structure (return ONLY the JSON, no markdown fences, no explanation): {{ "page_number": {page_seq}, "chunks": [ {{ "order_in_page": 1, "type": "cover", "content_en": "exact transcription or description in English", "content_pt_br": "descrição ou transcrição em português brasileiro", "bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} ] }} RULES: - type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank - bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0) - Split the page into logical chunks (letterhead separate from body text, stamps separate, etc.) - For redacted blocks: type=redaction, include redaction_code if visible e.g. "(b)(1)", "(b)(3)", "(b)(6)" - For stamps (RECEIVED, RECORDED, etc.): type=stamp - For photos, sketches, diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other - For tables: type=table_marker - cross_page_hint: self_contained | continues_to_next | continues_from_prev - content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]" - content_pt_br: Brazilian Portuguese translation/description - formatting: array of applicable: bold | italic | all_caps | underline | typewritten | handwritten - ufo_anomaly_detected: true ONLY if page has image/sketch/photo of an anomalous aerial object - Blank pages: one chunk with type=blank - Return ONLY valid JSON, nothing else""" IMAGE_ANALYST_PROMPT_TEMPLATE = """You are an image analyst for declassified FBI UFO/UAP investigation documents. Read this cropped image region: {img_path} Analyze it and return ONLY this JSON (no markdown fences): {{ "image_type": "photo", "image_description_en": "detailed description in English", "image_description_pt_br": "descrição detalhada em português brasileiro", "extracted_text": "any text visible verbatim or null", "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} image_type: photo | diagram | sketch | map | chart | signature_block | stamp | seal | other Return ONLY valid JSON.""" def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str: """Run claude CLI with a prompt, return stdout text.""" try: result = subprocess.run( [CLAUDE_BIN, "-p", "--dangerously-skip-permissions", "--model", "claude-haiku-4-5", "--no-session-persistence", prompt], capture_output=True, text=True, timeout=timeout, env={**os.environ} ) return result.stdout.strip() except subprocess.TimeoutExpired: return "" except Exception as e: return f"ERROR: {e}" def parse_json_response(raw: str): """Try to parse JSON from response, stripping markdown fences.""" text = raw.strip() # Strip markdown fences if text.startswith("```"): lines = text.split("\n") # Remove first line (```json or ```) lines = lines[1:] # Remove last line if it's ``` if lines and lines[-1].strip() == "```": lines = lines[:-1] text = "\n".join(lines).strip() # Find JSON object boundaries start = text.find("{") if start == -1: return None # Find matching closing brace depth = 0 end = -1 for i, ch in enumerate(text[start:]): if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: end = start + i + 1 break if end == -1: return None try: return json.loads(text[start:end]) except json.JSONDecodeError: return None def rebuild_page(page_seq: int) -> dict: """Process one page via claude CLI.""" actual_num = PAGE_MAP[page_seq] ocr_text = load_ocr(actual_num) prompt = PAGE_REBUILDER_PROMPT_TEMPLATE.format( doc_title=DOC_TITLE, actual_num=actual_num, page_seq=page_seq, total_pages=TOTAL_PAGES, ocr_text=ocr_text if ocr_text else "(no OCR available)", doc_id=DOC_ID ) retries = 3 for attempt in range(retries): raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT) if not raw or raw.startswith("ERROR:"): if attempt < retries - 1: wait = 5 * (attempt + 1) print(f" [RETRY {attempt+1}] page {page_seq}: empty/error, waiting {wait}s", flush=True) time.sleep(wait) continue else: break data = parse_json_response(raw) if data and "chunks" in data: data["page_number"] = page_seq data["actual_num"] = actual_num for i, ch in enumerate(data["chunks"]): ch["order_in_page"] = i + 1 ch["page"] = page_seq print(f" [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True) return data else: if attempt < retries - 1: print(f" [RETRY {attempt+1}] page {page_seq}: bad JSON, retrying", flush=True) time.sleep(3) else: print(f" [FAIL] page {page_seq}: could not parse JSON. Raw: {raw[:200]}", flush=True) # Fallback return { "page_number": page_seq, "actual_num": actual_num, "chunks": [{ "order_in_page": 1, "type": "blank", "page": page_seq, "content_en": "[Page processing failed - manual review required]", "content_pt_br": "[Falha no processamento da página - revisão manual necessária]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path: """Crop bbox region from page PNG.""" src = PNG_DIR / f"p-{actual_num:03d}.png" dst = IMAGES_DIR / f"IMG-{chunk_id}.png" try: im = Image.open(src) W, H = im.size x = max(0.0, min(1.0, bbox.get("x", 0.0))) y = max(0.0, min(1.0, bbox.get("y", 0.0))) w = max(0.01, min(1.0 - x, bbox.get("w", 1.0))) h = max(0.01, min(1.0 - y, bbox.get("h", 0.1))) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) cropped = im.crop((left, top, right, bottom)) cropped.save(str(dst)) except Exception as e: print(f" [CROP FAIL] {chunk_id}: {e}", flush=True) return dst def analyze_image(chunk_id: str, img_path: Path) -> dict: """Analyze cropped image via claude CLI.""" if not img_path.exists(): return { "image_type": "other", "image_description_en": "Image not available", "image_description_pt_br": "Imagem não disponível", "extracted_text": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None } prompt = IMAGE_ANALYST_PROMPT_TEMPLATE.format(img_path=str(img_path)) retries = 2 for attempt in range(retries): raw = run_claude(prompt, timeout=60) data = parse_json_response(raw) if data: print(f" [IMG OK] {chunk_id}", flush=True) return data if attempt < retries - 1: time.sleep(3) print(f" [IMG FAIL] {chunk_id}", flush=True) return { "image_type": "other", "image_description_en": "Analysis failed", "image_description_pt_br": "Análise falhou", "extracted_text": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None } def write_chunk_file(chunk: dict): """Write individual chunk markdown file.""" chunk_id = chunk["chunk_id"] bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) page = chunk.get("page", 1) actual_num = PAGE_MAP.get(page, page) ctype = chunk.get("type", "paragraph") related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null" related_table = chunk.get("related_table") or "null" prev_chunk = chunk.get("prev_chunk") or "null" next_chunk = chunk.get("next_chunk") or "null" fmt_list = chunk.get("formatting") or [] fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]" ocr_lines = chunk.get("ocr_source_lines") or [] ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]" def yv(v): if v is None: return "null" if isinstance(v, bool): return str(v).lower() s = str(v) # Quote if contains special chars if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@', '`']): return f'"{s}"' return s content = f"""--- chunk_id: {chunk_id} type: {ctype} page: {page} order_in_page: {chunk.get("order_in_page", 1)} order_global: {chunk.get("order_global", 1)} bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}} classification: {yv(chunk.get("classification"))} formatting: {fmt_str} cross_page_hint: {chunk.get("cross_page_hint", "self_contained")} prev_chunk: {prev_chunk} next_chunk: {next_chunk} related_image: {related_image} related_table: {related_table} ocr_confidence: {chunk.get("ocr_confidence", 0.85)} ocr_source_lines: {ocr_lines_str} redaction_code: {yv(chunk.get("redaction_code"))} redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))} image_type: {yv(chunk.get("image_type"))} ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()} cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()} ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))} ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))} cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))} cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))} image_description_en: {yv(chunk.get("image_description_en"))} image_description_pt_br: {yv(chunk.get("image_description_pt_br"))} extracted_text: {yv(chunk.get("extracted_text"))} source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png --- **EN:** {chunk.get("content_en", "")} **PT-BR:** {chunk.get("content_pt_br", "")} """ (CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8") def main(): t_start = time.time() print(f"Starting rebuild: {DOC_ID}", flush=True) print(f"Total pages: {TOTAL_PAGES}", flush=True) CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) # Phase 1: Rebuild pages in parallel batches of 5 print("\n=== Phase 1: Page rebuilding ===", flush=True) all_page_results = {} page_seqs = list(range(1, TOTAL_PAGES + 1)) for batch_start in range(0, len(page_seqs), BATCH_SIZE): batch = page_seqs[batch_start:batch_start + BATCH_SIZE] print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor: futures = {executor.submit(rebuild_page, p): p for p in batch} for future in concurrent.futures.as_completed(futures): result = future.result() all_page_results[result["page_number"]] = result # Save intermediate state after each batch state_path = OUT_DIR / "_rebuild_state.json" state_path.write_text( json.dumps({str(k): v for k, v in all_page_results.items()}, ensure_ascii=False), encoding="utf-8" ) # Phase 2: Global chunk numbering print("\n=== Phase 2: Global chunk numbering ===", flush=True) all_chunks = [] order_global = 0 for page_seq in sorted(all_page_results.keys()): chunks = all_page_results[page_seq].get("chunks", []) actual_num = all_page_results[page_seq].get("actual_num", PAGE_MAP.get(page_seq, page_seq)) for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)): order_global += 1 chunk_id = f"c{order_global:04d}" chunk["chunk_id"] = chunk_id chunk["order_global"] = order_global chunk["actual_num"] = actual_num all_chunks.append(chunk) for i, chunk in enumerate(all_chunks): chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None print(f" Total chunks: {len(all_chunks)}", flush=True) # Phase 3: Crop all images print("\n=== Phase 3: Cropping images ===", flush=True) image_chunks = [c for c in all_chunks if c.get("type") == "image"] print(f" Found {len(image_chunks)} image chunks", flush=True) for chunk in image_chunks: crop_image( chunk["page"], chunk.get("actual_num", PAGE_MAP.get(chunk["page"], chunk["page"])), chunk["chunk_id"], chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) ) # Phase 4: Analyze images in parallel batches of 5 print("\n=== Phase 4: Image analysis ===", flush=True) chunk_lookup = {c["chunk_id"]: c for c in all_chunks} for batch_start in range(0, len(image_chunks), BATCH_SIZE): batch = image_chunks[batch_start:batch_start + BATCH_SIZE] print(f" Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor: futures = {} for chunk in batch: chunk_id = chunk["chunk_id"] img_path = IMAGES_DIR / f"IMG-{chunk_id}.png" futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id for future in concurrent.futures.as_completed(futures): chunk_id = futures[future] img_meta = future.result() chunk = chunk_lookup.get(chunk_id) if chunk: chunk.update({k: v for k, v in img_meta.items() if v is not None}) # Phase 5: Table stitching check print("\n=== Phase 5: Table stitching ===", flush=True) tables_stitched = 0 table_markers = [c for c in all_chunks if c.get("type") == "table_marker"] print(f" Found {len(table_markers)} table markers (no cross-page stitching needed)", flush=True) # Phase 6: Write chunk files print("\n=== Phase 6: Writing chunk files ===", flush=True) for chunk in all_chunks: write_chunk_file(chunk) print(f" Wrote {len(all_chunks)} chunk files", flush=True) # Phase 7: Write _index.json print("\n=== Phase 7: Writing _index.json ===", flush=True) build_at = datetime.now(timezone.utc).isoformat() index_chunks = [] for chunk in all_chunks: bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) content_en = chunk.get("content_en", "") preview = content_en[:80] + ("..." if len(content_en) > 80 else "") index_chunks.append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "paragraph"), "page": chunk.get("page", 1), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk['chunk_id']}.md", "bbox": bbox, "preview": preview }) index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": len(all_chunks), "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": build_at, "chunks": index_chunks } (OUT_DIR / "_index.json").write_text( json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8" ) # Phase 8: Assemble document.md print("\n=== Phase 8: Assembling document.md ===", flush=True) type_histogram = {} for chunk in all_chunks: t = chunk.get("type", "paragraph") type_histogram[t] = type_histogram.get(t, 0) + 1 ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")] cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")] images_extracted = len(image_chunks) histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items())) def list_yaml(items): if not items: return " []" return "\n".join(f" - {i}" for i in items) doc_parts = [f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {TOTAL_PAGES} total_chunks: {len(all_chunks)} chunk_types_histogram: {histogram_yaml} multi_page_tables: [] ufo_anomalies_flagged: {list_yaml(ufo_flagged)} cryptid_anomalies_flagged: {list_yaml(cryptid_flagged)} build_approach: "subagents" build_model: "claude-haiku-4-5" build_at: "{build_at}" --- """] chunks_by_page = {} for chunk in all_chunks: p = chunk.get("page", 1) chunks_by_page.setdefault(p, []).append(chunk) for page_seq in sorted(chunks_by_page.keys()): page_chunks = chunks_by_page[page_seq] doc_parts.append(f"\n## Page {page_seq}\n") for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)): chunk_id = chunk["chunk_id"] ctype = chunk.get("type", "paragraph") bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}" doc_parts.append(f"\n") doc_parts.append(f'\n') doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n") doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n") doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n") if ctype == "image": doc_parts.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)\n\n") if chunk.get("image_description_en"): doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n") if chunk.get("image_description_pt_br"): doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n") # Metadata block meta = { "chunk_id": chunk_id, "type": ctype, "page": chunk.get("page"), "order_in_page": chunk.get("order_in_page"), "order_global": chunk.get("order_global"), "bbox": bbox, "classification": chunk.get("classification"), "formatting": chunk.get("formatting", []), "cross_page_hint": chunk.get("cross_page_hint"), "prev_chunk": chunk.get("prev_chunk"), "next_chunk": chunk.get("next_chunk"), "ocr_confidence": chunk.get("ocr_confidence"), "redaction_code": chunk.get("redaction_code"), "image_type": chunk.get("image_type"), "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False), "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False), "ufo_anomaly_type": chunk.get("ufo_anomaly_type"), "ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"), } meta_json = json.dumps(meta, indent=2, ensure_ascii=False) doc_parts.append( f"
metadata\n\n```json\n{meta_json}\n```\n\n
\n\n---\n\n" ) doc_md = "".join(doc_parts) (OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8") doc_md_bytes = len(doc_md.encode("utf-8")) # Cleanup intermediate state state_path = OUT_DIR / "_rebuild_state.json" if state_path.exists(): state_path.unlink() t_end = time.time() wall_seconds = int(t_end - t_start) print(f"\n=== DONE ===", flush=True) final_line = f"pages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}" print(final_line, flush=True) print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True) if __name__ == "__main__": main()