#!/usr/bin/env python3 """ Rebuilds doc-65-hs1-834228961-62-hq-83894-section-7 into the raw/ layout. Uses claude CLI (OAuth via Max plan) to process each page PNG via vision. """ import os import sys import json import base64 import time import subprocess import concurrent.futures import threading from datetime import datetime, timezone from pathlib import Path DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-7" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 7" PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) _print_lock = threading.Lock() def safe_print(*args, **kwargs): with _print_lock: print(*args, **kwargs, flush=True) PAGE_REBUILDER_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO document reconstruction system. STEP 1: Use the Read tool to view this page image: {page_png_path} STEP 2: Analyze the page carefully. The page is from document: {doc_title} Doc ID: {doc_id} Page number (1-indexed in document): {page_number} Total pages: {total_pages} OCR text (may be empty): {page_ocr_text} STEP 3: Return a JSON object with ALL content from the page split into chunks. Return ONLY this JSON structure (no markdown fences, no commentary): {{ "page_number": {page_number}, "chunks": [ {{ "order_in_page": 1, "type": "", "content_en": "English content or description", "content_pt_br": "Conteúdo em português brasileiro", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.90, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null }} ] }} Chunk type enum (use ONLY these): - letterhead: agency/org header at top - classification_banner: TOP SECRET/SECRET/CONFIDENTIAL/UNCLASSIFIED banners - date_line: date of document - to_from_line: TO:/FROM:/VIA: address lines - subject_line: RE:/SUBJECT: lines - paragraph: body text paragraph - section_header: bold/underlined section title - list_item: numbered or bulleted item - redaction_block: blacked-out or whited-out region - signature_block: signature/name/title at bottom - image: photograph, diagram, sketch, stamp, seal - table_marker: table content - page_number: page number indicator - footnote: footnote or endnote - handwriting: handwritten annotation - form_field: form label+value pairs - blank: empty/whitespace page or region Rules: 1. bbox values are NORMALIZED [0..1] (x=left, y=top, w=width, h=height) 2. Every visible region must be a chunk 3. For redaction_block: estimate redacted content type in redaction_inferred_content_type 4. For image chunks: provide detailed image_description_en AND image_description_pt_br 5. classification: extract from banners (e.g. "TOP SECRET") or null 6. formatting: array from: ["bold","italic","underline","all_caps","centered","right_aligned"] 7. cross_page_hint: "self_contained"|"continues_to_next"|"continues_from_prev"|"continues_both" 8. If blank page: one chunk of type "blank" 9. content_en: verbatim text (EN) or description; content_pt_br: PT-BR translation 10. ufo_anomaly_detected: true ONLY if page shows unidentified aerial phenomenon evidence 11. Output ONLY valid JSON, nothing else """ IMAGE_ANALYST_PROMPT = """You are an image analyst for declassified UAP/UFO document reconstruction. STEP 1: Use the Read tool to view this cropped image: {image_path} STEP 2: Analyze the image carefully. STEP 3: Return ONLY this JSON (no fences, no commentary): {{ "image_description_en": "Detailed description in English", "image_description_pt_br": "Descrição detalhada em português brasileiro", "image_type": "", "extracted_text": "Any text visible in image verbatim, or null", "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} image_type enum: photograph|diagram|sketch|map|chart|seal|stamp|signature|redacted_region|form|other ufo_anomaly_detected: true ONLY if image shows craft/object/phenomenon that appears to be UAP cryptid_anomaly_detected: true ONLY if image shows anomalous/non-human entity Return ONLY valid JSON. """ def extract_json(text: str) -> dict: """Extract JSON from claude CLI output.""" text = text.strip() if text.startswith("```"): import re text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```\s*$", "", text) start = text.find("{") if start == -1: raise ValueError("No JSON object found") depth = 0 for i, c in enumerate(text[start:], start): if c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: return json.loads(text[start:i+1]) raise ValueError("Unclosed JSON") def call_claude(prompt: str, png_dir: Path, timeout: int = 180) -> dict: """Call claude CLI and return parsed JSON.""" cmd = [ "claude", "-p", "--model", "haiku", "--output-format", "json", "--max-turns", "3", "--allowedTools", "Read", "--add-dir", str(png_dir), "--", prompt ] res = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) if res.returncode != 0: raise RuntimeError(f"claude CLI failed rc={res.returncode}: {res.stderr[-1000:]}") cli_output = json.loads(res.stdout) if cli_output.get("is_error"): raise RuntimeError(f"claude error: {cli_output.get('result', '')[:500]}") result_text = cli_output.get("result", "") return extract_json(result_text) def get_page_list(): """Returns list of (page_number, png_path) tuples sorted by page_number.""" files = sorted(PNG_DIR.glob("p-*.png")) return [(i+1, f) for i, f in enumerate(files)] def load_ocr(png_path: Path) -> str: stem = png_path.stem # p-NNN ocr_path = OCR_DIR / f"{stem}.txt" if ocr_path.exists(): text = ocr_path.read_text(encoding="utf-8").strip() return text if len(text) > 2 else "" return "" def process_page(page_number: int, png_path: Path, total_pages: int) -> dict: """Process a single page via claude vision.""" ocr_text = load_ocr(png_path) prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, doc_id=DOC_ID, page_number=page_number, total_pages=total_pages, page_png_path=str(png_path), page_ocr_text=ocr_text if ocr_text else "(no OCR available)" ) retries = 3 for attempt in range(retries): try: result = call_claude(prompt, png_path.parent, timeout=180) chunks = result.get("chunks", []) safe_print(f" [OK] p{page_number:03d}: {len(chunks)} chunks") return result except Exception as e: safe_print(f" [ERR] p{page_number:03d} attempt {attempt+1}: {str(e)[:200]}") if attempt < retries - 1: time.sleep(2 ** attempt) # Fallback return { "page_number": page_number, "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": f"[Page {page_number} — processing error]", "content_pt_br": f"[Página {page_number} — erro de processamento]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } def global_number_chunks(all_page_results: dict) -> list: """Assign global chunk IDs across all pages.""" chunks_flat = [] for page_num in sorted(all_page_results.keys()): page_data = all_page_results[page_num] page_chunks = sorted(page_data.get("chunks", []), key=lambda c: c.get("order_in_page", 0)) for chunk in page_chunks: chunk["page"] = page_num chunks_flat.append(chunk) for i, chunk in enumerate(chunks_flat): chunk["chunk_id"] = f"c{i+1:04d}" chunk["order_global"] = i + 1 chunk["prev_chunk"] = f"c{i:04d}" if i > 0 else None chunk["next_chunk"] = f"c{i+2:04d}" if i < len(chunks_flat) - 1 else None return chunks_flat def crop_image(chunk: dict, png_path: Path): """Crop image chunk bbox from page PNG.""" from PIL import Image chunk_id = chunk["chunk_id"] bbox = chunk.get("bbox", {}) x = bbox.get("x", 0) y = bbox.get("y", 0) w = bbox.get("w", 1) h = bbox.get("h", 1) out_path = IMAGES_DIR / f"IMG-{chunk_id}.png" try: im = Image.open(png_path) W, H = im.size pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) if right <= left or bottom <= top: right = min(W, left + 10) bottom = min(H, top + 10) cropped = im.crop((left, top, right, bottom)) cropped.save(str(out_path)) return out_path except Exception as e: safe_print(f" [WARN] Crop failed {chunk_id}: {e}") return None def analyze_image(chunk: dict, png_path: Path) -> dict: """Crop and analyze an image chunk.""" cropped_path = crop_image(chunk, png_path) if not cropped_path or not cropped_path.exists(): return chunk prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path)) retries = 2 for attempt in range(retries): try: analysis = call_claude(prompt, cropped_path.parent, timeout=120) for key in ["image_description_en", "image_description_pt_br", "image_type", "extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type", "ufo_anomaly_rationale", "cryptid_anomaly_detected", "cryptid_anomaly_type", "cryptid_anomaly_rationale"]: if key in analysis: chunk[key] = analysis[key] chunk["related_image"] = f"IMG-{chunk['chunk_id']}.png" safe_print(f" [IMG] {chunk['chunk_id']}: analyzed") return chunk except Exception as e: safe_print(f" [WARN] Image analysis {chunk['chunk_id']} attempt {attempt+1}: {str(e)[:150]}") if attempt < retries - 1: time.sleep(1) return chunk def write_chunk_file(chunk: dict, page_png_map: dict): """Write individual chunk .md file.""" chunk_id = chunk["chunk_id"] page = chunk.get("page", 0) bbox = chunk.get("bbox", {}) png_path = page_png_map.get(page) source_png = f"../../processing/png/{DOC_ID}/{png_path.name}" if png_path else "unknown" def jv(v): return json.dumps(v, ensure_ascii=False) yaml_lines = [ "---", f"chunk_id: {chunk_id}", f"type: {chunk.get('type', 'paragraph')}", f"page: {page}", f"order_in_page: {chunk.get('order_in_page', 1)}", f"order_global: {chunk.get('order_global', 1)}", f"bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 0):.3f}}}", f"classification: {jv(chunk.get('classification'))}", f"formatting: {jv(chunk.get('formatting', []))}", f"cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}", f"prev_chunk: {jv(chunk.get('prev_chunk'))}", f"next_chunk: {jv(chunk.get('next_chunk'))}", f"related_image: {jv(chunk.get('related_image'))}", f"related_table: {jv(chunk.get('related_table'))}", f"ocr_confidence: {chunk.get('ocr_confidence', 0.85)}", f"ocr_source_lines: {jv(chunk.get('ocr_source_lines', []))}", f"redaction_code: {jv(chunk.get('redaction_code'))}", f"redaction_inferred_content_type: {jv(chunk.get('redaction_inferred_content_type'))}", f"image_type: {jv(chunk.get('image_type'))}", f"ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()}", f"cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()}", f"ufo_anomaly_type: {jv(chunk.get('ufo_anomaly_type'))}", f"ufo_anomaly_rationale: {jv(chunk.get('ufo_anomaly_rationale'))}", f"cryptid_anomaly_type: {jv(chunk.get('cryptid_anomaly_type'))}", f"cryptid_anomaly_rationale: {jv(chunk.get('cryptid_anomaly_rationale'))}", f"image_description_en: {jv(chunk.get('image_description_en'))}", f"image_description_pt_br: {jv(chunk.get('image_description_pt_br'))}", f"extracted_text: {jv(chunk.get('extracted_text'))}", f"source_png: {source_png}", "---", "", f"**EN:** {chunk.get('content_en', '')}", "", f"**PT-BR:** {chunk.get('content_pt_br', '')}", "" ] out_path = CHUNKS_DIR / f"{chunk_id}.md" out_path.write_text("\n".join(yaml_lines), encoding="utf-8") def write_index(chunks_flat: list, total_pages: int): """Write _index.json.""" index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": total_pages, "total_chunks": len(chunks_flat), "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": datetime.now(timezone.utc).isoformat(), "chunks": [] } for chunk in chunks_flat: chunk_id = chunk["chunk_id"] preview = (chunk.get("content_en", "") or "")[:80] index["chunks"].append({ "chunk_id": chunk_id, "type": chunk.get("type", "paragraph"), "page": chunk.get("page", 1), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk_id}.md", "bbox": chunk.get("bbox", {}), "preview": preview }) (OUT_DIR / "_index.json").write_text( json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8" ) def write_document_md(chunks_flat: list, total_pages: int) -> int: """Assemble the master document.md.""" type_histogram = {} ufo_flagged = [] cryptid_flagged = [] for chunk in chunks_flat: t = chunk.get("type", "paragraph") type_histogram[t] = type_histogram.get(t, 0) + 1 if chunk.get("ufo_anomaly_detected"): ufo_flagged.append(chunk["chunk_id"]) if chunk.get("cryptid_anomaly_detected"): cryptid_flagged.append(chunk["chunk_id"]) now_iso = datetime.now(timezone.utc).isoformat() lines = [ "---", "schema_version: \"0.2.0\"", "type: master_document", f"doc_id: {DOC_ID}", f"canonical_title: \"{DOC_TITLE}\"", f"total_pages: {total_pages}", f"total_chunks: {len(chunks_flat)}", f"chunk_types_histogram: {json.dumps(type_histogram, ensure_ascii=False)}", "multi_page_tables: []", f"ufo_anomalies_flagged: {json.dumps(ufo_flagged)}", f"cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)}", "build_approach: \"subagents\"", "build_model: claude-haiku-4-5", f"build_at: {now_iso}", "---", "" ] current_page = None for chunk in chunks_flat: page = chunk.get("page", 1) if page != current_page: current_page = page lines.append(f"\n## Page {page}\n") chunk_id = chunk["chunk_id"] ctype = chunk.get("type", "paragraph") bbox = chunk.get("bbox", {}) bbox_str = f"{bbox.get('x', 0):.2f}/{bbox.get('y', 0):.2f}/{bbox.get('w', 1):.2f}/{bbox.get('h', 0):.2f}" lines.append(f"") lines.append(f"") lines.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}") lines.append("") lines.append(f"**EN:** {chunk.get('content_en', '')}") lines.append("") lines.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}") lines.append("") if ctype == "image" and chunk.get("related_image"): lines.append(f"![{chunk_id} image](./images/{chunk.get('related_image')})") lines.append("") if chunk.get("image_description_en"): lines.append(f"**Image Description (EN):** {chunk['image_description_en']}") lines.append("") if chunk.get("image_description_pt_br"): lines.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}") lines.append("") meta = {k: v for k, v in chunk.items() if k not in ("content_en", "content_pt_br")} lines.append("
metadata") lines.append("") lines.append("```json") lines.append(json.dumps(meta, indent=2, ensure_ascii=False)) lines.append("```") lines.append("") lines.append("
") lines.append("") lines.append("---") lines.append("") content = "\n".join(lines) (OUT_DIR / "document.md").write_text(content, encoding="utf-8") return len(content.encode("utf-8")) def main(): start_time = time.time() pages = get_page_list() total_pages = len(pages) page_png_map = {pnum: ppath for pnum, ppath in pages} safe_print(f"Processing {total_pages} pages for {DOC_ID}") # Process pages in batches of 5 batch_size = 5 all_page_results = {} batches = [pages[i:i+batch_size] for i in range(0, len(pages), batch_size)] for batch_idx, batch in enumerate(batches): page_nums = [p[0] for p in batch] safe_print(f"Batch {batch_idx+1}/{len(batches)}: pages {page_nums}") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = { executor.submit(process_page, pnum, ppath, total_pages): pnum for pnum, ppath in batch } for future in concurrent.futures.as_completed(futures): pnum = futures[future] try: result = future.result() all_page_results[pnum] = result except Exception as e: safe_print(f" [FATAL] Page {pnum}: {e}") all_page_results[pnum] = { "page_number": pnum, "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": f"[Page {pnum} — fatal error]", "content_pt_br": f"[Página {pnum} — erro fatal]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } safe_print(f"\nAll pages processed. Numbering chunks globally...") chunks_flat = global_number_chunks(all_page_results) total_chunks = len(chunks_flat) safe_print(f"Total chunks: {total_chunks}") # Analyze image chunks in batches of 5 image_chunks = [c for c in chunks_flat if c.get("type") == "image"] safe_print(f"\nProcessing {len(image_chunks)} image chunks...") img_batches = [image_chunks[i:i+5] for i in range(0, len(image_chunks), 5)] for img_batch_idx, img_batch in enumerate(img_batches): safe_print(f"Image batch {img_batch_idx+1}/{len(img_batches)}") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = {} for chunk in img_batch: page = chunk.get("page", 1) png_path = page_png_map.get(page) if png_path: f = executor.submit(analyze_image, chunk, png_path) futures[f] = chunk["chunk_id"] for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as e: cid = futures[future] safe_print(f" [ERR] Image {cid}: {e}") safe_print(f"\nWriting chunk files...") for chunk in chunks_flat: write_chunk_file(chunk, page_png_map) safe_print(f"Writing _index.json...") write_index(chunks_flat, total_pages) safe_print(f"Writing document.md...") doc_bytes = write_document_md(chunks_flat, total_pages) images_count = len([c for c in chunks_flat if c.get("type") == "image"]) ufo_count = len([c for c in chunks_flat if c.get("ufo_anomaly_detected")]) cryptid_count = len([c for c in chunks_flat if c.get("cryptid_anomaly_detected")]) wall_seconds = int(time.time() - start_time) safe_print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_count} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}") safe_print(f"pages_done={total_pages}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={ufo_count}, cryptid_anomalies={cryptid_count}, wall_seconds={wall_seconds}") if __name__ == "__main__": main()