#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Rebuild doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for into structured chunk files, _index.json, and document.md. Uses `claude -p --model haiku` subprocess calls (OAuth via Max plan). """ import json import os import random import re import subprocess import sys import time import threading from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from pathlib import Path DOC_ID = "doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for" DOC_TITLE = "UFO's and Defense: What Should We Prepare For?" PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" MODEL = "haiku" TOTAL_PAGES = 93 WORKERS = 4 TIMEOUT = 240 # seconds per page call _print_lock = threading.Lock() def safe_print(*args, **kwargs): with _print_lock: print(*args, **kwargs, flush=True) PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder agent. Analyze the scanned document page image and extract all content into structured chunks. Document: {doc_title} Page: {page_number} of {total_pages} Doc ID: {doc_id} STEP 1: Use the Read tool to view this PNG image: {png_path} STEP 2: Analyze every element on the page carefully. STEP 3: Return ONE JSON object only (no markdown fence, no commentary): {{ "page_number": {page_number}, "chunks": [ {{ "order_in_page": 1, "type": "paragraph", "content_en": "verbatim English text from page", "content_pt_br": "tradução em português brasileiro", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null }} ] }} CHUNK TYPES (use exactly one): - letterhead: document header/letterhead - classification_marking: classification marking (TOP SECRET, CUI, etc.) - date_line: date field - address_block: TO:/FROM:/distribution fields - heading: section/chapter/subject heading - paragraph: body text paragraph - numbered_item: numbered list item - bulleted_item: bullet list item - table_marker: table content - image: photograph, diagram, chart, sketch, map, graph - caption: figure/image caption - footer: page footer - page_number: standalone page number - signature: signature/signatory block - redaction: blacked-out/redacted area - stamp: official stamp or seal - handwriting: handwritten annotation - blank_area: empty area - form_field: form field with label and value - unknown: unidentifiable element RULES: 1. Split content into logical chunks (one concept per chunk). A typical page has 3-15 chunks. 2. For image chunks: describe what you see in content_en and set image_type. 3. image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other 4. bbox: normalized coordinates 0.0-1.0 (x=left, y=top, w=width, h=height) 5. content_en: verbatim text if text chunk; visual description if image chunk 6. content_pt_br: Brazilian Portuguese translation (NOT European Portuguese) 7. classification: null or the marking text (e.g. "CUI", "UNCLASSIFIED") 8. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev" 9. formatting: array from ["bold", "italic", "all_caps", "underline"] 10. If page is completely blank: ONE chunk of type "blank_area" 11. Preserve French text verbatim (document may contain French) 12. For redaction chunks: set redaction_code if visible (e.g. "(b)(1)") 13. ufo_anomaly_detected: true ONLY for image chunks showing actual UAP/anomalous phenomena Output ONLY the JSON object. No preamble. No fence. No commentary.''' IMAGE_ANALYST_PROMPT = '''You are an image analyst for a UAP/UFO declassified document. STEP 1: Use the Read tool to view this cropped image: {image_path} STEP 2: Analyze it carefully. STEP 3: Return ONE JSON object only (no markdown fence): {{ "image_description_en": "detailed English description", "image_description_pt_br": "descrição detalhada em português brasileiro", "image_type": "photograph", "extracted_text": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other Set ufo_anomaly_detected=true only if the image shows an actual UAP/UFO or anomalous aerial phenomenon. Set cryptid_anomaly_detected=true only if the image shows a cryptid or unknown creature. extracted_text: any text visible inside the image (verbatim), or null. Output ONLY the JSON object.''' def extract_json(text: str) -> dict: """Extract JSON from claude CLI output.""" text = text.strip() # Strip markdown fences if present if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```\s*$", "", text.rstrip()) # Find first { and matching } start = text.find("{") if start == -1: raise ValueError(f"No JSON found in: {text[:200]}") depth = 0 for i, c in enumerate(text[start:], start): if c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: return json.loads(text[start:i + 1]) raise ValueError("Unclosed JSON in response") def call_claude(prompt: str, allowed_tools: str = "Read", timeout: int = TIMEOUT) -> str: """Call claude -p CLI and return result text.""" cmd = [ "claude", "-p", "--model", MODEL, "--output-format", "json", "--max-turns", "5", "--allowedTools", allowed_tools, "--add-dir", str(PNG_DIR), "--add-dir", str(IMAGES_DIR), "--", prompt, ] res = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout, check=False, env={**os.environ}, ) if res.returncode != 0: raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}") cli = json.loads(res.stdout) if cli.get("is_error"): raise RuntimeError(f"claude error: {cli.get('result', '')[:500]}") return cli.get("result", "") def process_page(page_num: int) -> dict: """Process a single page using claude -p CLI.""" png_path = PNG_DIR / f"p-{page_num:03d}.png" if not png_path.exists(): safe_print(f" Page {page_num}: PNG missing — placeholder") return { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "blank_area", "content_en": f"[Page {page_num} — PNG not available]", "content_pt_br": f"[Página {page_num} — PNG não disponível]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, page_number=page_num, total_pages=TOTAL_PAGES, doc_id=DOC_ID, png_path=str(png_path), ) max_retries = 3 for attempt in range(1, max_retries + 1): try: result_text = call_claude(prompt, allowed_tools="Read") data = extract_json(result_text) data["page_number"] = page_num # Validate chunks exist if not isinstance(data.get("chunks"), list) or len(data["chunks"]) == 0: raise ValueError("No chunks in response") safe_print(f" Page {page_num}: {len(data['chunks'])} chunks") return data except (subprocess.TimeoutExpired,) as e: safe_print(f" Page {page_num}: timeout attempt {attempt}/{max_retries}") if attempt == max_retries: break time.sleep(10 * attempt) except (RuntimeError, ValueError, json.JSONDecodeError) as e: safe_print(f" Page {page_num}: error attempt {attempt}/{max_retries}: {str(e)[:100]}") if attempt == max_retries: break backoff = 5 * attempt + random.uniform(0, 3) time.sleep(backoff) # Return fallback safe_print(f" Page {page_num}: FALLBACK after {max_retries} attempts") return { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "unknown", "content_en": f"[Page {page_num} — content extraction failed after {max_retries} attempts]", "content_pt_br": f"[Página {page_num} — extração de conteúdo falhou após {max_retries} tentativas]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } def crop_image(chunk_id: str, png_path: Path, bbox: dict) -> object: """Crop image region from page PNG.""" from PIL import Image cropped_path = IMAGES_DIR / f"IMG-{chunk_id}.png" try: im = Image.open(png_path) W, H = im.size x = max(0.0, float(bbox.get("x", 0))) y = max(0.0, float(bbox.get("y", 0))) w = max(0.01, float(bbox.get("w", 1))) h = max(0.01, float(bbox.get("h", 0.1))) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) if right <= left or bottom <= top: safe_print(f" Crop {chunk_id}: degenerate bbox {bbox}") return None cropped = im.crop((left, top, right, bottom)) cropped.save(str(cropped_path)) safe_print(f" Cropped {chunk_id}: {left},{top},{right},{bottom} from {W}x{H}") return cropped_path except Exception as e: safe_print(f" Crop {chunk_id}: error: {e}") return None def analyze_image(chunk_id: str, cropped_path: Path) -> dict: """Analyze a cropped image using claude -p CLI.""" if not cropped_path or not cropped_path.exists(): return {} prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path)) max_retries = 2 for attempt in range(1, max_retries + 1): try: result_text = call_claude(prompt, allowed_tools="Read", timeout=120) data = extract_json(result_text) safe_print(f" Image {chunk_id}: analyzed (ufo={data.get('ufo_anomaly_detected', False)})") return data except Exception as e: safe_print(f" Image {chunk_id}: error attempt {attempt}: {str(e)[:80]}") if attempt < max_retries: time.sleep(5) return {} def write_chunk_file(chunk: dict) -> None: """Write chunk .md file.""" chunk_id = chunk["chunk_id"] chunk_path = CHUNKS_DIR / f"{chunk_id}.md" bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1} page_num = chunk.get("page", 1) source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png" content = f"""--- chunk_id: {chunk_id} type: {chunk.get("type", "paragraph")} page: {page_num} order_in_page: {chunk.get("order_in_page", 1)} order_global: {chunk.get("order_global", 1)} bbox: {{x: {float(bbox.get('x') or 0):.2f}, y: {float(bbox.get('y') or 0):.2f}, w: {float(bbox.get('w') or 1):.2f}, h: {float(bbox.get('h') or 0.1):.2f}}} classification: {json.dumps(chunk.get("classification"))} formatting: {json.dumps(chunk.get("formatting", []))} cross_page_hint: {chunk.get("cross_page_hint", "self_contained")} prev_chunk: {json.dumps(chunk.get("prev_chunk"))} next_chunk: {json.dumps(chunk.get("next_chunk"))} related_image: {json.dumps(chunk.get("related_image"))} related_table: null ocr_confidence: {float(chunk.get("ocr_confidence") or 0.85)} ocr_source_lines: {json.dumps(chunk.get("ocr_source_lines", []))} redaction_code: {json.dumps(chunk.get("redaction_code"))} redaction_inferred_content_type: {json.dumps(chunk.get("redaction_inferred_content_type"))} image_type: {json.dumps(chunk.get("image_type"))} ufo_anomaly_detected: {str(bool(chunk.get("ufo_anomaly_detected", False))).lower()} cryptid_anomaly_detected: {str(bool(chunk.get("cryptid_anomaly_detected", False))).lower()} ufo_anomaly_type: {json.dumps(chunk.get("ufo_anomaly_type"))} ufo_anomaly_rationale: {json.dumps(chunk.get("ufo_anomaly_rationale"))} cryptid_anomaly_type: {json.dumps(chunk.get("cryptid_anomaly_type"))} cryptid_anomaly_rationale: {json.dumps(chunk.get("cryptid_anomaly_rationale"))} image_description_en: {json.dumps(chunk.get("image_description_en"))} image_description_pt_br: {json.dumps(chunk.get("image_description_pt_br"))} extracted_text: {json.dumps(chunk.get("extracted_text"))} source_png: {source_png} --- **EN:** {chunk.get("content_en", "")} **PT-BR:** {chunk.get("content_pt_br", "")} """ chunk_path.write_text(content, encoding="utf-8") def main(): start_time = time.time() CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) print(f"Rebuilding {DOC_ID}") print(f"Processing {TOTAL_PAGES} pages with {WORKERS} workers...") print("=" * 70) page_numbers = list(range(1, TOTAL_PAGES + 1)) # 1..93 all_page_data = {} # Process pages in batches of WORKERS for batch_start in range(0, len(page_numbers), WORKERS): batch = page_numbers[batch_start:batch_start + WORKERS] batch_num = batch_start // WORKERS + 1 total_batches = (len(page_numbers) + WORKERS - 1) // WORKERS safe_print(f"\nBatch {batch_num}/{total_batches}: pages {batch}") with ThreadPoolExecutor(max_workers=WORKERS) as executor: futures = {executor.submit(process_page, p): p for p in batch} for future in as_completed(futures): page_num = futures[future] try: data = future.result() all_page_data[page_num] = data except Exception as e: safe_print(f" Page {page_num}: CRITICAL FAILURE: {e}") all_page_data[page_num] = { "page_number": page_num, "chunks": [{ "order_in_page": 1, "type": "unknown", "content_en": f"[Page {page_num} — critical failure]", "content_pt_br": f"[Página {page_num} — falha crítica]", "bbox": {"x": 0, "y": 0, "w": 1, "h": 1}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } print(f"\nAll pages processed. Assigning global chunk IDs...") # Assign global chunk IDs in page order all_chunks = [] chunk_counter = 1 for page_num in sorted(all_page_data.keys()): page_data = all_page_data[page_num] chunks = page_data.get("chunks", []) chunks.sort(key=lambda c: c.get("order_in_page", 1)) for chunk in chunks: chunk_id = f"c{chunk_counter:04d}" chunk["chunk_id"] = chunk_id chunk["page"] = page_num chunk["order_global"] = chunk_counter chunk_counter += 1 all_chunks.append(chunk) total_chunks = len(all_chunks) print(f"Total chunks: {total_chunks}") # Prev/next pointers for i, chunk in enumerate(all_chunks): chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None # Identify image chunks image_chunks = [c for c in all_chunks if c.get("type") == "image"] print(f"\nCropping {len(image_chunks)} images...") # Crop all images first crop_results = {} for chunk in image_chunks: chunk_id = chunk["chunk_id"] page_num = chunk["page"] png_path = PNG_DIR / f"p-{page_num:03d}.png" if png_path.exists(): cp = crop_image(chunk_id, png_path, chunk.get("bbox", {})) crop_results[chunk_id] = cp else: crop_results[chunk_id] = None # Analyze images in batches image_items = [(c["chunk_id"], crop_results.get(c["chunk_id"])) for c in image_chunks if crop_results.get(c["chunk_id"])] print(f"\nAnalyzing {len(image_items)} cropped images...") image_analysis = {} for batch_start in range(0, len(image_items), WORKERS): batch = image_items[batch_start:batch_start + WORKERS] with ThreadPoolExecutor(max_workers=WORKERS) as executor: futures = {executor.submit(analyze_image, cid, cp): cid for cid, cp in batch} for future in as_completed(futures): chunk_id = futures[future] try: image_analysis[chunk_id] = future.result() except Exception as e: safe_print(f" Image analysis {chunk_id}: {e}") image_analysis[chunk_id] = {} # Merge image analysis into chunks for chunk in all_chunks: chunk_id = chunk["chunk_id"] if chunk.get("type") == "image": chunk["related_image"] = f"IMG-{chunk_id}.png" if chunk_id in image_analysis: for field in ["image_description_en", "image_description_pt_br", "image_type", "extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type", "ufo_anomaly_rationale", "cryptid_anomaly_detected", "cryptid_anomaly_type", "cryptid_anomaly_rationale"]: if field in image_analysis[chunk_id]: chunk[field] = image_analysis[chunk_id][field] # Write chunk files print(f"\nWriting {total_chunks} chunk files...") for chunk in all_chunks: write_chunk_file(chunk) print("Chunk files written.") # Build _index.json now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") index_chunks = [] for chunk in all_chunks: bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1} index_chunks.append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "paragraph"), "page": chunk.get("page", 1), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk['chunk_id']}.md", "bbox": bbox, "preview": chunk.get("content_en", "")[:80] }) index_data = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": now_iso, "chunks": index_chunks } (OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8") print("_index.json written.") # Compute stats chunk_types = {} ufo_anomalies = [] cryptid_anomalies = [] images_count = 0 for chunk in all_chunks: t = chunk.get("type", "paragraph") chunk_types[t] = chunk_types.get(t, 0) + 1 if chunk.get("ufo_anomaly_detected"): ufo_anomalies.append(chunk["chunk_id"]) if chunk.get("cryptid_anomaly_detected"): cryptid_anomalies.append(chunk["chunk_id"]) if t == "image": images_count += 1 # Assemble document.md print("\nAssembling document.md...") parts = [] # Frontmatter parts.append("---") parts.append('schema_version: "0.2.0"') parts.append("type: master_document") parts.append(f"doc_id: {DOC_ID}") parts.append(f'canonical_title: "{DOC_TITLE}"') parts.append(f"total_pages: {TOTAL_PAGES}") parts.append(f"total_chunks: {total_chunks}") parts.append("chunk_types_histogram:") for t, count in sorted(chunk_types.items()): parts.append(f" {t}: {count}") parts.append("multi_page_tables: []") parts.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}") parts.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}") parts.append('build_approach: "subagents"') parts.append("build_model: claude-haiku-4-5") parts.append(f"build_at: {now_iso}") parts.append("---") parts.append("") current_page = None for chunk in all_chunks: page = chunk.get("page", 1) if page != current_page: current_page = page parts.append(f"\n## Page {page}\n") chunk_id = chunk["chunk_id"] bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1} bbox_str = f"{float(bbox.get('x') or 0):.2f}/{float(bbox.get('y') or 0):.2f}/{float(bbox.get('w') or 1):.2f}/{float(bbox.get('h') or 0.1):.2f}" ctype = chunk.get("type", "paragraph") parts.append(f"") parts.append(f'') parts.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}") parts.append("") parts.append(f"**EN:** {chunk.get('content_en', '')}") parts.append("") parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}") parts.append("") if ctype == "image": img_path = IMAGES_DIR / f"IMG-{chunk_id}.png" if img_path.exists(): parts.append(f"![chunk image](./images/IMG-{chunk_id}.png)") parts.append("") if chunk.get("image_description_en"): parts.append(f"*{chunk['image_description_en']}*") parts.append("") # Metadata block meta = {k: v for k, v in chunk.items() if k not in ["content_en", "content_pt_br"]} parts.append("
metadata") parts.append("") parts.append("```json") parts.append(json.dumps(meta, ensure_ascii=False, indent=2)) parts.append("```") parts.append("") parts.append("
") parts.append("") parts.append("---") parts.append("") document_md = "\n".join(parts) doc_path = OUT_DIR / "document.md" doc_path.write_text(document_md, encoding="utf-8") doc_md_bytes = len(document_md.encode("utf-8")) print(f"document.md written ({doc_md_bytes:,} bytes)") wall_seconds = int(time.time() - start_time) print(f"\n{'='*70}") print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={len(ufo_anomalies)}, cryptid_anomalies={len(cryptid_anomalies)}, wall_seconds={wall_seconds}") if __name__ == "__main__": main()