#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Rebuild doc-65-hs1-834228961-62-hq-83894-serial-130 Processes all 91 pages via Claude vision, produces chunks/_index.json/document.md """ import os import sys import json import base64 import time import concurrent.futures from datetime import datetime, timezone from pathlib import Path import anthropic DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130" DOC_TITLE = "HQ Air Defense Command - Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)" PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") CHUNKS_DIR = RAW_DIR / "chunks" IMAGES_DIR = RAW_DIR / "images" TABLES_DIR = RAW_DIR / "tables" for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]: d.mkdir(parents=True, exist_ok=True) client = anthropic.Anthropic() def encode_image(path: Path) -> str: with open(path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder subagent. Analyze this document page image and extract ALL content as structured chunks. Document: {doc_title} Doc ID: {doc_id} Page number (in sequence): {page_number} of {total_pages} Source PNG filename: {png_filename} Return a JSON object with this exact structure: {{ "page_number": {page_number}, "png_filename": "{png_filename}", "chunks": [ {{ "order_in_page": 1, "type": "", "content_en": "...", "content_pt_br": "...", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.9, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null }} ] }} Allowed chunk types: letterhead, classification_banner, subject_line, body_paragraph, list_item, signature_block, date_line, address_block, header, footer, redaction_block, table_marker, image, stamp, handwritten_note, page_number_marker, blank Rules: 1. Create ONE chunk per distinct visual/logical unit. Do not merge unrelated blocks. 2. For classification banners (TOP SECRET, SECRET, CONFIDENTIAL, etc.) at top/bottom of page: type=classification_banner, fill classification field. 3. For any image/photo/diagram/map/sketch: type=image, fill image_type, image_description_en, image_description_pt_br, ufo_anomaly_detected, cryptid_anomaly_detected. 4. For redacted/blacked-out areas: type=redaction_block, fill redaction_code if visible. 5. content_en = exact English transcription of text, verbatim. content_pt_br = Brazilian Portuguese translation of content_en (NOT translation of classification banners/stamps/codes — keep those verbatim in both fields). 6. bbox: normalized coordinates (0.0-1.0): x=left, y=top, w=width, h=height relative to page. 7. formatting: array of applicable: bold, italic, underline, all_caps, strikethrough, handwritten. 8. For cross_page_hint: "continues_to_next" if text clearly continues on next page, "continues_from_prev" if it continues from previous page, "self_contained" otherwise. 9. ocr_confidence: your confidence in the transcription (0.0-1.0). 10. If page is blank: return single chunk type=blank. 11. ufo_anomaly_detected: true if the chunk contains or depicts a UAP/UFO, unidentified aerial phenomenon, unknown object in sky, or anomalous craft. Set ufo_anomaly_type and ufo_anomaly_rationale. 12. IMPORTANT: Return ONLY valid JSON, no markdown code blocks, no explanation.''' def process_page(page_index: int, png_filename: str, total_pages: int) -> dict: """Process a single page and return its chunks.""" png_path = PNG_DIR / png_filename try: img_data = encode_image(png_path) prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, doc_id=DOC_ID, page_number=page_index, total_pages=total_pages, png_filename=png_filename ) response = client.messages.create( model="claude-sonnet-4-6", max_tokens=4096, messages=[ { "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": img_data } }, { "type": "text", "text": prompt } ] } ] ) raw_text = response.content[0].text.strip() # Strip markdown code block if present if raw_text.startswith("```"): lines = raw_text.split("\n") # Remove first and last lines if they are code fences if lines[0].startswith("```"): lines = lines[1:] if lines and lines[-1].strip() == "```": lines = lines[:-1] raw_text = "\n".join(lines) result = json.loads(raw_text) result["page_index"] = page_index result["png_filename"] = png_filename return result except Exception as e: print(f" ERROR page {page_index} ({png_filename}): {e}", file=sys.stderr) # Return minimal fallback return { "page_number": page_index, "page_index": page_index, "png_filename": png_filename, "chunks": [ { "order_in_page": 1, "type": "blank", "content_en": f"[Page processing error: {str(e)[:100]}]", "content_pt_br": f"[Erro de processamento: {str(e)[:100]}]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None } ] } def main(): start_time = time.time() # Get all PNG files in sorted order png_files = sorted([f.name for f in PNG_DIR.glob("p-*.png")]) total_pages = len(png_files) print(f"Processing {total_pages} pages for {DOC_ID}") # Process in parallel batches of 5 all_page_results = {} batch_size = 5 for batch_start in range(0, total_pages, batch_size): batch = png_files[batch_start:batch_start + batch_size] batch_indices = list(range(batch_start + 1, batch_start + len(batch) + 1)) print(f" Batch {batch_start//batch_size + 1}: pages {batch_indices[0]}-{batch_indices[-1]} ({[b for b in batch]})") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = { executor.submit(process_page, idx, fname, total_pages): (idx, fname) for idx, fname in zip(batch_indices, batch) } for future in concurrent.futures.as_completed(futures): idx, fname = futures[future] try: result = future.result(timeout=120) all_page_results[idx] = result chunk_count = len(result.get("chunks", [])) print(f" Page {idx} ({fname}): {chunk_count} chunks") except Exception as e: print(f" FAILED page {idx} ({fname}): {e}", file=sys.stderr) # Globally number chunks print("\nNumbering chunks globally...") all_chunks = [] global_order = 0 for page_idx in sorted(all_page_results.keys()): page_data = all_page_results[page_idx] png_filename = page_data.get("png_filename", f"p-{page_idx:03d}.png") page_chunks = page_data.get("chunks", []) # Sort by order_in_page page_chunks.sort(key=lambda c: c.get("order_in_page", 0)) for chunk in page_chunks: global_order += 1 chunk_id = f"c{global_order:04d}" chunk["chunk_id"] = chunk_id chunk["page"] = page_idx chunk["order_global"] = global_order chunk["source_png"] = f"../../processing/png/{DOC_ID}/{png_filename}" all_chunks.append(chunk) # Set prev/next pointers for i, chunk in enumerate(all_chunks): chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks) - 1 else None # Detect image chunks for cropping image_chunks = [c for c in all_chunks if c.get("type") == "image"] print(f"\nFound {len(image_chunks)} image chunks") # Crop images using PIL print("Cropping image regions...") for chunk in image_chunks: chunk_id = chunk["chunk_id"] page_idx = chunk["page"] png_filename = all_page_results[page_idx]["png_filename"] png_path = PNG_DIR / png_filename bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) out_path = IMAGES_DIR / f"IMG-{chunk_id}.png" try: from PIL import Image im = Image.open(png_path) W, H = im.size x, y, w, h = bbox.get("x", 0), bbox.get("y", 0), bbox.get("w", 1), bbox.get("h", 1) pad = 0.005 crop = im.crop(( max(0, int((x - pad) * W)), max(0, int((y - pad) * H)), min(W, int((x + w + pad) * W)), min(H, int((y + h + pad) * H)) )) crop.save(str(out_path)) chunk["related_image"] = f"IMG-{chunk_id}.png" print(f" Cropped {chunk_id} from {png_filename}") except Exception as e: print(f" CROP ERROR {chunk_id}: {e}", file=sys.stderr) chunk["related_image"] = None # For non-image chunks, set related_image to null for chunk in all_chunks: if "related_image" not in chunk: chunk["related_image"] = None if "related_table" not in chunk: chunk["related_table"] = None # Write individual chunk files print("\nWriting chunk files...") for chunk in all_chunks: chunk_id = chunk["chunk_id"] chunk_path = CHUNKS_DIR / f"{chunk_id}.md" bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) content = f"""--- chunk_id: {chunk_id} type: {chunk.get('type', 'body_paragraph')} page: {chunk.get('page', 1)} order_in_page: {chunk.get('order_in_page', 1)} order_global: {chunk.get('order_global', 1)} bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 1):.3f}}} classification: {json.dumps(chunk.get('classification'))} formatting: {json.dumps(chunk.get('formatting', []))} cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')} prev_chunk: {json.dumps(chunk.get('prev_chunk'))} next_chunk: {json.dumps(chunk.get('next_chunk'))} related_image: {json.dumps(chunk.get('related_image'))} related_table: {json.dumps(chunk.get('related_table'))} ocr_confidence: {chunk.get('ocr_confidence', 0.9)} ocr_source_lines: {json.dumps(chunk.get('ocr_source_lines', []))} redaction_code: {json.dumps(chunk.get('redaction_code'))} redaction_inferred_content_type: {json.dumps(chunk.get('redaction_inferred_content_type'))} image_type: {json.dumps(chunk.get('image_type'))} ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()} cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()} ufo_anomaly_type: {json.dumps(chunk.get('ufo_anomaly_type'))} ufo_anomaly_rationale: {json.dumps(chunk.get('ufo_anomaly_rationale'))} cryptid_anomaly_type: {json.dumps(chunk.get('cryptid_anomaly_type'))} cryptid_anomaly_rationale: {json.dumps(chunk.get('cryptid_anomaly_rationale'))} image_description_en: {json.dumps(chunk.get('image_description_en'))} image_description_pt_br: {json.dumps(chunk.get('image_description_pt_br'))} extracted_text: {json.dumps(chunk.get('extracted_text'))} source_png: {chunk.get('source_png', '')} --- **EN:** {chunk.get('content_en', '')} **PT-BR:** {chunk.get('content_pt_br', '')} """ chunk_path.write_text(content, encoding="utf-8") print(f" Wrote {len(all_chunks)} chunk files") # Build _index.json print("\nBuilding _index.json...") build_at = datetime.now(timezone.utc).isoformat() index_chunks = [] for chunk in all_chunks: bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) preview = chunk.get("content_en", "")[:80] index_chunks.append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "body_paragraph"), "page": chunk.get("page", 1), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk['chunk_id']}.md", "bbox": { "x": round(bbox.get("x", 0), 3), "y": round(bbox.get("y", 0), 3), "w": round(bbox.get("w", 1), 3), "h": round(bbox.get("h", 1), 3) }, "preview": preview }) index_data = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": total_pages, "total_chunks": len(all_chunks), "build_approach": "subagents", "build_model": "claude-sonnet-4-6", "build_at": build_at, "chunks": index_chunks } index_path = RAW_DIR / "_index.json" index_path.write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8") print(f" Wrote _index.json with {len(all_chunks)} chunks") # Compute histogram type_hist = {} for chunk in all_chunks: t = chunk.get("type", "unknown") type_hist[t] = type_hist.get(t, 0) + 1 # Collect anomaly lists ufo_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")] cryptid_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")] # Assemble document.md print("\nAssembling document.md...") doc_lines = [] doc_lines.append(f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {total_pages} total_chunks: {len(all_chunks)} chunk_types_histogram: {json.dumps(type_hist)} multi_page_tables: [] ufo_anomalies_flagged: {json.dumps(ufo_anomaly_chunks)} cryptid_anomalies_flagged: {json.dumps(cryptid_anomaly_chunks)} build_approach: "subagents" build_model: "claude-sonnet-4-6" build_at: "{build_at}" --- """) current_page = None for chunk in all_chunks: page = chunk.get("page") if page != current_page: current_page = page png_fn = all_page_results.get(page, {}).get("png_filename", f"p-{page:03d}.png") doc_lines.append(f"\n## Page {page} (source: {png_fn})\n") chunk_id = chunk["chunk_id"] ctype = chunk.get("type", "body_paragraph") bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}" doc_lines.append(f"") doc_lines.append(f'') doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}\n") content_en = chunk.get("content_en", "") content_pt_br = chunk.get("content_pt_br", "") doc_lines.append(f"**EN:** {content_en}\n") doc_lines.append(f"**PT-BR:** {content_pt_br}\n") # Embed image if applicable if ctype == "image" and chunk.get("related_image"): img_file = chunk["related_image"] doc_lines.append(f"![{chunk_id} image](./images/{img_file})\n") if chunk.get("image_description_en"): doc_lines.append(f"*Image description: {chunk['image_description_en']}*\n") # Metadata details block meta = { "chunk_id": chunk_id, "type": ctype, "page": page, "order_in_page": chunk.get("order_in_page"), "order_global": chunk.get("order_global"), "bbox": chunk.get("bbox"), "classification": chunk.get("classification"), "formatting": chunk.get("formatting", []), "cross_page_hint": chunk.get("cross_page_hint"), "prev_chunk": chunk.get("prev_chunk"), "next_chunk": chunk.get("next_chunk"), "related_image": chunk.get("related_image"), "related_table": chunk.get("related_table"), "ocr_confidence": chunk.get("ocr_confidence"), "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False), "ufo_anomaly_type": chunk.get("ufo_anomaly_type"), "ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"), "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False), "cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"), "cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"), "image_description_en": chunk.get("image_description_en"), "image_description_pt_br": chunk.get("image_description_pt_br"), "source_png": chunk.get("source_png") } doc_lines.append("
metadata\n") doc_lines.append("```json") doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2)) doc_lines.append("```\n") doc_lines.append("
\n") doc_lines.append("---\n") doc_content = "\n".join(doc_lines) doc_path = RAW_DIR / "document.md" doc_path.write_text(doc_content, encoding="utf-8") wall_seconds = int(time.time() - start_time) doc_bytes = len(doc_content.encode("utf-8")) print(f"\nDone!") print(f" Chunks: {len(all_chunks)}") print(f" Images: {len(image_chunks)}") print(f" UFO anomalies: {len(ufo_anomaly_chunks)}") print(f" Cryptid anomalies: {len(cryptid_anomaly_chunks)}") print(f" document.md: {doc_bytes} bytes") print(f" Wall time: {wall_seconds}s") print(f"\nSTATS pages={total_pages} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomaly_chunks)} cryptid={len(cryptid_anomaly_chunks)} doc_md_bytes={doc_bytes}") if __name__ == "__main__": main()