#!/usr/bin/env python3 """ Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-3 Processes all 155 pages in parallel batches, generates chunks, images, and index. """ import os import json import base64 import time import concurrent.futures from datetime import datetime, timezone from pathlib import Path import anthropic DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-3" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 3 — FBI Flying Discs Investigation File" TOTAL_PAGES = 155 PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" client = anthropic.Anthropic() CHUNK_TYPES = [ "letterhead", "header", "classification_banner", "subject_line", "salutation", "body_paragraph", "signature_block", "handwritten_note", "stamp", "redaction_block", "image", "table_marker", "footer", "page_number", "attachment_label", "routing_slip", "blank", "caption", "list_item", "address_block" ] PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent for a declassified FBI UAP/UFO document archive. Your task: Analyze the provided page image and extract ALL content into structured chunks. Document: {doc_title} Page: {page_number} of {total_pages} Page PNG path: {page_png_path} Return a JSON object with this exact structure: {{ "page_number": {page_number}, "classification": "", "page_type": "", "chunks": [ {{ "order_in_page": 1, "type": "", "content_en": "", "content_pt_br": "", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": "", "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.9, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} ] }} RULES: 1. Extract ALL content — no chunk can be skipped. 2. Use ONLY these chunk types: letterhead, header, classification_banner, subject_line, salutation, body_paragraph, signature_block, handwritten_note, stamp, redaction_block, image, table_marker, footer, page_number, attachment_label, routing_slip, blank, caption, list_item, address_block 3. bbox values are normalized 0.0-1.0 (x=left, y=top, w=width, h=height of the page). 4. content_en: verbatim transcription for text, description for images. 5. content_pt_br: Brazilian Portuguese translation of content_en (NOT European Portuguese). For verbatim text blocks, provide both the original (verbatim) and a translation note. 6. For redacted blocks: set type="redaction_block", content_en="[REDACTED]", set redaction_code if visible (e.g., "(b)(1)", "(b)(6)"), redaction_inferred_content_type with your best inference. 7. For images/photos: type="image", image_type = one of: photograph|sketch|diagram|map|chart|logo|signature|stamp|other 8. For tables: type="table_marker" 9. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev" 10. UAP/cryptid anomaly detection: flag any anomalous visual content (UFO shapes, unusual aerial phenomena, cryptid-related imagery). 11. If page is blank or nearly blank: create ONE chunk type="blank". 12. classification_banner chunks at top/bottom of page for classification markings. 13. stamps: type="stamp" for rubber stamps, file numbers, dates stamped on documents. 14. Return ONLY valid JSON, no other text. IMPORTANT: Be thorough. A typical text page has 5-15 chunks. A photo page may have 2-3 chunks. Cover/envelope pages have 4-8 chunks. """ def encode_image_b64(path: Path) -> str: with open(path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") def process_page(page_num: int) -> dict: """Process a single page and return its chunks as a dict.""" # PNG files are p-000.png through p-154.png (zero-indexed) png_index = page_num - 1 # page 1 = p-000.png png_path = PNG_DIR / f"p-{png_index:03d}.png" if not png_path.exists(): print(f" WARNING: PNG not found for page {page_num}: {png_path}") return { "page_number": page_num, "classification": None, "page_type": "blank", "chunks": [{ "order_in_page": 1, "type": "blank", "content_en": "[Page image not found]", "content_pt_br": "[Imagem da página não encontrada]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } img_b64 = encode_image_b64(png_path) prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, page_number=page_num, total_pages=TOTAL_PAGES, page_png_path=str(png_path) ) max_retries = 3 for attempt in range(max_retries): try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=4096, messages=[{ "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": img_b64 } }, { "type": "text", "text": prompt } ] }] ) text = response.content[0].text.strip() # Strip markdown code fences if present if text.startswith("```"): lines = text.split("\n") text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:]) data = json.loads(text) data["page_number"] = page_num # ensure correct return data except json.JSONDecodeError as e: print(f" Page {page_num} attempt {attempt+1}: JSON parse error: {e}") if attempt == max_retries - 1: # Return a fallback return { "page_number": page_num, "classification": None, "page_type": "text", "chunks": [{ "order_in_page": 1, "type": "body_paragraph", "content_en": f"[Page {page_num} — parse error, content not extracted]", "content_pt_br": f"[Página {page_num} — erro de análise, conteúdo não extraído]", "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } except anthropic.APIError as e: print(f" Page {page_num} attempt {attempt+1}: API error: {e}") if attempt < max_retries - 1: time.sleep(2 ** attempt) else: return { "page_number": page_num, "classification": None, "page_type": "text", "chunks": [{ "order_in_page": 1, "type": "body_paragraph", "content_en": f"[Page {page_num} — API error]", "content_pt_br": f"[Página {page_num} — erro de API]", "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } def crop_image(chunk_id: str, png_path: Path, bbox: dict): """Crop a region from the page PNG and save to images dir.""" try: from PIL import Image im = Image.open(png_path) W, H = im.size x = bbox.get("x", 0) y = bbox.get("y", 0) w = bbox.get("w", 1) h = bbox.get("h", 1) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) cropped = im.crop((left, top, right, bottom)) out_path = IMAGES_DIR / f"IMG-{chunk_id}.png" cropped.save(out_path) return out_path except Exception as e: print(f" Crop error for {chunk_id}: {e}") return None def write_chunk_file(chunk_data: dict, chunk_id: str, page_num: int, order_global: int, prev_chunk, next_chunk, has_image: bool) -> None: """Write a single chunk markdown file.""" bbox = chunk_data.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) png_index = page_num - 1 source_png = f"../../processing/png/{DOC_ID}/p-{png_index:03d}.png" related_image = f"IMG-{chunk_id}.png" if has_image else "null" related_table = chunk_data.get("related_table", "null") or "null" ufo = chunk_data.get("ufo_anomaly_detected", False) cryptid = chunk_data.get("cryptid_anomaly_detected", False) frontmatter = f"""--- chunk_id: {chunk_id} type: {chunk_data.get("type", "body_paragraph")} page: {page_num} order_in_page: {chunk_data.get("order_in_page", 1)} order_global: {order_global} bbox: {{x: {bbox.get("x", 0):.3f}, y: {bbox.get("y", 0):.3f}, w: {bbox.get("w", 1):.3f}, h: {bbox.get("h", 1):.3f}}} classification: {json.dumps(chunk_data.get("classification"))} formatting: {json.dumps(chunk_data.get("formatting", []))} cross_page_hint: {chunk_data.get("cross_page_hint", "self_contained")} prev_chunk: {json.dumps(prev_chunk)} next_chunk: {json.dumps(next_chunk)} related_image: {json.dumps(related_image if has_image else None)} related_table: {json.dumps(chunk_data.get("related_table"))} ocr_confidence: {chunk_data.get("ocr_confidence", 0.9)} ocr_source_lines: {json.dumps(chunk_data.get("ocr_source_lines", []))} redaction_code: {json.dumps(chunk_data.get("redaction_code"))} redaction_inferred_content_type: {json.dumps(chunk_data.get("redaction_inferred_content_type"))} image_type: {json.dumps(chunk_data.get("image_type"))} ufo_anomaly_detected: {str(ufo).lower()} ufo_anomaly_type: {json.dumps(chunk_data.get("ufo_anomaly_type"))} ufo_anomaly_rationale: {json.dumps(chunk_data.get("ufo_anomaly_rationale"))} cryptid_anomaly_detected: {str(cryptid).lower()} cryptid_anomaly_type: {json.dumps(chunk_data.get("cryptid_anomaly_type"))} cryptid_anomaly_rationale: {json.dumps(chunk_data.get("cryptid_anomaly_rationale"))} image_description_en: {json.dumps(chunk_data.get("image_description_en"))} image_description_pt_br: {json.dumps(chunk_data.get("image_description_pt_br"))} extracted_text: {json.dumps(chunk_data.get("extracted_text"))} source_png: {source_png} --- **EN:** {chunk_data.get("content_en", "")} **PT-BR:** {chunk_data.get("content_pt_br", "")} """ out_path = CHUNKS_DIR / f"{chunk_id}.md" out_path.write_text(frontmatter, encoding="utf-8") def main(): start_time = time.time() print(f"Starting rebuild of {DOC_ID}") print(f"Processing {TOTAL_PAGES} pages with 4 parallel workers...") CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) # Step 1: Process all pages in parallel batches of 4 all_pages = {} # page_num -> page_data with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: future_to_page = { executor.submit(process_page, page_num): page_num for page_num in range(1, TOTAL_PAGES + 1) } completed = 0 for future in concurrent.futures.as_completed(future_to_page): page_num = future_to_page[future] try: result = future.result() all_pages[page_num] = result completed += 1 if completed % 10 == 0: print(f" Completed {completed}/{TOTAL_PAGES} pages...") except Exception as e: print(f" Page {page_num} failed: {e}") all_pages[page_num] = { "page_number": page_num, "classification": None, "page_type": "text", "chunks": [{ "order_in_page": 1, "type": "body_paragraph", "content_en": f"[Page {page_num} — processing failed: {e}]", "content_pt_br": f"[Página {page_num} — processamento falhou: {e}]", "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } print(f"All pages processed. Assigning global chunk IDs...") # Step 2: Assign global chunk IDs all_chunks = [] # list of (chunk_id, page_num, chunk_data) global_order = 0 for page_num in range(1, TOTAL_PAGES + 1): page_data = all_pages[page_num] chunks = page_data.get("chunks", []) # Sort by order_in_page chunks.sort(key=lambda c: c.get("order_in_page", 0)) for chunk in chunks: global_order += 1 chunk_id = f"c{global_order:04d}" all_chunks.append((chunk_id, page_num, chunk)) total_chunks = len(all_chunks) print(f"Total chunks: {total_chunks}") # Set prev/next pointers for i, (chunk_id, page_num, chunk) in enumerate(all_chunks): prev_chunk = all_chunks[i-1][0] if i > 0 else None next_chunk = all_chunks[i+1][0] if i < len(all_chunks) - 1 else None chunk["_chunk_id"] = chunk_id chunk["_prev"] = prev_chunk chunk["_next"] = next_chunk chunk["_order_global"] = i + 1 # Step 3: Crop images for image-type chunks print("Cropping images for image chunks...") image_chunks = [(cid, pnum, c) for cid, pnum, c in all_chunks if c.get("type") == "image"] print(f" Found {len(image_chunks)} image chunks") for chunk_id, page_num, chunk in image_chunks: png_index = page_num - 1 png_path = PNG_DIR / f"p-{png_index:03d}.png" bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) crop_image(chunk_id, png_path, bbox) # Step 4: Write chunk files print("Writing chunk files...") for chunk_id, page_num, chunk in all_chunks: has_image = chunk.get("type") == "image" write_chunk_file( chunk, chunk_id, page_num, chunk["_order_global"], chunk["_prev"], chunk["_next"], has_image ) # Step 5: Write _index.json print("Writing _index.json...") build_at = datetime.now(timezone.utc).isoformat() index_chunks = [] for chunk_id, page_num, chunk in all_chunks: content_en = chunk.get("content_en", "") preview = content_en[:80] if content_en else "" bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) index_chunks.append({ "chunk_id": chunk_id, "type": chunk.get("type", "body_paragraph"), "page": page_num, "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk["_order_global"], "file": f"chunks/{chunk_id}.md", "bbox": bbox, "preview": preview }) index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": build_at, "chunks": index_chunks } index_path = OUT_DIR / "_index.json" index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8") # Step 6: Compute stats chunk_types = {} ufo_anomalies = [] cryptid_anomalies = [] images_count = 0 for chunk_id, page_num, chunk in all_chunks: t = chunk.get("type", "body_paragraph") chunk_types[t] = chunk_types.get(t, 0) + 1 if chunk.get("ufo_anomaly_detected"): ufo_anomalies.append(chunk_id) if chunk.get("cryptid_anomaly_detected"): cryptid_anomalies.append(chunk_id) if t == "image": images_count += 1 # Step 7: Write document.md print("Writing document.md...") frontmatter_lines = [ "---", 'schema_version: "0.2.0"', "type: master_document", f"doc_id: {DOC_ID}", f'canonical_title: "{DOC_TITLE}"', f"total_pages: {TOTAL_PAGES}", f"total_chunks: {total_chunks}", "chunk_types_histogram:", ] for t, count in sorted(chunk_types.items()): frontmatter_lines.append(f" {t}: {count}") frontmatter_lines.append("multi_page_tables: []") frontmatter_lines.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}") frontmatter_lines.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}") frontmatter_lines.append('build_approach: "subagents"') frontmatter_lines.append("build_model: claude-haiku-4-5") frontmatter_lines.append(f"build_at: {build_at}") frontmatter_lines.append("---") frontmatter_lines.append("") doc_lines = frontmatter_lines[:] current_page = 0 for chunk_id, page_num, chunk in all_chunks: if page_num != current_page: current_page = page_num doc_lines.append(f"## Page {page_num}") doc_lines.append("") chunk_type = chunk.get("type", "body_paragraph") bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}" doc_lines.append(f"") doc_lines.append(f'') doc_lines.append(f"### Chunk {chunk_id} — {chunk_type} · p{page_num} · bbox: {bbox_str}") doc_lines.append("") content_en = chunk.get("content_en", "") content_pt = chunk.get("content_pt_br", "") doc_lines.append(f"**EN:** {content_en}") doc_lines.append("") doc_lines.append(f"**PT-BR:** {content_pt}") doc_lines.append("") if chunk_type == "image": doc_lines.append(f"![chunk image](./images/IMG-{chunk_id}.png)") desc_en = chunk.get("image_description_en", "") desc_pt = chunk.get("image_description_pt_br", "") if desc_en: doc_lines.append(f"*{desc_en}*") if desc_pt: doc_lines.append(f"*{desc_pt}*") doc_lines.append("") # Build metadata JSON for details block meta = { "chunk_id": chunk_id, "type": chunk_type, "page": page_num, "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk["_order_global"], "bbox": bbox, "classification": chunk.get("classification"), "formatting": chunk.get("formatting", []), "cross_page_hint": chunk.get("cross_page_hint", "self_contained"), "prev_chunk": chunk["_prev"], "next_chunk": chunk["_next"], "ocr_confidence": chunk.get("ocr_confidence", 0.9), "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False), "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False), "ufo_anomaly_type": chunk.get("ufo_anomaly_type"), "cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"), "redaction_code": chunk.get("redaction_code"), "image_type": chunk.get("image_type"), } doc_lines.append("
metadata") doc_lines.append("") doc_lines.append("```json") doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2)) doc_lines.append("```") doc_lines.append("") doc_lines.append("
") doc_lines.append("") doc_lines.append("---") doc_lines.append("") doc_content = "\n".join(doc_lines) doc_path = OUT_DIR / "document.md" doc_path.write_text(doc_content, encoding="utf-8") wall_seconds = int(time.time() - start_time) doc_md_bytes = len(doc_content.encode("utf-8")) print(f"\nDone!") print(f"STATS pages={TOTAL_PAGES} chunks={total_chunks} images={images_count} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}") print(f"Wall time: {wall_seconds}s") return { "pages": TOTAL_PAGES, "chunks": total_chunks, "images": images_count, "tables": 0, "ufo": len(ufo_anomalies), "cryptid": len(cryptid_anomalies), "wall_seconds": wall_seconds, "doc_md_bytes": doc_md_bytes } if __name__ == "__main__": main()