#!/usr/bin/env python3 """ Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-2 Processes all 159 pages in parallel batches of 5, generates chunks, images, index, document.md """ import os import sys import json import base64 import time import concurrent.futures from datetime import datetime, timezone from pathlib import Path import anthropic from PIL import Image # ── Config ────────────────────────────────────────────────────────────────── DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2" DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)" PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" client = anthropic.Anthropic() CHUNK_TYPES = [ "cover", "letterhead", "stamp", "header", "subheader", "paragraph", "redaction", "signature", "image", "table_marker", "footer", "page_number", "classification_marking", "separator", "handwriting", "form_field", "caption", "list_item", "annotation", "blank" ] # Build page mapping: sequential 1..159 -> actual file number def build_page_map(): pngs = sorted( int(p.stem.replace("p-", "")) for p in PNG_DIR.glob("p-*.png") ) return {i + 1: num for i, num in enumerate(pngs)} PAGE_MAP = build_page_map() TOTAL_PAGES = len(PAGE_MAP) def load_image_b64(path: Path) -> str: with open(path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") def load_ocr(actual_num: int) -> str: ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt" if ocr_path.exists(): text = ocr_path.read_text(encoding="utf-8", errors="replace").strip() return text if text else "" return "" PAGE_REBUILDER_PROMPT = """You are a page-rebuilder subagent. Your job is to analyze a declassified FBI document page and extract ALL content as structured chunks. Document: {doc_title} Page: {page_number} of {total_pages} Actual file: p-{actual_num:03d}.png OCR text (may be empty/poor quality): {ocr_text} Analyze the image carefully. Extract ALL visible content into chunks. Return a JSON object: {{ "page_number": {page_number}, "chunks": [ {{ "order_in_page": 1, "type": "", "content_en": "", "content_pt_br": "", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} ] }} Rules: - bbox: x,y = top-left corner (0.0-1.0 fraction of page), w,h = width/height fractions - classification: string like "SECRET" or null - formatting: array of ["bold","italic","all_caps","underline","strikethrough"] as applicable - cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev" - For redaction blocks: type="redaction", include redaction_code if visible (e.g. "(b)(1)", "(b)(3)", "(b)(6)") - For stamps: type="stamp", describe the stamp text - For images/diagrams/photos: type="image", set image_type to "photo"|"diagram"|"sketch"|"map"|"chart"|"signature_block" - For tables: type="table_marker" - ufo_anomaly_detected: true only if the page contains an image/sketch/photo of an anomalous aerial phenomenon - cryptid_anomaly_detected: true only if the page contains imagery of cryptids/unknown creatures - content_en: transcribe verbatim when legible; describe when not (e.g., "[Redacted block]", "[Stamp: RECEIVED]") - content_pt_br: Brazilian Portuguese equivalent - Return ONLY valid JSON, no markdown fences, no explanation - Do NOT skip any visible content area - Minimum 1 chunk per page (even blank pages get type="blank") """ def rebuild_page(page_seq: int) -> dict: """Process one page, return {page_number, chunks:[...]}""" actual_num = PAGE_MAP[page_seq] png_path = PNG_DIR / f"p-{actual_num:03d}.png" ocr_text = load_ocr(actual_num) img_b64 = load_image_b64(png_path) prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, page_number=page_seq, total_pages=TOTAL_PAGES, actual_num=actual_num, ocr_text=ocr_text[:2000] if ocr_text else "(no OCR available)" ) retries = 3 for attempt in range(retries): try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=4096, messages=[{ "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": img_b64 } }, { "type": "text", "text": prompt } ] }] ) raw = response.content[0].text.strip() # Strip markdown fences if present if raw.startswith("```"): raw = raw.split("\n", 1)[1] if raw.endswith("```"): raw = raw[:-3] raw = raw.strip() data = json.loads(raw) data["page_number"] = page_seq data["actual_num"] = actual_num if "chunks" not in data: data["chunks"] = [] # Ensure order_in_page for i, ch in enumerate(data["chunks"]): ch["order_in_page"] = i + 1 ch["page"] = page_seq print(f" [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True) return data except Exception as e: if attempt < retries - 1: wait = 2 ** attempt * 5 print(f" [RETRY {attempt+1}] page {page_seq}: {e}, waiting {wait}s", flush=True) time.sleep(wait) else: print(f" [FAIL] page {page_seq}: {e}", flush=True) return { "page_number": page_seq, "actual_num": actual_num, "chunks": [{ "order_in_page": 1, "type": "blank", "page": page_seq, "content_en": "[Page processing failed]", "content_pt_br": "[Falha no processamento da página]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } IMAGE_ANALYST_PROMPT = """You are an image analyst examining a cropped region from a declassified FBI document about flying discs / UAP investigations. Analyze this image region and return a JSON object: {{ "image_type": "", "image_description_en": "", "image_description_pt_br": "", "extracted_text": "", "ufo_anomaly_detected": , "ufo_anomaly_type": "", "ufo_anomaly_rationale": "", "cryptid_anomaly_detected": , "cryptid_anomaly_type": "", "cryptid_anomaly_rationale": "" }} Return ONLY valid JSON, no markdown fences. """ def analyze_image(chunk_id: str, img_path: Path) -> dict: """Analyze a cropped image, return metadata dict""" if not img_path.exists(): return { "image_type": "other", "image_description_en": "Image not available", "image_description_pt_br": "Imagem não disponível", "extracted_text": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None } img_b64 = load_image_b64(img_path) retries = 3 for attempt in range(retries): try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=1024, messages=[{ "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": img_b64 } }, { "type": "text", "text": IMAGE_ANALYST_PROMPT } ] }] ) raw = response.content[0].text.strip() if raw.startswith("```"): raw = raw.split("\n", 1)[1] if raw.endswith("```"): raw = raw[:-3] raw = raw.strip() return json.loads(raw) except Exception as e: if attempt < retries - 1: time.sleep(2 ** attempt * 3) else: print(f" [IMAGE FAIL] {chunk_id}: {e}", flush=True) return { "image_type": "other", "image_description_en": "Analysis failed", "image_description_pt_br": "Análise falhou", "extracted_text": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None } def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path: """Crop bbox region from page PNG and save to images dir""" src = PNG_DIR / f"p-{actual_num:03d}.png" dst = IMAGES_DIR / f"IMG-{chunk_id}.png" try: im = Image.open(src) W, H = im.size x = bbox.get("x", 0.0) y = bbox.get("y", 0.0) w = bbox.get("w", 1.0) h = bbox.get("h", 1.0) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) cropped = im.crop((left, top, right, bottom)) cropped.save(str(dst)) return dst except Exception as e: print(f" [CROP FAIL] {chunk_id}: {e}", flush=True) return dst def write_chunk_file(chunk: dict, chunk_id: str): """Write individual chunk markdown file""" bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) page = chunk.get("page", 1) actual_num = PAGE_MAP.get(page, page) related_image = f"IMG-{chunk_id}.png" if chunk.get("type") == "image" else "null" related_table = chunk.get("related_table", "null") or "null" prev_chunk = chunk.get("prev_chunk", "null") or "null" next_chunk = chunk.get("next_chunk", "null") or "null" fmt_list = chunk.get("formatting", []) or [] fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]" ocr_lines = chunk.get("ocr_source_lines", []) or [] ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]" # Boolean fields ufo_det = str(chunk.get("ufo_anomaly_detected", False)).lower() crypto_det = str(chunk.get("cryptid_anomaly_detected", False)).lower() def yaml_val(v): if v is None or v == "null": return "null" if isinstance(v, bool): return str(v).lower() return str(v) content = f"""--- chunk_id: {chunk_id} type: {chunk.get("type", "paragraph")} page: {page} order_in_page: {chunk.get("order_in_page", 1)} order_global: {chunk.get("order_global", 1)} bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}} classification: {yaml_val(chunk.get("classification"))} formatting: {fmt_str} cross_page_hint: {chunk.get("cross_page_hint", "self_contained")} prev_chunk: {prev_chunk} next_chunk: {next_chunk} related_image: {related_image} related_table: {related_table} ocr_confidence: {chunk.get("ocr_confidence", 0.85)} ocr_source_lines: {ocr_lines_str} redaction_code: {yaml_val(chunk.get("redaction_code"))} redaction_inferred_content_type: {yaml_val(chunk.get("redaction_inferred_content_type"))} image_type: {yaml_val(chunk.get("image_type"))} ufo_anomaly_detected: {ufo_det} cryptid_anomaly_detected: {crypto_det} ufo_anomaly_type: {yaml_val(chunk.get("ufo_anomaly_type"))} ufo_anomaly_rationale: {yaml_val(chunk.get("ufo_anomaly_rationale"))} cryptid_anomaly_type: {yaml_val(chunk.get("cryptid_anomaly_type"))} cryptid_anomaly_rationale: {yaml_val(chunk.get("cryptid_anomaly_rationale"))} image_description_en: {yaml_val(chunk.get("image_description_en"))} image_description_pt_br: {yaml_val(chunk.get("image_description_pt_br"))} extracted_text: {yaml_val(chunk.get("extracted_text"))} source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png --- **EN:** {chunk.get("content_en", "")} **PT-BR:** {chunk.get("content_pt_br", "")} """ chunk_path = CHUNKS_DIR / f"{chunk_id}.md" chunk_path.write_text(content, encoding="utf-8") def main(): t_start = time.time() print(f"Starting rebuild of {DOC_ID}", flush=True) print(f"Total pages: {TOTAL_PAGES}", flush=True) # Ensure output dirs CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) # Step 1: Process all pages in parallel batches of 5 print("\n=== Phase 1: Page rebuilding ===", flush=True) all_page_results = {} page_seqs = list(range(1, TOTAL_PAGES + 1)) batch_size = 5 for batch_start in range(0, len(page_seqs), batch_size): batch = page_seqs[batch_start:batch_start + batch_size] print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor: futures = {executor.submit(rebuild_page, p): p for p in batch} for future in concurrent.futures.as_completed(futures): result = future.result() all_page_results[result["page_number"]] = result # Small delay between batches to avoid rate limits if batch_start + batch_size < len(page_seqs): time.sleep(1) # Step 2: Globally number chunks print("\n=== Phase 2: Global chunk numbering ===", flush=True) all_chunks = [] order_global = 0 for page_seq in sorted(all_page_results.keys()): page_data = all_page_results[page_seq] chunks = page_data.get("chunks", []) for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)): order_global += 1 chunk_id = f"c{order_global:04d}" chunk["chunk_id"] = chunk_id chunk["order_global"] = order_global chunk["actual_num"] = page_data.get("actual_num", page_seq) all_chunks.append(chunk) # Set prev/next pointers for i, chunk in enumerate(all_chunks): chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None print(f" Total chunks: {len(all_chunks)}", flush=True) # Step 3: Crop images (all first, then analyze) print("\n=== Phase 3: Cropping images ===", flush=True) image_chunks = [c for c in all_chunks if c.get("type") == "image"] print(f" Found {len(image_chunks)} image chunks", flush=True) for chunk in image_chunks: chunk_id = chunk["chunk_id"] page = chunk["page"] actual_num = chunk.get("actual_num", PAGE_MAP.get(page, page)) bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) crop_image(page, actual_num, chunk_id, bbox) # Step 4: Analyze images in parallel batches of 5 print("\n=== Phase 4: Image analysis ===", flush=True) for batch_start in range(0, len(image_chunks), batch_size): batch = image_chunks[batch_start:batch_start + batch_size] print(f" Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor: futures = {} for chunk in batch: chunk_id = chunk["chunk_id"] img_path = IMAGES_DIR / f"IMG-{chunk_id}.png" futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id for future in concurrent.futures.as_completed(futures): chunk_id = futures[future] img_meta = future.result() # Find chunk and merge for chunk in all_chunks: if chunk["chunk_id"] == chunk_id: chunk.update({ "image_type": img_meta.get("image_type", chunk.get("image_type")), "image_description_en": img_meta.get("image_description_en"), "image_description_pt_br": img_meta.get("image_description_pt_br"), "extracted_text": img_meta.get("extracted_text"), "ufo_anomaly_detected": img_meta.get("ufo_anomaly_detected", False), "ufo_anomaly_type": img_meta.get("ufo_anomaly_type"), "ufo_anomaly_rationale": img_meta.get("ufo_anomaly_rationale"), "cryptid_anomaly_detected": img_meta.get("cryptid_anomaly_detected", False), "cryptid_anomaly_type": img_meta.get("cryptid_anomaly_type"), "cryptid_anomaly_rationale": img_meta.get("cryptid_anomaly_rationale"), }) print(f" [IMG OK] {chunk_id}", flush=True) break if batch_start + batch_size < len(image_chunks): time.sleep(1) # Step 5: Check for cross-page table stitching print("\n=== Phase 5: Table stitching check ===", flush=True) tables_stitched = 0 # (Simple check - full stitching would require more complex logic) # Find table_marker chunks that span pages table_markers = [c for c in all_chunks if c.get("type") == "table_marker"] print(f" Found {len(table_markers)} table markers", flush=True) # No cross-page stitching needed for this pass - all tables are self-contained # Step 6: Write individual chunk files print("\n=== Phase 6: Writing chunk files ===", flush=True) for chunk in all_chunks: write_chunk_file(chunk, chunk["chunk_id"]) print(f" Wrote {len(all_chunks)} chunk files", flush=True) # Step 7: Write _index.json print("\n=== Phase 7: Writing _index.json ===", flush=True) build_at = datetime.now(timezone.utc).isoformat() index_chunks = [] for chunk in all_chunks: bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) content_en = chunk.get("content_en", "") preview = (content_en[:80] + "...") if len(content_en) > 80 else content_en index_chunks.append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "paragraph"), "page": chunk.get("page", 1), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk['chunk_id']}.md", "bbox": bbox, "preview": preview }) index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": len(all_chunks), "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": build_at, "chunks": index_chunks } index_path = OUT_DIR / "_index.json" index_path.write_text(json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8") print(f" Written: {index_path}", flush=True) # Step 8: Assemble document.md print("\n=== Phase 8: Assembling document.md ===", flush=True) # Compute stats type_histogram = {} for chunk in all_chunks: t = chunk.get("type", "paragraph") type_histogram[t] = type_histogram.get(t, 0) + 1 ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")] cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")] images_extracted = len(image_chunks) # Build frontmatter histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items())) ufo_yaml = "\n".join(f" - {c}" for c in ufo_flagged) if ufo_flagged else " []" cryptid_yaml = "\n".join(f" - {c}" for c in cryptid_flagged) if cryptid_flagged else " []" doc_parts = [f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {TOTAL_PAGES} total_chunks: {len(all_chunks)} chunk_types_histogram: {histogram_yaml} multi_page_tables: [] ufo_anomalies_flagged: {ufo_yaml if ufo_flagged else " []"} cryptid_anomalies_flagged: {cryptid_yaml if cryptid_flagged else " []"} build_approach: "subagents" build_model: "claude-haiku-4-5" build_at: "{build_at}" --- """] # Group chunks by page chunks_by_page = {} for chunk in all_chunks: p = chunk.get("page", 1) chunks_by_page.setdefault(p, []).append(chunk) for page_seq in sorted(chunks_by_page.keys()): page_chunks = chunks_by_page[page_seq] doc_parts.append(f"\n## Page {page_seq}\n") for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)): chunk_id = chunk["chunk_id"] ctype = chunk.get("type", "paragraph") bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}" doc_parts.append(f"\n") doc_parts.append(f'\n') doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n") content_en = chunk.get("content_en", "") content_pt_br = chunk.get("content_pt_br", "") doc_parts.append(f"**EN:** {content_en}\n\n") doc_parts.append(f"**PT-BR:** {content_pt_br}\n\n") # Image embed if ctype == "image": img_rel = f"./images/IMG-{chunk_id}.png" doc_parts.append(f"![{chunk_id} image]({img_rel})\n\n") desc_en = chunk.get("image_description_en", "") desc_pt = chunk.get("image_description_pt_br", "") if desc_en: doc_parts.append(f"**Image Description (EN):** {desc_en}\n\n") if desc_pt: doc_parts.append(f"**Descrição da Imagem (PT-BR):** {desc_pt}\n\n") # Table render if ctype == "table_marker" and chunk.get("stitched_table"): rows = chunk["stitched_table"] if rows: doc_parts.append("\n") for row in rows: doc_parts.append("" + "".join(f"" for cell in row) + "\n") doc_parts.append("
{cell}
\n\n") # Metadata details meta = { "chunk_id": chunk_id, "type": ctype, "page": chunk.get("page"), "order_in_page": chunk.get("order_in_page"), "order_global": chunk.get("order_global"), "bbox": bbox, "classification": chunk.get("classification"), "formatting": chunk.get("formatting", []), "cross_page_hint": chunk.get("cross_page_hint"), "prev_chunk": chunk.get("prev_chunk"), "next_chunk": chunk.get("next_chunk"), "ocr_confidence": chunk.get("ocr_confidence"), "redaction_code": chunk.get("redaction_code"), "image_type": chunk.get("image_type"), "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False), "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False), } meta_json = json.dumps(meta, indent=2, ensure_ascii=False) doc_parts.append(f"
metadata\n\n```json\n{meta_json}\n```\n\n
\n\n---\n\n") doc_md = "".join(doc_parts) doc_path = OUT_DIR / "document.md" doc_path.write_text(doc_md, encoding="utf-8") doc_md_bytes = len(doc_md.encode("utf-8")) print(f" Written: {doc_path} ({doc_md_bytes} bytes)", flush=True) t_end = time.time() wall_seconds = int(t_end - t_start) print(f"\n=== DONE ===", flush=True) print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True) print(f"\npages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}", flush=True) if __name__ == "__main__": main()