#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ rebuild_doc65_suba_final.py Full rebuild of doc-65-hs1-834228961-62-hq-83894-sub-a 89 pages (p-000 to p-063, p-100 to p-124 PNGs) Uses Anthropic claude-haiku-4-5 for vision processing. """ import os import sys import json import base64 import time import re import threading from datetime import datetime, timezone from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from PIL import Image as PILImage import anthropic DOC_ID = "doc-65-hs1-834228961-62-hq-83894-sub-a" DOC_TITLE = "FBI HQ 62-HQ-83894 Sub A — Flying Saucers / UAP Investigation File" PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" BATCH_SIZE = 4 MAX_WORKERS = 4 _lock = threading.Lock() def safe_print(*args, **kwargs): with _lock: print(*args, **kwargs, flush=True) # Ensure dirs for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]: d.mkdir(parents=True, exist_ok=True) client = anthropic.Anthropic() # Build ordered list of PNG files png_files = sorted(PNG_DIR.glob("p-*.png")) TOTAL_PAGES = len(png_files) safe_print(f"Found {TOTAL_PAGES} PNG pages") def load_image_b64(path: Path) -> str: with open(path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") def load_ocr(png_name: str) -> str: ocr_name = png_name.replace(".png", ".txt") ocr_path = OCR_DIR / ocr_name if ocr_path.exists(): txt = ocr_path.read_text(encoding="utf-8").strip() if txt: return txt[:3000] return "(no OCR text available — use vision only)" def extract_json(text: str) -> dict: text = text.strip() text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```\s*$", "", text) start = text.find("{") if start == -1: raise ValueError("No JSON found") depth = 0 for i, c in enumerate(text[start:], start): if c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: return json.loads(text[start:i+1]) raise ValueError("Unclosed JSON") PAGE_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO FBI document archive. Document: {doc_title} Doc ID: {doc_id} Page: {page_number} of {total_pages} PNG: {png_filename} OCR text: --- {ocr_text} --- Analyze this page image carefully. Extract ALL content as ordered semantic chunks. Return ONLY valid JSON (no markdown, no fences): {{ "page_number": {page_number}, "chunks": [ {{ "order_in_page": 1, "type": "cover|letterhead|classification_banner|subject_line|salutation|body_paragraph|signature_block|date_line|reference_line|redaction_block|table_marker|image|caption|footer|header|list_item|handwritten_note|stamp|page_number|section_heading|blank", "content_en": "verbatim text or description in English", "content_pt_br": "tradução em português brasileiro", "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} ] }} Rules: 1. Every visible region = its own chunk. Do not skip content. 2. For images: set image_type to photograph|diagram|map|sketch|stamp|chart|handwriting|newspaper_clipping|other 3. For redaction_block: set redaction_code to visible FOIA code if shown. 4. For classification banners/stamps: set classification field to exact text. 5. ufo_anomaly_detected=true if content has UAP/UFO sighting details, craft descriptions, anomalous phenomena. 6. cross_page_hint: "self_contained"|"continues_to_next"|"continues_from_prev" 7. bbox: normalized 0.0-1.0 (x=left, y=top, w=width, h=height). 8. formatting: ["bold","italic","all_caps","underline","strikethrough"] 9. Newspaper clippings = type "image", image_type="newspaper_clipping", ufo_anomaly_detected=true if about UFOs. 10. Return ONLY the JSON object, nothing else.""" def fallback_chunk(page_number: int, reason: str) -> dict: return { "page_number": page_number, "chunks": [{ "order_in_page": 1, "type": "body_paragraph", "content_en": f"[Page {page_number} - processing failed: {reason[:80]}]", "content_pt_br": f"[Página {page_number} - falha no processamento: {reason[:80]}]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, }] } def process_page(page_idx: int, png_path: Path) -> dict: page_number = page_idx + 1 png_filename = png_path.name ocr_text = load_ocr(png_filename) img_b64 = load_image_b64(png_path) prompt = PAGE_PROMPT.format( doc_title=DOC_TITLE, doc_id=DOC_ID, page_number=page_number, total_pages=TOTAL_PAGES, png_filename=png_filename, ocr_text=ocr_text, ) for attempt in range(3): try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=4096, messages=[{ "role": "user", "content": [ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}}, {"type": "text", "text": prompt}, ], }], ) raw = response.content[0].text data = extract_json(raw) data["page_number"] = page_number data["png_path"] = str(png_path) data["png_filename"] = png_filename safe_print(f" p{page_number} ({png_filename}): {len(data.get('chunks',[]))} chunks") return data except json.JSONDecodeError as e: safe_print(f" p{page_number} JSON error attempt {attempt+1}: {e}") if attempt == 2: return fallback_chunk(page_number, f"JSON parse: {e}") except Exception as e: safe_print(f" p{page_number} error attempt {attempt+1}: {e}") if attempt < 2: time.sleep(2 ** attempt) else: return fallback_chunk(page_number, str(e)) IMAGE_ANALYST_PROMPT = """You are an image analyst for a declassified FBI UAP/UFO document archive. Analyze this cropped image from FBI file 62-HQ-83894 about Flying Saucers/UAP. Return ONLY valid JSON (no markdown, no fences): {{ "image_description_en": "detailed English description", "image_description_pt_br": "descrição detalhada em português brasileiro", "image_type": "photograph|diagram|map|sketch|stamp|chart|handwriting|newspaper_clipping|other", "extracted_text": "visible text verbatim or null", "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }}""" def crop_and_analyze_image(chunk: dict) -> dict: chunk_id = chunk["chunk_id"] png_path = chunk["png_path"] bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}) # Crop try: im = PILImage.open(png_path) W, H = im.size x, y, w, h = bbox.get("x",0), bbox.get("y",0), bbox.get("w",1), bbox.get("h",1) pad = 0.005 left = max(0, int((x-pad)*W)) top = max(0, int((y-pad)*H)) right = min(W, int((x+w+pad)*W)) bottom = min(H, int((y+h+pad)*H)) crop = im.crop((left, top, right, bottom)) out_path = IMAGES_DIR / f"IMG-{chunk_id}.png" crop.save(str(out_path)) img_b64 = load_image_b64(out_path) except Exception as e: safe_print(f" Crop error {chunk_id}: {e}") return chunk # Analyze try: response = client.messages.create( model="claude-haiku-4-5", max_tokens=1024, messages=[{ "role": "user", "content": [ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}}, {"type": "text", "text": IMAGE_ANALYST_PROMPT}, ], }], ) raw = response.content[0].text analysis = extract_json(raw) for key in ["image_description_en","image_description_pt_br","image_type","extracted_text", "ufo_anomaly_detected","ufo_anomaly_type","ufo_anomaly_rationale", "cryptid_anomaly_detected","cryptid_anomaly_type","cryptid_anomaly_rationale"]: if key in analysis: chunk[key] = analysis[key] safe_print(f" image analyzed: {chunk_id} ufo={chunk.get('ufo_anomaly_detected',False)}") except Exception as e: safe_print(f" Image analysis error {chunk_id}: {e}") return chunk def yaml_val(v): if v is None: return "null" if isinstance(v, bool): return "true" if v else "false" if isinstance(v, list): if not v: return "[]" return "[" + ", ".join(json.dumps(i, ensure_ascii=False) for i in v) + "]" return json.dumps(v, ensure_ascii=False) def write_chunk_file(chunk: dict): chunk_id = chunk["chunk_id"] bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}) chunk_type = chunk.get("type", "body_paragraph") related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None png_filename = chunk.get("png_filename", "") fm = f"""--- chunk_id: {chunk_id} type: {chunk_type} page: {chunk['page']} order_in_page: {chunk.get('order_in_page', 1)} order_global: {chunk['order_global']} bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',1):.2f}}} classification: {yaml_val(chunk.get('classification'))} formatting: {yaml_val(chunk.get('formatting', []))} cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')} prev_chunk: {yaml_val(chunk.get('prev_chunk'))} next_chunk: {yaml_val(chunk.get('next_chunk'))} related_image: {yaml_val(related_image)} related_table: {yaml_val(chunk.get('related_table'))} ocr_confidence: {chunk.get('ocr_confidence', 0.8)} ocr_source_lines: {yaml_val(chunk.get('ocr_source_lines', []))} redaction_code: {yaml_val(chunk.get('redaction_code'))} redaction_inferred_content_type: {yaml_val(chunk.get('redaction_inferred_content_type'))} image_type: {yaml_val(chunk.get('image_type'))} ufo_anomaly_detected: {yaml_val(chunk.get('ufo_anomaly_detected', False))} cryptid_anomaly_detected: {yaml_val(chunk.get('cryptid_anomaly_detected', False))} ufo_anomaly_type: {yaml_val(chunk.get('ufo_anomaly_type'))} ufo_anomaly_rationale: {yaml_val(chunk.get('ufo_anomaly_rationale'))} cryptid_anomaly_type: {yaml_val(chunk.get('cryptid_anomaly_type'))} cryptid_anomaly_rationale: {yaml_val(chunk.get('cryptid_anomaly_rationale'))} image_description_en: {yaml_val(chunk.get('image_description_en'))} image_description_pt_br: {yaml_val(chunk.get('image_description_pt_br'))} extracted_text: {yaml_val(chunk.get('extracted_text'))} source_png: ../../processing/png/{DOC_ID}/{png_filename} --- **EN:** {chunk.get('content_en', '')} **PT-BR:** {chunk.get('content_pt_br', '')} """ (CHUNKS_DIR / f"{chunk_id}.md").write_text(fm, encoding="utf-8") def main(): start = time.time() safe_print(f"=== Rebuild {DOC_ID} ===") safe_print(f"Total pages: {TOTAL_PAGES}") # Phase 1: Process pages in batches all_pages = [] page_items = list(enumerate(png_files)) # (idx, path) for batch_start in range(0, TOTAL_PAGES, BATCH_SIZE): batch = page_items[batch_start: batch_start + BATCH_SIZE] safe_print(f"Batch pages {[b[0]+1 for b in batch]}...") with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex: futs = {ex.submit(process_page, idx, pth): idx for idx, pth in batch} for fut in as_completed(futs): result = fut.result() all_pages.append(result) all_pages.sort(key=lambda p: p["page_number"]) # Phase 2: Global chunk numbering global_chunks = [] chunk_counter = 1 for page_data in all_pages: page_chunks = sorted(page_data.get("chunks", []), key=lambda c: c.get("order_in_page", 1)) for chunk in page_chunks: chunk["chunk_id"] = f"c{chunk_counter:04d}" chunk["page"] = page_data["page_number"] chunk["png_path"] = page_data["png_path"] chunk["png_filename"] = page_data["png_filename"] chunk["order_global"] = chunk_counter global_chunks.append(chunk) chunk_counter += 1 total_chunks = len(global_chunks) safe_print(f"Total chunks: {total_chunks}") # Set prev/next for i, chunk in enumerate(global_chunks): chunk["prev_chunk"] = global_chunks[i-1]["chunk_id"] if i > 0 else None chunk["next_chunk"] = global_chunks[i+1]["chunk_id"] if i < total_chunks-1 else None # Phase 3: Crop & analyze images image_chunks = [c for c in global_chunks if c.get("type") == "image"] safe_print(f"Image chunks: {len(image_chunks)}") for batch_start in range(0, len(image_chunks), BATCH_SIZE): batch = image_chunks[batch_start: batch_start + BATCH_SIZE] with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex: futs = {ex.submit(crop_and_analyze_image, chunk): chunk["chunk_id"] for chunk in batch} for fut in as_completed(futs): fut.result() # side-effects already applied # Phase 4: Write chunk files safe_print("Writing chunk files...") for chunk in global_chunks: write_chunk_file(chunk) # Phase 5: Write _index.json safe_print("Writing _index.json...") build_at = datetime.now(timezone.utc).isoformat() index_chunks = [] for chunk in global_chunks: index_chunks.append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "body_paragraph"), "page": chunk["page"], "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk["order_global"], "file": f"chunks/{chunk['chunk_id']}.md", "bbox": chunk.get("bbox", {"x":0,"y":0,"w":1,"h":1}), "preview": chunk.get("content_en","")[:80].replace("\n"," "), }) index_data = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": build_at, "chunks": index_chunks, } (OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8") # Phase 6: Assemble document.md safe_print("Assembling document.md...") type_hist = {} for chunk in global_chunks: t = chunk.get("type","body_paragraph") type_hist[t] = type_hist.get(t,0)+1 ufo_flagged = [c["chunk_id"] for c in global_chunks if c.get("ufo_anomaly_detected")] cryptid_flagged = [c["chunk_id"] for c in global_chunks if c.get("cryptid_anomaly_detected")] hist_yaml = "\n".join(f" {k}: {v}" for k,v in sorted(type_hist.items())) doc_parts = [f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {TOTAL_PAGES} total_chunks: {total_chunks} chunk_types_histogram: {hist_yaml} multi_page_tables: [] ufo_anomalies_flagged: {json.dumps(ufo_flagged, ensure_ascii=False)} cryptid_anomalies_flagged: {json.dumps(cryptid_flagged, ensure_ascii=False)} build_approach: "subagents" build_model: "claude-haiku-4-5" build_at: "{build_at}" --- """] chunks_by_page = {} for chunk in global_chunks: p = chunk["page"] chunks_by_page.setdefault(p, []).append(chunk) for page_num in sorted(chunks_by_page.keys()): doc_parts.append(f"\n## Page {page_num}\n\n") for chunk in chunks_by_page[page_num]: cid = chunk["chunk_id"] ctype = chunk.get("type","body_paragraph") bbox = chunk.get("bbox",{}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}" doc_parts.append(f"\n") doc_parts.append(f'\n') doc_parts.append(f"### Chunk {cid} — {ctype} · p{page_num} · bbox: {bbox_str}\n\n") if ctype == "image": doc_parts.append(f"![chunk image](./images/IMG-{cid}.png)\n\n") d_en = chunk.get("image_description_en") d_pt = chunk.get("image_description_pt_br") if d_en: doc_parts.append(f"**Image Description (EN):** {d_en}\n\n") if d_pt: doc_parts.append(f"**Descrição da Imagem (PT-BR):** {d_pt}\n\n") doc_parts.append(f"**EN:** {chunk.get('content_en','')}\n\n") doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br','')}\n\n") meta = { "chunk_id": cid, "type": ctype, "page": page_num, "order_in_page": chunk.get("order_in_page",1), "order_global": chunk["order_global"], "bbox": chunk.get("bbox",{}), "classification": chunk.get("classification"), "formatting": chunk.get("formatting",[]), "cross_page_hint": chunk.get("cross_page_hint","self_contained"), "prev_chunk": chunk.get("prev_chunk"), "next_chunk": chunk.get("next_chunk"), "ocr_confidence": chunk.get("ocr_confidence",0.8), "redaction_code": chunk.get("redaction_code"), "image_type": chunk.get("image_type"), "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected",False), "ufo_anomaly_type": chunk.get("ufo_anomaly_type"), "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected",False), "source_png": f"../../processing/png/{DOC_ID}/{chunk.get('png_filename','')}", } doc_parts.append("
metadata\n\n```json\n") doc_parts.append(json.dumps(meta, ensure_ascii=False, indent=2)) doc_parts.append("\n```\n\n
\n\n---\n\n") doc_content = "".join(doc_parts) (OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8") doc_md_bytes = len(doc_content.encode("utf-8")) elapsed = int(time.time() - start) safe_print(f"\nSTATS pages={TOTAL_PAGES} chunks={total_chunks} images={len(image_chunks)} tables=0 ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}") print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={len(image_chunks)}, tables_stitched=0, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={elapsed}") if __name__ == "__main__": main()