#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ rebuild_doc65_gemini.py Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4. Uses Google Gemini flash for vision analysis of each page. CRITICAL: Always wraps Gemini calls with thread timeout (known hang issue). """ import os import sys import json import datetime import time import re import concurrent.futures from pathlib import Path from PIL import Image as PILImage import warnings warnings.filterwarnings('ignore') DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files" RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") CHUNKS_DIR = RAW_DIR / "chunks" IMAGES_DIR = RAW_DIR / "images" TABLES_DIR = RAW_DIR / "tables" GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") GEMINI_MODEL = "gemini-2.5-flash" BATCH_SIZE = 4 MAX_WORKERS = 4 GEMINI_TIMEOUT_SEC = 120 for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]: d.mkdir(parents=True, exist_ok=True) def build_page_map(): pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')]) ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')]) page_map = {} for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1): page_map[i] = { 'png': str(PNG_DIR / png), 'ocr': str(OCR_DIR / ocr), 'png_filename': png, } return page_map def read_ocr(path): try: with open(path, 'r', encoding='utf-8') as f: return f.read().strip() except: return "" def now_iso(): return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') # Compact prompt that minimizes token usage in the response PAGE_ANALYSIS_PROMPT = """Analyze this FBI declassified document page. Return ONLY raw JSON (no markdown fences). JSON format (keep content_en values SHORT — max 300 chars per chunk, truncate with "..." if needed): {"page_number":%d,"chunks":[{"type":"","order_in_page":,"content_en":"","content_pt_br":"","bbox":{"x":<0-1>,"y":<0-1>,"w":<0-1>,"h":<0-1>},"classification":null,"formatting":[],"cross_page_hint":"self_contained","ocr_confidence":<0-1>,"ocr_source_lines":[],"redaction_code":null,"redaction_inferred_content_type":null,"image_type":null,"ufo_anomaly_detected":,"cryptid_anomaly_detected":false,"ufo_anomaly_type":null,"ufo_anomaly_rationale":null,"cryptid_anomaly_type":null,"cryptid_anomaly_rationale":null,"image_description_en":null,"image_description_pt_br":null,"extracted_text":null}]} Rules: - Each paragraph/section = separate chunk - Redacted: type=redaction, content_en="[REDACTED]" - Blank page: one chunk type=blank - Flying disc/UAP reports: ufo_anomaly_detected=true - bbox: x=left, y=top, w=width, h=height, all 0.0-1.0 - Page %d of 179, FBI flying discs 1947""" def fallback_chunk(page_num): return { "page_number": page_num, "chunks": [{ "type": "body_text", "order_in_page": 1, "content_en": f"[Page {page_num} — vision analysis failed]", "content_pt_br": f"[Página {page_num} — análise visual falhou]", "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "cryptid_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } def _gemini_call_inner(page_num, png_path, prompt_text): """Single Gemini API call — run inside thread for timeout.""" import google.genai as genai import google.genai.types as gTypes client = genai.Client(api_key=GEMINI_API_KEY) with open(png_path, 'rb') as f: img_bytes = f.read() response = client.models.generate_content( model=GEMINI_MODEL, contents=[ gTypes.Part.from_bytes(data=img_bytes, mime_type='image/png'), prompt_text ], config=gTypes.GenerateContentConfig( temperature=0.1, max_output_tokens=16384, ) ) return response.text def clean_json_text(text): """Try to clean and extract JSON from potentially truncated response.""" if text is None: return None text = text.strip() # Remove markdown fences text = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE) text = re.sub(r'\s*```\s*$', '', text, flags=re.MULTILINE) text = text.strip() # Try direct parse first try: return json.loads(text) except json.JSONDecodeError: pass # Try to find the JSON object boundaries start = text.find('{') if start == -1: return None # Try to repair truncated JSON by finding the last complete chunk # Strategy: find the last complete chunk object and close the array properly text_from_start = text[start:] # Try progressively smaller slices to find valid JSON # Look for last valid chunk boundary last_bracket = text_from_start.rfind('}') while last_bracket > 0: candidate = text_from_start[:last_bracket+1] # Try to close the chunks array and root object for suffix in ['', ']}', ']}}']: try: result = json.loads(candidate + suffix) return result except: pass last_bracket = text_from_start.rfind('}', 0, last_bracket) return None def analyze_page(page_num, png_path, ocr_text, retry=3): prompt = PAGE_ANALYSIS_PROMPT % (page_num, page_num) for attempt in range(retry): try: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: future = pool.submit(_gemini_call_inner, page_num, png_path, prompt) text = future.result(timeout=GEMINI_TIMEOUT_SEC) data = clean_json_text(text) if data and 'chunks' in data and data['chunks']: return data print(f" P{page_num} no valid JSON (attempt {attempt+1})", flush=True) if attempt < retry - 1: time.sleep(2) except concurrent.futures.TimeoutError: print(f" P{page_num} TIMEOUT (attempt {attempt+1}/{retry})", flush=True) if attempt < retry - 1: time.sleep(5) except Exception as e: err = str(e) print(f" P{page_num} error (attempt {attempt+1}): {err[:120]}", flush=True) if '429' in err or 'RESOURCE_EXHAUSTED' in err: wait = 15 * (attempt + 1) print(f" Rate limit hit, waiting {wait}s...", flush=True) time.sleep(wait) elif attempt < retry - 1: time.sleep(3) return fallback_chunk(page_num) def process_page_task(args): page_num, png_path, ocr_path = args ocr_text = read_ocr(ocr_path) result = analyze_page(page_num, png_path, ocr_text) n = len(result.get('chunks', [])) print(f" P{page_num:03d}: {n} chunks", flush=True) return page_num, result def crop_image_chunk(page_png, bbox, out_path): try: im = PILImage.open(page_png) W, H = im.size x = max(0.0, float(bbox.get('x', 0))) y = max(0.0, float(bbox.get('y', 0))) w = max(0.01, float(bbox.get('w', 0.5))) h = max(0.01, float(bbox.get('h', 0.5))) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) if right <= left or bottom <= top: return False crop = im.crop((left, top, right, bottom)) crop.save(str(out_path)) return True except Exception as e: print(f" Crop error: {e}", flush=True) return False def yaml_scalar(v): if v is None: return "null" if isinstance(v, bool): return "true" if v else "false" if isinstance(v, (int, float)): return str(v) if isinstance(v, list): if not v: return "[]" return "[" + ", ".join(yaml_scalar(i) for i in v) + "]" if isinstance(v, dict): return "{" + ", ".join(f"{k}: {yaml_scalar(vv)}" for k, vv in v.items()) + "}" s = str(v) needs_quote = any(c in s for c in [':', '#', '[', ']', '{', '}', '|', '>', '*', '&', '!', "'", '"', '\n', '\r']) if needs_quote: s = s.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '') return f'"{s}"' return s def bbox_safe(bbox): if not bbox or not isinstance(bbox, dict): return {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90} return { "x": float(bbox.get('x', 0.05)), "y": float(bbox.get('y', 0.05)), "w": float(bbox.get('w', 0.90)), "h": float(bbox.get('h', 0.90)), } def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename): bbox = bbox_safe(chunk_data.get('bbox')) ctype = chunk_data.get('type', 'body_text') related_image = f"IMG-{chunk_id}.png" if ctype == 'image' else None meta = { "chunk_id": chunk_id, "type": ctype, "page": page_num, "order_in_page": chunk_data.get('order_in_page', 1), "order_global": order_global, "bbox": bbox, "classification": chunk_data.get('classification'), "formatting": chunk_data.get('formatting', []), "cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'), "prev_chunk": prev_chunk, "next_chunk": next_chunk, "related_image": related_image, "related_table": None, "ocr_confidence": chunk_data.get('ocr_confidence', 0.8), "ocr_source_lines": chunk_data.get('ocr_source_lines', []), "redaction_code": chunk_data.get('redaction_code'), "redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'), "image_type": chunk_data.get('image_type'), "ufo_anomaly_detected": bool(chunk_data.get('ufo_anomaly_detected', False)), "cryptid_anomaly_detected": bool(chunk_data.get('cryptid_anomaly_detected', False)), "ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'), "ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'), "cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'), "cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'), "image_description_en": chunk_data.get('image_description_en'), "image_description_pt_br": chunk_data.get('image_description_pt_br'), "extracted_text": chunk_data.get('extracted_text'), "source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}", } lines = ["---"] for k, v in meta.items(): if isinstance(v, dict): pairs = ", ".join(f"{kk}: {yaml_scalar(vv)}" for kk, vv in v.items()) lines.append(f"{k}: {{{pairs}}}") else: lines.append(f"{k}: {yaml_scalar(v)}") lines.append("---") lines.append("") lines.append(f"**EN:** {chunk_data.get('content_en', '')}") lines.append("") lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}") lines.append("") out_path = CHUNKS_DIR / f"{chunk_id}.md" with open(str(out_path), 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) return meta def main(): start_time = time.time() page_map = build_page_map() total_pages = len(page_map) print(f"Pages: {total_pages}, Model: {GEMINI_MODEL}", flush=True) all_page_results = {} page_nums = list(page_map.keys()) cache_file = RAW_DIR / "_page_results_cache.json" if cache_file.exists(): print("Loading partial cache...", flush=True) with open(str(cache_file), 'r', encoding='utf-8') as f: cached = json.load(f) all_page_results = {int(k): v for k, v in cached.items()} print(f" Loaded {len(all_page_results)} cached pages", flush=True) pages_to_process = [p for p in page_nums if p not in all_page_results] print(f"Pages to process: {len(pages_to_process)}", flush=True) total_batches = (len(pages_to_process) + BATCH_SIZE - 1) // BATCH_SIZE for batch_idx, batch_start in enumerate(range(0, len(pages_to_process), BATCH_SIZE)): batch = pages_to_process[batch_start:batch_start + BATCH_SIZE] batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch] print(f"Batch {batch_idx+1}/{total_batches}: pages {batch[0]}-{batch[-1]}", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = {executor.submit(process_page_task, args): args[0] for args in batch_args} for future in concurrent.futures.as_completed(futures, timeout=600): page_num = futures[future] try: pn, result = future.result(timeout=5) all_page_results[pn] = result except Exception as e: print(f" P{page_num} future error: {e}", flush=True) all_page_results[page_num] = fallback_chunk(page_num) with open(str(cache_file), 'w', encoding='utf-8') as f: json.dump({str(k): v for k, v in all_page_results.items()}, f, ensure_ascii=False) print(f" Cache: {len(all_page_results)} pages", flush=True) if batch_start + BATCH_SIZE < len(pages_to_process): time.sleep(1) print(f"\nAll pages processed. Building output...", flush=True) all_chunks_ordered = [] for page_num in sorted(all_page_results.keys()): result = all_page_results[page_num] chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1)) source_png = page_map[page_num]['png_filename'] for chunk in chunks: all_chunks_ordered.append((page_num, chunk, source_png)) total_chunks = len(all_chunks_ordered) print(f"Total chunks: {total_chunks}", flush=True) chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)] print("Cropping image chunks...", flush=True) images_extracted = 0 for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered): if chunk_data.get('type') == 'image': chunk_id = chunk_id_list[i] bbox = bbox_safe(chunk_data.get('bbox')) img_out = IMAGES_DIR / f"IMG-{chunk_id}.png" png_path = page_map[page_num]['png'] if crop_image_chunk(png_path, bbox, img_out): images_extracted += 1 print("Writing chunk files...", flush=True) index_entries = [] all_chunk_meta = [] ufo_anomalies = [] cryptid_anomalies = [] for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered): chunk_id = chunk_id_list[i] order_global = i + 1 prev_chunk = chunk_id_list[i-1] if i > 0 else None next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None meta = write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png) all_chunk_meta.append(meta) if chunk_data.get('ufo_anomaly_detected'): ufo_anomalies.append(chunk_id) if chunk_data.get('cryptid_anomaly_detected'): cryptid_anomalies.append(chunk_id) content_en = str(chunk_data.get('content_en', '')) preview = content_en[:80].replace('\n', ' ') index_entries.append({ "chunk_id": chunk_id, "type": chunk_data.get('type', 'body_text'), "page": page_num, "order_in_page": chunk_data.get('order_in_page', 1), "order_global": order_global, "file": f"chunks/{chunk_id}.md", "bbox": bbox_safe(chunk_data.get('bbox')), "preview": preview }) print("Writing _index.json...", flush=True) build_at = now_iso() type_hist = {} for entry in index_entries: t = entry['type'] type_hist[t] = type_hist.get(t, 0) + 1 index_data = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": total_pages, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": "claude-sonnet-4-6", "build_at": build_at, "chunks": index_entries } with open(str(RAW_DIR / "_index.json"), 'w', encoding='utf-8') as f: json.dump(index_data, f, indent=2, ensure_ascii=False) print("Assembling document.md...", flush=True) doc_lines = [] doc_lines.append("---") doc_lines.append('schema_version: "0.2.0"') doc_lines.append("type: master_document") doc_lines.append(f"doc_id: {DOC_ID}") doc_lines.append(f'canonical_title: "{DOC_TITLE}"') doc_lines.append(f"total_pages: {total_pages}") doc_lines.append(f"total_chunks: {total_chunks}") hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items())) doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}") doc_lines.append("multi_page_tables: []") doc_lines.append(f"ufo_anomalies_flagged: [{', '.join(ufo_anomalies)}]") doc_lines.append(f"cryptid_anomalies_flagged: [{', '.join(cryptid_anomalies)}]") doc_lines.append('build_approach: "subagents"') doc_lines.append("build_model: claude-sonnet-4-6") doc_lines.append(f"build_at: {build_at}") doc_lines.append("---") doc_lines.append("") chunks_by_page = {} for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered): if page_num not in chunks_by_page: chunks_by_page[page_num] = [] chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, all_chunk_meta[i])) for page_num in sorted(chunks_by_page.keys()): doc_lines.append(f"## Page {page_num}") doc_lines.append("") for chunk_id, chunk_data, meta in chunks_by_page[page_num]: ctype = chunk_data.get('type', 'body_text') bbox = bbox_safe(chunk_data.get('bbox')) bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}" doc_lines.append(f"") doc_lines.append(f'') doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page_num} · bbox: {bbox_str}") doc_lines.append("") doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}") doc_lines.append("") doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}") doc_lines.append("") if ctype == 'image': doc_lines.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)") doc_lines.append("") desc = chunk_data.get('image_description_en', '') if desc: doc_lines.append(f"*{desc}*") doc_lines.append("") doc_lines.append("
metadata") doc_lines.append("") doc_lines.append("```json") doc_lines.append(json.dumps(meta, indent=2, ensure_ascii=False)) doc_lines.append("```") doc_lines.append("") doc_lines.append("
") doc_lines.append("") doc_lines.append("---") doc_lines.append("") doc_content = '\n'.join(doc_lines) with open(str(RAW_DIR / "document.md"), 'w', encoding='utf-8') as f: f.write(doc_content) doc_bytes = len(doc_content.encode('utf-8')) wall_seconds = int(time.time() - start_time) if cache_file.exists(): os.remove(str(cache_file)) print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}") print(f"Wall time: {wall_seconds}s") if __name__ == "__main__": main()