#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ rebuild_doc65_full.py Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4. Uses Google Gemini flash for vision analysis of each page. Generates chunks/, images/, _index.json, document.md """ import os import sys import json import base64 import datetime import time import re import concurrent.futures from pathlib import Path from PIL import Image as PILImage # ---- Config ---- DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files" RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") CHUNKS_DIR = RAW_DIR / "chunks" IMAGES_DIR = RAW_DIR / "images" TABLES_DIR = RAW_DIR / "tables" GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") BATCH_SIZE = 4 # conservative for API limits MAX_WORKERS = 4 # ---- Ensure dirs ---- for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]: d.mkdir(parents=True, exist_ok=True) # ---- Page map ---- def build_page_map(): pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')]) ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')]) page_map = {} for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1): page_map[i] = { 'png': str(PNG_DIR / png), 'ocr': str(OCR_DIR / ocr), 'png_filename': png, } return page_map def read_ocr(path): try: with open(path, 'r', encoding='utf-8') as f: return f.read().strip() except: return "" def now_iso(): return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') # ---- Gemini vision call ---- import google.generativeai as genai genai.configure(api_key=GEMINI_API_KEY) PAGE_ANALYSIS_PROMPT = """You are a document analyst rebuilding a declassified FBI UAP/flying saucer investigation file. Analyze this page image carefully and return ONLY valid JSON (no markdown code fences, no explanation). The JSON must have this exact structure: { "page_number": , "chunks": [ { "type": "", "order_in_page": , "content_en": "", "content_pt_br": "", "bbox": {"x": <0.0-1.0>, "y": <0.0-1.0>, "w": <0.0-1.0>, "h": <0.0-1.0>}, "classification": , "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": <0.0-1.0>, "ocr_source_lines": [], "redaction_code": , "redaction_inferred_content_type": null, "image_type": , "ufo_anomaly_detected": false, "cryptid_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null, "image_description_en": null, "image_description_pt_br": null, "extracted_text": null } ] } Rules: - Identify ALL distinct content blocks (letterhead, classification markings, memo headers, body paragraphs, stamps, redactions, signatures, photos, etc.) - For redacted areas: type="redaction", content_en="[REDACTED]", content_pt_br="[REDATADO]", include redaction_code if visible - For blank pages: ONE chunk with type="blank" - For stamps: type="stamp", include extracted_text with what the stamp says - For signatures: type="signature" - For photos/images: type="image", image_type appropriately, image_description_en with detailed description - UAP/flying saucer content: set ufo_anomaly_detected=true and fill ufo_anomaly_type and ufo_anomaly_rationale - bbox values are fractions of page dimensions (0.0 to 1.0) - content_en must be verbatim OCR text where possible, or [description] for non-text - content_pt_br must be Brazilian Portuguese translation - This is page %d of 179 total - Document: FBI investigation files about flying discs/UAP reports, 1947-era """ def analyze_page_with_gemini(page_num, png_path, ocr_text, retry=3): """Call Gemini flash to analyze a page image.""" prompt = PAGE_ANALYSIS_PROMPT % page_num if ocr_text: prompt += f"\n\nOCR text available (may be incomplete):\n{ocr_text[:2000]}" for attempt in range(retry): try: model = genai.GenerativeModel('gemini-1.5-flash') with open(png_path, 'rb') as f: img_data = f.read() import google.generativeai as genai2 from google.generativeai.types import HarmCategory, HarmBlockThreshold response = model.generate_content( [ {"mime_type": "image/png", "data": img_data}, prompt ], generation_config={"temperature": 0.1, "max_output_tokens": 4096}, safety_settings={ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, } ) text = response.text.strip() # Remove markdown code fences if present if text.startswith('```'): text = re.sub(r'^```(?:json)?\s*', '', text) text = re.sub(r'\s*```$', '', text) data = json.loads(text) return data except json.JSONDecodeError as e: print(f" Page {page_num}: JSON parse error (attempt {attempt+1}): {e}") if attempt < retry - 1: time.sleep(2) except Exception as e: print(f" Page {page_num}: Error (attempt {attempt+1}): {e}") if attempt < retry - 1: time.sleep(3) # Fallback: minimal chunk return { "page_number": page_num, "chunks": [{ "type": "body_text", "order_in_page": 1, "content_en": f"[Page {page_num} — vision analysis failed]", "content_pt_br": f"[Página {page_num} — análise visual falhou]", "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "cryptid_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } def process_page(args): page_num, png_path, ocr_path = args ocr_text = read_ocr(ocr_path) print(f" Processing page {page_num:03d}...", flush=True) result = analyze_page_with_gemini(page_num, png_path, ocr_text) print(f" Done page {page_num:03d}: {len(result.get('chunks', []))} chunks", flush=True) return page_num, result def crop_image_for_chunk(page_png, bbox, out_path): """Crop image region for an image-type chunk.""" try: im = PILImage.open(page_png) W, H = im.size x = bbox.get('x', 0) y = bbox.get('y', 0) w = bbox.get('w', 1) h = bbox.get('h', 1) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) if right <= left or bottom <= top: return False crop = im.crop((left, top, right, bottom)) crop.save(out_path) return True except Exception as e: print(f" Crop error: {e}") return False def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename): """Write a single chunk .md file.""" bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}) # Determine related_image related_image = None if chunk_data.get('type') == 'image': related_image = f"IMG-{chunk_id}.png" meta = { "chunk_id": chunk_id, "type": chunk_data.get('type', 'body_text'), "page": page_num, "order_in_page": chunk_data.get('order_in_page', 1), "order_global": order_global, "bbox": bbox, "classification": chunk_data.get('classification'), "formatting": chunk_data.get('formatting', []), "cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'), "prev_chunk": prev_chunk, "next_chunk": next_chunk, "related_image": related_image, "related_table": None, "ocr_confidence": chunk_data.get('ocr_confidence', 0.8), "ocr_source_lines": chunk_data.get('ocr_source_lines', []), "redaction_code": chunk_data.get('redaction_code'), "redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'), "image_type": chunk_data.get('image_type'), "ufo_anomaly_detected": chunk_data.get('ufo_anomaly_detected', False), "cryptid_anomaly_detected": chunk_data.get('cryptid_anomaly_detected', False), "ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'), "ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'), "cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'), "cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'), "image_description_en": chunk_data.get('image_description_en'), "image_description_pt_br": chunk_data.get('image_description_pt_br'), "extracted_text": chunk_data.get('extracted_text'), "source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}", } content_en = chunk_data.get('content_en', '') content_pt_br = chunk_data.get('content_pt_br', '') # Build YAML frontmatter def yaml_val(v): if v is None: return "null" if isinstance(v, bool): return str(v).lower() if isinstance(v, (int, float)): return str(v) if isinstance(v, list): if not v: return "[]" return "[" + ", ".join(yaml_val(i) for i in v) + "]" if isinstance(v, dict): return "{" + ", ".join(f"{k}: {yaml_val(vv)}" for k, vv in v.items()) + "}" # string s = str(v) if any(c in s for c in [':', '#', '[', ']', '{', '}', '*', '&', '!', '|', '>', "'", '"', '\n']): s = s.replace('"', '\\"') return f'"{s}"' return s lines = ["---"] for k, v in meta.items(): if isinstance(v, dict): lines.append(f"{k}: {{{', '.join(f'{kk}: {yaml_val(vv)}' for kk, vv in v.items())}}}") else: lines.append(f"{k}: {yaml_val(v)}") lines.append("---") lines.append("") lines.append(f"**EN:** {content_en}") lines.append("") lines.append(f"**PT-BR:** {content_pt_br}") lines.append("") out_path = CHUNKS_DIR / f"{chunk_id}.md" with open(out_path, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) return meta def main(): start_time = time.time() page_map = build_page_map() total_pages = len(page_map) print(f"Starting rebuild: {total_pages} pages") # Process all pages in batches of BATCH_SIZE all_page_results = {} # page_num -> result dict page_nums = list(page_map.keys()) for batch_start in range(0, total_pages, BATCH_SIZE): batch = page_nums[batch_start:batch_start + BATCH_SIZE] batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch] print(f"\nBatch {batch_start//BATCH_SIZE + 1}: pages {batch[0]}-{batch[-1]}", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = {executor.submit(process_page, args): args[0] for args in batch_args} for future in concurrent.futures.as_completed(futures): page_num = futures[future] try: pn, result = future.result(timeout=120) all_page_results[pn] = result except Exception as e: print(f" Page {page_num} failed: {e}") all_page_results[page_num] = { "page_number": page_num, "chunks": [{ "type": "body_text", "order_in_page": 1, "content_en": f"[Page {page_num} — processing error]", "content_pt_br": f"[Página {page_num} — erro de processamento]", "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "cryptid_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None }] } # Small pause between batches to be respectful of rate limits if batch_start + BATCH_SIZE < total_pages: time.sleep(1) print(f"\nAll pages analyzed. Assigning global chunk IDs...") # --- Global chunk numbering --- all_chunks_ordered = [] # list of (page_num, chunk_data, source_png_filename) for page_num in sorted(all_page_results.keys()): result = all_page_results[page_num] chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1)) source_png = page_map[page_num]['png_filename'] for chunk in chunks: all_chunks_ordered.append((page_num, chunk, source_png)) total_chunks = len(all_chunks_ordered) print(f"Total chunks: {total_chunks}") # Assign chunk_ids and write chunk files chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)] index_entries = [] all_chunk_meta = [] images_extracted = 0 ufo_anomalies = [] cryptid_anomalies = [] print("Writing chunk files...") for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered): chunk_id = chunk_id_list[i] order_global = i + 1 prev_chunk = chunk_id_list[i-1] if i > 0 else None next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None # Crop image if needed if chunk_data.get('type') == 'image': bbox = chunk_data.get('bbox', {}) img_out = IMAGES_DIR / f"IMG-{chunk_id}.png" png_path = page_map[page_num]['png'] if crop_image_for_chunk(png_path, bbox, img_out): images_extracted += 1 # Write chunk file meta = write_chunk_file( chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png ) all_chunk_meta.append(meta) # Track anomalies if chunk_data.get('ufo_anomaly_detected'): ufo_anomalies.append(chunk_id) if chunk_data.get('cryptid_anomaly_detected'): cryptid_anomalies.append(chunk_id) # Index entry content_en = chunk_data.get('content_en', '') preview = content_en[:80].replace('\n', ' ') bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}) index_entries.append({ "chunk_id": chunk_id, "type": chunk_data.get('type', 'body_text'), "page": page_num, "order_in_page": chunk_data.get('order_in_page', 1), "order_global": order_global, "file": f"chunks/{chunk_id}.md", "bbox": bbox, "preview": preview }) # --- Write _index.json --- print("Writing _index.json...") build_at = now_iso() # Compute chunk type histogram type_hist = {} for entry in index_entries: t = entry['type'] type_hist[t] = type_hist.get(t, 0) + 1 index_data = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": total_pages, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": "claude-sonnet-4-6", "build_at": build_at, "chunks": index_entries } with open(RAW_DIR / "_index.json", 'w', encoding='utf-8') as f: json.dump(index_data, f, indent=2, ensure_ascii=False) # --- Assemble document.md --- print("Assembling document.md...") doc_lines = [] doc_lines.append("---") doc_lines.append('schema_version: "0.2.0"') doc_lines.append("type: master_document") doc_lines.append(f"doc_id: {DOC_ID}") doc_lines.append(f'canonical_title: "{DOC_TITLE}"') doc_lines.append(f"total_pages: {total_pages}") doc_lines.append(f"total_chunks: {total_chunks}") hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items())) doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}") doc_lines.append("multi_page_tables: []") ufo_str = "[" + ", ".join(ufo_anomalies) + "]" cryptid_str = "[" + ", ".join(cryptid_anomalies) + "]" doc_lines.append(f"ufo_anomalies_flagged: {ufo_str}") doc_lines.append(f"cryptid_anomalies_flagged: {cryptid_str}") doc_lines.append('build_approach: "subagents"') doc_lines.append("build_model: claude-sonnet-4-6") doc_lines.append(f"build_at: {build_at}") doc_lines.append("---") doc_lines.append("") # Group chunks by page chunks_by_page = {} for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered): if page_num not in chunks_by_page: chunks_by_page[page_num] = [] chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, source_png)) for page_num in sorted(chunks_by_page.keys()): doc_lines.append(f"## Page {page_num}") doc_lines.append("") for chunk_id, chunk_data, source_png in chunks_by_page[page_num]: ctype = chunk_data.get('type', 'body_text') bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}" doc_lines.append(f"") doc_lines.append(f'') doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page_num} · bbox: {bbox_str}") doc_lines.append("") doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}") doc_lines.append("") doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}") doc_lines.append("") if ctype == 'image': img_path = f"./images/IMG-{chunk_id}.png" doc_lines.append(f"![{chunk_id} image]({img_path})") doc_lines.append("") desc = chunk_data.get('image_description_en', '') if desc: doc_lines.append(f"*{desc}*") doc_lines.append("") # Metadata details meta_dict = all_chunk_meta[int(chunk_id[1:]) - 1] doc_lines.append("
metadata") doc_lines.append("") doc_lines.append("```json") doc_lines.append(json.dumps(meta_dict, indent=2, ensure_ascii=False)) doc_lines.append("```") doc_lines.append("") doc_lines.append("
") doc_lines.append("") doc_lines.append("---") doc_lines.append("") doc_content = '\n'.join(doc_lines) with open(RAW_DIR / "document.md", 'w', encoding='utf-8') as f: f.write(doc_content) doc_bytes = len(doc_content.encode('utf-8')) wall_seconds = int(time.time() - start_time) print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}") print(f"Wall time: {wall_seconds}s") if __name__ == "__main__": main()