#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ rebuild_doc65_section8.py Direct Gemini-powered rebuild of doc-65-hs1-834228961-62-hq-83894-section-8. Produces: chunks/, images/, tables/, _index.json, document.md """ import os import sys import json import re import time import base64 import datetime from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeout from PIL import Image import google.genai as genai from google.genai import types # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8" DOC_TITLE = "65 HS1-834228961/62-HQ-83894 Section 8" HIGHEST_CLASS = "TOP SECRET" RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") CHUNKS_DIR = RAW_DIR / "chunks" IMAGES_DIR = RAW_DIR / "images" TABLES_DIR = RAW_DIR / "tables" PAGES_RAW = RAW_DIR / "pages_raw.json" MODEL = "models/gemini-3.1-flash-lite" MAX_WORKERS = 4 PAGE_TIMEOUT = 150 # seconds per page VALID_TYPES = { "letterhead", "address_block", "classification_marking", "heading", "paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block", "caption", "table_marker", "image", "stamp", "signature", "marginalia", "redaction", "footer", "blank_area", "unknown", } # --------------------------------------------------------------------------- # Gemini client # --------------------------------------------------------------------------- client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")) # --------------------------------------------------------------------------- # Page-rebuilder prompt # --------------------------------------------------------------------------- PAGE_PROMPT = """\ You are a forensic document reconstruction agent for The Disclosure Bureau. Given a single page image (PNG) and its raw OCR text from a US Department of War declassified UAP/UFO document, decompose it into LOSSLESS agentic chunks. ## Chunk types — STRICT enum (use EXACTLY one of these 19 strings): letterhead, address_block, classification_marking, heading, paragraph, form_field, bulleted_item, numbered_item, quote_block, caption, table_marker, image, stamp, signature, marginalia, redaction, footer, blank_area, unknown ## Output: ONE JSON object — NO markdown fences, NO prose before/after. {{ "page_number": {page_number}, "page_summary_en": "1-2 sentences describing this page", "page_summary_pt_br": "1-2 frases em português brasileiro", "page_layout": {{ "columns": 1, "orientation": "portrait", "page_dimensions_approx": "letter" }}, "chunks": [ {{ "order_in_page": 1, "type": "paragraph", "bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.08}}, "content_en": "verbatim English text of this chunk", "content_pt_br": "Texto em português brasileiro", "metadata": {{ "ocr_confidence": 0.95, "ocr_source_lines": [1, 2, 3], "classification": null, "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "formatting": [], "cross_page_hint": "self_contained", "prev_chunk_hint": null, "next_chunk_hint": null, "language_in_source": "en" }} }} ] }} ## Rules: 1. Order by reading order (top→bottom, left→right). order_in_page is 1-indexed. 2. One semantic unit per chunk (one paragraph, one address block, one image, etc.). 3. ALL content accounted for — never skip anything, even blank areas if significant. 4. content_en: verbatim/near-verbatim. No paraphrasing. 5. content_pt_br: Brazilian Portuguese (pt-BR). Preserve UTF-8 accents: ç ã á é í ó ú â ê ô à. Proper nouns and verbatim quoted passages stay in source language inside pt-br. 6. Redacted blocks: content_en = "[REDACTED — ]". Never fabricate hidden content. 7. bbox: normalized 0..1 relative to page PNG size. Tight around the chunk. 8. cross_page_hint: self_contained | continues_from_prev | continues_to_next 9. image chunks: content_en = brief 1-sentence placeholder description (will be analyzed separately). 10. classification field: exact string as it appears (e.g. "TOP SECRET", "SECRET//NOFORN") or null. Document context: doc_id: {doc_id} page_number: {page_number} of {total_pages} doc_title: {doc_title} OCR text (layout-preserved, may have errors — trust the image when they disagree): --- {ocr_text} --- Now analyze the image + OCR and output the JSON:""" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def get_page_files(): pages = [] for png in sorted(PNG_DIR.glob("p-*.png")): m = re.match(r"p-0*(\d+)\.png", png.name) if not m: continue pn = int(m.group(1)) # OCR: try zero-padded 3-digit, then bare number for fmt in [f"p-{pn:03d}.txt", f"p-{pn}.txt"]: ocr = OCR_DIR / fmt if ocr.exists(): break else: ocr = None pages.append((pn, png, ocr)) return pages def encode_png(path): with open(path, "rb") as f: return base64.b64encode(f.read()).decode() def call_gemini(png_path, ocr_text, page_num, total_pages): prompt = PAGE_PROMPT.format( doc_id=DOC_ID, page_number=page_num, total_pages=total_pages, doc_title=DOC_TITLE, ocr_text=ocr_text[:5000], ) with open(png_path, "rb") as f: img_bytes = f.read() contents = [ types.Part( inline_data=types.Blob(mime_type="image/png", data=img_bytes) ), types.Part(text=prompt), ] config = types.GenerateContentConfig( temperature=0.1, max_output_tokens=8192, ) def _call(): resp = client.models.generate_content( model=MODEL, contents=contents, config=config ) if resp.text is None: # Safety block or empty response — extract any available text from parts try: parts = resp.candidates[0].content.parts return "\n".join(p.text for p in parts if hasattr(p, "text") and p.text) except Exception: return None return resp.text with ThreadPoolExecutor(max_workers=1) as ex: future = ex.submit(_call) return future.result(timeout=PAGE_TIMEOUT) def parse_page_json(raw_text, page_num): text = raw_text.strip() text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE) text = re.sub(r"\s*```\s*$", "", text, flags=re.MULTILINE) text = text.strip() try: data = json.loads(text) except json.JSONDecodeError: # Try to extract the largest {...} block m = re.search(r"\{[\s\S]*\}", text) if m: try: data = json.loads(m.group(0)) except json.JSONDecodeError: return {"page_number": page_num, "error": "json_parse_failed", "chunks": [], "raw": text[:300]} else: return {"page_number": page_num, "error": "no_json_found", "chunks": [], "raw": text[:300]} data["page_number"] = page_num # Validate and normalize chunk types for c in data.get("chunks", []): if c.get("type") not in VALID_TYPES: c["type"] = "unknown" return data def fallback_chunk(page_num, ocr_text): """Minimal unknown chunk when Gemini fails persistently.""" preview = ocr_text[:200].strip() if ocr_text and ocr_text.strip() else "(page content unavailable)" return { "page_number": page_num, "page_summary_en": f"Page {page_num} — content could not be parsed by vision model.", "page_summary_pt_br": f"Página {page_num} — conteúdo não pôde ser analisado pelo modelo de visão.", "page_layout": {"columns": 1, "orientation": "portrait", "page_dimensions_approx": "letter"}, "chunks": [{ "order_in_page": 1, "type": "unknown", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "content_en": f"[Vision analysis failed — OCR excerpt: {preview}]", "content_pt_br": f"[Análise de visão falhou — trecho OCR: {preview}]", "metadata": { "ocr_confidence": 0.0, "ocr_source_lines": [], "classification": None, "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "formatting": [], "cross_page_hint": "self_contained", "prev_chunk_hint": None, "next_chunk_hint": None, "language_in_source": "en", }, }], } def process_page(page_num, png_path, ocr_path, total_pages, use_fallback=False): ocr_text = ( ocr_path.read_text(encoding="utf-8", errors="replace") if ocr_path else "(OCR not available)" ) if use_fallback: return fallback_chunk(page_num, ocr_text) try: raw = call_gemini(png_path, ocr_text, page_num, total_pages) if raw is None: return {"page_number": page_num, "error": "gemini_none_response", "chunks": []} return parse_page_json(raw, page_num) except FuturesTimeout: return {"page_number": page_num, "error": "timeout", "chunks": []} except Exception as exc: return {"page_number": page_num, "error": str(exc)[:200], "chunks": []} def is_valid_page(p): return bool(p.get("chunks")) and not p.get("error") # --------------------------------------------------------------------------- # Phase 1: process all pages # --------------------------------------------------------------------------- def phase_process_pages(pages): total = len(pages) print(f"[Phase 1] Processing {total} pages with {MODEL} ...") # Load existing checkpoint existing_map = {} failed_pages = set() if PAGES_RAW.exists(): try: existing = json.loads(PAGES_RAW.read_text(encoding="utf-8")) for p in existing: if is_valid_page(p): existing_map[p["page_number"]] = p elif p.get("error"): failed_pages.add(p["page_number"]) print(f" Checkpoint: {len(existing_map)} valid pages loaded, {len(failed_pages)} previously failed") except Exception: pass to_process = [(pn, pp, op) for pn, pp, op in pages if pn not in existing_map] print(f" Remaining: {len(to_process)} pages") results_map = dict(existing_map) with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = { executor.submit(process_page, pn, pp, op, total, pn in failed_pages): pn for pn, pp, op in to_process } done = 0 for future in as_completed(futures): pn = futures[future] done += 1 try: result = future.result(timeout=PAGE_TIMEOUT + 30) except Exception as exc: result = {"page_number": pn, "error": str(exc)[:200], "chunks": []} results_map[pn] = result nchunks = len(result.get("chunks", [])) status = "OK" if is_valid_page(result) else f"ERR({result.get('error','?')[:40]})" print(f" [{done}/{len(to_process)}] p-{pn:03d}: {status} chunks={nchunks}") # Checkpoint every 10 pages if done % 10 == 0: ordered = [results_map[p[0]] for p in pages if p[0] in results_map] PAGES_RAW.write_text( json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8" ) # Final save ordered = [results_map[p[0]] for p in pages if p[0] in results_map] PAGES_RAW.write_text(json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8") print(f" Saved {len(ordered)} pages to pages_raw.json") return results_map # --------------------------------------------------------------------------- # Phase 2: globally number chunks # --------------------------------------------------------------------------- def phase_number_chunks(pages, results_map): print("[Phase 2] Globally numbering chunks ...") all_chunks = [] # list of (page_num, chunk_dict) for pn, _, _ in pages: pg = results_map.get(pn, {}) chunks = sorted(pg.get("chunks", []), key=lambda c: c.get("order_in_page", 0)) for c in chunks: all_chunks.append((pn, c)) total_chunks = len(all_chunks) for i, (pn, c) in enumerate(all_chunks, 1): c["chunk_id"] = f"c{i:04d}" c["order_global"] = i c["page"] = pn c["prev_chunk"] = f"c{i-1:04d}" if i > 1 else None c["next_chunk"] = f"c{i+1:04d}" if i < total_chunks else None print(f" Total chunks: {total_chunks}") return all_chunks # --------------------------------------------------------------------------- # Phase 3: crop image chunks # --------------------------------------------------------------------------- def phase_crop_images(all_chunks, pages): png_map = {pn: pp for pn, pp, _ in pages} image_chunks = [(pn, c) for pn, c in all_chunks if c.get("type") == "image"] print(f"[Phase 3] Cropping {len(image_chunks)} image chunks ...") for pn, c in image_chunks: cid = c["chunk_id"] out_path = IMAGES_DIR / f"IMG-{cid}.png" if out_path.exists(): continue png_path = png_map.get(pn) if not png_path: continue bbox = c.get("bbox", {}) if not bbox: continue try: im = Image.open(png_path) W, H = im.size pad = 0.005 x = bbox.get("x", 0) y = bbox.get("y", 0) w = bbox.get("w", 1) h = bbox.get("h", 1) left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) if right > left and bottom > top: crop = im.crop((left, top, right, bottom)) crop.save(out_path) c["related_image"] = f"IMG-{cid}.png" except Exception as exc: print(f" WARN crop {cid}: {exc}") # --------------------------------------------------------------------------- # Phase 4: write chunk files # --------------------------------------------------------------------------- def phase_write_chunks(all_chunks, pages): png_map = {pn: pp for pn, pp, _ in pages} print(f"[Phase 4] Writing {len(all_chunks)} chunk files ...") for pn, c in all_chunks: cid = c["chunk_id"] chunk_path = CHUNKS_DIR / f"{cid}.md" meta = c.get("metadata", {}) bbox = c.get("bbox", {"x": 0, "y": 0, "w": 0, "h": 0}) png_path = png_map.get(pn, "") rel_png = f"../../processing/png/{DOC_ID}/{Path(str(png_path)).name}" if png_path else "null" yaml_lines = [ "---", f"chunk_id: {cid}", f"type: {c.get('type', 'unknown')}", f"page: {pn}", f"order_in_page: {c.get('order_in_page', 0)}", f"order_global: {c.get('order_global', 0)}", f"bbox: {{x: {bbox.get('x',0):.4f}, y: {bbox.get('y',0):.4f}, w: {bbox.get('w',0):.4f}, h: {bbox.get('h',0):.4f}}}", f"classification: {json.dumps(meta.get('classification'))}", f"formatting: {json.dumps(meta.get('formatting', []))}", f"cross_page_hint: {meta.get('cross_page_hint', 'self_contained')}", f"prev_chunk: {json.dumps(c.get('prev_chunk'))}", f"next_chunk: {json.dumps(c.get('next_chunk'))}", f"related_image: {json.dumps(c.get('related_image'))}", f"related_table: {json.dumps(c.get('related_table'))}", f"ocr_confidence: {meta.get('ocr_confidence', 0.0)}", f"ocr_source_lines: {json.dumps(meta.get('ocr_source_lines', []))}", f"redaction_code: {json.dumps(meta.get('redaction_code'))}", f"redaction_inferred_content_type: {json.dumps(meta.get('redaction_inferred_content_type'))}", f"image_type: {json.dumps(meta.get('image_type'))}", f"ufo_anomaly_detected: {str(c.get('ufo_anomaly_detected', False)).lower()}", f"cryptid_anomaly_detected: {str(c.get('cryptid_anomaly_detected', False)).lower()}", f"ufo_anomaly_type: {json.dumps(c.get('ufo_anomaly_type'))}", f"ufo_anomaly_rationale: {json.dumps(c.get('ufo_anomaly_rationale'))}", f"cryptid_anomaly_type: {json.dumps(c.get('cryptid_anomaly_type'))}", f"cryptid_anomaly_rationale: {json.dumps(c.get('cryptid_anomaly_rationale'))}", f"image_description_en: {json.dumps(c.get('image_description_en'))}", f"image_description_pt_br: {json.dumps(c.get('image_description_pt_br'))}", f"extracted_text: {json.dumps(c.get('extracted_text'))}", f"source_png: {rel_png}", "---", ] body = "\n".join(yaml_lines) + "\n\n" body += f"**EN:** {c.get('content_en', '')}\n\n" body += f"**PT-BR:** {c.get('content_pt_br', '')}\n" chunk_path.write_text(body, encoding="utf-8") # --------------------------------------------------------------------------- # Phase 5: write _index.json # --------------------------------------------------------------------------- def phase_write_index(all_chunks, pages): total_pages = len(pages) total_chunks = len(all_chunks) build_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": total_pages, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": MODEL, "build_at": build_at, "chunks": [], } for pn, c in all_chunks: cid = c["chunk_id"] preview = (c.get("content_en") or "")[:80] index["chunks"].append({ "chunk_id": cid, "type": c.get("type", "unknown"), "page": pn, "order_in_page": c.get("order_in_page", 0), "order_global": c.get("order_global", 0), "file": f"chunks/{cid}.md", "bbox": c.get("bbox", {}), "preview": preview, }) index_path = RAW_DIR / "_index.json" index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8") print(f"[Phase 5] Written _index.json ({total_chunks} entries)") return build_at # --------------------------------------------------------------------------- # Phase 6: assemble document.md # --------------------------------------------------------------------------- def phase_assemble_document(all_chunks, pages, results_map, build_at): total_pages = len(pages) total_chunks = len(all_chunks) # Histograms + anomaly lists type_hist = {} ufo_flagged = [] cryptid_flagged = [] for pn, c in all_chunks: ctype = c.get("type", "unknown") type_hist[ctype] = type_hist.get(ctype, 0) + 1 if c.get("ufo_anomaly_detected"): ufo_flagged.append(c["chunk_id"]) if c.get("cryptid_anomaly_detected"): cryptid_flagged.append(c["chunk_id"]) build_at_str = build_at frontmatter = f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {total_pages} total_chunks: {total_chunks} chunk_types_histogram: {json.dumps(type_hist, ensure_ascii=False)} multi_page_tables: [] ufo_anomalies_flagged: {json.dumps(ufo_flagged)} cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)} build_approach: "subagents" build_model: "{MODEL}" build_at: "{build_at_str}" --- """ # Group chunks by page chunks_by_page = {} for pn, c in all_chunks: chunks_by_page.setdefault(pn, []).append(c) body_parts = [] for pn, _, _ in pages: pg = results_map.get(pn, {}) summary_en = pg.get("page_summary_en", "") summary_pt = pg.get("page_summary_pt_br", "") body_parts.append(f"\n## Page {pn}\n") if summary_en: body_parts.append(f"\n") if summary_pt: body_parts.append(f"\n") body_parts.append("\n") for c in chunks_by_page.get(pn, []): cid = c["chunk_id"] ctype = c.get("type", "unknown") bbox = c.get("bbox", {}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}" body_parts.append(f"\n") body_parts.append(f'\n') body_parts.append(f"### Chunk {cid} — {ctype} · p{pn} · bbox: {bbox_str}\n\n") body_parts.append(f"**EN:** {c.get('content_en', '')}\n\n") body_parts.append(f"**PT-BR:** {c.get('content_pt_br', '')}\n\n") if ctype == "image" and c.get("related_image"): body_parts.append(f"![{cid}](./images/{c['related_image']})\n\n") if c.get("image_description_en"): body_parts.append(f"*Image (EN): {c['image_description_en']}*\n\n") if c.get("image_description_pt_br"): body_parts.append(f"*Imagem (PT-BR): {c['image_description_pt_br']}*\n\n") # Metadata details block meta_json = { "chunk_id": cid, "type": ctype, "page": pn, "order_global": c.get("order_global"), "bbox": bbox, "classification": c.get("metadata", {}).get("classification"), "formatting": c.get("metadata", {}).get("formatting", []), "cross_page_hint": c.get("metadata", {}).get("cross_page_hint"), "ocr_confidence": c.get("metadata", {}).get("ocr_confidence"), "ufo_anomaly_detected": c.get("ufo_anomaly_detected", False), "cryptid_anomaly_detected": c.get("cryptid_anomaly_detected", False), } body_parts.append("
metadata\n\n") body_parts.append("```json\n") body_parts.append(json.dumps(meta_json, ensure_ascii=False, indent=2)) body_parts.append("\n```\n\n
\n\n---\n\n") doc_content = frontmatter + "".join(body_parts) doc_path = RAW_DIR / "document.md" doc_path.write_text(doc_content, encoding="utf-8") doc_bytes = len(doc_content.encode("utf-8")) print(f"[Phase 6] Written document.md ({doc_bytes:,} bytes)") return doc_bytes, ufo_flagged, cryptid_flagged # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): start = time.time() # Ensure output dirs exist for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]: d.mkdir(parents=True, exist_ok=True) pages = get_page_files() if not pages: print("ERROR: no PNG pages found", file=sys.stderr) sys.exit(1) total_pages = len(pages) print(f"Document: {DOC_ID}") print(f"Pages found: {total_pages}") # Phase 1: vision + OCR per page results_map = phase_process_pages(pages) # Phase 2: global chunk numbering all_chunks = phase_number_chunks(pages, results_map) # Phase 3: crop image chunks phase_crop_images(all_chunks, pages) # Phase 4: write chunk .md files phase_write_chunks(all_chunks, pages) # Phase 5: write _index.json build_at = phase_write_index(all_chunks, pages) # Phase 6: assemble document.md doc_bytes, ufo_flagged, cryptid_flagged = phase_assemble_document( all_chunks, pages, results_map, build_at ) wall = int(time.time() - start) images_count = len(list(IMAGES_DIR.glob("IMG-*.png"))) tables_count = len(list(TABLES_DIR.glob("TBL-*.csv"))) print(f"\nSTATS pages_done={total_pages} chunks_total={len(all_chunks)} " f"images_extracted={images_count} tables_stitched={tables_count} " f"ufo_anomalies={len(ufo_flagged)} cryptid_anomalies={len(cryptid_flagged)} " f"wall_seconds={wall}") if __name__ == "__main__": main()