disclosure-bureau/scripts/rebuild_doc65_gemini.py

514 lines
20 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_gemini.py
Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4.
Uses Google Gemini flash for vision analysis of each page.
CRITICAL: Always wraps Gemini calls with thread timeout (known hang issue).
"""
import os
import sys
import json
import datetime
import time
import re
import concurrent.futures
from pathlib import Path
from PIL import Image as PILImage
import warnings
warnings.filterwarnings('ignore')
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files"
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
GEMINI_MODEL = "gemini-2.5-flash"
BATCH_SIZE = 4
MAX_WORKERS = 4
GEMINI_TIMEOUT_SEC = 120
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
d.mkdir(parents=True, exist_ok=True)
def build_page_map():
pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
page_map = {}
for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
page_map[i] = {
'png': str(PNG_DIR / png),
'ocr': str(OCR_DIR / ocr),
'png_filename': png,
}
return page_map
def read_ocr(path):
try:
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip()
except:
return ""
def now_iso():
return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
# Compact prompt that minimizes token usage in the response
PAGE_ANALYSIS_PROMPT = """Analyze this FBI declassified document page. Return ONLY raw JSON (no markdown fences).
JSON format (keep content_en values SHORT — max 300 chars per chunk, truncate with "..." if needed):
{"page_number":%d,"chunks":[{"type":"<cover|letterhead|heading|subheading|body_text|caption|signature|stamp|redaction|table_marker|image|handwriting|footer|header|separator|blank>","order_in_page":<int>,"content_en":"<text>","content_pt_br":"<pt-br>","bbox":{"x":<0-1>,"y":<0-1>,"w":<0-1>,"h":<0-1>},"classification":null,"formatting":[],"cross_page_hint":"self_contained","ocr_confidence":<0-1>,"ocr_source_lines":[],"redaction_code":null,"redaction_inferred_content_type":null,"image_type":null,"ufo_anomaly_detected":<true|false>,"cryptid_anomaly_detected":false,"ufo_anomaly_type":null,"ufo_anomaly_rationale":null,"cryptid_anomaly_type":null,"cryptid_anomaly_rationale":null,"image_description_en":null,"image_description_pt_br":null,"extracted_text":null}]}
Rules:
- Each paragraph/section = separate chunk
- Redacted: type=redaction, content_en="[REDACTED]"
- Blank page: one chunk type=blank
- Flying disc/UAP reports: ufo_anomaly_detected=true
- bbox: x=left, y=top, w=width, h=height, all 0.0-1.0
- Page %d of 179, FBI flying discs 1947"""
def fallback_chunk(page_num):
return {
"page_number": page_num,
"chunks": [{
"type": "body_text",
"order_in_page": 1,
"content_en": f"[Page {page_num} — vision analysis failed]",
"content_pt_br": f"[Página {page_num} — análise visual falhou]",
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [],
"redaction_code": None, "redaction_inferred_content_type": None,
"image_type": None, "ufo_anomaly_detected": False,
"cryptid_anomaly_detected": False,
"ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None,
"extracted_text": None
}]
}
def _gemini_call_inner(page_num, png_path, prompt_text):
"""Single Gemini API call — run inside thread for timeout."""
import google.genai as genai
import google.genai.types as gTypes
client = genai.Client(api_key=GEMINI_API_KEY)
with open(png_path, 'rb') as f:
img_bytes = f.read()
response = client.models.generate_content(
model=GEMINI_MODEL,
contents=[
gTypes.Part.from_bytes(data=img_bytes, mime_type='image/png'),
prompt_text
],
config=gTypes.GenerateContentConfig(
temperature=0.1,
max_output_tokens=16384,
)
)
return response.text
def clean_json_text(text):
"""Try to clean and extract JSON from potentially truncated response."""
if text is None:
return None
text = text.strip()
# Remove markdown fences
text = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
text = re.sub(r'\s*```\s*$', '', text, flags=re.MULTILINE)
text = text.strip()
# Try direct parse first
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try to find the JSON object boundaries
start = text.find('{')
if start == -1:
return None
# Try to repair truncated JSON by finding the last complete chunk
# Strategy: find the last complete chunk object and close the array properly
text_from_start = text[start:]
# Try progressively smaller slices to find valid JSON
# Look for last valid chunk boundary
last_bracket = text_from_start.rfind('}')
while last_bracket > 0:
candidate = text_from_start[:last_bracket+1]
# Try to close the chunks array and root object
for suffix in ['', ']}', ']}}']:
try:
result = json.loads(candidate + suffix)
return result
except:
pass
last_bracket = text_from_start.rfind('}', 0, last_bracket)
return None
def analyze_page(page_num, png_path, ocr_text, retry=3):
prompt = PAGE_ANALYSIS_PROMPT % (page_num, page_num)
for attempt in range(retry):
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
future = pool.submit(_gemini_call_inner, page_num, png_path, prompt)
text = future.result(timeout=GEMINI_TIMEOUT_SEC)
data = clean_json_text(text)
if data and 'chunks' in data and data['chunks']:
return data
print(f" P{page_num} no valid JSON (attempt {attempt+1})", flush=True)
if attempt < retry - 1:
time.sleep(2)
except concurrent.futures.TimeoutError:
print(f" P{page_num} TIMEOUT (attempt {attempt+1}/{retry})", flush=True)
if attempt < retry - 1:
time.sleep(5)
except Exception as e:
err = str(e)
print(f" P{page_num} error (attempt {attempt+1}): {err[:120]}", flush=True)
if '429' in err or 'RESOURCE_EXHAUSTED' in err:
wait = 15 * (attempt + 1)
print(f" Rate limit hit, waiting {wait}s...", flush=True)
time.sleep(wait)
elif attempt < retry - 1:
time.sleep(3)
return fallback_chunk(page_num)
def process_page_task(args):
page_num, png_path, ocr_path = args
ocr_text = read_ocr(ocr_path)
result = analyze_page(page_num, png_path, ocr_text)
n = len(result.get('chunks', []))
print(f" P{page_num:03d}: {n} chunks", flush=True)
return page_num, result
def crop_image_chunk(page_png, bbox, out_path):
try:
im = PILImage.open(page_png)
W, H = im.size
x = max(0.0, float(bbox.get('x', 0)))
y = max(0.0, float(bbox.get('y', 0)))
w = max(0.01, float(bbox.get('w', 0.5)))
h = max(0.01, float(bbox.get('h', 0.5)))
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
if right <= left or bottom <= top:
return False
crop = im.crop((left, top, right, bottom))
crop.save(str(out_path))
return True
except Exception as e:
print(f" Crop error: {e}", flush=True)
return False
def yaml_scalar(v):
if v is None:
return "null"
if isinstance(v, bool):
return "true" if v else "false"
if isinstance(v, (int, float)):
return str(v)
if isinstance(v, list):
if not v:
return "[]"
return "[" + ", ".join(yaml_scalar(i) for i in v) + "]"
if isinstance(v, dict):
return "{" + ", ".join(f"{k}: {yaml_scalar(vv)}" for k, vv in v.items()) + "}"
s = str(v)
needs_quote = any(c in s for c in [':', '#', '[', ']', '{', '}', '|', '>', '*', '&', '!', "'", '"', '\n', '\r'])
if needs_quote:
s = s.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '')
return f'"{s}"'
return s
def bbox_safe(bbox):
if not bbox or not isinstance(bbox, dict):
return {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}
return {
"x": float(bbox.get('x', 0.05)),
"y": float(bbox.get('y', 0.05)),
"w": float(bbox.get('w', 0.90)),
"h": float(bbox.get('h', 0.90)),
}
def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename):
bbox = bbox_safe(chunk_data.get('bbox'))
ctype = chunk_data.get('type', 'body_text')
related_image = f"IMG-{chunk_id}.png" if ctype == 'image' else None
meta = {
"chunk_id": chunk_id,
"type": ctype,
"page": page_num,
"order_in_page": chunk_data.get('order_in_page', 1),
"order_global": order_global,
"bbox": bbox,
"classification": chunk_data.get('classification'),
"formatting": chunk_data.get('formatting', []),
"cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'),
"prev_chunk": prev_chunk,
"next_chunk": next_chunk,
"related_image": related_image,
"related_table": None,
"ocr_confidence": chunk_data.get('ocr_confidence', 0.8),
"ocr_source_lines": chunk_data.get('ocr_source_lines', []),
"redaction_code": chunk_data.get('redaction_code'),
"redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'),
"image_type": chunk_data.get('image_type'),
"ufo_anomaly_detected": bool(chunk_data.get('ufo_anomaly_detected', False)),
"cryptid_anomaly_detected": bool(chunk_data.get('cryptid_anomaly_detected', False)),
"ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'),
"ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'),
"cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'),
"cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'),
"image_description_en": chunk_data.get('image_description_en'),
"image_description_pt_br": chunk_data.get('image_description_pt_br'),
"extracted_text": chunk_data.get('extracted_text'),
"source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}",
}
lines = ["---"]
for k, v in meta.items():
if isinstance(v, dict):
pairs = ", ".join(f"{kk}: {yaml_scalar(vv)}" for kk, vv in v.items())
lines.append(f"{k}: {{{pairs}}}")
else:
lines.append(f"{k}: {yaml_scalar(v)}")
lines.append("---")
lines.append("")
lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
lines.append("")
lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
lines.append("")
out_path = CHUNKS_DIR / f"{chunk_id}.md"
with open(str(out_path), 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
return meta
def main():
start_time = time.time()
page_map = build_page_map()
total_pages = len(page_map)
print(f"Pages: {total_pages}, Model: {GEMINI_MODEL}", flush=True)
all_page_results = {}
page_nums = list(page_map.keys())
cache_file = RAW_DIR / "_page_results_cache.json"
if cache_file.exists():
print("Loading partial cache...", flush=True)
with open(str(cache_file), 'r', encoding='utf-8') as f:
cached = json.load(f)
all_page_results = {int(k): v for k, v in cached.items()}
print(f" Loaded {len(all_page_results)} cached pages", flush=True)
pages_to_process = [p for p in page_nums if p not in all_page_results]
print(f"Pages to process: {len(pages_to_process)}", flush=True)
total_batches = (len(pages_to_process) + BATCH_SIZE - 1) // BATCH_SIZE
for batch_idx, batch_start in enumerate(range(0, len(pages_to_process), BATCH_SIZE)):
batch = pages_to_process[batch_start:batch_start + BATCH_SIZE]
batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch]
print(f"Batch {batch_idx+1}/{total_batches}: pages {batch[0]}-{batch[-1]}", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {executor.submit(process_page_task, args): args[0] for args in batch_args}
for future in concurrent.futures.as_completed(futures, timeout=600):
page_num = futures[future]
try:
pn, result = future.result(timeout=5)
all_page_results[pn] = result
except Exception as e:
print(f" P{page_num} future error: {e}", flush=True)
all_page_results[page_num] = fallback_chunk(page_num)
with open(str(cache_file), 'w', encoding='utf-8') as f:
json.dump({str(k): v for k, v in all_page_results.items()}, f, ensure_ascii=False)
print(f" Cache: {len(all_page_results)} pages", flush=True)
if batch_start + BATCH_SIZE < len(pages_to_process):
time.sleep(1)
print(f"\nAll pages processed. Building output...", flush=True)
all_chunks_ordered = []
for page_num in sorted(all_page_results.keys()):
result = all_page_results[page_num]
chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1))
source_png = page_map[page_num]['png_filename']
for chunk in chunks:
all_chunks_ordered.append((page_num, chunk, source_png))
total_chunks = len(all_chunks_ordered)
print(f"Total chunks: {total_chunks}", flush=True)
chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)]
print("Cropping image chunks...", flush=True)
images_extracted = 0
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
if chunk_data.get('type') == 'image':
chunk_id = chunk_id_list[i]
bbox = bbox_safe(chunk_data.get('bbox'))
img_out = IMAGES_DIR / f"IMG-{chunk_id}.png"
png_path = page_map[page_num]['png']
if crop_image_chunk(png_path, bbox, img_out):
images_extracted += 1
print("Writing chunk files...", flush=True)
index_entries = []
all_chunk_meta = []
ufo_anomalies = []
cryptid_anomalies = []
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
chunk_id = chunk_id_list[i]
order_global = i + 1
prev_chunk = chunk_id_list[i-1] if i > 0 else None
next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None
meta = write_chunk_file(chunk_id, chunk_data, page_num, order_global,
prev_chunk, next_chunk, source_png)
all_chunk_meta.append(meta)
if chunk_data.get('ufo_anomaly_detected'):
ufo_anomalies.append(chunk_id)
if chunk_data.get('cryptid_anomaly_detected'):
cryptid_anomalies.append(chunk_id)
content_en = str(chunk_data.get('content_en', ''))
preview = content_en[:80].replace('\n', ' ')
index_entries.append({
"chunk_id": chunk_id,
"type": chunk_data.get('type', 'body_text'),
"page": page_num,
"order_in_page": chunk_data.get('order_in_page', 1),
"order_global": order_global,
"file": f"chunks/{chunk_id}.md",
"bbox": bbox_safe(chunk_data.get('bbox')),
"preview": preview
})
print("Writing _index.json...", flush=True)
build_at = now_iso()
type_hist = {}
for entry in index_entries:
t = entry['type']
type_hist[t] = type_hist.get(t, 0) + 1
index_data = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": total_pages,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": "claude-sonnet-4-6",
"build_at": build_at,
"chunks": index_entries
}
with open(str(RAW_DIR / "_index.json"), 'w', encoding='utf-8') as f:
json.dump(index_data, f, indent=2, ensure_ascii=False)
print("Assembling document.md...", flush=True)
doc_lines = []
doc_lines.append("---")
doc_lines.append('schema_version: "0.2.0"')
doc_lines.append("type: master_document")
doc_lines.append(f"doc_id: {DOC_ID}")
doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
doc_lines.append(f"total_pages: {total_pages}")
doc_lines.append(f"total_chunks: {total_chunks}")
hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items()))
doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}")
doc_lines.append("multi_page_tables: []")
doc_lines.append(f"ufo_anomalies_flagged: [{', '.join(ufo_anomalies)}]")
doc_lines.append(f"cryptid_anomalies_flagged: [{', '.join(cryptid_anomalies)}]")
doc_lines.append('build_approach: "subagents"')
doc_lines.append("build_model: claude-sonnet-4-6")
doc_lines.append(f"build_at: {build_at}")
doc_lines.append("---")
doc_lines.append("")
chunks_by_page = {}
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
if page_num not in chunks_by_page:
chunks_by_page[page_num] = []
chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, all_chunk_meta[i]))
for page_num in sorted(chunks_by_page.keys()):
doc_lines.append(f"## Page {page_num}")
doc_lines.append("")
for chunk_id, chunk_data, meta in chunks_by_page[page_num]:
ctype = chunk_data.get('type', 'body_text')
bbox = bbox_safe(chunk_data.get('bbox'))
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
doc_lines.append(f'<a id="{chunk_id}"></a>')
doc_lines.append(f"### Chunk {chunk_id}{ctype} · p{page_num} · bbox: {bbox_str}")
doc_lines.append("")
doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
doc_lines.append("")
doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
doc_lines.append("")
if ctype == 'image':
doc_lines.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)")
doc_lines.append("")
desc = chunk_data.get('image_description_en', '')
if desc:
doc_lines.append(f"*{desc}*")
doc_lines.append("")
doc_lines.append("<details><summary>metadata</summary>")
doc_lines.append("")
doc_lines.append("```json")
doc_lines.append(json.dumps(meta, indent=2, ensure_ascii=False))
doc_lines.append("```")
doc_lines.append("")
doc_lines.append("</details>")
doc_lines.append("")
doc_lines.append("---")
doc_lines.append("")
doc_content = '\n'.join(doc_lines)
with open(str(RAW_DIR / "document.md"), 'w', encoding='utf-8') as f:
f.write(doc_content)
doc_bytes = len(doc_content.encode('utf-8'))
wall_seconds = int(time.time() - start_time)
if cache_file.exists():
os.remove(str(cache_file))
print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}")
print(f"Wall time: {wall_seconds}s")
if __name__ == "__main__":
main()