disclosure-bureau/scripts/rebuild_doc65_full.py

534 lines
21 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_full.py
Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4.
Uses Google Gemini flash for vision analysis of each page.
Generates chunks/, images/, _index.json, document.md
"""
import os
import sys
import json
import base64
import datetime
import time
import re
import concurrent.futures
from pathlib import Path
from PIL import Image as PILImage
# ---- Config ----
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files"
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
BATCH_SIZE = 4 # conservative for API limits
MAX_WORKERS = 4
# ---- Ensure dirs ----
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
d.mkdir(parents=True, exist_ok=True)
# ---- Page map ----
def build_page_map():
pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
page_map = {}
for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
page_map[i] = {
'png': str(PNG_DIR / png),
'ocr': str(OCR_DIR / ocr),
'png_filename': png,
}
return page_map
def read_ocr(path):
try:
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip()
except:
return ""
def now_iso():
return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
# ---- Gemini vision call ----
import google.generativeai as genai
genai.configure(api_key=GEMINI_API_KEY)
PAGE_ANALYSIS_PROMPT = """You are a document analyst rebuilding a declassified FBI UAP/flying saucer investigation file.
Analyze this page image carefully and return ONLY valid JSON (no markdown code fences, no explanation).
The JSON must have this exact structure:
{
"page_number": <int>,
"chunks": [
{
"type": "<one of: cover|letterhead|heading|subheading|body_text|caption|signature|stamp|redaction|table_marker|image|handwriting|footer|header|separator|blank>",
"order_in_page": <int starting at 1>,
"content_en": "<English text or description>",
"content_pt_br": "<Brazilian Portuguese translation/description>",
"bbox": {"x": <0.0-1.0>, "y": <0.0-1.0>, "w": <0.0-1.0>, "h": <0.0-1.0>},
"classification": <null or "SECRET" or "TOP SECRET" etc>,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": <0.0-1.0>,
"ocr_source_lines": [],
"redaction_code": <null or "(b)(1)" etc>,
"redaction_inferred_content_type": null,
"image_type": <null or "photograph" or "diagram" or "sketch" or "stamp" or "logo">,
"ufo_anomaly_detected": false,
"cryptid_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null,
"image_description_en": null,
"image_description_pt_br": null,
"extracted_text": null
}
]
}
Rules:
- Identify ALL distinct content blocks (letterhead, classification markings, memo headers, body paragraphs, stamps, redactions, signatures, photos, etc.)
- For redacted areas: type="redaction", content_en="[REDACTED]", content_pt_br="[REDATADO]", include redaction_code if visible
- For blank pages: ONE chunk with type="blank"
- For stamps: type="stamp", include extracted_text with what the stamp says
- For signatures: type="signature"
- For photos/images: type="image", image_type appropriately, image_description_en with detailed description
- UAP/flying saucer content: set ufo_anomaly_detected=true and fill ufo_anomaly_type and ufo_anomaly_rationale
- bbox values are fractions of page dimensions (0.0 to 1.0)
- content_en must be verbatim OCR text where possible, or [description] for non-text
- content_pt_br must be Brazilian Portuguese translation
- This is page %d of 179 total
- Document: FBI investigation files about flying discs/UAP reports, 1947-era
"""
def analyze_page_with_gemini(page_num, png_path, ocr_text, retry=3):
"""Call Gemini flash to analyze a page image."""
prompt = PAGE_ANALYSIS_PROMPT % page_num
if ocr_text:
prompt += f"\n\nOCR text available (may be incomplete):\n{ocr_text[:2000]}"
for attempt in range(retry):
try:
model = genai.GenerativeModel('gemini-1.5-flash')
with open(png_path, 'rb') as f:
img_data = f.read()
import google.generativeai as genai2
from google.generativeai.types import HarmCategory, HarmBlockThreshold
response = model.generate_content(
[
{"mime_type": "image/png", "data": img_data},
prompt
],
generation_config={"temperature": 0.1, "max_output_tokens": 4096},
safety_settings={
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}
)
text = response.text.strip()
# Remove markdown code fences if present
if text.startswith('```'):
text = re.sub(r'^```(?:json)?\s*', '', text)
text = re.sub(r'\s*```$', '', text)
data = json.loads(text)
return data
except json.JSONDecodeError as e:
print(f" Page {page_num}: JSON parse error (attempt {attempt+1}): {e}")
if attempt < retry - 1:
time.sleep(2)
except Exception as e:
print(f" Page {page_num}: Error (attempt {attempt+1}): {e}")
if attempt < retry - 1:
time.sleep(3)
# Fallback: minimal chunk
return {
"page_number": page_num,
"chunks": [{
"type": "body_text",
"order_in_page": 1,
"content_en": f"[Page {page_num} — vision analysis failed]",
"content_pt_br": f"[Página {page_num} — análise visual falhou]",
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"cryptid_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None
}]
}
def process_page(args):
page_num, png_path, ocr_path = args
ocr_text = read_ocr(ocr_path)
print(f" Processing page {page_num:03d}...", flush=True)
result = analyze_page_with_gemini(page_num, png_path, ocr_text)
print(f" Done page {page_num:03d}: {len(result.get('chunks', []))} chunks", flush=True)
return page_num, result
def crop_image_for_chunk(page_png, bbox, out_path):
"""Crop image region for an image-type chunk."""
try:
im = PILImage.open(page_png)
W, H = im.size
x = bbox.get('x', 0)
y = bbox.get('y', 0)
w = bbox.get('w', 1)
h = bbox.get('h', 1)
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
if right <= left or bottom <= top:
return False
crop = im.crop((left, top, right, bottom))
crop.save(out_path)
return True
except Exception as e:
print(f" Crop error: {e}")
return False
def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename):
"""Write a single chunk .md file."""
bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90})
# Determine related_image
related_image = None
if chunk_data.get('type') == 'image':
related_image = f"IMG-{chunk_id}.png"
meta = {
"chunk_id": chunk_id,
"type": chunk_data.get('type', 'body_text'),
"page": page_num,
"order_in_page": chunk_data.get('order_in_page', 1),
"order_global": order_global,
"bbox": bbox,
"classification": chunk_data.get('classification'),
"formatting": chunk_data.get('formatting', []),
"cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'),
"prev_chunk": prev_chunk,
"next_chunk": next_chunk,
"related_image": related_image,
"related_table": None,
"ocr_confidence": chunk_data.get('ocr_confidence', 0.8),
"ocr_source_lines": chunk_data.get('ocr_source_lines', []),
"redaction_code": chunk_data.get('redaction_code'),
"redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'),
"image_type": chunk_data.get('image_type'),
"ufo_anomaly_detected": chunk_data.get('ufo_anomaly_detected', False),
"cryptid_anomaly_detected": chunk_data.get('cryptid_anomaly_detected', False),
"ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'),
"ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'),
"cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'),
"cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'),
"image_description_en": chunk_data.get('image_description_en'),
"image_description_pt_br": chunk_data.get('image_description_pt_br'),
"extracted_text": chunk_data.get('extracted_text'),
"source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}",
}
content_en = chunk_data.get('content_en', '')
content_pt_br = chunk_data.get('content_pt_br', '')
# Build YAML frontmatter
def yaml_val(v):
if v is None:
return "null"
if isinstance(v, bool):
return str(v).lower()
if isinstance(v, (int, float)):
return str(v)
if isinstance(v, list):
if not v:
return "[]"
return "[" + ", ".join(yaml_val(i) for i in v) + "]"
if isinstance(v, dict):
return "{" + ", ".join(f"{k}: {yaml_val(vv)}" for k, vv in v.items()) + "}"
# string
s = str(v)
if any(c in s for c in [':', '#', '[', ']', '{', '}', '*', '&', '!', '|', '>', "'", '"', '\n']):
s = s.replace('"', '\\"')
return f'"{s}"'
return s
lines = ["---"]
for k, v in meta.items():
if isinstance(v, dict):
lines.append(f"{k}: {{{', '.join(f'{kk}: {yaml_val(vv)}' for kk, vv in v.items())}}}")
else:
lines.append(f"{k}: {yaml_val(v)}")
lines.append("---")
lines.append("")
lines.append(f"**EN:** {content_en}")
lines.append("")
lines.append(f"**PT-BR:** {content_pt_br}")
lines.append("")
out_path = CHUNKS_DIR / f"{chunk_id}.md"
with open(out_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
return meta
def main():
start_time = time.time()
page_map = build_page_map()
total_pages = len(page_map)
print(f"Starting rebuild: {total_pages} pages")
# Process all pages in batches of BATCH_SIZE
all_page_results = {} # page_num -> result dict
page_nums = list(page_map.keys())
for batch_start in range(0, total_pages, BATCH_SIZE):
batch = page_nums[batch_start:batch_start + BATCH_SIZE]
batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch]
print(f"\nBatch {batch_start//BATCH_SIZE + 1}: pages {batch[0]}-{batch[-1]}", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {executor.submit(process_page, args): args[0] for args in batch_args}
for future in concurrent.futures.as_completed(futures):
page_num = futures[future]
try:
pn, result = future.result(timeout=120)
all_page_results[pn] = result
except Exception as e:
print(f" Page {page_num} failed: {e}")
all_page_results[page_num] = {
"page_number": page_num,
"chunks": [{
"type": "body_text",
"order_in_page": 1,
"content_en": f"[Page {page_num} — processing error]",
"content_pt_br": f"[Página {page_num} — erro de processamento]",
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [],
"redaction_code": None, "redaction_inferred_content_type": None,
"image_type": None, "ufo_anomaly_detected": False,
"cryptid_anomaly_detected": False,
"ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None,
"extracted_text": None
}]
}
# Small pause between batches to be respectful of rate limits
if batch_start + BATCH_SIZE < total_pages:
time.sleep(1)
print(f"\nAll pages analyzed. Assigning global chunk IDs...")
# --- Global chunk numbering ---
all_chunks_ordered = [] # list of (page_num, chunk_data, source_png_filename)
for page_num in sorted(all_page_results.keys()):
result = all_page_results[page_num]
chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1))
source_png = page_map[page_num]['png_filename']
for chunk in chunks:
all_chunks_ordered.append((page_num, chunk, source_png))
total_chunks = len(all_chunks_ordered)
print(f"Total chunks: {total_chunks}")
# Assign chunk_ids and write chunk files
chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)]
index_entries = []
all_chunk_meta = []
images_extracted = 0
ufo_anomalies = []
cryptid_anomalies = []
print("Writing chunk files...")
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
chunk_id = chunk_id_list[i]
order_global = i + 1
prev_chunk = chunk_id_list[i-1] if i > 0 else None
next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None
# Crop image if needed
if chunk_data.get('type') == 'image':
bbox = chunk_data.get('bbox', {})
img_out = IMAGES_DIR / f"IMG-{chunk_id}.png"
png_path = page_map[page_num]['png']
if crop_image_for_chunk(png_path, bbox, img_out):
images_extracted += 1
# Write chunk file
meta = write_chunk_file(
chunk_id, chunk_data, page_num, order_global,
prev_chunk, next_chunk, source_png
)
all_chunk_meta.append(meta)
# Track anomalies
if chunk_data.get('ufo_anomaly_detected'):
ufo_anomalies.append(chunk_id)
if chunk_data.get('cryptid_anomaly_detected'):
cryptid_anomalies.append(chunk_id)
# Index entry
content_en = chunk_data.get('content_en', '')
preview = content_en[:80].replace('\n', ' ')
bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90})
index_entries.append({
"chunk_id": chunk_id,
"type": chunk_data.get('type', 'body_text'),
"page": page_num,
"order_in_page": chunk_data.get('order_in_page', 1),
"order_global": order_global,
"file": f"chunks/{chunk_id}.md",
"bbox": bbox,
"preview": preview
})
# --- Write _index.json ---
print("Writing _index.json...")
build_at = now_iso()
# Compute chunk type histogram
type_hist = {}
for entry in index_entries:
t = entry['type']
type_hist[t] = type_hist.get(t, 0) + 1
index_data = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": total_pages,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": "claude-sonnet-4-6",
"build_at": build_at,
"chunks": index_entries
}
with open(RAW_DIR / "_index.json", 'w', encoding='utf-8') as f:
json.dump(index_data, f, indent=2, ensure_ascii=False)
# --- Assemble document.md ---
print("Assembling document.md...")
doc_lines = []
doc_lines.append("---")
doc_lines.append('schema_version: "0.2.0"')
doc_lines.append("type: master_document")
doc_lines.append(f"doc_id: {DOC_ID}")
doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
doc_lines.append(f"total_pages: {total_pages}")
doc_lines.append(f"total_chunks: {total_chunks}")
hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items()))
doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}")
doc_lines.append("multi_page_tables: []")
ufo_str = "[" + ", ".join(ufo_anomalies) + "]"
cryptid_str = "[" + ", ".join(cryptid_anomalies) + "]"
doc_lines.append(f"ufo_anomalies_flagged: {ufo_str}")
doc_lines.append(f"cryptid_anomalies_flagged: {cryptid_str}")
doc_lines.append('build_approach: "subagents"')
doc_lines.append("build_model: claude-sonnet-4-6")
doc_lines.append(f"build_at: {build_at}")
doc_lines.append("---")
doc_lines.append("")
# Group chunks by page
chunks_by_page = {}
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
if page_num not in chunks_by_page:
chunks_by_page[page_num] = []
chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, source_png))
for page_num in sorted(chunks_by_page.keys()):
doc_lines.append(f"## Page {page_num}")
doc_lines.append("")
for chunk_id, chunk_data, source_png in chunks_by_page[page_num]:
ctype = chunk_data.get('type', 'body_text')
bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
doc_lines.append(f'<a id="{chunk_id}"></a>')
doc_lines.append(f"### Chunk {chunk_id}{ctype} · p{page_num} · bbox: {bbox_str}")
doc_lines.append("")
doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
doc_lines.append("")
doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
doc_lines.append("")
if ctype == 'image':
img_path = f"./images/IMG-{chunk_id}.png"
doc_lines.append(f"![{chunk_id} image]({img_path})")
doc_lines.append("")
desc = chunk_data.get('image_description_en', '')
if desc:
doc_lines.append(f"*{desc}*")
doc_lines.append("")
# Metadata details
meta_dict = all_chunk_meta[int(chunk_id[1:]) - 1]
doc_lines.append("<details><summary>metadata</summary>")
doc_lines.append("")
doc_lines.append("```json")
doc_lines.append(json.dumps(meta_dict, indent=2, ensure_ascii=False))
doc_lines.append("```")
doc_lines.append("")
doc_lines.append("</details>")
doc_lines.append("")
doc_lines.append("---")
doc_lines.append("")
doc_content = '\n'.join(doc_lines)
with open(RAW_DIR / "document.md", 'w', encoding='utf-8') as f:
f.write(doc_content)
doc_bytes = len(doc_content.encode('utf-8'))
wall_seconds = int(time.time() - start_time)
print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}")
print(f"Wall time: {wall_seconds}s")
if __name__ == "__main__":
main()