disclosure-bureau/scripts/rebuild_doc65_section8.py

645 lines
24 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_section8.py
Direct Gemini-powered rebuild of doc-65-hs1-834228961-62-hq-83894-section-8.
Produces: chunks/, images/, tables/, _index.json, document.md
"""
import os
import sys
import json
import re
import time
import base64
import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeout
from PIL import Image
import google.genai as genai
from google.genai import types
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8"
DOC_TITLE = "65 HS1-834228961/62-HQ-83894 Section 8"
HIGHEST_CLASS = "TOP SECRET"
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"
PAGES_RAW = RAW_DIR / "pages_raw.json"
MODEL = "models/gemini-3.1-flash-lite"
MAX_WORKERS = 4
PAGE_TIMEOUT = 150 # seconds per page
VALID_TYPES = {
"letterhead", "address_block", "classification_marking", "heading",
"paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
"caption", "table_marker", "image", "stamp", "signature", "marginalia",
"redaction", "footer", "blank_area", "unknown",
}
# ---------------------------------------------------------------------------
# Gemini client
# ---------------------------------------------------------------------------
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))
# ---------------------------------------------------------------------------
# Page-rebuilder prompt
# ---------------------------------------------------------------------------
PAGE_PROMPT = """\
You are a forensic document reconstruction agent for The Disclosure Bureau.
Given a single page image (PNG) and its raw OCR text from a US Department of War
declassified UAP/UFO document, decompose it into LOSSLESS agentic chunks.
## Chunk types — STRICT enum (use EXACTLY one of these 19 strings):
letterhead, address_block, classification_marking, heading, paragraph,
form_field, bulleted_item, numbered_item, quote_block, caption, table_marker,
image, stamp, signature, marginalia, redaction, footer, blank_area, unknown
## Output: ONE JSON object — NO markdown fences, NO prose before/after.
{{
"page_number": {page_number},
"page_summary_en": "1-2 sentences describing this page",
"page_summary_pt_br": "1-2 frases em português brasileiro",
"page_layout": {{
"columns": 1,
"orientation": "portrait",
"page_dimensions_approx": "letter"
}},
"chunks": [
{{
"order_in_page": 1,
"type": "paragraph",
"bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.08}},
"content_en": "verbatim English text of this chunk",
"content_pt_br": "Texto em português brasileiro",
"metadata": {{
"ocr_confidence": 0.95,
"ocr_source_lines": [1, 2, 3],
"classification": null,
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"formatting": [],
"cross_page_hint": "self_contained",
"prev_chunk_hint": null,
"next_chunk_hint": null,
"language_in_source": "en"
}}
}}
]
}}
## Rules:
1. Order by reading order (top→bottom, left→right). order_in_page is 1-indexed.
2. One semantic unit per chunk (one paragraph, one address block, one image, etc.).
3. ALL content accounted for — never skip anything, even blank areas if significant.
4. content_en: verbatim/near-verbatim. No paraphrasing.
5. content_pt_br: Brazilian Portuguese (pt-BR). Preserve UTF-8 accents: ç ã á é í ó ú â ê ô à.
Proper nouns and verbatim quoted passages stay in source language inside pt-br.
6. Redacted blocks: content_en = "[REDACTED — <code>]". Never fabricate hidden content.
7. bbox: normalized 0..1 relative to page PNG size. Tight around the chunk.
8. cross_page_hint: self_contained | continues_from_prev | continues_to_next
9. image chunks: content_en = brief 1-sentence placeholder description (will be analyzed separately).
10. classification field: exact string as it appears (e.g. "TOP SECRET", "SECRET//NOFORN") or null.
Document context:
doc_id: {doc_id}
page_number: {page_number} of {total_pages}
doc_title: {doc_title}
OCR text (layout-preserved, may have errors — trust the image when they disagree):
---
{ocr_text}
---
Now analyze the image + OCR and output the JSON:"""
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def get_page_files():
pages = []
for png in sorted(PNG_DIR.glob("p-*.png")):
m = re.match(r"p-0*(\d+)\.png", png.name)
if not m:
continue
pn = int(m.group(1))
# OCR: try zero-padded 3-digit, then bare number
for fmt in [f"p-{pn:03d}.txt", f"p-{pn}.txt"]:
ocr = OCR_DIR / fmt
if ocr.exists():
break
else:
ocr = None
pages.append((pn, png, ocr))
return pages
def encode_png(path):
with open(path, "rb") as f:
return base64.b64encode(f.read()).decode()
def call_gemini(png_path, ocr_text, page_num, total_pages):
prompt = PAGE_PROMPT.format(
doc_id=DOC_ID,
page_number=page_num,
total_pages=total_pages,
doc_title=DOC_TITLE,
ocr_text=ocr_text[:5000],
)
with open(png_path, "rb") as f:
img_bytes = f.read()
contents = [
types.Part(
inline_data=types.Blob(mime_type="image/png", data=img_bytes)
),
types.Part(text=prompt),
]
config = types.GenerateContentConfig(
temperature=0.1,
max_output_tokens=8192,
)
def _call():
resp = client.models.generate_content(
model=MODEL, contents=contents, config=config
)
if resp.text is None:
# Safety block or empty response — extract any available text from parts
try:
parts = resp.candidates[0].content.parts
return "\n".join(p.text for p in parts if hasattr(p, "text") and p.text)
except Exception:
return None
return resp.text
with ThreadPoolExecutor(max_workers=1) as ex:
future = ex.submit(_call)
return future.result(timeout=PAGE_TIMEOUT)
def parse_page_json(raw_text, page_num):
text = raw_text.strip()
text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
text = re.sub(r"\s*```\s*$", "", text, flags=re.MULTILINE)
text = text.strip()
try:
data = json.loads(text)
except json.JSONDecodeError:
# Try to extract the largest {...} block
m = re.search(r"\{[\s\S]*\}", text)
if m:
try:
data = json.loads(m.group(0))
except json.JSONDecodeError:
return {"page_number": page_num, "error": "json_parse_failed",
"chunks": [], "raw": text[:300]}
else:
return {"page_number": page_num, "error": "no_json_found",
"chunks": [], "raw": text[:300]}
data["page_number"] = page_num
# Validate and normalize chunk types
for c in data.get("chunks", []):
if c.get("type") not in VALID_TYPES:
c["type"] = "unknown"
return data
def fallback_chunk(page_num, ocr_text):
"""Minimal unknown chunk when Gemini fails persistently."""
preview = ocr_text[:200].strip() if ocr_text and ocr_text.strip() else "(page content unavailable)"
return {
"page_number": page_num,
"page_summary_en": f"Page {page_num} — content could not be parsed by vision model.",
"page_summary_pt_br": f"Página {page_num} — conteúdo não pôde ser analisado pelo modelo de visão.",
"page_layout": {"columns": 1, "orientation": "portrait", "page_dimensions_approx": "letter"},
"chunks": [{
"order_in_page": 1,
"type": "unknown",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"content_en": f"[Vision analysis failed — OCR excerpt: {preview}]",
"content_pt_br": f"[Análise de visão falhou — trecho OCR: {preview}]",
"metadata": {
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"classification": None,
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"formatting": [],
"cross_page_hint": "self_contained",
"prev_chunk_hint": None,
"next_chunk_hint": None,
"language_in_source": "en",
},
}],
}
def process_page(page_num, png_path, ocr_path, total_pages, use_fallback=False):
ocr_text = (
ocr_path.read_text(encoding="utf-8", errors="replace")
if ocr_path
else "(OCR not available)"
)
if use_fallback:
return fallback_chunk(page_num, ocr_text)
try:
raw = call_gemini(png_path, ocr_text, page_num, total_pages)
if raw is None:
return {"page_number": page_num, "error": "gemini_none_response", "chunks": []}
return parse_page_json(raw, page_num)
except FuturesTimeout:
return {"page_number": page_num, "error": "timeout", "chunks": []}
except Exception as exc:
return {"page_number": page_num, "error": str(exc)[:200], "chunks": []}
def is_valid_page(p):
return bool(p.get("chunks")) and not p.get("error")
# ---------------------------------------------------------------------------
# Phase 1: process all pages
# ---------------------------------------------------------------------------
def phase_process_pages(pages):
total = len(pages)
print(f"[Phase 1] Processing {total} pages with {MODEL} ...")
# Load existing checkpoint
existing_map = {}
failed_pages = set()
if PAGES_RAW.exists():
try:
existing = json.loads(PAGES_RAW.read_text(encoding="utf-8"))
for p in existing:
if is_valid_page(p):
existing_map[p["page_number"]] = p
elif p.get("error"):
failed_pages.add(p["page_number"])
print(f" Checkpoint: {len(existing_map)} valid pages loaded, {len(failed_pages)} previously failed")
except Exception:
pass
to_process = [(pn, pp, op) for pn, pp, op in pages if pn not in existing_map]
print(f" Remaining: {len(to_process)} pages")
results_map = dict(existing_map)
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {
executor.submit(process_page, pn, pp, op, total, pn in failed_pages): pn
for pn, pp, op in to_process
}
done = 0
for future in as_completed(futures):
pn = futures[future]
done += 1
try:
result = future.result(timeout=PAGE_TIMEOUT + 30)
except Exception as exc:
result = {"page_number": pn, "error": str(exc)[:200], "chunks": []}
results_map[pn] = result
nchunks = len(result.get("chunks", []))
status = "OK" if is_valid_page(result) else f"ERR({result.get('error','?')[:40]})"
print(f" [{done}/{len(to_process)}] p-{pn:03d}: {status} chunks={nchunks}")
# Checkpoint every 10 pages
if done % 10 == 0:
ordered = [results_map[p[0]] for p in pages if p[0] in results_map]
PAGES_RAW.write_text(
json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8"
)
# Final save
ordered = [results_map[p[0]] for p in pages if p[0] in results_map]
PAGES_RAW.write_text(json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8")
print(f" Saved {len(ordered)} pages to pages_raw.json")
return results_map
# ---------------------------------------------------------------------------
# Phase 2: globally number chunks
# ---------------------------------------------------------------------------
def phase_number_chunks(pages, results_map):
print("[Phase 2] Globally numbering chunks ...")
all_chunks = [] # list of (page_num, chunk_dict)
for pn, _, _ in pages:
pg = results_map.get(pn, {})
chunks = sorted(pg.get("chunks", []), key=lambda c: c.get("order_in_page", 0))
for c in chunks:
all_chunks.append((pn, c))
total_chunks = len(all_chunks)
for i, (pn, c) in enumerate(all_chunks, 1):
c["chunk_id"] = f"c{i:04d}"
c["order_global"] = i
c["page"] = pn
c["prev_chunk"] = f"c{i-1:04d}" if i > 1 else None
c["next_chunk"] = f"c{i+1:04d}" if i < total_chunks else None
print(f" Total chunks: {total_chunks}")
return all_chunks
# ---------------------------------------------------------------------------
# Phase 3: crop image chunks
# ---------------------------------------------------------------------------
def phase_crop_images(all_chunks, pages):
png_map = {pn: pp for pn, pp, _ in pages}
image_chunks = [(pn, c) for pn, c in all_chunks if c.get("type") == "image"]
print(f"[Phase 3] Cropping {len(image_chunks)} image chunks ...")
for pn, c in image_chunks:
cid = c["chunk_id"]
out_path = IMAGES_DIR / f"IMG-{cid}.png"
if out_path.exists():
continue
png_path = png_map.get(pn)
if not png_path:
continue
bbox = c.get("bbox", {})
if not bbox:
continue
try:
im = Image.open(png_path)
W, H = im.size
pad = 0.005
x = bbox.get("x", 0)
y = bbox.get("y", 0)
w = bbox.get("w", 1)
h = bbox.get("h", 1)
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
if right > left and bottom > top:
crop = im.crop((left, top, right, bottom))
crop.save(out_path)
c["related_image"] = f"IMG-{cid}.png"
except Exception as exc:
print(f" WARN crop {cid}: {exc}")
# ---------------------------------------------------------------------------
# Phase 4: write chunk files
# ---------------------------------------------------------------------------
def phase_write_chunks(all_chunks, pages):
png_map = {pn: pp for pn, pp, _ in pages}
print(f"[Phase 4] Writing {len(all_chunks)} chunk files ...")
for pn, c in all_chunks:
cid = c["chunk_id"]
chunk_path = CHUNKS_DIR / f"{cid}.md"
meta = c.get("metadata", {})
bbox = c.get("bbox", {"x": 0, "y": 0, "w": 0, "h": 0})
png_path = png_map.get(pn, "")
rel_png = f"../../processing/png/{DOC_ID}/{Path(str(png_path)).name}" if png_path else "null"
yaml_lines = [
"---",
f"chunk_id: {cid}",
f"type: {c.get('type', 'unknown')}",
f"page: {pn}",
f"order_in_page: {c.get('order_in_page', 0)}",
f"order_global: {c.get('order_global', 0)}",
f"bbox: {{x: {bbox.get('x',0):.4f}, y: {bbox.get('y',0):.4f}, w: {bbox.get('w',0):.4f}, h: {bbox.get('h',0):.4f}}}",
f"classification: {json.dumps(meta.get('classification'))}",
f"formatting: {json.dumps(meta.get('formatting', []))}",
f"cross_page_hint: {meta.get('cross_page_hint', 'self_contained')}",
f"prev_chunk: {json.dumps(c.get('prev_chunk'))}",
f"next_chunk: {json.dumps(c.get('next_chunk'))}",
f"related_image: {json.dumps(c.get('related_image'))}",
f"related_table: {json.dumps(c.get('related_table'))}",
f"ocr_confidence: {meta.get('ocr_confidence', 0.0)}",
f"ocr_source_lines: {json.dumps(meta.get('ocr_source_lines', []))}",
f"redaction_code: {json.dumps(meta.get('redaction_code'))}",
f"redaction_inferred_content_type: {json.dumps(meta.get('redaction_inferred_content_type'))}",
f"image_type: {json.dumps(meta.get('image_type'))}",
f"ufo_anomaly_detected: {str(c.get('ufo_anomaly_detected', False)).lower()}",
f"cryptid_anomaly_detected: {str(c.get('cryptid_anomaly_detected', False)).lower()}",
f"ufo_anomaly_type: {json.dumps(c.get('ufo_anomaly_type'))}",
f"ufo_anomaly_rationale: {json.dumps(c.get('ufo_anomaly_rationale'))}",
f"cryptid_anomaly_type: {json.dumps(c.get('cryptid_anomaly_type'))}",
f"cryptid_anomaly_rationale: {json.dumps(c.get('cryptid_anomaly_rationale'))}",
f"image_description_en: {json.dumps(c.get('image_description_en'))}",
f"image_description_pt_br: {json.dumps(c.get('image_description_pt_br'))}",
f"extracted_text: {json.dumps(c.get('extracted_text'))}",
f"source_png: {rel_png}",
"---",
]
body = "\n".join(yaml_lines) + "\n\n"
body += f"**EN:** {c.get('content_en', '')}\n\n"
body += f"**PT-BR:** {c.get('content_pt_br', '')}\n"
chunk_path.write_text(body, encoding="utf-8")
# ---------------------------------------------------------------------------
# Phase 5: write _index.json
# ---------------------------------------------------------------------------
def phase_write_index(all_chunks, pages):
total_pages = len(pages)
total_chunks = len(all_chunks)
build_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": total_pages,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": MODEL,
"build_at": build_at,
"chunks": [],
}
for pn, c in all_chunks:
cid = c["chunk_id"]
preview = (c.get("content_en") or "")[:80]
index["chunks"].append({
"chunk_id": cid,
"type": c.get("type", "unknown"),
"page": pn,
"order_in_page": c.get("order_in_page", 0),
"order_global": c.get("order_global", 0),
"file": f"chunks/{cid}.md",
"bbox": c.get("bbox", {}),
"preview": preview,
})
index_path = RAW_DIR / "_index.json"
index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"[Phase 5] Written _index.json ({total_chunks} entries)")
return build_at
# ---------------------------------------------------------------------------
# Phase 6: assemble document.md
# ---------------------------------------------------------------------------
def phase_assemble_document(all_chunks, pages, results_map, build_at):
total_pages = len(pages)
total_chunks = len(all_chunks)
# Histograms + anomaly lists
type_hist = {}
ufo_flagged = []
cryptid_flagged = []
for pn, c in all_chunks:
ctype = c.get("type", "unknown")
type_hist[ctype] = type_hist.get(ctype, 0) + 1
if c.get("ufo_anomaly_detected"):
ufo_flagged.append(c["chunk_id"])
if c.get("cryptid_anomaly_detected"):
cryptid_flagged.append(c["chunk_id"])
build_at_str = build_at
frontmatter = f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {total_pages}
total_chunks: {total_chunks}
chunk_types_histogram: {json.dumps(type_hist, ensure_ascii=False)}
multi_page_tables: []
ufo_anomalies_flagged: {json.dumps(ufo_flagged)}
cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)}
build_approach: "subagents"
build_model: "{MODEL}"
build_at: "{build_at_str}"
---
"""
# Group chunks by page
chunks_by_page = {}
for pn, c in all_chunks:
chunks_by_page.setdefault(pn, []).append(c)
body_parts = []
for pn, _, _ in pages:
pg = results_map.get(pn, {})
summary_en = pg.get("page_summary_en", "")
summary_pt = pg.get("page_summary_pt_br", "")
body_parts.append(f"\n## Page {pn}\n")
if summary_en:
body_parts.append(f"<!-- page_summary_en: {summary_en} -->\n")
if summary_pt:
body_parts.append(f"<!-- page_summary_pt_br: {summary_pt} -->\n")
body_parts.append("\n")
for c in chunks_by_page.get(pn, []):
cid = c["chunk_id"]
ctype = c.get("type", "unknown")
bbox = c.get("bbox", {})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"
body_parts.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->\n")
body_parts.append(f'<a id="{cid}"></a>\n')
body_parts.append(f"### Chunk {cid}{ctype} · p{pn} · bbox: {bbox_str}\n\n")
body_parts.append(f"**EN:** {c.get('content_en', '')}\n\n")
body_parts.append(f"**PT-BR:** {c.get('content_pt_br', '')}\n\n")
if ctype == "image" and c.get("related_image"):
body_parts.append(f"![{cid}](./images/{c['related_image']})\n\n")
if c.get("image_description_en"):
body_parts.append(f"*Image (EN): {c['image_description_en']}*\n\n")
if c.get("image_description_pt_br"):
body_parts.append(f"*Imagem (PT-BR): {c['image_description_pt_br']}*\n\n")
# Metadata details block
meta_json = {
"chunk_id": cid,
"type": ctype,
"page": pn,
"order_global": c.get("order_global"),
"bbox": bbox,
"classification": c.get("metadata", {}).get("classification"),
"formatting": c.get("metadata", {}).get("formatting", []),
"cross_page_hint": c.get("metadata", {}).get("cross_page_hint"),
"ocr_confidence": c.get("metadata", {}).get("ocr_confidence"),
"ufo_anomaly_detected": c.get("ufo_anomaly_detected", False),
"cryptid_anomaly_detected": c.get("cryptid_anomaly_detected", False),
}
body_parts.append("<details><summary>metadata</summary>\n\n")
body_parts.append("```json\n")
body_parts.append(json.dumps(meta_json, ensure_ascii=False, indent=2))
body_parts.append("\n```\n\n</details>\n\n---\n\n")
doc_content = frontmatter + "".join(body_parts)
doc_path = RAW_DIR / "document.md"
doc_path.write_text(doc_content, encoding="utf-8")
doc_bytes = len(doc_content.encode("utf-8"))
print(f"[Phase 6] Written document.md ({doc_bytes:,} bytes)")
return doc_bytes, ufo_flagged, cryptid_flagged
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
start = time.time()
# Ensure output dirs exist
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
d.mkdir(parents=True, exist_ok=True)
pages = get_page_files()
if not pages:
print("ERROR: no PNG pages found", file=sys.stderr)
sys.exit(1)
total_pages = len(pages)
print(f"Document: {DOC_ID}")
print(f"Pages found: {total_pages}")
# Phase 1: vision + OCR per page
results_map = phase_process_pages(pages)
# Phase 2: global chunk numbering
all_chunks = phase_number_chunks(pages, results_map)
# Phase 3: crop image chunks
phase_crop_images(all_chunks, pages)
# Phase 4: write chunk .md files
phase_write_chunks(all_chunks, pages)
# Phase 5: write _index.json
build_at = phase_write_index(all_chunks, pages)
# Phase 6: assemble document.md
doc_bytes, ufo_flagged, cryptid_flagged = phase_assemble_document(
all_chunks, pages, results_map, build_at
)
wall = int(time.time() - start)
images_count = len(list(IMAGES_DIR.glob("IMG-*.png")))
tables_count = len(list(TABLES_DIR.glob("TBL-*.csv")))
print(f"\nSTATS pages_done={total_pages} chunks_total={len(all_chunks)} "
f"images_extracted={images_count} tables_stitched={tables_count} "
f"ufo_anomalies={len(ufo_flagged)} cryptid_anomalies={len(cryptid_flagged)} "
f"wall_seconds={wall}")
if __name__ == "__main__":
main()