disclosure-bureau/scripts/rebuild_doc65_gemini.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_gemini.py
Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4.
Uses Google Gemini flash for vision analysis of each page.
CRITICAL: Always wraps Gemini calls with thread timeout (known hang issue).
"""

import os
import sys
import json
import datetime
import time
import re
import concurrent.futures
from pathlib import Path
from PIL import Image as PILImage

import warnings
warnings.filterwarnings('ignore')

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files"
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"

GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
GEMINI_MODEL = "gemini-2.5-flash"
BATCH_SIZE = 4
MAX_WORKERS = 4
GEMINI_TIMEOUT_SEC = 120

for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

def build_page_map():
    pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
    ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
    page_map = {}
    for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
        page_map[i] = {
            'png': str(PNG_DIR / png),
            'ocr': str(OCR_DIR / ocr),
            'png_filename': png,
        }
    return page_map

def read_ocr(path):
    try:
        with open(path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except:
        return ""

def now_iso():
    return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')

# Compact prompt that minimizes token usage in the response
PAGE_ANALYSIS_PROMPT = """Analyze this FBI declassified document page. Return ONLY raw JSON (no markdown fences).

JSON format (keep content_en values SHORT — max 300 chars per chunk, truncate with "..." if needed):
{"page_number":%d,"chunks":[{"type":"<cover|letterhead|heading|subheading|body_text|caption|signature|stamp|redaction|table_marker|image|handwriting|footer|header|separator|blank>","order_in_page":<int>,"content_en":"<text>","content_pt_br":"<pt-br>","bbox":{"x":<0-1>,"y":<0-1>,"w":<0-1>,"h":<0-1>},"classification":null,"formatting":[],"cross_page_hint":"self_contained","ocr_confidence":<0-1>,"ocr_source_lines":[],"redaction_code":null,"redaction_inferred_content_type":null,"image_type":null,"ufo_anomaly_detected":<true|false>,"cryptid_anomaly_detected":false,"ufo_anomaly_type":null,"ufo_anomaly_rationale":null,"cryptid_anomaly_type":null,"cryptid_anomaly_rationale":null,"image_description_en":null,"image_description_pt_br":null,"extracted_text":null}]}

Rules:
- Each paragraph/section = separate chunk
- Redacted: type=redaction, content_en="[REDACTED]"
- Blank page: one chunk type=blank
- Flying disc/UAP reports: ufo_anomaly_detected=true
- bbox: x=left, y=top, w=width, h=height, all 0.0-1.0
- Page %d of 179, FBI flying discs 1947"""

def fallback_chunk(page_num):
    return {
        "page_number": page_num,
        "chunks": [{
            "type": "body_text",
            "order_in_page": 1,
            "content_en": f"[Page {page_num} — vision analysis failed]",
            "content_pt_br": f"[Página {page_num} — análise visual falhou]",
            "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
            "classification": None, "formatting": [],
            "cross_page_hint": "self_contained",
            "ocr_confidence": 0.0, "ocr_source_lines": [],
            "redaction_code": None, "redaction_inferred_content_type": None,
            "image_type": None, "ufo_anomaly_detected": False,
            "cryptid_anomaly_detected": False,
            "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
            "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
            "image_description_en": None, "image_description_pt_br": None,
            "extracted_text": None
        }]
    }

def _gemini_call_inner(page_num, png_path, prompt_text):
    """Single Gemini API call — run inside thread for timeout."""
    import google.genai as genai
    import google.genai.types as gTypes

    client = genai.Client(api_key=GEMINI_API_KEY)

    with open(png_path, 'rb') as f:
        img_bytes = f.read()

    response = client.models.generate_content(
        model=GEMINI_MODEL,
        contents=[
            gTypes.Part.from_bytes(data=img_bytes, mime_type='image/png'),
            prompt_text
        ],
        config=gTypes.GenerateContentConfig(
            temperature=0.1,
            max_output_tokens=16384,
        )
    )
    return response.text

def clean_json_text(text):
    """Try to clean and extract JSON from potentially truncated response."""
    if text is None:
        return None
    text = text.strip()
    # Remove markdown fences
    text = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\s*```\s*$', '', text, flags=re.MULTILINE)
    text = text.strip()

    # Try direct parse first
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Try to find the JSON object boundaries
    start = text.find('{')
    if start == -1:
        return None

    # Try to repair truncated JSON by finding the last complete chunk
    # Strategy: find the last complete chunk object and close the array properly
    text_from_start = text[start:]

    # Try progressively smaller slices to find valid JSON
    # Look for last valid chunk boundary
    last_bracket = text_from_start.rfind('}')
    while last_bracket > 0:
        candidate = text_from_start[:last_bracket+1]
        # Try to close the chunks array and root object
        for suffix in ['', ']}', ']}}']:
            try:
                result = json.loads(candidate + suffix)
                return result
            except:
                pass
        last_bracket = text_from_start.rfind('}', 0, last_bracket)

    return None

def analyze_page(page_num, png_path, ocr_text, retry=3):
    prompt = PAGE_ANALYSIS_PROMPT % (page_num, page_num)

    for attempt in range(retry):
        try:
            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
                future = pool.submit(_gemini_call_inner, page_num, png_path, prompt)
                text = future.result(timeout=GEMINI_TIMEOUT_SEC)

            data = clean_json_text(text)
            if data and 'chunks' in data and data['chunks']:
                return data

            print(f"  P{page_num} no valid JSON (attempt {attempt+1})", flush=True)
            if attempt < retry - 1:
                time.sleep(2)

        except concurrent.futures.TimeoutError:
            print(f"  P{page_num} TIMEOUT (attempt {attempt+1}/{retry})", flush=True)
            if attempt < retry - 1:
                time.sleep(5)
        except Exception as e:
            err = str(e)
            print(f"  P{page_num} error (attempt {attempt+1}): {err[:120]}", flush=True)
            if '429' in err or 'RESOURCE_EXHAUSTED' in err:
                wait = 15 * (attempt + 1)
                print(f"  Rate limit hit, waiting {wait}s...", flush=True)
                time.sleep(wait)
            elif attempt < retry - 1:
                time.sleep(3)

    return fallback_chunk(page_num)

def process_page_task(args):
    page_num, png_path, ocr_path = args
    ocr_text = read_ocr(ocr_path)
    result = analyze_page(page_num, png_path, ocr_text)
    n = len(result.get('chunks', []))
    print(f"  P{page_num:03d}: {n} chunks", flush=True)
    return page_num, result

def crop_image_chunk(page_png, bbox, out_path):
    try:
        im = PILImage.open(page_png)
        W, H = im.size
        x = max(0.0, float(bbox.get('x', 0)))
        y = max(0.0, float(bbox.get('y', 0)))
        w = max(0.01, float(bbox.get('w', 0.5)))
        h = max(0.01, float(bbox.get('h', 0.5)))
        pad = 0.005
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        if right <= left or bottom <= top:
            return False
        crop = im.crop((left, top, right, bottom))
        crop.save(str(out_path))
        return True
    except Exception as e:
        print(f"  Crop error: {e}", flush=True)
        return False

def yaml_scalar(v):
    if v is None:
        return "null"
    if isinstance(v, bool):
        return "true" if v else "false"
    if isinstance(v, (int, float)):
        return str(v)
    if isinstance(v, list):
        if not v:
            return "[]"
        return "[" + ", ".join(yaml_scalar(i) for i in v) + "]"
    if isinstance(v, dict):
        return "{" + ", ".join(f"{k}: {yaml_scalar(vv)}" for k, vv in v.items()) + "}"
    s = str(v)
    needs_quote = any(c in s for c in [':', '#', '[', ']', '{', '}', '|', '>', '*', '&', '!', "'", '"', '\n', '\r'])
    if needs_quote:
        s = s.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '')
        return f'"{s}"'
    return s

def bbox_safe(bbox):
    if not bbox or not isinstance(bbox, dict):
        return {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}
    return {
        "x": float(bbox.get('x', 0.05)),
        "y": float(bbox.get('y', 0.05)),
        "w": float(bbox.get('w', 0.90)),
        "h": float(bbox.get('h', 0.90)),
    }

def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename):
    bbox = bbox_safe(chunk_data.get('bbox'))
    ctype = chunk_data.get('type', 'body_text')
    related_image = f"IMG-{chunk_id}.png" if ctype == 'image' else None

    meta = {
        "chunk_id": chunk_id,
        "type": ctype,
        "page": page_num,
        "order_in_page": chunk_data.get('order_in_page', 1),
        "order_global": order_global,
        "bbox": bbox,
        "classification": chunk_data.get('classification'),
        "formatting": chunk_data.get('formatting', []),
        "cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'),
        "prev_chunk": prev_chunk,
        "next_chunk": next_chunk,
        "related_image": related_image,
        "related_table": None,
        "ocr_confidence": chunk_data.get('ocr_confidence', 0.8),
        "ocr_source_lines": chunk_data.get('ocr_source_lines', []),
        "redaction_code": chunk_data.get('redaction_code'),
        "redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'),
        "image_type": chunk_data.get('image_type'),
        "ufo_anomaly_detected": bool(chunk_data.get('ufo_anomaly_detected', False)),
        "cryptid_anomaly_detected": bool(chunk_data.get('cryptid_anomaly_detected', False)),
        "ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'),
        "ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'),
        "cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'),
        "cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'),
        "image_description_en": chunk_data.get('image_description_en'),
        "image_description_pt_br": chunk_data.get('image_description_pt_br'),
        "extracted_text": chunk_data.get('extracted_text'),
        "source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}",
    }

    lines = ["---"]
    for k, v in meta.items():
        if isinstance(v, dict):
            pairs = ", ".join(f"{kk}: {yaml_scalar(vv)}" for kk, vv in v.items())
            lines.append(f"{k}: {{{pairs}}}")
        else:
            lines.append(f"{k}: {yaml_scalar(v)}")
    lines.append("---")
    lines.append("")
    lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
    lines.append("")
    lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
    lines.append("")

    out_path = CHUNKS_DIR / f"{chunk_id}.md"
    with open(str(out_path), 'w', encoding='utf-8') as f:
        f.write('\n'.join(lines))

    return meta

def main():
    start_time = time.time()
    page_map = build_page_map()
    total_pages = len(page_map)
    print(f"Pages: {total_pages}, Model: {GEMINI_MODEL}", flush=True)

    all_page_results = {}
    page_nums = list(page_map.keys())

    cache_file = RAW_DIR / "_page_results_cache.json"
    if cache_file.exists():
        print("Loading partial cache...", flush=True)
        with open(str(cache_file), 'r', encoding='utf-8') as f:
            cached = json.load(f)
        all_page_results = {int(k): v for k, v in cached.items()}
        print(f"  Loaded {len(all_page_results)} cached pages", flush=True)

    pages_to_process = [p for p in page_nums if p not in all_page_results]
    print(f"Pages to process: {len(pages_to_process)}", flush=True)

    total_batches = (len(pages_to_process) + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_idx, batch_start in enumerate(range(0, len(pages_to_process), BATCH_SIZE)):
        batch = pages_to_process[batch_start:batch_start + BATCH_SIZE]
        batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch]

        print(f"Batch {batch_idx+1}/{total_batches}: pages {batch[0]}-{batch[-1]}", flush=True)

        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(process_page_task, args): args[0] for args in batch_args}
            for future in concurrent.futures.as_completed(futures, timeout=600):
                page_num = futures[future]
                try:
                    pn, result = future.result(timeout=5)
                    all_page_results[pn] = result
                except Exception as e:
                    print(f"  P{page_num} future error: {e}", flush=True)
                    all_page_results[page_num] = fallback_chunk(page_num)

        with open(str(cache_file), 'w', encoding='utf-8') as f:
            json.dump({str(k): v for k, v in all_page_results.items()}, f, ensure_ascii=False)
        print(f"  Cache: {len(all_page_results)} pages", flush=True)

        if batch_start + BATCH_SIZE < len(pages_to_process):
            time.sleep(1)

    print(f"\nAll pages processed. Building output...", flush=True)

    all_chunks_ordered = []
    for page_num in sorted(all_page_results.keys()):
        result = all_page_results[page_num]
        chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1))
        source_png = page_map[page_num]['png_filename']
        for chunk in chunks:
            all_chunks_ordered.append((page_num, chunk, source_png))

    total_chunks = len(all_chunks_ordered)
    print(f"Total chunks: {total_chunks}", flush=True)

    chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)]

    print("Cropping image chunks...", flush=True)
    images_extracted = 0
    for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
        if chunk_data.get('type') == 'image':
            chunk_id = chunk_id_list[i]
            bbox = bbox_safe(chunk_data.get('bbox'))
            img_out = IMAGES_DIR / f"IMG-{chunk_id}.png"
            png_path = page_map[page_num]['png']
            if crop_image_chunk(png_path, bbox, img_out):
                images_extracted += 1

    print("Writing chunk files...", flush=True)
    index_entries = []
    all_chunk_meta = []
    ufo_anomalies = []
    cryptid_anomalies = []

    for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
        chunk_id = chunk_id_list[i]
        order_global = i + 1
        prev_chunk = chunk_id_list[i-1] if i > 0 else None
        next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None

        meta = write_chunk_file(chunk_id, chunk_data, page_num, order_global,
                                prev_chunk, next_chunk, source_png)
        all_chunk_meta.append(meta)

        if chunk_data.get('ufo_anomaly_detected'):
            ufo_anomalies.append(chunk_id)
        if chunk_data.get('cryptid_anomaly_detected'):
            cryptid_anomalies.append(chunk_id)

        content_en = str(chunk_data.get('content_en', ''))
        preview = content_en[:80].replace('\n', ' ')
        index_entries.append({
            "chunk_id": chunk_id,
            "type": chunk_data.get('type', 'body_text'),
            "page": page_num,
            "order_in_page": chunk_data.get('order_in_page', 1),
            "order_global": order_global,
            "file": f"chunks/{chunk_id}.md",
            "bbox": bbox_safe(chunk_data.get('bbox')),
            "preview": preview
        })

    print("Writing _index.json...", flush=True)
    build_at = now_iso()
    type_hist = {}
    for entry in index_entries:
        t = entry['type']
        type_hist[t] = type_hist.get(t, 0) + 1

    index_data = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": total_pages,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": "claude-sonnet-4-6",
        "build_at": build_at,
        "chunks": index_entries
    }
    with open(str(RAW_DIR / "_index.json"), 'w', encoding='utf-8') as f:
        json.dump(index_data, f, indent=2, ensure_ascii=False)

    print("Assembling document.md...", flush=True)
    doc_lines = []
    doc_lines.append("---")
    doc_lines.append('schema_version: "0.2.0"')
    doc_lines.append("type: master_document")
    doc_lines.append(f"doc_id: {DOC_ID}")
    doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
    doc_lines.append(f"total_pages: {total_pages}")
    doc_lines.append(f"total_chunks: {total_chunks}")
    hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items()))
    doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}")
    doc_lines.append("multi_page_tables: []")
    doc_lines.append(f"ufo_anomalies_flagged: [{', '.join(ufo_anomalies)}]")
    doc_lines.append(f"cryptid_anomalies_flagged: [{', '.join(cryptid_anomalies)}]")
    doc_lines.append('build_approach: "subagents"')
    doc_lines.append("build_model: claude-sonnet-4-6")
    doc_lines.append(f"build_at: {build_at}")
    doc_lines.append("---")
    doc_lines.append("")

    chunks_by_page = {}
    for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
        if page_num not in chunks_by_page:
            chunks_by_page[page_num] = []
        chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, all_chunk_meta[i]))

    for page_num in sorted(chunks_by_page.keys()):
        doc_lines.append(f"## Page {page_num}")
        doc_lines.append("")
        for chunk_id, chunk_data, meta in chunks_by_page[page_num]:
            ctype = chunk_data.get('type', 'body_text')
            bbox = bbox_safe(chunk_data.get('bbox'))
            bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"

            doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
            doc_lines.append(f'<a id="{chunk_id}"></a>')
            doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page_num} · bbox: {bbox_str}")
            doc_lines.append("")
            doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
            doc_lines.append("")
            doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
            doc_lines.append("")

            if ctype == 'image':
                doc_lines.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)")
                doc_lines.append("")
                desc = chunk_data.get('image_description_en', '')
                if desc:
                    doc_lines.append(f"*{desc}*")
                    doc_lines.append("")

            doc_lines.append("<details><summary>metadata</summary>")
            doc_lines.append("")
            doc_lines.append("```json")
            doc_lines.append(json.dumps(meta, indent=2, ensure_ascii=False))
            doc_lines.append("```")
            doc_lines.append("")
            doc_lines.append("</details>")
            doc_lines.append("")
            doc_lines.append("---")
            doc_lines.append("")

    doc_content = '\n'.join(doc_lines)
    with open(str(RAW_DIR / "document.md"), 'w', encoding='utf-8') as f:
        f.write(doc_content)

    doc_bytes = len(doc_content.encode('utf-8'))
    wall_seconds = int(time.time() - start_time)

    if cache_file.exists():
        os.remove(str(cache_file))

    print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}")
    print(f"Wall time: {wall_seconds}s")

if __name__ == "__main__":
    main()