disclosure-bureau/scripts/rebuild_doc65_full.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_full.py
Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4.
Uses Google Gemini flash for vision analysis of each page.
Generates chunks/, images/, _index.json, document.md
"""

import os
import sys
import json
import base64
import datetime
import time
import re
import concurrent.futures
from pathlib import Path
from PIL import Image as PILImage

# ---- Config ----
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files"
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"

GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
BATCH_SIZE = 4  # conservative for API limits
MAX_WORKERS = 4

# ---- Ensure dirs ----
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ---- Page map ----
def build_page_map():
    pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
    ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
    page_map = {}
    for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
        page_map[i] = {
            'png': str(PNG_DIR / png),
            'ocr': str(OCR_DIR / ocr),
            'png_filename': png,
        }
    return page_map

def read_ocr(path):
    try:
        with open(path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except:
        return ""

def now_iso():
    return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')

# ---- Gemini vision call ----
import google.generativeai as genai

genai.configure(api_key=GEMINI_API_KEY)

PAGE_ANALYSIS_PROMPT = """You are a document analyst rebuilding a declassified FBI UAP/flying saucer investigation file.

Analyze this page image carefully and return ONLY valid JSON (no markdown code fences, no explanation).

The JSON must have this exact structure:
{
  "page_number": <int>,
  "chunks": [
    {
      "type": "<one of: cover|letterhead|heading|subheading|body_text|caption|signature|stamp|redaction|table_marker|image|handwriting|footer|header|separator|blank>",
      "order_in_page": <int starting at 1>,
      "content_en": "<English text or description>",
      "content_pt_br": "<Brazilian Portuguese translation/description>",
      "bbox": {"x": <0.0-1.0>, "y": <0.0-1.0>, "w": <0.0-1.0>, "h": <0.0-1.0>},
      "classification": <null or "SECRET" or "TOP SECRET" etc>,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": <0.0-1.0>,
      "ocr_source_lines": [],
      "redaction_code": <null or "(b)(1)" etc>,
      "redaction_inferred_content_type": null,
      "image_type": <null or "photograph" or "diagram" or "sketch" or "stamp" or "logo">,
      "ufo_anomaly_detected": false,
      "cryptid_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null,
      "image_description_en": null,
      "image_description_pt_br": null,
      "extracted_text": null
    }
  ]
}

Rules:
- Identify ALL distinct content blocks (letterhead, classification markings, memo headers, body paragraphs, stamps, redactions, signatures, photos, etc.)
- For redacted areas: type="redaction", content_en="[REDACTED]", content_pt_br="[REDATADO]", include redaction_code if visible
- For blank pages: ONE chunk with type="blank"
- For stamps: type="stamp", include extracted_text with what the stamp says
- For signatures: type="signature"
- For photos/images: type="image", image_type appropriately, image_description_en with detailed description
- UAP/flying saucer content: set ufo_anomaly_detected=true and fill ufo_anomaly_type and ufo_anomaly_rationale
- bbox values are fractions of page dimensions (0.0 to 1.0)
- content_en must be verbatim OCR text where possible, or [description] for non-text
- content_pt_br must be Brazilian Portuguese translation
- This is page %d of 179 total
- Document: FBI investigation files about flying discs/UAP reports, 1947-era
"""

def analyze_page_with_gemini(page_num, png_path, ocr_text, retry=3):
    """Call Gemini flash to analyze a page image."""
    prompt = PAGE_ANALYSIS_PROMPT % page_num
    if ocr_text:
        prompt += f"\n\nOCR text available (may be incomplete):\n{ocr_text[:2000]}"

    for attempt in range(retry):
        try:
            model = genai.GenerativeModel('gemini-1.5-flash')
            with open(png_path, 'rb') as f:
                img_data = f.read()

            import google.generativeai as genai2
            from google.generativeai.types import HarmCategory, HarmBlockThreshold

            response = model.generate_content(
                [
                    {"mime_type": "image/png", "data": img_data},
                    prompt
                ],
                generation_config={"temperature": 0.1, "max_output_tokens": 4096},
                safety_settings={
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                }
            )

            text = response.text.strip()
            # Remove markdown code fences if present
            if text.startswith('```'):
                text = re.sub(r'^```(?:json)?\s*', '', text)
                text = re.sub(r'\s*```$', '', text)

            data = json.loads(text)
            return data

        except json.JSONDecodeError as e:
            print(f"  Page {page_num}: JSON parse error (attempt {attempt+1}): {e}")
            if attempt < retry - 1:
                time.sleep(2)
        except Exception as e:
            print(f"  Page {page_num}: Error (attempt {attempt+1}): {e}")
            if attempt < retry - 1:
                time.sleep(3)

    # Fallback: minimal chunk
    return {
        "page_number": page_num,
        "chunks": [{
            "type": "body_text",
            "order_in_page": 1,
            "content_en": f"[Page {page_num} — vision analysis failed]",
            "content_pt_br": f"[Página {page_num} — análise visual falhou]",
            "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
            "classification": None,
            "formatting": [],
            "cross_page_hint": "self_contained",
            "ocr_confidence": 0.0,
            "ocr_source_lines": [],
            "redaction_code": None,
            "redaction_inferred_content_type": None,
            "image_type": None,
            "ufo_anomaly_detected": False,
            "cryptid_anomaly_detected": False,
            "ufo_anomaly_type": None,
            "ufo_anomaly_rationale": None,
            "cryptid_anomaly_type": None,
            "cryptid_anomaly_rationale": None,
            "image_description_en": None,
            "image_description_pt_br": None,
            "extracted_text": None
        }]
    }

def process_page(args):
    page_num, png_path, ocr_path = args
    ocr_text = read_ocr(ocr_path)
    print(f"  Processing page {page_num:03d}...", flush=True)
    result = analyze_page_with_gemini(page_num, png_path, ocr_text)
    print(f"  Done page {page_num:03d}: {len(result.get('chunks', []))} chunks", flush=True)
    return page_num, result

def crop_image_for_chunk(page_png, bbox, out_path):
    """Crop image region for an image-type chunk."""
    try:
        im = PILImage.open(page_png)
        W, H = im.size
        x = bbox.get('x', 0)
        y = bbox.get('y', 0)
        w = bbox.get('w', 1)
        h = bbox.get('h', 1)
        pad = 0.005
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        if right <= left or bottom <= top:
            return False
        crop = im.crop((left, top, right, bottom))
        crop.save(out_path)
        return True
    except Exception as e:
        print(f"  Crop error: {e}")
        return False

def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename):
    """Write a single chunk .md file."""
    bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90})

    # Determine related_image
    related_image = None
    if chunk_data.get('type') == 'image':
        related_image = f"IMG-{chunk_id}.png"

    meta = {
        "chunk_id": chunk_id,
        "type": chunk_data.get('type', 'body_text'),
        "page": page_num,
        "order_in_page": chunk_data.get('order_in_page', 1),
        "order_global": order_global,
        "bbox": bbox,
        "classification": chunk_data.get('classification'),
        "formatting": chunk_data.get('formatting', []),
        "cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'),
        "prev_chunk": prev_chunk,
        "next_chunk": next_chunk,
        "related_image": related_image,
        "related_table": None,
        "ocr_confidence": chunk_data.get('ocr_confidence', 0.8),
        "ocr_source_lines": chunk_data.get('ocr_source_lines', []),
        "redaction_code": chunk_data.get('redaction_code'),
        "redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'),
        "image_type": chunk_data.get('image_type'),
        "ufo_anomaly_detected": chunk_data.get('ufo_anomaly_detected', False),
        "cryptid_anomaly_detected": chunk_data.get('cryptid_anomaly_detected', False),
        "ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'),
        "ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'),
        "cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'),
        "cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'),
        "image_description_en": chunk_data.get('image_description_en'),
        "image_description_pt_br": chunk_data.get('image_description_pt_br'),
        "extracted_text": chunk_data.get('extracted_text'),
        "source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}",
    }

    content_en = chunk_data.get('content_en', '')
    content_pt_br = chunk_data.get('content_pt_br', '')

    # Build YAML frontmatter
    def yaml_val(v):
        if v is None:
            return "null"
        if isinstance(v, bool):
            return str(v).lower()
        if isinstance(v, (int, float)):
            return str(v)
        if isinstance(v, list):
            if not v:
                return "[]"
            return "[" + ", ".join(yaml_val(i) for i in v) + "]"
        if isinstance(v, dict):
            return "{" + ", ".join(f"{k}: {yaml_val(vv)}" for k, vv in v.items()) + "}"
        # string
        s = str(v)
        if any(c in s for c in [':', '#', '[', ']', '{', '}', '*', '&', '!', '|', '>', "'", '"', '\n']):
            s = s.replace('"', '\\"')
            return f'"{s}"'
        return s

    lines = ["---"]
    for k, v in meta.items():
        if isinstance(v, dict):
            lines.append(f"{k}: {{{', '.join(f'{kk}: {yaml_val(vv)}' for kk, vv in v.items())}}}")
        else:
            lines.append(f"{k}: {yaml_val(v)}")
    lines.append("---")
    lines.append("")
    lines.append(f"**EN:** {content_en}")
    lines.append("")
    lines.append(f"**PT-BR:** {content_pt_br}")
    lines.append("")

    out_path = CHUNKS_DIR / f"{chunk_id}.md"
    with open(out_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(lines))

    return meta

def main():
    start_time = time.time()

    page_map = build_page_map()
    total_pages = len(page_map)
    print(f"Starting rebuild: {total_pages} pages")

    # Process all pages in batches of BATCH_SIZE
    all_page_results = {}  # page_num -> result dict

    page_nums = list(page_map.keys())

    for batch_start in range(0, total_pages, BATCH_SIZE):
        batch = page_nums[batch_start:batch_start + BATCH_SIZE]
        batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch]

        print(f"\nBatch {batch_start//BATCH_SIZE + 1}: pages {batch[0]}-{batch[-1]}", flush=True)

        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(process_page, args): args[0] for args in batch_args}
            for future in concurrent.futures.as_completed(futures):
                page_num = futures[future]
                try:
                    pn, result = future.result(timeout=120)
                    all_page_results[pn] = result
                except Exception as e:
                    print(f"  Page {page_num} failed: {e}")
                    all_page_results[page_num] = {
                        "page_number": page_num,
                        "chunks": [{
                            "type": "body_text",
                            "order_in_page": 1,
                            "content_en": f"[Page {page_num} — processing error]",
                            "content_pt_br": f"[Página {page_num} — erro de processamento]",
                            "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
                            "classification": None, "formatting": [],
                            "cross_page_hint": "self_contained",
                            "ocr_confidence": 0.0, "ocr_source_lines": [],
                            "redaction_code": None, "redaction_inferred_content_type": None,
                            "image_type": None, "ufo_anomaly_detected": False,
                            "cryptid_anomaly_detected": False,
                            "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
                            "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
                            "image_description_en": None, "image_description_pt_br": None,
                            "extracted_text": None
                        }]
                    }

        # Small pause between batches to be respectful of rate limits
        if batch_start + BATCH_SIZE < total_pages:
            time.sleep(1)

    print(f"\nAll pages analyzed. Assigning global chunk IDs...")

    # --- Global chunk numbering ---
    all_chunks_ordered = []  # list of (page_num, chunk_data, source_png_filename)

    for page_num in sorted(all_page_results.keys()):
        result = all_page_results[page_num]
        chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1))
        source_png = page_map[page_num]['png_filename']
        for chunk in chunks:
            all_chunks_ordered.append((page_num, chunk, source_png))

    total_chunks = len(all_chunks_ordered)
    print(f"Total chunks: {total_chunks}")

    # Assign chunk_ids and write chunk files
    chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)]

    index_entries = []
    all_chunk_meta = []
    images_extracted = 0
    ufo_anomalies = []
    cryptid_anomalies = []

    print("Writing chunk files...")
    for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
        chunk_id = chunk_id_list[i]
        order_global = i + 1
        prev_chunk = chunk_id_list[i-1] if i > 0 else None
        next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None

        # Crop image if needed
        if chunk_data.get('type') == 'image':
            bbox = chunk_data.get('bbox', {})
            img_out = IMAGES_DIR / f"IMG-{chunk_id}.png"
            png_path = page_map[page_num]['png']
            if crop_image_for_chunk(png_path, bbox, img_out):
                images_extracted += 1

        # Write chunk file
        meta = write_chunk_file(
            chunk_id, chunk_data, page_num, order_global,
            prev_chunk, next_chunk, source_png
        )
        all_chunk_meta.append(meta)

        # Track anomalies
        if chunk_data.get('ufo_anomaly_detected'):
            ufo_anomalies.append(chunk_id)
        if chunk_data.get('cryptid_anomaly_detected'):
            cryptid_anomalies.append(chunk_id)

        # Index entry
        content_en = chunk_data.get('content_en', '')
        preview = content_en[:80].replace('\n', ' ')
        bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90})
        index_entries.append({
            "chunk_id": chunk_id,
            "type": chunk_data.get('type', 'body_text'),
            "page": page_num,
            "order_in_page": chunk_data.get('order_in_page', 1),
            "order_global": order_global,
            "file": f"chunks/{chunk_id}.md",
            "bbox": bbox,
            "preview": preview
        })

    # --- Write _index.json ---
    print("Writing _index.json...")
    build_at = now_iso()

    # Compute chunk type histogram
    type_hist = {}
    for entry in index_entries:
        t = entry['type']
        type_hist[t] = type_hist.get(t, 0) + 1

    index_data = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": total_pages,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": "claude-sonnet-4-6",
        "build_at": build_at,
        "chunks": index_entries
    }

    with open(RAW_DIR / "_index.json", 'w', encoding='utf-8') as f:
        json.dump(index_data, f, indent=2, ensure_ascii=False)

    # --- Assemble document.md ---
    print("Assembling document.md...")

    doc_lines = []
    doc_lines.append("---")
    doc_lines.append('schema_version: "0.2.0"')
    doc_lines.append("type: master_document")
    doc_lines.append(f"doc_id: {DOC_ID}")
    doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
    doc_lines.append(f"total_pages: {total_pages}")
    doc_lines.append(f"total_chunks: {total_chunks}")

    hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items()))
    doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}")
    doc_lines.append("multi_page_tables: []")

    ufo_str = "[" + ", ".join(ufo_anomalies) + "]"
    cryptid_str = "[" + ", ".join(cryptid_anomalies) + "]"
    doc_lines.append(f"ufo_anomalies_flagged: {ufo_str}")
    doc_lines.append(f"cryptid_anomalies_flagged: {cryptid_str}")
    doc_lines.append('build_approach: "subagents"')
    doc_lines.append("build_model: claude-sonnet-4-6")
    doc_lines.append(f"build_at: {build_at}")
    doc_lines.append("---")
    doc_lines.append("")

    # Group chunks by page
    chunks_by_page = {}
    for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
        if page_num not in chunks_by_page:
            chunks_by_page[page_num] = []
        chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, source_png))

    for page_num in sorted(chunks_by_page.keys()):
        doc_lines.append(f"## Page {page_num}")
        doc_lines.append("")

        for chunk_id, chunk_data, source_png in chunks_by_page[page_num]:
            ctype = chunk_data.get('type', 'body_text')
            bbox = chunk_data.get('bbox', {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90})
            bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"

            doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
            doc_lines.append(f'<a id="{chunk_id}"></a>')
            doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page_num} · bbox: {bbox_str}")
            doc_lines.append("")
            doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
            doc_lines.append("")
            doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
            doc_lines.append("")

            if ctype == 'image':
                img_path = f"./images/IMG-{chunk_id}.png"
                doc_lines.append(f"![{chunk_id} image]({img_path})")
                doc_lines.append("")
                desc = chunk_data.get('image_description_en', '')
                if desc:
                    doc_lines.append(f"*{desc}*")
                    doc_lines.append("")

            # Metadata details
            meta_dict = all_chunk_meta[int(chunk_id[1:]) - 1]
            doc_lines.append("<details><summary>metadata</summary>")
            doc_lines.append("")
            doc_lines.append("```json")
            doc_lines.append(json.dumps(meta_dict, indent=2, ensure_ascii=False))
            doc_lines.append("```")
            doc_lines.append("")
            doc_lines.append("</details>")
            doc_lines.append("")
            doc_lines.append("---")
            doc_lines.append("")

    doc_content = '\n'.join(doc_lines)
    with open(RAW_DIR / "document.md", 'w', encoding='utf-8') as f:
        f.write(doc_content)

    doc_bytes = len(doc_content.encode('utf-8'))
    wall_seconds = int(time.time() - start_time)

    print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}")
    print(f"Wall time: {wall_seconds}s")

if __name__ == "__main__":
    main()