disclosure-bureau/scripts/rebuild_d49.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild DOW-UAP-D49 launch summary into harness-assemblable structure.
Processes all 78 PNG pages, writes chunks, _index.json, and document.md.
"""

import os
import sys
import json
import re
import base64
import datetime
import time
from pathlib import Path
from PIL import Image
import anthropic

DOC_ID = "dow-uap-d49-launch-summary-february-2000"
DOC_TITLE = "Vandenberg AFB Launch Summary 1958–2000"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"

# All PNG pages sorted
PNG_PAGES = sorted([f for f in os.listdir(PNG_DIR) if f.endswith(".png")])
TOTAL_PAGES = len(PNG_PAGES)

client = anthropic.Anthropic()

def read_ocr(page_stem: str) -> str:
    """Read OCR text for a page stem like p-001."""
    ocr_path = OCR_DIR / (page_stem + ".txt")
    if ocr_path.exists():
        return ocr_path.read_text(encoding="utf-8", errors="replace")
    return ""

def encode_image_b64(path: str) -> str:
    with open(path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")

def classify_page(ocr_text: str, page_num: int) -> str:
    """Heuristic page type classification."""
    text = ocr_text.strip().lower()
    if page_num == 1:
        return "cover"
    if "distribution list" in text:
        return "distribution"
    if "foreword" in text or "preface" in text:
        return "foreword"
    if "glossary" in text and len(text) < 2000:
        return "glossary"
    if "annual launch summary" in text and ("chart" in text or "launch vehicle" in text or "launch agency" in text):
        return "summary_table"
    if "launch facility guide" in text:
        return "facility_guide"
    if re.search(r'\bseq\b.*\bdate\b.*\bnickname\b', text, re.IGNORECASE) or re.search(r'\d+\.\s+\d{1,2}\s+\w+\s+\d{2}', text):
        return "chronology"
    if "table of contents" in text or "contents" in text.split("\n")[0]:
        return "toc"
    return "body"

def determine_chunk_type(content: str, page_type: str) -> str:
    """Map page content to chunk type."""
    lower = content.lower().strip()
    if page_type == "cover":
        return "letterhead"
    if page_type in ("summary_table", "chronology"):
        return "table_marker"
    if page_type == "glossary":
        return "body_text"
    if page_type == "foreword":
        return "body_text"
    if page_type == "distribution":
        return "body_text"
    if page_type == "facility_guide":
        return "body_text"
    if page_type == "toc":
        return "body_text"
    # Check for headings
    lines = content.strip().split("\n")
    if len(lines) <= 3 and content.strip().isupper():
        return "section_header"
    return "body_text"

def build_page_chunks(page_num: int, page_stem: str, ocr_text: str) -> list:
    """Build chunks for a single page from OCR text."""
    png_path = str(PNG_DIR / (page_stem + ".png"))
    page_type = classify_page(ocr_text, page_num)
    lines = ocr_text.strip().split("\n") if ocr_text.strip() else []

    chunks = []

    if not ocr_text.strip():
        # Image-only page (p-000)
        chunks.append({
            "type": "image",
            "page_type": page_type,
            "content_raw": "",
            "content_en": "[Cover image — Vandenberg AFB Launch Summary 1958–2000]",
            "content_pt_br": "[Imagem da capa — Resumo de Lançamentos da Base Aérea Vandenberg 1958–2000]",
            "order_in_page": 1,
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
            "ocr_confidence": 0.0,
            "ocr_source_lines": [],
        })
        return chunks

    # Identify logical sections within the page
    # For this document, most pages are single logical blocks
    # Special handling: pages with a heading + content body

    heading_lines = []
    body_lines = []
    in_heading = True

    for i, line in enumerate(lines):
        stripped = line.strip()
        # Skip empty header lines
        if not stripped and in_heading and not heading_lines:
            continue
        # Detect heading transition: short uppercase lines at top
        if in_heading:
            if stripped and len(stripped) < 80 and (stripped.isupper() or re.match(r'^[A-Z\s\-\./:,0-9]+$', stripped)):
                heading_lines.append(stripped)
            else:
                in_heading = False
                if stripped:
                    body_lines.append(line)
        else:
            body_lines.append(line)

    # For cover, use all lines as single chunk
    if page_type == "cover":
        content = "\n".join(line.strip() for line in lines if line.strip())
        chunks.append({
            "type": "letterhead",
            "page_type": page_type,
            "content_raw": content,
            "content_en": content,
            "content_pt_br": translate_to_ptbr_simple(content, page_type),
            "order_in_page": 1,
            "bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.8},
            "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
            "ocr_confidence": 0.92,
            "ocr_source_lines": list(range(1, len(lines)+1)),
        })
        return chunks

    order = 1

    # Emit heading chunk if distinct
    if heading_lines and body_lines:
        heading_content = "\n".join(heading_lines)
        chunks.append({
            "type": "section_header",
            "page_type": page_type,
            "content_raw": heading_content,
            "content_en": heading_content,
            "content_pt_br": translate_to_ptbr_simple(heading_content, "section_header"),
            "order_in_page": order,
            "bbox": {"x": 0.05, "y": 0.02, "w": 0.9, "h": 0.12},
            "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
            "ocr_confidence": 0.93,
            "ocr_source_lines": list(range(1, len(heading_lines)+1)),
            "formatting": ["bold", "all_caps"],
        })
        order += 1

        body_content = "\n".join(body_lines)
        body_type = "table_marker" if page_type in ("summary_table", "chronology") else "body_text"
        chunks.append({
            "type": body_type,
            "page_type": page_type,
            "content_raw": body_content,
            "content_en": body_content,
            "content_pt_br": translate_to_ptbr_simple(body_content, page_type),
            "order_in_page": order,
            "bbox": {"x": 0.02, "y": 0.14, "w": 0.98, "h": 0.84},
            "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
            "ocr_confidence": 0.88,
            "ocr_source_lines": list(range(len(heading_lines)+1, len(lines)+1)),
        })
    else:
        # Single chunk for entire page
        content = "\n".join(line for line in lines if True)  # preserve all lines
        body_type = determine_chunk_type(content, page_type)
        if page_type in ("summary_table", "chronology"):
            body_type = "table_marker"

        chunks.append({
            "type": body_type,
            "page_type": page_type,
            "content_raw": content,
            "content_en": content,
            "content_pt_br": translate_to_ptbr_simple(content, page_type),
            "order_in_page": 1,
            "bbox": {"x": 0.02, "y": 0.02, "w": 0.96, "h": 0.96},
            "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
            "ocr_confidence": 0.88,
            "ocr_source_lines": list(range(1, len(lines)+1)),
        })

    return chunks


def translate_to_ptbr_simple(text: str, context: str) -> str:
    """Simple heuristic PT-BR translation for common document patterns.
    For verbatim data (tables, codes, dates, numbers) returns text unchanged.
    For known headers/labels adds translation.
    """
    # For table/chronology data, return as-is (numeric data, codes, acronyms)
    if context in ("summary_table", "chronology", "table_marker"):
        return text  # Data stays verbatim

    # Map known English phrases to PT-BR
    replacements = {
        "FOREWORD": "PREFÁCIO",
        "GLOSSARY": "GLOSSÁRIO",
        "DISTRIBUTION LIST": "LISTA DE DISTRIBUIÇÃO",
        "TABLE OF CONTENTS": "SUMÁRIO",
        "ANNUAL LAUNCH SUMMARY BY BOOSTER": "RESUMO ANUAL DE LANÇAMENTOS POR VEÍCULO",
        "ANNUAL LAUNCH SUMMARY BY COMMAND": "RESUMO ANUAL DE LANÇAMENTOS POR COMANDO",
        "LAUNCH FACILITY GUIDE": "GUIA DE INSTALAÇÕES DE LANÇAMENTO",
        "Office of History": "Escritório de História",
        "As of": "Em",
        "LAUNCH": "LANÇAMENTO",
        "VEHICLE": "VEÍCULO",
        "COMMAND": "COMANDO",
        "PROGRAM": "PROGRAMA",
        "SPACE": "ESPAÇO",
        "TOTAL": "TOTAL",
        "SUBTOTAL": "SUBTOTAL",
        "Grand Total": "Total Geral",
        "GRAND TOTAL": "TOTAL GERAL",
    }

    result = text
    for en, pt in replacements.items():
        result = result.replace(en, f"{en} / {pt}")

    return result


def fmt_chunk_id(n: int) -> str:
    return f"c{n:04d}"


def write_chunk_file(chunk_id: str, chunk: dict, page_num: int) -> None:
    """Write a single chunk .md file."""
    path = CHUNKS_DIR / (chunk_id + ".md")

    prev_chunk = chunk.get("prev_chunk", "null")
    next_chunk = chunk.get("next_chunk", "null")

    def yaml_val(v):
        if v is None or v == "null":
            return "null"
        if isinstance(v, bool):
            return str(v).lower()
        if isinstance(v, (int, float)):
            return str(v)
        return f'"{v}"'

    bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
    formatting = chunk.get("formatting", [])
    fmt_str = "[" + ", ".join(f'"{f}"' for f in formatting) + "]" if formatting else "[]"

    ocr_lines = chunk.get("ocr_source_lines", [])
    if len(ocr_lines) > 10:
        ocr_lines_str = f"[{ocr_lines[0]}, {ocr_lines[1]}, \"...\", {ocr_lines[-1]}]"
    else:
        ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"

    related_image = chunk.get("related_image", "null")
    related_table = chunk.get("related_table", "null")
    image_type = chunk.get("image_type", "null")

    content = f"""---
chunk_id: {chunk_id}
type: {chunk["type"]}
page: {page_num}
order_in_page: {chunk["order_in_page"]}
order_global: {chunk["order_global"]}
bbox: {{x: {bbox["x"]:.2f}, y: {bbox["y"]:.2f}, w: {bbox["w"]:.2f}, h: {bbox["h"]:.2f}}}
classification: null
formatting: {fmt_str}
cross_page_hint: self_contained
prev_chunk: {prev_chunk if prev_chunk != "null" else "null"}
next_chunk: {next_chunk if next_chunk != "null" else "null"}
related_image: {yaml_val(related_image) if related_image != "null" else "null"}
related_table: {yaml_val(related_table) if related_table != "null" else "null"}
ocr_confidence: {chunk.get("ocr_confidence", 0.88):.2f}
ocr_source_lines: {ocr_lines_str}
redaction_code: null
redaction_inferred_content_type: null
image_type: {yaml_val(image_type) if image_type and image_type != "null" else "null"}
ufo_anomaly_detected: false
cryptid_anomaly_detected: false
ufo_anomaly_type: null
ufo_anomaly_rationale: null
cryptid_anomaly_type: null
cryptid_anomaly_rationale: null
image_description_en: null
image_description_pt_br: null
extracted_text: null
source_png: {chunk["source_png"]}
---

**EN:** {chunk["content_en"]}

**PT-BR:** {chunk["content_pt_br"]}
"""
    path.write_text(content, encoding="utf-8")


def main():
    start_time = time.time()

    # Ensure output dirs exist
    CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)

    print(f"Processing {TOTAL_PAGES} pages...")

    # Process all pages
    all_pages_chunks = []  # list of (page_num, page_stem, [chunks])

    for idx, png_file in enumerate(PNG_PAGES):
        page_stem = png_file.replace(".png", "")
        # Map to 1-based page number
        page_num = idx + 1

        ocr_text = read_ocr(page_stem)
        chunks = build_page_chunks(page_num, page_stem, ocr_text)
        all_pages_chunks.append((page_num, page_stem, chunks))
        print(f"  Page {page_num:3d}/{TOTAL_PAGES} ({page_stem}): {len(chunks)} chunk(s)")

    # Globally number chunks
    global_order = 0
    all_chunks_flat = []  # list of (chunk_id, page_num, chunk_dict)

    for page_num, page_stem, chunks in all_pages_chunks:
        for chunk in chunks:
            global_order += 1
            chunk_id = fmt_chunk_id(global_order)
            chunk["chunk_id"] = chunk_id
            chunk["order_global"] = global_order
            chunk["page_num"] = page_num
            chunk["page_stem"] = page_stem
            all_chunks_flat.append((chunk_id, page_num, chunk))

    total_chunks = len(all_chunks_flat)
    print(f"Total chunks: {total_chunks}")

    # Set prev/next pointers
    for i, (chunk_id, page_num, chunk) in enumerate(all_chunks_flat):
        chunk["prev_chunk"] = all_chunks_flat[i-1][0] if i > 0 else "null"
        chunk["next_chunk"] = all_chunks_flat[i+1][0] if i < total_chunks - 1 else "null"

    # Write chunk files
    print("Writing chunk files...")
    for chunk_id, page_num, chunk in all_chunks_flat:
        write_chunk_file(chunk_id, chunk, page_num)

    # Build _index.json
    print("Writing _index.json...")
    index_chunks = []
    for chunk_id, page_num, chunk in all_chunks_flat:
        content_en = chunk["content_en"]
        preview = content_en[:80].replace("\n", " ").strip()
        index_chunks.append({
            "chunk_id": chunk_id,
            "type": chunk["type"],
            "page": page_num,
            "order_in_page": chunk["order_in_page"],
            "order_global": chunk["order_global"],
            "file": f"chunks/{chunk_id}.md",
            "bbox": chunk["bbox"],
            "preview": preview,
        })

    build_at = datetime.datetime.utcnow().isoformat() + "Z"
    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": "claude-sonnet-4-6",
        "build_at": build_at,
        "chunks": index_chunks,
    }
    (OUT_DIR / "_index.json").write_text(
        json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    # Count chunk types
    type_histogram = {}
    for _, _, chunk in all_chunks_flat:
        t = chunk["type"]
        type_histogram[t] = type_histogram.get(t, 0) + 1

    # Count image chunks
    image_chunks = [(cid, pn, ch) for cid, pn, ch in all_chunks_flat if ch["type"] == "image"]
    n_images = len(image_chunks)

    # Build document.md
    print("Writing document.md...")
    build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images)

    elapsed = int(time.time() - start_time)
    print(f"\nDone in {elapsed}s")
    print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={n_images}, tables_stitched=0, ufo_anomalies=0, cryptid_anomalies=0, wall_seconds={elapsed}")


def build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images):
    """Build the master document.md."""
    total_chunks = len(all_chunks_flat)

    histogram_yaml = "\n".join(f"  {k}: {v}" for k, v in sorted(type_histogram.items()))

    header = f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {total_chunks}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged: []
cryptid_anomalies_flagged: []
build_approach: "subagents"
build_model: claude-sonnet-4-6
build_at: {build_at}
---

"""

    # Group chunks by page
    pages_dict = {}
    for chunk_id, page_num, chunk in all_chunks_flat:
        if page_num not in pages_dict:
            pages_dict[page_num] = []
        pages_dict[page_num].append((chunk_id, chunk))

    body_parts = [header]

    for page_num in sorted(pages_dict.keys()):
        chunks_on_page = pages_dict[page_num]
        body_parts.append(f"## Page {page_num}\n\n")

        for chunk_id, chunk in chunks_on_page:
            bbox = chunk["bbox"]
            bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"

            body_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
            body_parts.append(f'<a id="{chunk_id}"></a>\n')
            body_parts.append(f"### Chunk {chunk_id} — {chunk['type']} · p{page_num} · bbox: {bbox_str}\n\n")

            content_en = chunk["content_en"]
            content_pt = chunk["content_pt_br"]

            # For table/chronology, wrap in code block for readability
            if chunk["type"] == "table_marker":
                body_parts.append(f"**EN:**\n\n```\n{content_en}\n```\n\n")
                body_parts.append(f"**PT-BR:**\n\n```\n{content_pt}\n```\n\n")
            elif chunk["type"] == "image":
                body_parts.append(f"**EN:** {content_en}\n\n")
                body_parts.append(f"**PT-BR:** {content_pt}\n\n")
                related_img = chunk.get("related_image")
                if related_img and related_img != "null":
                    body_parts.append(f"![chunk image](./images/{related_img})\n\n")
            else:
                body_parts.append(f"**EN:** {content_en}\n\n")
                body_parts.append(f"**PT-BR:** {content_pt}\n\n")

            # Metadata details block
            meta = {
                "chunk_id": chunk_id,
                "type": chunk["type"],
                "page": page_num,
                "order_in_page": chunk["order_in_page"],
                "order_global": chunk["order_global"],
                "bbox": chunk["bbox"],
                "classification": None,
                "formatting": chunk.get("formatting", []),
                "cross_page_hint": "self_contained",
                "prev_chunk": chunk.get("prev_chunk"),
                "next_chunk": chunk.get("next_chunk"),
                "ocr_confidence": chunk.get("ocr_confidence", 0.88),
                "ufo_anomaly_detected": False,
                "cryptid_anomaly_detected": False,
            }
            meta_json = json.dumps(meta, ensure_ascii=False, indent=2)
            body_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")

    doc_content = "".join(body_parts)
    (OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
    print(f"document.md written ({len(doc_content):,} bytes)")


if __name__ == "__main__":
    main()