disclosure-bureau/scripts/rebuild_doc_section3.py

#!/usr/bin/env python3
"""
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-3
Processes all 155 pages in parallel batches, generates chunks, images, and index.
"""

import os
import json
import base64
import time
import concurrent.futures
from datetime import datetime, timezone
from pathlib import Path
import anthropic

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-3"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 3 — FBI Flying Discs Investigation File"
TOTAL_PAGES = 155
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"

client = anthropic.Anthropic()

CHUNK_TYPES = [
    "letterhead", "header", "classification_banner", "subject_line",
    "salutation", "body_paragraph", "signature_block", "handwritten_note",
    "stamp", "redaction_block", "image", "table_marker", "footer",
    "page_number", "attachment_label", "routing_slip", "blank",
    "caption", "list_item", "address_block"
]

PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent for a declassified FBI UAP/UFO document archive.

Your task: Analyze the provided page image and extract ALL content into structured chunks.

Document: {doc_title}
Page: {page_number} of {total_pages}
Page PNG path: {page_png_path}

Return a JSON object with this exact structure:
{{
  "page_number": {page_number},
  "classification": "<classification string found on page or null>",
  "page_type": "<blank|text|image|mixed|cover>",
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "<chunk_type>",
      "content_en": "<English text content or description>",
      "content_pt_br": "<Brazilian Portuguese translation/description>",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": "<classification string or null>",
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.9,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null
    }}
  ]
}}

RULES:
1. Extract ALL content — no chunk can be skipped.
2. Use ONLY these chunk types: letterhead, header, classification_banner, subject_line, salutation, body_paragraph, signature_block, handwritten_note, stamp, redaction_block, image, table_marker, footer, page_number, attachment_label, routing_slip, blank, caption, list_item, address_block
3. bbox values are normalized 0.0-1.0 (x=left, y=top, w=width, h=height of the page).
4. content_en: verbatim transcription for text, description for images.
5. content_pt_br: Brazilian Portuguese translation of content_en (NOT European Portuguese). For verbatim text blocks, provide both the original (verbatim) and a translation note.
6. For redacted blocks: set type="redaction_block", content_en="[REDACTED]", set redaction_code if visible (e.g., "(b)(1)", "(b)(6)"), redaction_inferred_content_type with your best inference.
7. For images/photos: type="image", image_type = one of: photograph|sketch|diagram|map|chart|logo|signature|stamp|other
8. For tables: type="table_marker"
9. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
10. UAP/cryptid anomaly detection: flag any anomalous visual content (UFO shapes, unusual aerial phenomena, cryptid-related imagery).
11. If page is blank or nearly blank: create ONE chunk type="blank".
12. classification_banner chunks at top/bottom of page for classification markings.
13. stamps: type="stamp" for rubber stamps, file numbers, dates stamped on documents.
14. Return ONLY valid JSON, no other text.

IMPORTANT: Be thorough. A typical text page has 5-15 chunks. A photo page may have 2-3 chunks. Cover/envelope pages have 4-8 chunks.
"""


def encode_image_b64(path: Path) -> str:
    with open(path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")


def process_page(page_num: int) -> dict:
    """Process a single page and return its chunks as a dict."""
    # PNG files are p-000.png through p-154.png (zero-indexed)
    png_index = page_num - 1  # page 1 = p-000.png
    png_path = PNG_DIR / f"p-{png_index:03d}.png"

    if not png_path.exists():
        print(f"  WARNING: PNG not found for page {page_num}: {png_path}")
        return {
            "page_number": page_num,
            "classification": None,
            "page_type": "blank",
            "chunks": [{
                "order_in_page": 1,
                "type": "blank",
                "content_en": "[Page image not found]",
                "content_pt_br": "[Imagem da página não encontrada]",
                "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
                "classification": None,
                "formatting": [],
                "cross_page_hint": "self_contained",
                "ocr_confidence": 0.0,
                "ocr_source_lines": [],
                "redaction_code": None,
                "redaction_inferred_content_type": None,
                "image_type": None,
                "ufo_anomaly_detected": False,
                "ufo_anomaly_type": None,
                "ufo_anomaly_rationale": None,
                "cryptid_anomaly_detected": False,
                "cryptid_anomaly_type": None,
                "cryptid_anomaly_rationale": None
            }]
        }

    img_b64 = encode_image_b64(png_path)

    prompt = PAGE_REBUILDER_PROMPT.format(
        doc_title=DOC_TITLE,
        page_number=page_num,
        total_pages=TOTAL_PAGES,
        page_png_path=str(png_path)
    )

    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = client.messages.create(
                model="claude-haiku-4-5",
                max_tokens=4096,
                messages=[{
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": img_b64
                            }
                        },
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }]
            )

            text = response.content[0].text.strip()
            # Strip markdown code fences if present
            if text.startswith("```"):
                lines = text.split("\n")
                text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])

            data = json.loads(text)
            data["page_number"] = page_num  # ensure correct
            return data

        except json.JSONDecodeError as e:
            print(f"  Page {page_num} attempt {attempt+1}: JSON parse error: {e}")
            if attempt == max_retries - 1:
                # Return a fallback
                return {
                    "page_number": page_num,
                    "classification": None,
                    "page_type": "text",
                    "chunks": [{
                        "order_in_page": 1,
                        "type": "body_paragraph",
                        "content_en": f"[Page {page_num} — parse error, content not extracted]",
                        "content_pt_br": f"[Página {page_num} — erro de análise, conteúdo não extraído]",
                        "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
                        "classification": None,
                        "formatting": [],
                        "cross_page_hint": "self_contained",
                        "ocr_confidence": 0.0,
                        "ocr_source_lines": [],
                        "redaction_code": None,
                        "redaction_inferred_content_type": None,
                        "image_type": None,
                        "ufo_anomaly_detected": False,
                        "ufo_anomaly_type": None,
                        "ufo_anomaly_rationale": None,
                        "cryptid_anomaly_detected": False,
                        "cryptid_anomaly_type": None,
                        "cryptid_anomaly_rationale": None
                    }]
                }
        except anthropic.APIError as e:
            print(f"  Page {page_num} attempt {attempt+1}: API error: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return {
                    "page_number": page_num,
                    "classification": None,
                    "page_type": "text",
                    "chunks": [{
                        "order_in_page": 1,
                        "type": "body_paragraph",
                        "content_en": f"[Page {page_num} — API error]",
                        "content_pt_br": f"[Página {page_num} — erro de API]",
                        "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
                        "classification": None,
                        "formatting": [],
                        "cross_page_hint": "self_contained",
                        "ocr_confidence": 0.0,
                        "ocr_source_lines": [],
                        "redaction_code": None,
                        "redaction_inferred_content_type": None,
                        "image_type": None,
                        "ufo_anomaly_detected": False,
                        "ufo_anomaly_type": None,
                        "ufo_anomaly_rationale": None,
                        "cryptid_anomaly_detected": False,
                        "cryptid_anomaly_type": None,
                        "cryptid_anomaly_rationale": None
                    }]
                }


def crop_image(chunk_id: str, png_path: Path, bbox: dict):
    """Crop a region from the page PNG and save to images dir."""
    try:
        from PIL import Image
        im = Image.open(png_path)
        W, H = im.size
        x = bbox.get("x", 0)
        y = bbox.get("y", 0)
        w = bbox.get("w", 1)
        h = bbox.get("h", 1)
        pad = 0.005
        left = max(0, int((x - pad) * W))
        top = max(0, int((y - pad) * H))
        right = min(W, int((x + w + pad) * W))
        bottom = min(H, int((y + h + pad) * H))
        cropped = im.crop((left, top, right, bottom))
        out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
        cropped.save(out_path)
        return out_path
    except Exception as e:
        print(f"  Crop error for {chunk_id}: {e}")
        return None


def write_chunk_file(chunk_data: dict, chunk_id: str, page_num: int,
                     order_global: int, prev_chunk, next_chunk,
                     has_image: bool) -> None:
    """Write a single chunk markdown file."""
    bbox = chunk_data.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
    png_index = page_num - 1
    source_png = f"../../processing/png/{DOC_ID}/p-{png_index:03d}.png"

    related_image = f"IMG-{chunk_id}.png" if has_image else "null"
    related_table = chunk_data.get("related_table", "null") or "null"

    ufo = chunk_data.get("ufo_anomaly_detected", False)
    cryptid = chunk_data.get("cryptid_anomaly_detected", False)

    frontmatter = f"""---
chunk_id: {chunk_id}
type: {chunk_data.get("type", "body_paragraph")}
page: {page_num}
order_in_page: {chunk_data.get("order_in_page", 1)}
order_global: {order_global}
bbox: {{x: {bbox.get("x", 0):.3f}, y: {bbox.get("y", 0):.3f}, w: {bbox.get("w", 1):.3f}, h: {bbox.get("h", 1):.3f}}}
classification: {json.dumps(chunk_data.get("classification"))}
formatting: {json.dumps(chunk_data.get("formatting", []))}
cross_page_hint: {chunk_data.get("cross_page_hint", "self_contained")}
prev_chunk: {json.dumps(prev_chunk)}
next_chunk: {json.dumps(next_chunk)}
related_image: {json.dumps(related_image if has_image else None)}
related_table: {json.dumps(chunk_data.get("related_table"))}
ocr_confidence: {chunk_data.get("ocr_confidence", 0.9)}
ocr_source_lines: {json.dumps(chunk_data.get("ocr_source_lines", []))}
redaction_code: {json.dumps(chunk_data.get("redaction_code"))}
redaction_inferred_content_type: {json.dumps(chunk_data.get("redaction_inferred_content_type"))}
image_type: {json.dumps(chunk_data.get("image_type"))}
ufo_anomaly_detected: {str(ufo).lower()}
ufo_anomaly_type: {json.dumps(chunk_data.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {json.dumps(chunk_data.get("ufo_anomaly_rationale"))}
cryptid_anomaly_detected: {str(cryptid).lower()}
cryptid_anomaly_type: {json.dumps(chunk_data.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {json.dumps(chunk_data.get("cryptid_anomaly_rationale"))}
image_description_en: {json.dumps(chunk_data.get("image_description_en"))}
image_description_pt_br: {json.dumps(chunk_data.get("image_description_pt_br"))}
extracted_text: {json.dumps(chunk_data.get("extracted_text"))}
source_png: {source_png}
---

**EN:** {chunk_data.get("content_en", "")}

**PT-BR:** {chunk_data.get("content_pt_br", "")}
"""

    out_path = CHUNKS_DIR / f"{chunk_id}.md"
    out_path.write_text(frontmatter, encoding="utf-8")


def main():
    start_time = time.time()
    print(f"Starting rebuild of {DOC_ID}")
    print(f"Processing {TOTAL_PAGES} pages with 4 parallel workers...")

    CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)

    # Step 1: Process all pages in parallel batches of 4
    all_pages = {}  # page_num -> page_data

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        future_to_page = {
            executor.submit(process_page, page_num): page_num
            for page_num in range(1, TOTAL_PAGES + 1)
        }
        completed = 0
        for future in concurrent.futures.as_completed(future_to_page):
            page_num = future_to_page[future]
            try:
                result = future.result()
                all_pages[page_num] = result
                completed += 1
                if completed % 10 == 0:
                    print(f"  Completed {completed}/{TOTAL_PAGES} pages...")
            except Exception as e:
                print(f"  Page {page_num} failed: {e}")
                all_pages[page_num] = {
                    "page_number": page_num,
                    "classification": None,
                    "page_type": "text",
                    "chunks": [{
                        "order_in_page": 1,
                        "type": "body_paragraph",
                        "content_en": f"[Page {page_num} — processing failed: {e}]",
                        "content_pt_br": f"[Página {page_num} — processamento falhou: {e}]",
                        "bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
                        "classification": None,
                        "formatting": [],
                        "cross_page_hint": "self_contained",
                        "ocr_confidence": 0.0,
                        "ocr_source_lines": [],
                        "redaction_code": None,
                        "redaction_inferred_content_type": None,
                        "image_type": None,
                        "ufo_anomaly_detected": False,
                        "ufo_anomaly_type": None,
                        "ufo_anomaly_rationale": None,
                        "cryptid_anomaly_detected": False,
                        "cryptid_anomaly_type": None,
                        "cryptid_anomaly_rationale": None
                    }]
                }

    print(f"All pages processed. Assigning global chunk IDs...")

    # Step 2: Assign global chunk IDs
    all_chunks = []  # list of (chunk_id, page_num, chunk_data)
    global_order = 0

    for page_num in range(1, TOTAL_PAGES + 1):
        page_data = all_pages[page_num]
        chunks = page_data.get("chunks", [])
        # Sort by order_in_page
        chunks.sort(key=lambda c: c.get("order_in_page", 0))
        for chunk in chunks:
            global_order += 1
            chunk_id = f"c{global_order:04d}"
            all_chunks.append((chunk_id, page_num, chunk))

    total_chunks = len(all_chunks)
    print(f"Total chunks: {total_chunks}")

    # Set prev/next pointers
    for i, (chunk_id, page_num, chunk) in enumerate(all_chunks):
        prev_chunk = all_chunks[i-1][0] if i > 0 else None
        next_chunk = all_chunks[i+1][0] if i < len(all_chunks) - 1 else None
        chunk["_chunk_id"] = chunk_id
        chunk["_prev"] = prev_chunk
        chunk["_next"] = next_chunk
        chunk["_order_global"] = i + 1

    # Step 3: Crop images for image-type chunks
    print("Cropping images for image chunks...")
    image_chunks = [(cid, pnum, c) for cid, pnum, c in all_chunks if c.get("type") == "image"]
    print(f"  Found {len(image_chunks)} image chunks")

    for chunk_id, page_num, chunk in image_chunks:
        png_index = page_num - 1
        png_path = PNG_DIR / f"p-{png_index:03d}.png"
        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
        crop_image(chunk_id, png_path, bbox)

    # Step 4: Write chunk files
    print("Writing chunk files...")
    for chunk_id, page_num, chunk in all_chunks:
        has_image = chunk.get("type") == "image"
        write_chunk_file(
            chunk, chunk_id, page_num,
            chunk["_order_global"],
            chunk["_prev"],
            chunk["_next"],
            has_image
        )

    # Step 5: Write _index.json
    print("Writing _index.json...")
    build_at = datetime.now(timezone.utc).isoformat()

    index_chunks = []
    for chunk_id, page_num, chunk in all_chunks:
        content_en = chunk.get("content_en", "")
        preview = content_en[:80] if content_en else ""
        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
        index_chunks.append({
            "chunk_id": chunk_id,
            "type": chunk.get("type", "body_paragraph"),
            "page": page_num,
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk["_order_global"],
            "file": f"chunks/{chunk_id}.md",
            "bbox": bbox,
            "preview": preview
        })

    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": TOTAL_PAGES,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": "claude-haiku-4-5",
        "build_at": build_at,
        "chunks": index_chunks
    }

    index_path = OUT_DIR / "_index.json"
    index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8")

    # Step 6: Compute stats
    chunk_types = {}
    ufo_anomalies = []
    cryptid_anomalies = []
    images_count = 0

    for chunk_id, page_num, chunk in all_chunks:
        t = chunk.get("type", "body_paragraph")
        chunk_types[t] = chunk_types.get(t, 0) + 1
        if chunk.get("ufo_anomaly_detected"):
            ufo_anomalies.append(chunk_id)
        if chunk.get("cryptid_anomaly_detected"):
            cryptid_anomalies.append(chunk_id)
        if t == "image":
            images_count += 1

    # Step 7: Write document.md
    print("Writing document.md...")

    frontmatter_lines = [
        "---",
        'schema_version: "0.2.0"',
        "type: master_document",
        f"doc_id: {DOC_ID}",
        f'canonical_title: "{DOC_TITLE}"',
        f"total_pages: {TOTAL_PAGES}",
        f"total_chunks: {total_chunks}",
        "chunk_types_histogram:",
    ]
    for t, count in sorted(chunk_types.items()):
        frontmatter_lines.append(f"  {t}: {count}")
    frontmatter_lines.append("multi_page_tables: []")
    frontmatter_lines.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
    frontmatter_lines.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
    frontmatter_lines.append('build_approach: "subagents"')
    frontmatter_lines.append("build_model: claude-haiku-4-5")
    frontmatter_lines.append(f"build_at: {build_at}")
    frontmatter_lines.append("---")
    frontmatter_lines.append("")

    doc_lines = frontmatter_lines[:]

    current_page = 0
    for chunk_id, page_num, chunk in all_chunks:
        if page_num != current_page:
            current_page = page_num
            doc_lines.append(f"## Page {page_num}")
            doc_lines.append("")

        chunk_type = chunk.get("type", "body_paragraph")
        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
        bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"

        doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
        doc_lines.append(f'<a id="{chunk_id}"></a>')
        doc_lines.append(f"### Chunk {chunk_id} — {chunk_type} · p{page_num} · bbox: {bbox_str}")
        doc_lines.append("")

        content_en = chunk.get("content_en", "")
        content_pt = chunk.get("content_pt_br", "")

        doc_lines.append(f"**EN:** {content_en}")
        doc_lines.append("")
        doc_lines.append(f"**PT-BR:** {content_pt}")
        doc_lines.append("")

        if chunk_type == "image":
            doc_lines.append(f"![chunk image](./images/IMG-{chunk_id}.png)")
            desc_en = chunk.get("image_description_en", "")
            desc_pt = chunk.get("image_description_pt_br", "")
            if desc_en:
                doc_lines.append(f"*{desc_en}*")
            if desc_pt:
                doc_lines.append(f"*{desc_pt}*")
            doc_lines.append("")

        # Build metadata JSON for details block
        meta = {
            "chunk_id": chunk_id,
            "type": chunk_type,
            "page": page_num,
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk["_order_global"],
            "bbox": bbox,
            "classification": chunk.get("classification"),
            "formatting": chunk.get("formatting", []),
            "cross_page_hint": chunk.get("cross_page_hint", "self_contained"),
            "prev_chunk": chunk["_prev"],
            "next_chunk": chunk["_next"],
            "ocr_confidence": chunk.get("ocr_confidence", 0.9),
            "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
            "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
            "ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
            "cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
            "redaction_code": chunk.get("redaction_code"),
            "image_type": chunk.get("image_type"),
        }

        doc_lines.append("<details><summary>metadata</summary>")
        doc_lines.append("")
        doc_lines.append("```json")
        doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
        doc_lines.append("```")
        doc_lines.append("")
        doc_lines.append("</details>")
        doc_lines.append("")
        doc_lines.append("---")
        doc_lines.append("")

    doc_content = "\n".join(doc_lines)
    doc_path = OUT_DIR / "document.md"
    doc_path.write_text(doc_content, encoding="utf-8")

    wall_seconds = int(time.time() - start_time)
    doc_md_bytes = len(doc_content.encode("utf-8"))

    print(f"\nDone!")
    print(f"STATS pages={TOTAL_PAGES} chunks={total_chunks} images={images_count} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
    print(f"Wall time: {wall_seconds}s")

    return {
        "pages": TOTAL_PAGES,
        "chunks": total_chunks,
        "images": images_count,
        "tables": 0,
        "ufo": len(ufo_anomalies),
        "cryptid": len(cryptid_anomalies),
        "wall_seconds": wall_seconds,
        "doc_md_bytes": doc_md_bytes
    }


if __name__ == "__main__":
    main()