disclosure-bureau/scripts/rebuild_doc_65.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild doc-65-hs1-834228961-62-hq-83894-serial-130
Processes all 91 pages via Claude vision, produces chunks/_index.json/document.md
"""

import os
import sys
import json
import base64
import time
import concurrent.futures
from datetime import datetime, timezone
from pathlib import Path
import anthropic

DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130"
DOC_TITLE = "HQ Air Defense Command - Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"

for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

client = anthropic.Anthropic()

def encode_image(path: Path) -> str:
    with open(path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")

PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder subagent. Analyze this document page image and extract ALL content as structured chunks.

Document: {doc_title}
Doc ID: {doc_id}
Page number (in sequence): {page_number} of {total_pages}
Source PNG filename: {png_filename}

Return a JSON object with this exact structure:
{{
  "page_number": {page_number},
  "png_filename": "{png_filename}",
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "<type>",
      "content_en": "...",
      "content_pt_br": "...",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.9,
      "ocr_source_lines": [],
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null,
      "image_description_en": null,
      "image_description_pt_br": null,
      "extracted_text": null
    }}
  ]
}}

Allowed chunk types: letterhead, classification_banner, subject_line, body_paragraph, list_item, signature_block, date_line, address_block, header, footer, redaction_block, table_marker, image, stamp, handwritten_note, page_number_marker, blank

Rules:
1. Create ONE chunk per distinct visual/logical unit. Do not merge unrelated blocks.
2. For classification banners (TOP SECRET, SECRET, CONFIDENTIAL, etc.) at top/bottom of page: type=classification_banner, fill classification field.
3. For any image/photo/diagram/map/sketch: type=image, fill image_type, image_description_en, image_description_pt_br, ufo_anomaly_detected, cryptid_anomaly_detected.
4. For redacted/blacked-out areas: type=redaction_block, fill redaction_code if visible.
5. content_en = exact English transcription of text, verbatim. content_pt_br = Brazilian Portuguese translation of content_en (NOT translation of classification banners/stamps/codes — keep those verbatim in both fields).
6. bbox: normalized coordinates (0.0-1.0): x=left, y=top, w=width, h=height relative to page.
7. formatting: array of applicable: bold, italic, underline, all_caps, strikethrough, handwritten.
8. For cross_page_hint: "continues_to_next" if text clearly continues on next page, "continues_from_prev" if it continues from previous page, "self_contained" otherwise.
9. ocr_confidence: your confidence in the transcription (0.0-1.0).
10. If page is blank: return single chunk type=blank.
11. ufo_anomaly_detected: true if the chunk contains or depicts a UAP/UFO, unidentified aerial phenomenon, unknown object in sky, or anomalous craft. Set ufo_anomaly_type and ufo_anomaly_rationale.
12. IMPORTANT: Return ONLY valid JSON, no markdown code blocks, no explanation.'''

def process_page(page_index: int, png_filename: str, total_pages: int) -> dict:
    """Process a single page and return its chunks."""
    png_path = PNG_DIR / png_filename

    try:
        img_data = encode_image(png_path)

        prompt = PAGE_REBUILDER_PROMPT.format(
            doc_title=DOC_TITLE,
            doc_id=DOC_ID,
            page_number=page_index,
            total_pages=total_pages,
            png_filename=png_filename
        )

        response = client.messages.create(
            model="claude-sonnet-4-6",
            max_tokens=4096,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": img_data
                            }
                        },
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ]
        )

        raw_text = response.content[0].text.strip()
        # Strip markdown code block if present
        if raw_text.startswith("```"):
            lines = raw_text.split("\n")
            # Remove first and last lines if they are code fences
            if lines[0].startswith("```"):
                lines = lines[1:]
            if lines and lines[-1].strip() == "```":
                lines = lines[:-1]
            raw_text = "\n".join(lines)

        result = json.loads(raw_text)
        result["page_index"] = page_index
        result["png_filename"] = png_filename
        return result

    except Exception as e:
        print(f"  ERROR page {page_index} ({png_filename}): {e}", file=sys.stderr)
        # Return minimal fallback
        return {
            "page_number": page_index,
            "page_index": page_index,
            "png_filename": png_filename,
            "chunks": [
                {
                    "order_in_page": 1,
                    "type": "blank",
                    "content_en": f"[Page processing error: {str(e)[:100]}]",
                    "content_pt_br": f"[Erro de processamento: {str(e)[:100]}]",
                    "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
                    "classification": None,
                    "formatting": [],
                    "cross_page_hint": "self_contained",
                    "ocr_confidence": 0.0,
                    "ocr_source_lines": [],
                    "redaction_code": None,
                    "redaction_inferred_content_type": None,
                    "image_type": None,
                    "ufo_anomaly_detected": False,
                    "ufo_anomaly_type": None,
                    "ufo_anomaly_rationale": None,
                    "cryptid_anomaly_detected": False,
                    "cryptid_anomaly_type": None,
                    "cryptid_anomaly_rationale": None,
                    "image_description_en": None,
                    "image_description_pt_br": None,
                    "extracted_text": None
                }
            ]
        }

def main():
    start_time = time.time()

    # Get all PNG files in sorted order
    png_files = sorted([f.name for f in PNG_DIR.glob("p-*.png")])
    total_pages = len(png_files)
    print(f"Processing {total_pages} pages for {DOC_ID}")

    # Process in parallel batches of 5
    all_page_results = {}
    batch_size = 5

    for batch_start in range(0, total_pages, batch_size):
        batch = png_files[batch_start:batch_start + batch_size]
        batch_indices = list(range(batch_start + 1, batch_start + len(batch) + 1))

        print(f"  Batch {batch_start//batch_size + 1}: pages {batch_indices[0]}-{batch_indices[-1]} ({[b for b in batch]})")

        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = {
                executor.submit(process_page, idx, fname, total_pages): (idx, fname)
                for idx, fname in zip(batch_indices, batch)
            }
            for future in concurrent.futures.as_completed(futures):
                idx, fname = futures[future]
                try:
                    result = future.result(timeout=120)
                    all_page_results[idx] = result
                    chunk_count = len(result.get("chunks", []))
                    print(f"    Page {idx} ({fname}): {chunk_count} chunks")
                except Exception as e:
                    print(f"    FAILED page {idx} ({fname}): {e}", file=sys.stderr)

    # Globally number chunks
    print("\nNumbering chunks globally...")
    all_chunks = []
    global_order = 0

    for page_idx in sorted(all_page_results.keys()):
        page_data = all_page_results[page_idx]
        png_filename = page_data.get("png_filename", f"p-{page_idx:03d}.png")
        page_chunks = page_data.get("chunks", [])

        # Sort by order_in_page
        page_chunks.sort(key=lambda c: c.get("order_in_page", 0))

        for chunk in page_chunks:
            global_order += 1
            chunk_id = f"c{global_order:04d}"
            chunk["chunk_id"] = chunk_id
            chunk["page"] = page_idx
            chunk["order_global"] = global_order
            chunk["source_png"] = f"../../processing/png/{DOC_ID}/{png_filename}"
            all_chunks.append(chunk)

    # Set prev/next pointers
    for i, chunk in enumerate(all_chunks):
        chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
        chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks) - 1 else None

    # Detect image chunks for cropping
    image_chunks = [c for c in all_chunks if c.get("type") == "image"]
    print(f"\nFound {len(image_chunks)} image chunks")

    # Crop images using PIL
    print("Cropping image regions...")
    for chunk in image_chunks:
        chunk_id = chunk["chunk_id"]
        page_idx = chunk["page"]
        png_filename = all_page_results[page_idx]["png_filename"]
        png_path = PNG_DIR / png_filename
        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
        out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"

        try:
            from PIL import Image
            im = Image.open(png_path)
            W, H = im.size
            x, y, w, h = bbox.get("x", 0), bbox.get("y", 0), bbox.get("w", 1), bbox.get("h", 1)
            pad = 0.005
            crop = im.crop((
                max(0, int((x - pad) * W)),
                max(0, int((y - pad) * H)),
                min(W, int((x + w + pad) * W)),
                min(H, int((y + h + pad) * H))
            ))
            crop.save(str(out_path))
            chunk["related_image"] = f"IMG-{chunk_id}.png"
            print(f"  Cropped {chunk_id} from {png_filename}")
        except Exception as e:
            print(f"  CROP ERROR {chunk_id}: {e}", file=sys.stderr)
            chunk["related_image"] = None

    # For non-image chunks, set related_image to null
    for chunk in all_chunks:
        if "related_image" not in chunk:
            chunk["related_image"] = None
        if "related_table" not in chunk:
            chunk["related_table"] = None

    # Write individual chunk files
    print("\nWriting chunk files...")
    for chunk in all_chunks:
        chunk_id = chunk["chunk_id"]
        chunk_path = CHUNKS_DIR / f"{chunk_id}.md"

        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})

        content = f"""---
chunk_id: {chunk_id}
type: {chunk.get('type', 'body_paragraph')}
page: {chunk.get('page', 1)}
order_in_page: {chunk.get('order_in_page', 1)}
order_global: {chunk.get('order_global', 1)}
bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 1):.3f}}}
classification: {json.dumps(chunk.get('classification'))}
formatting: {json.dumps(chunk.get('formatting', []))}
cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}
prev_chunk: {json.dumps(chunk.get('prev_chunk'))}
next_chunk: {json.dumps(chunk.get('next_chunk'))}
related_image: {json.dumps(chunk.get('related_image'))}
related_table: {json.dumps(chunk.get('related_table'))}
ocr_confidence: {chunk.get('ocr_confidence', 0.9)}
ocr_source_lines: {json.dumps(chunk.get('ocr_source_lines', []))}
redaction_code: {json.dumps(chunk.get('redaction_code'))}
redaction_inferred_content_type: {json.dumps(chunk.get('redaction_inferred_content_type'))}
image_type: {json.dumps(chunk.get('image_type'))}
ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()}
cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()}
ufo_anomaly_type: {json.dumps(chunk.get('ufo_anomaly_type'))}
ufo_anomaly_rationale: {json.dumps(chunk.get('ufo_anomaly_rationale'))}
cryptid_anomaly_type: {json.dumps(chunk.get('cryptid_anomaly_type'))}
cryptid_anomaly_rationale: {json.dumps(chunk.get('cryptid_anomaly_rationale'))}
image_description_en: {json.dumps(chunk.get('image_description_en'))}
image_description_pt_br: {json.dumps(chunk.get('image_description_pt_br'))}
extracted_text: {json.dumps(chunk.get('extracted_text'))}
source_png: {chunk.get('source_png', '')}
---

**EN:** {chunk.get('content_en', '')}

**PT-BR:** {chunk.get('content_pt_br', '')}
"""
        chunk_path.write_text(content, encoding="utf-8")

    print(f"  Wrote {len(all_chunks)} chunk files")

    # Build _index.json
    print("\nBuilding _index.json...")
    build_at = datetime.now(timezone.utc).isoformat()

    index_chunks = []
    for chunk in all_chunks:
        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
        preview = chunk.get("content_en", "")[:80]
        index_chunks.append({
            "chunk_id": chunk["chunk_id"],
            "type": chunk.get("type", "body_paragraph"),
            "page": chunk.get("page", 1),
            "order_in_page": chunk.get("order_in_page", 1),
            "order_global": chunk.get("order_global", 1),
            "file": f"chunks/{chunk['chunk_id']}.md",
            "bbox": {
                "x": round(bbox.get("x", 0), 3),
                "y": round(bbox.get("y", 0), 3),
                "w": round(bbox.get("w", 1), 3),
                "h": round(bbox.get("h", 1), 3)
            },
            "preview": preview
        })

    index_data = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": total_pages,
        "total_chunks": len(all_chunks),
        "build_approach": "subagents",
        "build_model": "claude-sonnet-4-6",
        "build_at": build_at,
        "chunks": index_chunks
    }

    index_path = RAW_DIR / "_index.json"
    index_path.write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"  Wrote _index.json with {len(all_chunks)} chunks")

    # Compute histogram
    type_hist = {}
    for chunk in all_chunks:
        t = chunk.get("type", "unknown")
        type_hist[t] = type_hist.get(t, 0) + 1

    # Collect anomaly lists
    ufo_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
    cryptid_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]

    # Assemble document.md
    print("\nAssembling document.md...")

    doc_lines = []
    doc_lines.append(f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {total_pages}
total_chunks: {len(all_chunks)}
chunk_types_histogram: {json.dumps(type_hist)}
multi_page_tables: []
ufo_anomalies_flagged: {json.dumps(ufo_anomaly_chunks)}
cryptid_anomalies_flagged: {json.dumps(cryptid_anomaly_chunks)}
build_approach: "subagents"
build_model: "claude-sonnet-4-6"
build_at: "{build_at}"
---
""")

    current_page = None
    for chunk in all_chunks:
        page = chunk.get("page")
        if page != current_page:
            current_page = page
            png_fn = all_page_results.get(page, {}).get("png_filename", f"p-{page:03d}.png")
            doc_lines.append(f"\n## Page {page} (source: {png_fn})\n")

        chunk_id = chunk["chunk_id"]
        ctype = chunk.get("type", "body_paragraph")
        bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
        bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"

        doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
        doc_lines.append(f'<a id="{chunk_id}"></a>')
        doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}\n")

        content_en = chunk.get("content_en", "")
        content_pt_br = chunk.get("content_pt_br", "")
        doc_lines.append(f"**EN:** {content_en}\n")
        doc_lines.append(f"**PT-BR:** {content_pt_br}\n")

        # Embed image if applicable
        if ctype == "image" and chunk.get("related_image"):
            img_file = chunk["related_image"]
            doc_lines.append(f"![{chunk_id} image](./images/{img_file})\n")
            if chunk.get("image_description_en"):
                doc_lines.append(f"*Image description: {chunk['image_description_en']}*\n")

        # Metadata details block
        meta = {
            "chunk_id": chunk_id,
            "type": ctype,
            "page": page,
            "order_in_page": chunk.get("order_in_page"),
            "order_global": chunk.get("order_global"),
            "bbox": chunk.get("bbox"),
            "classification": chunk.get("classification"),
            "formatting": chunk.get("formatting", []),
            "cross_page_hint": chunk.get("cross_page_hint"),
            "prev_chunk": chunk.get("prev_chunk"),
            "next_chunk": chunk.get("next_chunk"),
            "related_image": chunk.get("related_image"),
            "related_table": chunk.get("related_table"),
            "ocr_confidence": chunk.get("ocr_confidence"),
            "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
            "ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
            "ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
            "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
            "cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
            "cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"),
            "image_description_en": chunk.get("image_description_en"),
            "image_description_pt_br": chunk.get("image_description_pt_br"),
            "source_png": chunk.get("source_png")
        }

        doc_lines.append("<details><summary>metadata</summary>\n")
        doc_lines.append("```json")
        doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
        doc_lines.append("```\n")
        doc_lines.append("</details>\n")
        doc_lines.append("---\n")

    doc_content = "\n".join(doc_lines)
    doc_path = RAW_DIR / "document.md"
    doc_path.write_text(doc_content, encoding="utf-8")

    wall_seconds = int(time.time() - start_time)
    doc_bytes = len(doc_content.encode("utf-8"))

    print(f"\nDone!")
    print(f"  Chunks: {len(all_chunks)}")
    print(f"  Images: {len(image_chunks)}")
    print(f"  UFO anomalies: {len(ufo_anomaly_chunks)}")
    print(f"  Cryptid anomalies: {len(cryptid_anomaly_chunks)}")
    print(f"  document.md: {doc_bytes} bytes")
    print(f"  Wall time: {wall_seconds}s")
    print(f"\nSTATS pages={total_pages} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomaly_chunks)} cryptid={len(cryptid_anomaly_chunks)} doc_md_bytes={doc_bytes}")

if __name__ == "__main__":
    main()