disclosure-bureau/scripts/rebuild_doc_65.py

478 lines
19 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild doc-65-hs1-834228961-62-hq-83894-serial-130
Processes all 91 pages via Claude vision, produces chunks/_index.json/document.md
"""
import os
import sys
import json
import base64
import time
import concurrent.futures
from datetime import datetime, timezone
from pathlib import Path
import anthropic
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130"
DOC_TITLE = "HQ Air Defense Command - Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
d.mkdir(parents=True, exist_ok=True)
client = anthropic.Anthropic()
def encode_image(path: Path) -> str:
with open(path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder subagent. Analyze this document page image and extract ALL content as structured chunks.
Document: {doc_title}
Doc ID: {doc_id}
Page number (in sequence): {page_number} of {total_pages}
Source PNG filename: {png_filename}
Return a JSON object with this exact structure:
{{
"page_number": {page_number},
"png_filename": "{png_filename}",
"chunks": [
{{
"order_in_page": 1,
"type": "<type>",
"content_en": "...",
"content_pt_br": "...",
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.9,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null,
"image_description_en": null,
"image_description_pt_br": null,
"extracted_text": null
}}
]
}}
Allowed chunk types: letterhead, classification_banner, subject_line, body_paragraph, list_item, signature_block, date_line, address_block, header, footer, redaction_block, table_marker, image, stamp, handwritten_note, page_number_marker, blank
Rules:
1. Create ONE chunk per distinct visual/logical unit. Do not merge unrelated blocks.
2. For classification banners (TOP SECRET, SECRET, CONFIDENTIAL, etc.) at top/bottom of page: type=classification_banner, fill classification field.
3. For any image/photo/diagram/map/sketch: type=image, fill image_type, image_description_en, image_description_pt_br, ufo_anomaly_detected, cryptid_anomaly_detected.
4. For redacted/blacked-out areas: type=redaction_block, fill redaction_code if visible.
5. content_en = exact English transcription of text, verbatim. content_pt_br = Brazilian Portuguese translation of content_en (NOT translation of classification banners/stamps/codes — keep those verbatim in both fields).
6. bbox: normalized coordinates (0.0-1.0): x=left, y=top, w=width, h=height relative to page.
7. formatting: array of applicable: bold, italic, underline, all_caps, strikethrough, handwritten.
8. For cross_page_hint: "continues_to_next" if text clearly continues on next page, "continues_from_prev" if it continues from previous page, "self_contained" otherwise.
9. ocr_confidence: your confidence in the transcription (0.0-1.0).
10. If page is blank: return single chunk type=blank.
11. ufo_anomaly_detected: true if the chunk contains or depicts a UAP/UFO, unidentified aerial phenomenon, unknown object in sky, or anomalous craft. Set ufo_anomaly_type and ufo_anomaly_rationale.
12. IMPORTANT: Return ONLY valid JSON, no markdown code blocks, no explanation.'''
def process_page(page_index: int, png_filename: str, total_pages: int) -> dict:
"""Process a single page and return its chunks."""
png_path = PNG_DIR / png_filename
try:
img_data = encode_image(png_path)
prompt = PAGE_REBUILDER_PROMPT.format(
doc_title=DOC_TITLE,
doc_id=DOC_ID,
page_number=page_index,
total_pages=total_pages,
png_filename=png_filename
)
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_data
}
},
{
"type": "text",
"text": prompt
}
]
}
]
)
raw_text = response.content[0].text.strip()
# Strip markdown code block if present
if raw_text.startswith("```"):
lines = raw_text.split("\n")
# Remove first and last lines if they are code fences
if lines[0].startswith("```"):
lines = lines[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
raw_text = "\n".join(lines)
result = json.loads(raw_text)
result["page_index"] = page_index
result["png_filename"] = png_filename
return result
except Exception as e:
print(f" ERROR page {page_index} ({png_filename}): {e}", file=sys.stderr)
# Return minimal fallback
return {
"page_number": page_index,
"page_index": page_index,
"png_filename": png_filename,
"chunks": [
{
"order_in_page": 1,
"type": "blank",
"content_en": f"[Page processing error: {str(e)[:100]}]",
"content_pt_br": f"[Erro de processamento: {str(e)[:100]}]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None
}
]
}
def main():
start_time = time.time()
# Get all PNG files in sorted order
png_files = sorted([f.name for f in PNG_DIR.glob("p-*.png")])
total_pages = len(png_files)
print(f"Processing {total_pages} pages for {DOC_ID}")
# Process in parallel batches of 5
all_page_results = {}
batch_size = 5
for batch_start in range(0, total_pages, batch_size):
batch = png_files[batch_start:batch_start + batch_size]
batch_indices = list(range(batch_start + 1, batch_start + len(batch) + 1))
print(f" Batch {batch_start//batch_size + 1}: pages {batch_indices[0]}-{batch_indices[-1]} ({[b for b in batch]})")
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = {
executor.submit(process_page, idx, fname, total_pages): (idx, fname)
for idx, fname in zip(batch_indices, batch)
}
for future in concurrent.futures.as_completed(futures):
idx, fname = futures[future]
try:
result = future.result(timeout=120)
all_page_results[idx] = result
chunk_count = len(result.get("chunks", []))
print(f" Page {idx} ({fname}): {chunk_count} chunks")
except Exception as e:
print(f" FAILED page {idx} ({fname}): {e}", file=sys.stderr)
# Globally number chunks
print("\nNumbering chunks globally...")
all_chunks = []
global_order = 0
for page_idx in sorted(all_page_results.keys()):
page_data = all_page_results[page_idx]
png_filename = page_data.get("png_filename", f"p-{page_idx:03d}.png")
page_chunks = page_data.get("chunks", [])
# Sort by order_in_page
page_chunks.sort(key=lambda c: c.get("order_in_page", 0))
for chunk in page_chunks:
global_order += 1
chunk_id = f"c{global_order:04d}"
chunk["chunk_id"] = chunk_id
chunk["page"] = page_idx
chunk["order_global"] = global_order
chunk["source_png"] = f"../../processing/png/{DOC_ID}/{png_filename}"
all_chunks.append(chunk)
# Set prev/next pointers
for i, chunk in enumerate(all_chunks):
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks) - 1 else None
# Detect image chunks for cropping
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f"\nFound {len(image_chunks)} image chunks")
# Crop images using PIL
print("Cropping image regions...")
for chunk in image_chunks:
chunk_id = chunk["chunk_id"]
page_idx = chunk["page"]
png_filename = all_page_results[page_idx]["png_filename"]
png_path = PNG_DIR / png_filename
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
try:
from PIL import Image
im = Image.open(png_path)
W, H = im.size
x, y, w, h = bbox.get("x", 0), bbox.get("y", 0), bbox.get("w", 1), bbox.get("h", 1)
pad = 0.005
crop = im.crop((
max(0, int((x - pad) * W)),
max(0, int((y - pad) * H)),
min(W, int((x + w + pad) * W)),
min(H, int((y + h + pad) * H))
))
crop.save(str(out_path))
chunk["related_image"] = f"IMG-{chunk_id}.png"
print(f" Cropped {chunk_id} from {png_filename}")
except Exception as e:
print(f" CROP ERROR {chunk_id}: {e}", file=sys.stderr)
chunk["related_image"] = None
# For non-image chunks, set related_image to null
for chunk in all_chunks:
if "related_image" not in chunk:
chunk["related_image"] = None
if "related_table" not in chunk:
chunk["related_table"] = None
# Write individual chunk files
print("\nWriting chunk files...")
for chunk in all_chunks:
chunk_id = chunk["chunk_id"]
chunk_path = CHUNKS_DIR / f"{chunk_id}.md"
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
content = f"""---
chunk_id: {chunk_id}
type: {chunk.get('type', 'body_paragraph')}
page: {chunk.get('page', 1)}
order_in_page: {chunk.get('order_in_page', 1)}
order_global: {chunk.get('order_global', 1)}
bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 1):.3f}}}
classification: {json.dumps(chunk.get('classification'))}
formatting: {json.dumps(chunk.get('formatting', []))}
cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}
prev_chunk: {json.dumps(chunk.get('prev_chunk'))}
next_chunk: {json.dumps(chunk.get('next_chunk'))}
related_image: {json.dumps(chunk.get('related_image'))}
related_table: {json.dumps(chunk.get('related_table'))}
ocr_confidence: {chunk.get('ocr_confidence', 0.9)}
ocr_source_lines: {json.dumps(chunk.get('ocr_source_lines', []))}
redaction_code: {json.dumps(chunk.get('redaction_code'))}
redaction_inferred_content_type: {json.dumps(chunk.get('redaction_inferred_content_type'))}
image_type: {json.dumps(chunk.get('image_type'))}
ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()}
cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()}
ufo_anomaly_type: {json.dumps(chunk.get('ufo_anomaly_type'))}
ufo_anomaly_rationale: {json.dumps(chunk.get('ufo_anomaly_rationale'))}
cryptid_anomaly_type: {json.dumps(chunk.get('cryptid_anomaly_type'))}
cryptid_anomaly_rationale: {json.dumps(chunk.get('cryptid_anomaly_rationale'))}
image_description_en: {json.dumps(chunk.get('image_description_en'))}
image_description_pt_br: {json.dumps(chunk.get('image_description_pt_br'))}
extracted_text: {json.dumps(chunk.get('extracted_text'))}
source_png: {chunk.get('source_png', '')}
---
**EN:** {chunk.get('content_en', '')}
**PT-BR:** {chunk.get('content_pt_br', '')}
"""
chunk_path.write_text(content, encoding="utf-8")
print(f" Wrote {len(all_chunks)} chunk files")
# Build _index.json
print("\nBuilding _index.json...")
build_at = datetime.now(timezone.utc).isoformat()
index_chunks = []
for chunk in all_chunks:
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
preview = chunk.get("content_en", "")[:80]
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "body_paragraph"),
"page": chunk.get("page", 1),
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": {
"x": round(bbox.get("x", 0), 3),
"y": round(bbox.get("y", 0), 3),
"w": round(bbox.get("w", 1), 3),
"h": round(bbox.get("h", 1), 3)
},
"preview": preview
})
index_data = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": total_pages,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": "claude-sonnet-4-6",
"build_at": build_at,
"chunks": index_chunks
}
index_path = RAW_DIR / "_index.json"
index_path.write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f" Wrote _index.json with {len(all_chunks)} chunks")
# Compute histogram
type_hist = {}
for chunk in all_chunks:
t = chunk.get("type", "unknown")
type_hist[t] = type_hist.get(t, 0) + 1
# Collect anomaly lists
ufo_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
cryptid_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
# Assemble document.md
print("\nAssembling document.md...")
doc_lines = []
doc_lines.append(f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {total_pages}
total_chunks: {len(all_chunks)}
chunk_types_histogram: {json.dumps(type_hist)}
multi_page_tables: []
ufo_anomalies_flagged: {json.dumps(ufo_anomaly_chunks)}
cryptid_anomalies_flagged: {json.dumps(cryptid_anomaly_chunks)}
build_approach: "subagents"
build_model: "claude-sonnet-4-6"
build_at: "{build_at}"
---
""")
current_page = None
for chunk in all_chunks:
page = chunk.get("page")
if page != current_page:
current_page = page
png_fn = all_page_results.get(page, {}).get("png_filename", f"p-{page:03d}.png")
doc_lines.append(f"\n## Page {page} (source: {png_fn})\n")
chunk_id = chunk["chunk_id"]
ctype = chunk.get("type", "body_paragraph")
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
doc_lines.append(f'<a id="{chunk_id}"></a>')
doc_lines.append(f"### Chunk {chunk_id}{ctype} · p{page} · bbox: {bbox_str}\n")
content_en = chunk.get("content_en", "")
content_pt_br = chunk.get("content_pt_br", "")
doc_lines.append(f"**EN:** {content_en}\n")
doc_lines.append(f"**PT-BR:** {content_pt_br}\n")
# Embed image if applicable
if ctype == "image" and chunk.get("related_image"):
img_file = chunk["related_image"]
doc_lines.append(f"![{chunk_id} image](./images/{img_file})\n")
if chunk.get("image_description_en"):
doc_lines.append(f"*Image description: {chunk['image_description_en']}*\n")
# Metadata details block
meta = {
"chunk_id": chunk_id,
"type": ctype,
"page": page,
"order_in_page": chunk.get("order_in_page"),
"order_global": chunk.get("order_global"),
"bbox": chunk.get("bbox"),
"classification": chunk.get("classification"),
"formatting": chunk.get("formatting", []),
"cross_page_hint": chunk.get("cross_page_hint"),
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"related_image": chunk.get("related_image"),
"related_table": chunk.get("related_table"),
"ocr_confidence": chunk.get("ocr_confidence"),
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
"ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
"cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
"cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"),
"image_description_en": chunk.get("image_description_en"),
"image_description_pt_br": chunk.get("image_description_pt_br"),
"source_png": chunk.get("source_png")
}
doc_lines.append("<details><summary>metadata</summary>\n")
doc_lines.append("```json")
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
doc_lines.append("```\n")
doc_lines.append("</details>\n")
doc_lines.append("---\n")
doc_content = "\n".join(doc_lines)
doc_path = RAW_DIR / "document.md"
doc_path.write_text(doc_content, encoding="utf-8")
wall_seconds = int(time.time() - start_time)
doc_bytes = len(doc_content.encode("utf-8"))
print(f"\nDone!")
print(f" Chunks: {len(all_chunks)}")
print(f" Images: {len(image_chunks)}")
print(f" UFO anomalies: {len(ufo_anomaly_chunks)}")
print(f" Cryptid anomalies: {len(cryptid_anomaly_chunks)}")
print(f" document.md: {doc_bytes} bytes")
print(f" Wall time: {wall_seconds}s")
print(f"\nSTATS pages={total_pages} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomaly_chunks)} cryptid={len(cryptid_anomaly_chunks)} doc_md_bytes={doc_bytes}")
if __name__ == "__main__":
main()