disclosure-bureau/scripts/rebuild_doc_section3.py

592 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-3
Processes all 155 pages in parallel batches, generates chunks, images, and index.
"""
import os
import json
import base64
import time
import concurrent.futures
from datetime import datetime, timezone
from pathlib import Path
import anthropic
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-3"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 3 — FBI Flying Discs Investigation File"
TOTAL_PAGES = 155
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
client = anthropic.Anthropic()
CHUNK_TYPES = [
"letterhead", "header", "classification_banner", "subject_line",
"salutation", "body_paragraph", "signature_block", "handwritten_note",
"stamp", "redaction_block", "image", "table_marker", "footer",
"page_number", "attachment_label", "routing_slip", "blank",
"caption", "list_item", "address_block"
]
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent for a declassified FBI UAP/UFO document archive.
Your task: Analyze the provided page image and extract ALL content into structured chunks.
Document: {doc_title}
Page: {page_number} of {total_pages}
Page PNG path: {page_png_path}
Return a JSON object with this exact structure:
{{
"page_number": {page_number},
"classification": "<classification string found on page or null>",
"page_type": "<blank|text|image|mixed|cover>",
"chunks": [
{{
"order_in_page": 1,
"type": "<chunk_type>",
"content_en": "<English text content or description>",
"content_pt_br": "<Brazilian Portuguese translation/description>",
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
"classification": "<classification string or null>",
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.9,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}}
]
}}
RULES:
1. Extract ALL content — no chunk can be skipped.
2. Use ONLY these chunk types: letterhead, header, classification_banner, subject_line, salutation, body_paragraph, signature_block, handwritten_note, stamp, redaction_block, image, table_marker, footer, page_number, attachment_label, routing_slip, blank, caption, list_item, address_block
3. bbox values are normalized 0.0-1.0 (x=left, y=top, w=width, h=height of the page).
4. content_en: verbatim transcription for text, description for images.
5. content_pt_br: Brazilian Portuguese translation of content_en (NOT European Portuguese). For verbatim text blocks, provide both the original (verbatim) and a translation note.
6. For redacted blocks: set type="redaction_block", content_en="[REDACTED]", set redaction_code if visible (e.g., "(b)(1)", "(b)(6)"), redaction_inferred_content_type with your best inference.
7. For images/photos: type="image", image_type = one of: photograph|sketch|diagram|map|chart|logo|signature|stamp|other
8. For tables: type="table_marker"
9. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
10. UAP/cryptid anomaly detection: flag any anomalous visual content (UFO shapes, unusual aerial phenomena, cryptid-related imagery).
11. If page is blank or nearly blank: create ONE chunk type="blank".
12. classification_banner chunks at top/bottom of page for classification markings.
13. stamps: type="stamp" for rubber stamps, file numbers, dates stamped on documents.
14. Return ONLY valid JSON, no other text.
IMPORTANT: Be thorough. A typical text page has 5-15 chunks. A photo page may have 2-3 chunks. Cover/envelope pages have 4-8 chunks.
"""
def encode_image_b64(path: Path) -> str:
with open(path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def process_page(page_num: int) -> dict:
"""Process a single page and return its chunks as a dict."""
# PNG files are p-000.png through p-154.png (zero-indexed)
png_index = page_num - 1 # page 1 = p-000.png
png_path = PNG_DIR / f"p-{png_index:03d}.png"
if not png_path.exists():
print(f" WARNING: PNG not found for page {page_num}: {png_path}")
return {
"page_number": page_num,
"classification": None,
"page_type": "blank",
"chunks": [{
"order_in_page": 1,
"type": "blank",
"content_en": "[Page image not found]",
"content_pt_br": "[Imagem da página não encontrada]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}]
}
img_b64 = encode_image_b64(png_path)
prompt = PAGE_REBUILDER_PROMPT.format(
doc_title=DOC_TITLE,
page_number=page_num,
total_pages=TOTAL_PAGES,
page_png_path=str(png_path)
)
max_retries = 3
for attempt in range(max_retries):
try:
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_b64
}
},
{
"type": "text",
"text": prompt
}
]
}]
)
text = response.content[0].text.strip()
# Strip markdown code fences if present
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
data = json.loads(text)
data["page_number"] = page_num # ensure correct
return data
except json.JSONDecodeError as e:
print(f" Page {page_num} attempt {attempt+1}: JSON parse error: {e}")
if attempt == max_retries - 1:
# Return a fallback
return {
"page_number": page_num,
"classification": None,
"page_type": "text",
"chunks": [{
"order_in_page": 1,
"type": "body_paragraph",
"content_en": f"[Page {page_num} — parse error, content not extracted]",
"content_pt_br": f"[Página {page_num} — erro de análise, conteúdo não extraído]",
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}]
}
except anthropic.APIError as e:
print(f" Page {page_num} attempt {attempt+1}: API error: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
else:
return {
"page_number": page_num,
"classification": None,
"page_type": "text",
"chunks": [{
"order_in_page": 1,
"type": "body_paragraph",
"content_en": f"[Page {page_num} — API error]",
"content_pt_br": f"[Página {page_num} — erro de API]",
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}]
}
def crop_image(chunk_id: str, png_path: Path, bbox: dict):
"""Crop a region from the page PNG and save to images dir."""
try:
from PIL import Image
im = Image.open(png_path)
W, H = im.size
x = bbox.get("x", 0)
y = bbox.get("y", 0)
w = bbox.get("w", 1)
h = bbox.get("h", 1)
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
cropped = im.crop((left, top, right, bottom))
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
cropped.save(out_path)
return out_path
except Exception as e:
print(f" Crop error for {chunk_id}: {e}")
return None
def write_chunk_file(chunk_data: dict, chunk_id: str, page_num: int,
order_global: int, prev_chunk, next_chunk,
has_image: bool) -> None:
"""Write a single chunk markdown file."""
bbox = chunk_data.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
png_index = page_num - 1
source_png = f"../../processing/png/{DOC_ID}/p-{png_index:03d}.png"
related_image = f"IMG-{chunk_id}.png" if has_image else "null"
related_table = chunk_data.get("related_table", "null") or "null"
ufo = chunk_data.get("ufo_anomaly_detected", False)
cryptid = chunk_data.get("cryptid_anomaly_detected", False)
frontmatter = f"""---
chunk_id: {chunk_id}
type: {chunk_data.get("type", "body_paragraph")}
page: {page_num}
order_in_page: {chunk_data.get("order_in_page", 1)}
order_global: {order_global}
bbox: {{x: {bbox.get("x", 0):.3f}, y: {bbox.get("y", 0):.3f}, w: {bbox.get("w", 1):.3f}, h: {bbox.get("h", 1):.3f}}}
classification: {json.dumps(chunk_data.get("classification"))}
formatting: {json.dumps(chunk_data.get("formatting", []))}
cross_page_hint: {chunk_data.get("cross_page_hint", "self_contained")}
prev_chunk: {json.dumps(prev_chunk)}
next_chunk: {json.dumps(next_chunk)}
related_image: {json.dumps(related_image if has_image else None)}
related_table: {json.dumps(chunk_data.get("related_table"))}
ocr_confidence: {chunk_data.get("ocr_confidence", 0.9)}
ocr_source_lines: {json.dumps(chunk_data.get("ocr_source_lines", []))}
redaction_code: {json.dumps(chunk_data.get("redaction_code"))}
redaction_inferred_content_type: {json.dumps(chunk_data.get("redaction_inferred_content_type"))}
image_type: {json.dumps(chunk_data.get("image_type"))}
ufo_anomaly_detected: {str(ufo).lower()}
ufo_anomaly_type: {json.dumps(chunk_data.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {json.dumps(chunk_data.get("ufo_anomaly_rationale"))}
cryptid_anomaly_detected: {str(cryptid).lower()}
cryptid_anomaly_type: {json.dumps(chunk_data.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {json.dumps(chunk_data.get("cryptid_anomaly_rationale"))}
image_description_en: {json.dumps(chunk_data.get("image_description_en"))}
image_description_pt_br: {json.dumps(chunk_data.get("image_description_pt_br"))}
extracted_text: {json.dumps(chunk_data.get("extracted_text"))}
source_png: {source_png}
---
**EN:** {chunk_data.get("content_en", "")}
**PT-BR:** {chunk_data.get("content_pt_br", "")}
"""
out_path = CHUNKS_DIR / f"{chunk_id}.md"
out_path.write_text(frontmatter, encoding="utf-8")
def main():
start_time = time.time()
print(f"Starting rebuild of {DOC_ID}")
print(f"Processing {TOTAL_PAGES} pages with 4 parallel workers...")
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
# Step 1: Process all pages in parallel batches of 4
all_pages = {} # page_num -> page_data
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
future_to_page = {
executor.submit(process_page, page_num): page_num
for page_num in range(1, TOTAL_PAGES + 1)
}
completed = 0
for future in concurrent.futures.as_completed(future_to_page):
page_num = future_to_page[future]
try:
result = future.result()
all_pages[page_num] = result
completed += 1
if completed % 10 == 0:
print(f" Completed {completed}/{TOTAL_PAGES} pages...")
except Exception as e:
print(f" Page {page_num} failed: {e}")
all_pages[page_num] = {
"page_number": page_num,
"classification": None,
"page_type": "text",
"chunks": [{
"order_in_page": 1,
"type": "body_paragraph",
"content_en": f"[Page {page_num} — processing failed: {e}]",
"content_pt_br": f"[Página {page_num} — processamento falhou: {e}]",
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}]
}
print(f"All pages processed. Assigning global chunk IDs...")
# Step 2: Assign global chunk IDs
all_chunks = [] # list of (chunk_id, page_num, chunk_data)
global_order = 0
for page_num in range(1, TOTAL_PAGES + 1):
page_data = all_pages[page_num]
chunks = page_data.get("chunks", [])
# Sort by order_in_page
chunks.sort(key=lambda c: c.get("order_in_page", 0))
for chunk in chunks:
global_order += 1
chunk_id = f"c{global_order:04d}"
all_chunks.append((chunk_id, page_num, chunk))
total_chunks = len(all_chunks)
print(f"Total chunks: {total_chunks}")
# Set prev/next pointers
for i, (chunk_id, page_num, chunk) in enumerate(all_chunks):
prev_chunk = all_chunks[i-1][0] if i > 0 else None
next_chunk = all_chunks[i+1][0] if i < len(all_chunks) - 1 else None
chunk["_chunk_id"] = chunk_id
chunk["_prev"] = prev_chunk
chunk["_next"] = next_chunk
chunk["_order_global"] = i + 1
# Step 3: Crop images for image-type chunks
print("Cropping images for image chunks...")
image_chunks = [(cid, pnum, c) for cid, pnum, c in all_chunks if c.get("type") == "image"]
print(f" Found {len(image_chunks)} image chunks")
for chunk_id, page_num, chunk in image_chunks:
png_index = page_num - 1
png_path = PNG_DIR / f"p-{png_index:03d}.png"
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
crop_image(chunk_id, png_path, bbox)
# Step 4: Write chunk files
print("Writing chunk files...")
for chunk_id, page_num, chunk in all_chunks:
has_image = chunk.get("type") == "image"
write_chunk_file(
chunk, chunk_id, page_num,
chunk["_order_global"],
chunk["_prev"],
chunk["_next"],
has_image
)
# Step 5: Write _index.json
print("Writing _index.json...")
build_at = datetime.now(timezone.utc).isoformat()
index_chunks = []
for chunk_id, page_num, chunk in all_chunks:
content_en = chunk.get("content_en", "")
preview = content_en[:80] if content_en else ""
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
index_chunks.append({
"chunk_id": chunk_id,
"type": chunk.get("type", "body_paragraph"),
"page": page_num,
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk["_order_global"],
"file": f"chunks/{chunk_id}.md",
"bbox": bbox,
"preview": preview
})
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": "claude-haiku-4-5",
"build_at": build_at,
"chunks": index_chunks
}
index_path = OUT_DIR / "_index.json"
index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8")
# Step 6: Compute stats
chunk_types = {}
ufo_anomalies = []
cryptid_anomalies = []
images_count = 0
for chunk_id, page_num, chunk in all_chunks:
t = chunk.get("type", "body_paragraph")
chunk_types[t] = chunk_types.get(t, 0) + 1
if chunk.get("ufo_anomaly_detected"):
ufo_anomalies.append(chunk_id)
if chunk.get("cryptid_anomaly_detected"):
cryptid_anomalies.append(chunk_id)
if t == "image":
images_count += 1
# Step 7: Write document.md
print("Writing document.md...")
frontmatter_lines = [
"---",
'schema_version: "0.2.0"',
"type: master_document",
f"doc_id: {DOC_ID}",
f'canonical_title: "{DOC_TITLE}"',
f"total_pages: {TOTAL_PAGES}",
f"total_chunks: {total_chunks}",
"chunk_types_histogram:",
]
for t, count in sorted(chunk_types.items()):
frontmatter_lines.append(f" {t}: {count}")
frontmatter_lines.append("multi_page_tables: []")
frontmatter_lines.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
frontmatter_lines.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
frontmatter_lines.append('build_approach: "subagents"')
frontmatter_lines.append("build_model: claude-haiku-4-5")
frontmatter_lines.append(f"build_at: {build_at}")
frontmatter_lines.append("---")
frontmatter_lines.append("")
doc_lines = frontmatter_lines[:]
current_page = 0
for chunk_id, page_num, chunk in all_chunks:
if page_num != current_page:
current_page = page_num
doc_lines.append(f"## Page {page_num}")
doc_lines.append("")
chunk_type = chunk.get("type", "body_paragraph")
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
doc_lines.append(f'<a id="{chunk_id}"></a>')
doc_lines.append(f"### Chunk {chunk_id}{chunk_type} · p{page_num} · bbox: {bbox_str}")
doc_lines.append("")
content_en = chunk.get("content_en", "")
content_pt = chunk.get("content_pt_br", "")
doc_lines.append(f"**EN:** {content_en}")
doc_lines.append("")
doc_lines.append(f"**PT-BR:** {content_pt}")
doc_lines.append("")
if chunk_type == "image":
doc_lines.append(f"![chunk image](./images/IMG-{chunk_id}.png)")
desc_en = chunk.get("image_description_en", "")
desc_pt = chunk.get("image_description_pt_br", "")
if desc_en:
doc_lines.append(f"*{desc_en}*")
if desc_pt:
doc_lines.append(f"*{desc_pt}*")
doc_lines.append("")
# Build metadata JSON for details block
meta = {
"chunk_id": chunk_id,
"type": chunk_type,
"page": page_num,
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk["_order_global"],
"bbox": bbox,
"classification": chunk.get("classification"),
"formatting": chunk.get("formatting", []),
"cross_page_hint": chunk.get("cross_page_hint", "self_contained"),
"prev_chunk": chunk["_prev"],
"next_chunk": chunk["_next"],
"ocr_confidence": chunk.get("ocr_confidence", 0.9),
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
"cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
"redaction_code": chunk.get("redaction_code"),
"image_type": chunk.get("image_type"),
}
doc_lines.append("<details><summary>metadata</summary>")
doc_lines.append("")
doc_lines.append("```json")
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
doc_lines.append("```")
doc_lines.append("")
doc_lines.append("</details>")
doc_lines.append("")
doc_lines.append("---")
doc_lines.append("")
doc_content = "\n".join(doc_lines)
doc_path = OUT_DIR / "document.md"
doc_path.write_text(doc_content, encoding="utf-8")
wall_seconds = int(time.time() - start_time)
doc_md_bytes = len(doc_content.encode("utf-8"))
print(f"\nDone!")
print(f"STATS pages={TOTAL_PAGES} chunks={total_chunks} images={images_count} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
print(f"Wall time: {wall_seconds}s")
return {
"pages": TOTAL_PAGES,
"chunks": total_chunks,
"images": images_count,
"tables": 0,
"ufo": len(ufo_anomalies),
"cryptid": len(cryptid_anomalies),
"wall_seconds": wall_seconds,
"doc_md_bytes": doc_md_bytes
}
if __name__ == "__main__":
main()