disclosure-bureau/scripts/rebuild_doc_d48.py

597 lines
24 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuilds dow-uap-d48-report-september-1996 into harness-assemblable structure.
Processes all 146 pages with vision + OCR, generates chunks, images, index, and document.md
"""
import os
import json
import base64
import re
import csv
import time
import concurrent.futures
from datetime import datetime, timezone
from pathlib import Path
from PIL import Image
import anthropic
DOC_ID = "dow-uap-d48-report-september-1996"
DOC_TITLE = "Modeling Unlikely Space-Booster Failures in Risk Calculations"
BASE_PNG = f"/Users/guto/ufo/processing/png/{DOC_ID}"
BASE_OCR = f"/Users/guto/ufo/processing/ocr/{DOC_ID}"
OUT_DIR = f"/Users/guto/ufo/raw/{DOC_ID}"
CHUNKS_DIR = f"{OUT_DIR}/chunks"
IMAGES_DIR = f"{OUT_DIR}/images"
TABLES_DIR = f"{OUT_DIR}/tables"
os.makedirs(CHUNKS_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)
client = anthropic.Anthropic()
# All page numbers that have PNGs
PNG_PAGES = [
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,
117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,
134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,
151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,176,177,178,179,180,181
]
TOTAL_PAGES = len(PNG_PAGES)
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent. Analyze the provided page image and OCR text from a declassified technical document and extract all content as structured chunks.
Document: "{doc_title}"
Page number (file): {page_num} (sequential position {seq_pos} of {total_pages})
OCR text:
```
{ocr_text}
```
Return a JSON object with this exact structure:
{{
"page_number": {page_num},
"seq_position": {seq_pos},
"chunks": [
{{
"order_in_page": 1,
"type": "<type>",
"content_en": "<english content>",
"content_pt_br": "<portuguese BR translation>",
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.9,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null,
"image_description_en": null,
"image_description_pt_br": null,
"extracted_text": null
}}
]
}}
Chunk types (use ONLY these):
- letterhead: institution/org header at top of page
- classification_banner: classification marking (SECRET, TOP SECRET, UNCLASSIFIED, etc.)
- title: document or section title
- subtitle: subtitle or sub-heading
- heading: section heading (numbered or unnumbered)
- subheading: subsection heading
- paragraph: body text paragraph
- list_item: bullet or numbered list item
- table_marker: a table (include table data in content_en as pipe-delimited markdown table)
- figure_caption: caption for a figure or chart
- image: a photograph, diagram, chart, graph, or illustration
- footer: footer text (page numbers, dates, etc.)
- header: running header
- signature_block: signature area
- redaction: redacted/blacked-out area
- page_number: standalone page number
- toc_entry: table of contents entry
- abstract: abstract section
- reference: bibliography/reference entry
- form_field: form field label and value
- metadata_block: document metadata block (e.g., Report Documentation Page)
- appendix_marker: appendix label/header
- blank: intentionally blank area
Rules:
1. Every visible content region becomes a chunk — do not skip anything.
2. For tables: include the full table as markdown pipe-delimited format in content_en.
3. For images/figures: set type=image, describe what you see in image_description_en and image_description_pt_br. Set extracted_text if the image contains text.
4. bbox coordinates: x,y = top-left corner (0-1 normalized), w,h = width/height (0-1 normalized).
5. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
6. content_pt_br: full Brazilian Portuguese translation of content_en (NOT European Portuguese).
7. formatting: array of applicable: ["bold", "italic", "all_caps", "underline", "centered", "right_aligned"]
8. classification: null for unclassified content, or the exact marking string if present.
9. ocr_confidence: estimate 0.0-1.0 based on OCR quality.
10. For the abstract, use type=abstract.
11. For TOC entries, each line is a separate toc_entry chunk.
12. For figure captions, use type=figure_caption.
13. ufo_anomaly_detected: true only if content describes UAP/UFO phenomenon (this is a space booster report, very unlikely).
14. Return ONLY valid JSON, no markdown fences, no explanation text.
"""
def read_ocr(page_num):
"""Read OCR text for a page number, return empty string if not found."""
ocr_path = f"{BASE_OCR}/p-{page_num:03d}.txt"
if os.path.exists(ocr_path):
with open(ocr_path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
return ""
def read_png_b64(page_num):
"""Read PNG image as base64."""
png_path = f"{BASE_PNG}/p-{page_num:03d}.png"
with open(png_path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def process_page(page_num, seq_pos):
"""Process a single page using vision + OCR, return page chunk data."""
ocr_text = read_ocr(page_num)
img_b64 = read_png_b64(page_num)
prompt = PAGE_REBUILDER_PROMPT.format(
doc_title=DOC_TITLE,
page_num=page_num,
seq_pos=seq_pos,
total_pages=TOTAL_PAGES,
ocr_text=ocr_text[:4000] if ocr_text else "(no OCR available)"
)
max_retries = 3
for attempt in range(max_retries):
try:
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_b64
}
},
{
"type": "text",
"text": prompt
}
]
}
]
)
raw = response.content[0].text.strip()
# Remove markdown fences if present
raw = re.sub(r'^```json\s*', '', raw)
raw = re.sub(r'^```\s*', '', raw)
raw = re.sub(r'\s*```$', '', raw)
data = json.loads(raw)
print(f" [OK] page {page_num:03d} (seq {seq_pos}) -> {len(data.get('chunks', []))} chunks")
return data
except json.JSONDecodeError as e:
print(f" [WARN] page {page_num:03d} JSON parse error (attempt {attempt+1}): {e}")
if attempt == max_retries - 1:
# Return minimal fallback
return {
"page_number": page_num,
"seq_position": seq_pos,
"chunks": [
{
"order_in_page": 1,
"type": "paragraph",
"content_en": ocr_text[:2000] if ocr_text else f"[Page {page_num} - content extraction failed]",
"content_pt_br": f"[Página {page_num} - extração de conteúdo falhou]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.5,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None
}
]
}
except Exception as e:
print(f" [ERROR] page {page_num:03d} (attempt {attempt+1}): {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
else:
return {
"page_number": page_num,
"seq_position": seq_pos,
"chunks": [
{
"order_in_page": 1,
"type": "paragraph",
"content_en": f"[Page {page_num} - processing error: {str(e)[:100]}]",
"content_pt_br": f"[Página {page_num} - erro de processamento]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None
}
]
}
def process_pages_batch(pages_with_seq):
"""Process a batch of pages concurrently."""
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_page = {
executor.submit(process_page, page_num, seq_pos): (page_num, seq_pos)
for page_num, seq_pos in pages_with_seq
}
for future in concurrent.futures.as_completed(future_to_page):
page_num, seq_pos = future_to_page[future]
try:
result = future.result()
results[seq_pos] = result
except Exception as e:
print(f" [FATAL] page {page_num}: {e}")
results[seq_pos] = {
"page_number": page_num,
"seq_position": seq_pos,
"chunks": []
}
return results
def crop_image_chunk(chunk_id, page_num, bbox):
"""Crop image region from page PNG and save."""
png_path = f"{BASE_PNG}/p-{page_num:03d}.png"
out_path = f"{IMAGES_DIR}/IMG-{chunk_id}.png"
try:
im = Image.open(png_path)
W, H = im.size
x, y, w, h = bbox.get("x", 0), bbox.get("y", 0), bbox.get("w", 1), bbox.get("h", 1)
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
cropped = im.crop((left, top, right, bottom))
cropped.save(out_path)
return out_path
except Exception as e:
print(f" [WARN] crop failed for {chunk_id}: {e}")
return None
def write_chunk_file(chunk_data, page_num):
"""Write individual chunk markdown file."""
chunk_id = chunk_data["chunk_id"]
chunk_type = chunk_data.get("type", "paragraph")
order_in_page = chunk_data.get("order_in_page", 1)
order_global = chunk_data.get("order_global", 1)
bbox = chunk_data.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
# Determine related_image and related_table
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
related_table = chunk_data.get("related_table", None)
prev_chunk = chunk_data.get("prev_chunk", None)
next_chunk = chunk_data.get("next_chunk", None)
content_en = chunk_data.get("content_en", "")
content_pt_br = chunk_data.get("content_pt_br", "")
yaml_lines = [
f"---",
f"chunk_id: {chunk_id}",
f"type: {chunk_type}",
f"page: {page_num}",
f"order_in_page: {order_in_page}",
f"order_global: {order_global}",
f"bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',0.1):.2f}}}",
f"classification: {json.dumps(chunk_data.get('classification', None))}",
f"formatting: {json.dumps(chunk_data.get('formatting', []))}",
f"cross_page_hint: {chunk_data.get('cross_page_hint', 'self_contained')}",
f"prev_chunk: {json.dumps(prev_chunk)}",
f"next_chunk: {json.dumps(next_chunk)}",
f"related_image: {json.dumps(related_image)}",
f"related_table: {json.dumps(related_table)}",
f"ocr_confidence: {chunk_data.get('ocr_confidence', 0.9)}",
f"ocr_source_lines: {json.dumps(chunk_data.get('ocr_source_lines', []))}",
f"redaction_code: {json.dumps(chunk_data.get('redaction_code', None))}",
f"redaction_inferred_content_type: {json.dumps(chunk_data.get('redaction_inferred_content_type', None))}",
f"image_type: {json.dumps(chunk_data.get('image_type', None))}",
f"ufo_anomaly_detected: {str(chunk_data.get('ufo_anomaly_detected', False)).lower()}",
f"ufo_anomaly_type: {json.dumps(chunk_data.get('ufo_anomaly_type', None))}",
f"ufo_anomaly_rationale: {json.dumps(chunk_data.get('ufo_anomaly_rationale', None))}",
f"cryptid_anomaly_detected: {str(chunk_data.get('cryptid_anomaly_detected', False)).lower()}",
f"cryptid_anomaly_type: {json.dumps(chunk_data.get('cryptid_anomaly_type', None))}",
f"cryptid_anomaly_rationale: {json.dumps(chunk_data.get('cryptid_anomaly_rationale', None))}",
f"image_description_en: {json.dumps(chunk_data.get('image_description_en', None))}",
f"image_description_pt_br: {json.dumps(chunk_data.get('image_description_pt_br', None))}",
f"extracted_text: {json.dumps(chunk_data.get('extracted_text', None))}",
f"source_png: ../../processing/png/{DOC_ID}/p-{page_num:03d}.png",
f"---",
]
content = "\n".join(yaml_lines) + "\n\n"
content += f"**EN:** {content_en}\n\n"
content += f"**PT-BR:** {content_pt_br}\n"
out_path = f"{CHUNKS_DIR}/{chunk_id}.md"
with open(out_path, "w", encoding="utf-8") as f:
f.write(content)
def main():
start_time = time.time()
print(f"=== Rebuilding {DOC_ID} ===")
print(f"Total pages to process: {TOTAL_PAGES}")
# Create page batches (5 at a time)
pages_with_seq = [(page_num, seq_pos+1) for seq_pos, page_num in enumerate(PNG_PAGES)]
all_page_results = {}
batch_size = 5
for batch_start in range(0, len(pages_with_seq), batch_size):
batch = pages_with_seq[batch_start:batch_start+batch_size]
batch_nums = [p[0] for p in batch]
print(f"\nProcessing batch {batch_start//batch_size + 1}: pages {batch_nums}")
batch_results = process_pages_batch(batch)
all_page_results.update(batch_results)
# Small pause between batches to avoid rate limiting
if batch_start + batch_size < len(pages_with_seq):
time.sleep(0.5)
print(f"\n=== All {TOTAL_PAGES} pages processed ===")
# Global chunk numbering
# Sort results by seq_position
all_chunks = []
for seq_pos in sorted(all_page_results.keys()):
page_result = all_page_results[seq_pos]
page_num = page_result["page_number"]
chunks = page_result.get("chunks", [])
# Sort chunks by order_in_page
chunks_sorted = sorted(chunks, key=lambda c: c.get("order_in_page", 0))
for chunk in chunks_sorted:
all_chunks.append({
**chunk,
"page_number": page_num,
"seq_position": seq_pos
})
# Assign global chunk IDs
for global_idx, chunk in enumerate(all_chunks):
chunk["chunk_id"] = f"c{global_idx+1:04d}"
chunk["order_global"] = global_idx + 1
chunk["prev_chunk"] = f"c{global_idx:04d}" if global_idx > 0 else None
chunk["next_chunk"] = f"c{global_idx+2:04d}" if global_idx < len(all_chunks)-1 else None
print(f"Total chunks: {len(all_chunks)}")
# Crop images and collect image chunks
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f"Image chunks found: {len(image_chunks)}")
for img_chunk in image_chunks:
chunk_id = img_chunk["chunk_id"]
page_num = img_chunk["page_number"]
bbox = img_chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
crop_image_chunk(chunk_id, page_num, bbox)
print(f" Cropped image: {chunk_id} from page {page_num}")
# Write individual chunk files
print("\nWriting chunk files...")
for chunk in all_chunks:
write_chunk_file(chunk, chunk["page_number"])
# Build _index.json
print("Writing _index.json...")
index_chunks = []
for chunk in all_chunks:
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk["page_number"],
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk["order_global"],
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}),
"preview": chunk.get("content_en", "")[:80]
})
build_at = datetime.now(timezone.utc).isoformat()
index_data = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": "claude-haiku-4-5",
"build_at": build_at,
"chunks": index_chunks
}
with open(f"{OUT_DIR}/_index.json", "w", encoding="utf-8") as f:
json.dump(index_data, f, ensure_ascii=False, indent=2)
# Build document.md
print("Building document.md...")
# Compute histograms and stats
type_histogram = {}
ufo_anomalies = []
cryptid_anomalies = []
for chunk in all_chunks:
t = chunk.get("type", "paragraph")
type_histogram[t] = type_histogram.get(t, 0) + 1
if chunk.get("ufo_anomaly_detected", False):
ufo_anomalies.append(chunk["chunk_id"])
if chunk.get("cryptid_anomaly_detected", False):
cryptid_anomalies.append(chunk["chunk_id"])
doc_lines = []
doc_lines.append("---")
doc_lines.append('schema_version: "0.2.0"')
doc_lines.append("type: master_document")
doc_lines.append(f"doc_id: {DOC_ID}")
doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
doc_lines.append(f"total_pages: {TOTAL_PAGES}")
doc_lines.append(f"total_chunks: {len(all_chunks)}")
doc_lines.append("chunk_types_histogram:")
for t, count in sorted(type_histogram.items()):
doc_lines.append(f" {t}: {count}")
doc_lines.append("multi_page_tables: []")
doc_lines.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
doc_lines.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
doc_lines.append('build_approach: "subagents"')
doc_lines.append("build_model: claude-haiku-4-5")
doc_lines.append(f"build_at: {build_at}")
doc_lines.append("---")
doc_lines.append("")
# Group chunks by page
chunks_by_page = {}
for chunk in all_chunks:
p = chunk["page_number"]
if p not in chunks_by_page:
chunks_by_page[p] = []
chunks_by_page[p].append(chunk)
for page_num in sorted(chunks_by_page.keys()):
doc_lines.append(f"## Page {page_num}")
doc_lines.append("")
for chunk in chunks_by_page[page_num]:
chunk_id = chunk["chunk_id"]
chunk_type = chunk.get("type", "paragraph")
bbox = chunk.get("bbox", {})
bx = bbox.get("x", 0)
by = bbox.get("y", 0)
bw = bbox.get("w", 1)
bh = bbox.get("h", 0.1)
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
doc_lines.append(f'<a id="{chunk_id}"></a>')
doc_lines.append(f"### Chunk {chunk_id}{chunk_type} · p{page_num} · bbox: {bx:.2f}/{by:.2f}/{bw:.2f}/{bh:.2f}")
doc_lines.append("")
content_en = chunk.get("content_en", "")
content_pt_br = chunk.get("content_pt_br", "")
doc_lines.append(f"**EN:** {content_en}")
doc_lines.append("")
doc_lines.append(f"**PT-BR:** {content_pt_br}")
doc_lines.append("")
# Image embed
if chunk_type == "image":
img_path = f"./images/IMG-{chunk_id}.png"
doc_lines.append(f"![{chunk_id} image]({img_path})")
doc_lines.append("")
if chunk.get("image_description_en"):
doc_lines.append(f"*Image description:* {chunk['image_description_en']}")
doc_lines.append("")
# Metadata collapsible
meta = {
"chunk_id": chunk_id,
"type": chunk_type,
"page": page_num,
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"bbox": bbox,
"classification": chunk.get("classification"),
"formatting": chunk.get("formatting", []),
"cross_page_hint": chunk.get("cross_page_hint", "self_contained"),
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"ocr_confidence": chunk.get("ocr_confidence", 0.9),
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
"image_type": chunk.get("image_type"),
"image_description_en": chunk.get("image_description_en"),
"image_description_pt_br": chunk.get("image_description_pt_br")
}
doc_lines.append("<details><summary>metadata</summary>")
doc_lines.append("")
doc_lines.append("```json")
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
doc_lines.append("```")
doc_lines.append("")
doc_lines.append("</details>")
doc_lines.append("")
doc_lines.append("---")
doc_lines.append("")
document_md = "\n".join(doc_lines)
with open(f"{OUT_DIR}/document.md", "w", encoding="utf-8") as f:
f.write(document_md)
wall_seconds = int(time.time() - start_time)
doc_md_bytes = len(document_md.encode("utf-8"))
print(f"\n=== DONE ===")
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
print(f"Wall time: {wall_seconds}s")
return {
"pages": TOTAL_PAGES,
"chunks": len(all_chunks),
"images": len(image_chunks),
"tables": 0,
"ufo": len(ufo_anomalies),
"cryptid": len(cryptid_anomalies),
"wall_seconds": wall_seconds,
"doc_md_bytes": doc_md_bytes
}
if __name__ == "__main__":
main()