disclosure-bureau/scripts/rebuild_d49.py

512 lines
18 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild DOW-UAP-D49 launch summary into harness-assemblable structure.
Processes all 78 PNG pages, writes chunks, _index.json, and document.md.
"""
import os
import sys
import json
import re
import base64
import datetime
import time
from pathlib import Path
from PIL import Image
import anthropic
DOC_ID = "dow-uap-d49-launch-summary-february-2000"
DOC_TITLE = "Vandenberg AFB Launch Summary 19582000"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
# All PNG pages sorted
PNG_PAGES = sorted([f for f in os.listdir(PNG_DIR) if f.endswith(".png")])
TOTAL_PAGES = len(PNG_PAGES)
client = anthropic.Anthropic()
def read_ocr(page_stem: str) -> str:
"""Read OCR text for a page stem like p-001."""
ocr_path = OCR_DIR / (page_stem + ".txt")
if ocr_path.exists():
return ocr_path.read_text(encoding="utf-8", errors="replace")
return ""
def encode_image_b64(path: str) -> str:
with open(path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def classify_page(ocr_text: str, page_num: int) -> str:
"""Heuristic page type classification."""
text = ocr_text.strip().lower()
if page_num == 1:
return "cover"
if "distribution list" in text:
return "distribution"
if "foreword" in text or "preface" in text:
return "foreword"
if "glossary" in text and len(text) < 2000:
return "glossary"
if "annual launch summary" in text and ("chart" in text or "launch vehicle" in text or "launch agency" in text):
return "summary_table"
if "launch facility guide" in text:
return "facility_guide"
if re.search(r'\bseq\b.*\bdate\b.*\bnickname\b', text, re.IGNORECASE) or re.search(r'\d+\.\s+\d{1,2}\s+\w+\s+\d{2}', text):
return "chronology"
if "table of contents" in text or "contents" in text.split("\n")[0]:
return "toc"
return "body"
def determine_chunk_type(content: str, page_type: str) -> str:
"""Map page content to chunk type."""
lower = content.lower().strip()
if page_type == "cover":
return "letterhead"
if page_type in ("summary_table", "chronology"):
return "table_marker"
if page_type == "glossary":
return "body_text"
if page_type == "foreword":
return "body_text"
if page_type == "distribution":
return "body_text"
if page_type == "facility_guide":
return "body_text"
if page_type == "toc":
return "body_text"
# Check for headings
lines = content.strip().split("\n")
if len(lines) <= 3 and content.strip().isupper():
return "section_header"
return "body_text"
def build_page_chunks(page_num: int, page_stem: str, ocr_text: str) -> list:
"""Build chunks for a single page from OCR text."""
png_path = str(PNG_DIR / (page_stem + ".png"))
page_type = classify_page(ocr_text, page_num)
lines = ocr_text.strip().split("\n") if ocr_text.strip() else []
chunks = []
if not ocr_text.strip():
# Image-only page (p-000)
chunks.append({
"type": "image",
"page_type": page_type,
"content_raw": "",
"content_en": "[Cover image — Vandenberg AFB Launch Summary 19582000]",
"content_pt_br": "[Imagem da capa — Resumo de Lançamentos da Base Aérea Vandenberg 19582000]",
"order_in_page": 1,
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
})
return chunks
# Identify logical sections within the page
# For this document, most pages are single logical blocks
# Special handling: pages with a heading + content body
heading_lines = []
body_lines = []
in_heading = True
for i, line in enumerate(lines):
stripped = line.strip()
# Skip empty header lines
if not stripped and in_heading and not heading_lines:
continue
# Detect heading transition: short uppercase lines at top
if in_heading:
if stripped and len(stripped) < 80 and (stripped.isupper() or re.match(r'^[A-Z\s\-\./:,0-9]+$', stripped)):
heading_lines.append(stripped)
else:
in_heading = False
if stripped:
body_lines.append(line)
else:
body_lines.append(line)
# For cover, use all lines as single chunk
if page_type == "cover":
content = "\n".join(line.strip() for line in lines if line.strip())
chunks.append({
"type": "letterhead",
"page_type": page_type,
"content_raw": content,
"content_en": content,
"content_pt_br": translate_to_ptbr_simple(content, page_type),
"order_in_page": 1,
"bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.8},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.92,
"ocr_source_lines": list(range(1, len(lines)+1)),
})
return chunks
order = 1
# Emit heading chunk if distinct
if heading_lines and body_lines:
heading_content = "\n".join(heading_lines)
chunks.append({
"type": "section_header",
"page_type": page_type,
"content_raw": heading_content,
"content_en": heading_content,
"content_pt_br": translate_to_ptbr_simple(heading_content, "section_header"),
"order_in_page": order,
"bbox": {"x": 0.05, "y": 0.02, "w": 0.9, "h": 0.12},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.93,
"ocr_source_lines": list(range(1, len(heading_lines)+1)),
"formatting": ["bold", "all_caps"],
})
order += 1
body_content = "\n".join(body_lines)
body_type = "table_marker" if page_type in ("summary_table", "chronology") else "body_text"
chunks.append({
"type": body_type,
"page_type": page_type,
"content_raw": body_content,
"content_en": body_content,
"content_pt_br": translate_to_ptbr_simple(body_content, page_type),
"order_in_page": order,
"bbox": {"x": 0.02, "y": 0.14, "w": 0.98, "h": 0.84},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.88,
"ocr_source_lines": list(range(len(heading_lines)+1, len(lines)+1)),
})
else:
# Single chunk for entire page
content = "\n".join(line for line in lines if True) # preserve all lines
body_type = determine_chunk_type(content, page_type)
if page_type in ("summary_table", "chronology"):
body_type = "table_marker"
chunks.append({
"type": body_type,
"page_type": page_type,
"content_raw": content,
"content_en": content,
"content_pt_br": translate_to_ptbr_simple(content, page_type),
"order_in_page": 1,
"bbox": {"x": 0.02, "y": 0.02, "w": 0.96, "h": 0.96},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.88,
"ocr_source_lines": list(range(1, len(lines)+1)),
})
return chunks
def translate_to_ptbr_simple(text: str, context: str) -> str:
"""Simple heuristic PT-BR translation for common document patterns.
For verbatim data (tables, codes, dates, numbers) returns text unchanged.
For known headers/labels adds translation.
"""
# For table/chronology data, return as-is (numeric data, codes, acronyms)
if context in ("summary_table", "chronology", "table_marker"):
return text # Data stays verbatim
# Map known English phrases to PT-BR
replacements = {
"FOREWORD": "PREFÁCIO",
"GLOSSARY": "GLOSSÁRIO",
"DISTRIBUTION LIST": "LISTA DE DISTRIBUIÇÃO",
"TABLE OF CONTENTS": "SUMÁRIO",
"ANNUAL LAUNCH SUMMARY BY BOOSTER": "RESUMO ANUAL DE LANÇAMENTOS POR VEÍCULO",
"ANNUAL LAUNCH SUMMARY BY COMMAND": "RESUMO ANUAL DE LANÇAMENTOS POR COMANDO",
"LAUNCH FACILITY GUIDE": "GUIA DE INSTALAÇÕES DE LANÇAMENTO",
"Office of History": "Escritório de História",
"As of": "Em",
"LAUNCH": "LANÇAMENTO",
"VEHICLE": "VEÍCULO",
"COMMAND": "COMANDO",
"PROGRAM": "PROGRAMA",
"SPACE": "ESPAÇO",
"TOTAL": "TOTAL",
"SUBTOTAL": "SUBTOTAL",
"Grand Total": "Total Geral",
"GRAND TOTAL": "TOTAL GERAL",
}
result = text
for en, pt in replacements.items():
result = result.replace(en, f"{en} / {pt}")
return result
def fmt_chunk_id(n: int) -> str:
return f"c{n:04d}"
def write_chunk_file(chunk_id: str, chunk: dict, page_num: int) -> None:
"""Write a single chunk .md file."""
path = CHUNKS_DIR / (chunk_id + ".md")
prev_chunk = chunk.get("prev_chunk", "null")
next_chunk = chunk.get("next_chunk", "null")
def yaml_val(v):
if v is None or v == "null":
return "null"
if isinstance(v, bool):
return str(v).lower()
if isinstance(v, (int, float)):
return str(v)
return f'"{v}"'
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
formatting = chunk.get("formatting", [])
fmt_str = "[" + ", ".join(f'"{f}"' for f in formatting) + "]" if formatting else "[]"
ocr_lines = chunk.get("ocr_source_lines", [])
if len(ocr_lines) > 10:
ocr_lines_str = f"[{ocr_lines[0]}, {ocr_lines[1]}, \"...\", {ocr_lines[-1]}]"
else:
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
related_image = chunk.get("related_image", "null")
related_table = chunk.get("related_table", "null")
image_type = chunk.get("image_type", "null")
content = f"""---
chunk_id: {chunk_id}
type: {chunk["type"]}
page: {page_num}
order_in_page: {chunk["order_in_page"]}
order_global: {chunk["order_global"]}
bbox: {{x: {bbox["x"]:.2f}, y: {bbox["y"]:.2f}, w: {bbox["w"]:.2f}, h: {bbox["h"]:.2f}}}
classification: null
formatting: {fmt_str}
cross_page_hint: self_contained
prev_chunk: {prev_chunk if prev_chunk != "null" else "null"}
next_chunk: {next_chunk if next_chunk != "null" else "null"}
related_image: {yaml_val(related_image) if related_image != "null" else "null"}
related_table: {yaml_val(related_table) if related_table != "null" else "null"}
ocr_confidence: {chunk.get("ocr_confidence", 0.88):.2f}
ocr_source_lines: {ocr_lines_str}
redaction_code: null
redaction_inferred_content_type: null
image_type: {yaml_val(image_type) if image_type and image_type != "null" else "null"}
ufo_anomaly_detected: false
cryptid_anomaly_detected: false
ufo_anomaly_type: null
ufo_anomaly_rationale: null
cryptid_anomaly_type: null
cryptid_anomaly_rationale: null
image_description_en: null
image_description_pt_br: null
extracted_text: null
source_png: {chunk["source_png"]}
---
**EN:** {chunk["content_en"]}
**PT-BR:** {chunk["content_pt_br"]}
"""
path.write_text(content, encoding="utf-8")
def main():
start_time = time.time()
# Ensure output dirs exist
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
print(f"Processing {TOTAL_PAGES} pages...")
# Process all pages
all_pages_chunks = [] # list of (page_num, page_stem, [chunks])
for idx, png_file in enumerate(PNG_PAGES):
page_stem = png_file.replace(".png", "")
# Map to 1-based page number
page_num = idx + 1
ocr_text = read_ocr(page_stem)
chunks = build_page_chunks(page_num, page_stem, ocr_text)
all_pages_chunks.append((page_num, page_stem, chunks))
print(f" Page {page_num:3d}/{TOTAL_PAGES} ({page_stem}): {len(chunks)} chunk(s)")
# Globally number chunks
global_order = 0
all_chunks_flat = [] # list of (chunk_id, page_num, chunk_dict)
for page_num, page_stem, chunks in all_pages_chunks:
for chunk in chunks:
global_order += 1
chunk_id = fmt_chunk_id(global_order)
chunk["chunk_id"] = chunk_id
chunk["order_global"] = global_order
chunk["page_num"] = page_num
chunk["page_stem"] = page_stem
all_chunks_flat.append((chunk_id, page_num, chunk))
total_chunks = len(all_chunks_flat)
print(f"Total chunks: {total_chunks}")
# Set prev/next pointers
for i, (chunk_id, page_num, chunk) in enumerate(all_chunks_flat):
chunk["prev_chunk"] = all_chunks_flat[i-1][0] if i > 0 else "null"
chunk["next_chunk"] = all_chunks_flat[i+1][0] if i < total_chunks - 1 else "null"
# Write chunk files
print("Writing chunk files...")
for chunk_id, page_num, chunk in all_chunks_flat:
write_chunk_file(chunk_id, chunk, page_num)
# Build _index.json
print("Writing _index.json...")
index_chunks = []
for chunk_id, page_num, chunk in all_chunks_flat:
content_en = chunk["content_en"]
preview = content_en[:80].replace("\n", " ").strip()
index_chunks.append({
"chunk_id": chunk_id,
"type": chunk["type"],
"page": page_num,
"order_in_page": chunk["order_in_page"],
"order_global": chunk["order_global"],
"file": f"chunks/{chunk_id}.md",
"bbox": chunk["bbox"],
"preview": preview,
})
build_at = datetime.datetime.utcnow().isoformat() + "Z"
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": "claude-sonnet-4-6",
"build_at": build_at,
"chunks": index_chunks,
}
(OUT_DIR / "_index.json").write_text(
json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8"
)
# Count chunk types
type_histogram = {}
for _, _, chunk in all_chunks_flat:
t = chunk["type"]
type_histogram[t] = type_histogram.get(t, 0) + 1
# Count image chunks
image_chunks = [(cid, pn, ch) for cid, pn, ch in all_chunks_flat if ch["type"] == "image"]
n_images = len(image_chunks)
# Build document.md
print("Writing document.md...")
build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images)
elapsed = int(time.time() - start_time)
print(f"\nDone in {elapsed}s")
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={n_images}, tables_stitched=0, ufo_anomalies=0, cryptid_anomalies=0, wall_seconds={elapsed}")
def build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images):
"""Build the master document.md."""
total_chunks = len(all_chunks_flat)
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
header = f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {total_chunks}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged: []
cryptid_anomalies_flagged: []
build_approach: "subagents"
build_model: claude-sonnet-4-6
build_at: {build_at}
---
"""
# Group chunks by page
pages_dict = {}
for chunk_id, page_num, chunk in all_chunks_flat:
if page_num not in pages_dict:
pages_dict[page_num] = []
pages_dict[page_num].append((chunk_id, chunk))
body_parts = [header]
for page_num in sorted(pages_dict.keys()):
chunks_on_page = pages_dict[page_num]
body_parts.append(f"## Page {page_num}\n\n")
for chunk_id, chunk in chunks_on_page:
bbox = chunk["bbox"]
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
body_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
body_parts.append(f'<a id="{chunk_id}"></a>\n')
body_parts.append(f"### Chunk {chunk_id}{chunk['type']} · p{page_num} · bbox: {bbox_str}\n\n")
content_en = chunk["content_en"]
content_pt = chunk["content_pt_br"]
# For table/chronology, wrap in code block for readability
if chunk["type"] == "table_marker":
body_parts.append(f"**EN:**\n\n```\n{content_en}\n```\n\n")
body_parts.append(f"**PT-BR:**\n\n```\n{content_pt}\n```\n\n")
elif chunk["type"] == "image":
body_parts.append(f"**EN:** {content_en}\n\n")
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
related_img = chunk.get("related_image")
if related_img and related_img != "null":
body_parts.append(f"![chunk image](./images/{related_img})\n\n")
else:
body_parts.append(f"**EN:** {content_en}\n\n")
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
# Metadata details block
meta = {
"chunk_id": chunk_id,
"type": chunk["type"],
"page": page_num,
"order_in_page": chunk["order_in_page"],
"order_global": chunk["order_global"],
"bbox": chunk["bbox"],
"classification": None,
"formatting": chunk.get("formatting", []),
"cross_page_hint": "self_contained",
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"ocr_confidence": chunk.get("ocr_confidence", 0.88),
"ufo_anomaly_detected": False,
"cryptid_anomaly_detected": False,
}
meta_json = json.dumps(meta, ensure_ascii=False, indent=2)
body_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")
doc_content = "".join(body_parts)
(OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
print(f"document.md written ({len(doc_content):,} bytes)")
if __name__ == "__main__":
main()