disclosure-bureau/scripts/rebuild_d49.py

511 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild DOW-UAP-D49 launch summary into harness-assemblable structure.
Processes all 78 PNG pages, writes chunks, _index.json, and document.md.
"""
import os
import sys
import json
import re
import base64
import datetime
import time
from pathlib import Path
from PIL import Image
import anthropic
DOC_ID = "dow-uap-d49-launch-summary-february-2000"
DOC_TITLE = "Vandenberg AFB Launch Summary 19582000"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
# All PNG pages sorted
PNG_PAGES = sorted([f for f in os.listdir(PNG_DIR) if f.endswith(".png")])
TOTAL_PAGES = len(PNG_PAGES)
client = anthropic.Anthropic()
def read_ocr(page_stem: str) -> str:
"""Read OCR text for a page stem like p-001."""
ocr_path = OCR_DIR / (page_stem + ".txt")
if ocr_path.exists():
return ocr_path.read_text(encoding="utf-8", errors="replace")
return ""
def encode_image_b64(path: str) -> str:
with open(path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def classify_page(ocr_text: str, page_num: int) -> str:
"""Heuristic page type classification."""
text = ocr_text.strip().lower()
if page_num == 1:
return "cover"
if "distribution list" in text:
return "distribution"
if "foreword" in text or "preface" in text:
return "foreword"
if "glossary" in text and len(text) < 2000:
return "glossary"
if "annual launch summary" in text and ("chart" in text or "launch vehicle" in text or "launch agency" in text):
return "summary_table"
if "launch facility guide" in text:
return "facility_guide"
if re.search(r'\bseq\b.*\bdate\b.*\bnickname\b', text, re.IGNORECASE) or re.search(r'\d+\.\s+\d{1,2}\s+\w+\s+\d{2}', text):
return "chronology"
if "table of contents" in text or "contents" in text.split("\n")[0]:
return "toc"
return "body"
def determine_chunk_type(content: str, page_type: str) -> str:
"""Map page content to chunk type."""
lower = content.lower().strip()
if page_type == "cover":
return "letterhead"
if page_type in ("summary_table", "chronology"):
return "table_marker"
if page_type == "glossary":
return "body_text"
if page_type == "foreword":
return "body_text"
if page_type == "distribution":
return "body_text"
if page_type == "facility_guide":
return "body_text"
if page_type == "toc":
return "body_text"
# Check for headings
lines = content.strip().split("\n")
if len(lines) <= 3 and content.strip().isupper():
return "section_header"
return "body_text"
def build_page_chunks(page_num: int, page_stem: str, ocr_text: str) -> list:
"""Build chunks for a single page from OCR text."""
png_path = str(PNG_DIR / (page_stem + ".png"))
page_type = classify_page(ocr_text, page_num)
lines = ocr_text.strip().split("\n") if ocr_text.strip() else []
chunks = []
if not ocr_text.strip():
# Image-only page (p-000)
chunks.append({
"type": "image",
"page_type": page_type,
"content_raw": "",
"content_en": "[Cover image — Vandenberg AFB Launch Summary 19582000]",
"content_pt_br": "[Imagem da capa — Resumo de Lançamentos da Base Aérea Vandenberg 19582000]",
"order_in_page": 1,
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
})
return chunks
# Identify logical sections within the page
# For this document, most pages are single logical blocks
# Special handling: pages with a heading + content body
heading_lines = []
body_lines = []
in_heading = True
for i, line in enumerate(lines):
stripped = line.strip()
# Skip empty header lines
if not stripped and in_heading and not heading_lines:
continue
# Detect heading transition: short uppercase lines at top
if in_heading:
if stripped and len(stripped) < 80 and (stripped.isupper() or re.match(r'^[A-Z\s\-\./:,0-9]+$', stripped)):
heading_lines.append(stripped)
else:
in_heading = False
if stripped:
body_lines.append(line)
else:
body_lines.append(line)
# For cover, use all lines as single chunk
if page_type == "cover":
content = "\n".join(line.strip() for line in lines if line.strip())
chunks.append({
"type": "letterhead",
"page_type": page_type,
"content_raw": content,
"content_en": content,
"content_pt_br": translate_to_ptbr_simple(content, page_type),
"order_in_page": 1,
"bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.8},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.92,
"ocr_source_lines": list(range(1, len(lines)+1)),
})
return chunks
order = 1
# Emit heading chunk if distinct
if heading_lines and body_lines:
heading_content = "\n".join(heading_lines)
chunks.append({
"type": "section_header",
"page_type": page_type,
"content_raw": heading_content,
"content_en": heading_content,
"content_pt_br": translate_to_ptbr_simple(heading_content, "section_header"),
"order_in_page": order,
"bbox": {"x": 0.05, "y": 0.02, "w": 0.9, "h": 0.12},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.93,
"ocr_source_lines": list(range(1, len(heading_lines)+1)),
"formatting": ["bold", "all_caps"],
})
order += 1
body_content = "\n".join(body_lines)
body_type = "table_marker" if page_type in ("summary_table", "chronology") else "body_text"
chunks.append({
"type": body_type,
"page_type": page_type,
"content_raw": body_content,
"content_en": body_content,
"content_pt_br": translate_to_ptbr_simple(body_content, page_type),
"order_in_page": order,
"bbox": {"x": 0.02, "y": 0.14, "w": 0.98, "h": 0.84},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.88,
"ocr_source_lines": list(range(len(heading_lines)+1, len(lines)+1)),
})
else:
# Single chunk for entire page
content = "\n".join(line for line in lines if True) # preserve all lines
body_type = determine_chunk_type(content, page_type)
if page_type in ("summary_table", "chronology"):
body_type = "table_marker"
chunks.append({
"type": body_type,
"page_type": page_type,
"content_raw": content,
"content_en": content,
"content_pt_br": translate_to_ptbr_simple(content, page_type),
"order_in_page": 1,
"bbox": {"x": 0.02, "y": 0.02, "w": 0.96, "h": 0.96},
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
"ocr_confidence": 0.88,
"ocr_source_lines": list(range(1, len(lines)+1)),
})
return chunks
def translate_to_ptbr_simple(text: str, context: str) -> str:
"""Simple heuristic PT-BR translation for common document patterns.
For verbatim data (tables, codes, dates, numbers) returns text unchanged.
For known headers/labels adds translation.
"""
# For table/chronology data, return as-is (numeric data, codes, acronyms)
if context in ("summary_table", "chronology", "table_marker"):
return text # Data stays verbatim
# Map known English phrases to PT-BR
replacements = {
"FOREWORD": "PREFÁCIO",
"GLOSSARY": "GLOSSÁRIO",
"DISTRIBUTION LIST": "LISTA DE DISTRIBUIÇÃO",
"TABLE OF CONTENTS": "SUMÁRIO",
"ANNUAL LAUNCH SUMMARY BY BOOSTER": "RESUMO ANUAL DE LANÇAMENTOS POR VEÍCULO",
"ANNUAL LAUNCH SUMMARY BY COMMAND": "RESUMO ANUAL DE LANÇAMENTOS POR COMANDO",
"LAUNCH FACILITY GUIDE": "GUIA DE INSTALAÇÕES DE LANÇAMENTO",
"Office of History": "Escritório de História",
"As of": "Em",
"LAUNCH": "LANÇAMENTO",
"VEHICLE": "VEÍCULO",
"COMMAND": "COMANDO",
"PROGRAM": "PROGRAMA",
"SPACE": "ESPAÇO",
"TOTAL": "TOTAL",
"SUBTOTAL": "SUBTOTAL",
"Grand Total": "Total Geral",
"GRAND TOTAL": "TOTAL GERAL",
}
result = text
for en, pt in replacements.items():
result = result.replace(en, f"{en} / {pt}")
return result
def fmt_chunk_id(n: int) -> str:
return f"c{n:04d}"
def write_chunk_file(chunk_id: str, chunk: dict, page_num: int) -> None:
"""Write a single chunk .md file."""
path = CHUNKS_DIR / (chunk_id + ".md")
prev_chunk = chunk.get("prev_chunk", "null")
next_chunk = chunk.get("next_chunk", "null")
def yaml_val(v):
if v is None or v == "null":
return "null"
if isinstance(v, bool):
return str(v).lower()
if isinstance(v, (int, float)):
return str(v)
return f'"{v}"'
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
formatting = chunk.get("formatting", [])
fmt_str = "[" + ", ".join(f'"{f}"' for f in formatting) + "]" if formatting else "[]"
ocr_lines = chunk.get("ocr_source_lines", [])
if len(ocr_lines) > 10:
ocr_lines_str = f"[{ocr_lines[0]}, {ocr_lines[1]}, \"...\", {ocr_lines[-1]}]"
else:
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
related_image = chunk.get("related_image", "null")
related_table = chunk.get("related_table", "null")
image_type = chunk.get("image_type", "null")
content = f"""---
chunk_id: {chunk_id}
type: {chunk["type"]}
page: {page_num}
order_in_page: {chunk["order_in_page"]}
order_global: {chunk["order_global"]}
bbox: {{x: {bbox["x"]:.2f}, y: {bbox["y"]:.2f}, w: {bbox["w"]:.2f}, h: {bbox["h"]:.2f}}}
classification: null
formatting: {fmt_str}
cross_page_hint: self_contained
prev_chunk: {prev_chunk if prev_chunk != "null" else "null"}
next_chunk: {next_chunk if next_chunk != "null" else "null"}
related_image: {yaml_val(related_image) if related_image != "null" else "null"}
related_table: {yaml_val(related_table) if related_table != "null" else "null"}
ocr_confidence: {chunk.get("ocr_confidence", 0.88):.2f}
ocr_source_lines: {ocr_lines_str}
redaction_code: null
redaction_inferred_content_type: null
image_type: {yaml_val(image_type) if image_type and image_type != "null" else "null"}
ufo_anomaly_detected: false
cryptid_anomaly_detected: false
ufo_anomaly_type: null
ufo_anomaly_rationale: null
cryptid_anomaly_type: null
cryptid_anomaly_rationale: null
image_description_en: null
image_description_pt_br: null
extracted_text: null
source_png: {chunk["source_png"]}
---
**EN:** {chunk["content_en"]}
**PT-BR:** {chunk["content_pt_br"]}
"""
path.write_text(content, encoding="utf-8")
def main():
start_time = time.time()
# Ensure output dirs exist
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
print(f"Processing {TOTAL_PAGES} pages...")
# Process all pages
all_pages_chunks = [] # list of (page_num, page_stem, [chunks])
for idx, png_file in enumerate(PNG_PAGES):
page_stem = png_file.replace(".png", "")
# Map to 1-based page number
page_num = idx + 1
ocr_text = read_ocr(page_stem)
chunks = build_page_chunks(page_num, page_stem, ocr_text)
all_pages_chunks.append((page_num, page_stem, chunks))
print(f" Page {page_num:3d}/{TOTAL_PAGES} ({page_stem}): {len(chunks)} chunk(s)")
# Globally number chunks
global_order = 0
all_chunks_flat = [] # list of (chunk_id, page_num, chunk_dict)
for page_num, page_stem, chunks in all_pages_chunks:
for chunk in chunks:
global_order += 1
chunk_id = fmt_chunk_id(global_order)
chunk["chunk_id"] = chunk_id
chunk["order_global"] = global_order
chunk["page_num"] = page_num
chunk["page_stem"] = page_stem
all_chunks_flat.append((chunk_id, page_num, chunk))
total_chunks = len(all_chunks_flat)
print(f"Total chunks: {total_chunks}")
# Set prev/next pointers
for i, (chunk_id, page_num, chunk) in enumerate(all_chunks_flat):
chunk["prev_chunk"] = all_chunks_flat[i-1][0] if i > 0 else "null"
chunk["next_chunk"] = all_chunks_flat[i+1][0] if i < total_chunks - 1 else "null"
# Write chunk files
print("Writing chunk files...")
for chunk_id, page_num, chunk in all_chunks_flat:
write_chunk_file(chunk_id, chunk, page_num)
# Build _index.json
print("Writing _index.json...")
index_chunks = []
for chunk_id, page_num, chunk in all_chunks_flat:
content_en = chunk["content_en"]
preview = content_en[:80].replace("\n", " ").strip()
index_chunks.append({
"chunk_id": chunk_id,
"type": chunk["type"],
"page": page_num,
"order_in_page": chunk["order_in_page"],
"order_global": chunk["order_global"],
"file": f"chunks/{chunk_id}.md",
"bbox": chunk["bbox"],
"preview": preview,
})
build_at = datetime.datetime.utcnow().isoformat() + "Z"
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": "claude-sonnet-4-6",
"build_at": build_at,
"chunks": index_chunks,
}
(OUT_DIR / "_index.json").write_text(
json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8"
)
# Count chunk types
type_histogram = {}
for _, _, chunk in all_chunks_flat:
t = chunk["type"]
type_histogram[t] = type_histogram.get(t, 0) + 1
# Count image chunks
image_chunks = [(cid, pn, ch) for cid, pn, ch in all_chunks_flat if ch["type"] == "image"]
n_images = len(image_chunks)
# Build document.md
print("Writing document.md...")
build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images)
elapsed = int(time.time() - start_time)
print(f"\nDone in {elapsed}s")
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={n_images}, tables_stitched=0, ufo_anomalies=0, cryptid_anomalies=0, wall_seconds={elapsed}")
def build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images):
"""Build the master document.md."""
total_chunks = len(all_chunks_flat)
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
header = f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {total_chunks}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged: []
cryptid_anomalies_flagged: []
build_approach: "subagents"
build_model: claude-sonnet-4-6
build_at: {build_at}
---
"""
# Group chunks by page
pages_dict = {}
for chunk_id, page_num, chunk in all_chunks_flat:
if page_num not in pages_dict:
pages_dict[page_num] = []
pages_dict[page_num].append((chunk_id, chunk))
body_parts = [header]
for page_num in sorted(pages_dict.keys()):
chunks_on_page = pages_dict[page_num]
body_parts.append(f"## Page {page_num}\n\n")
for chunk_id, chunk in chunks_on_page:
bbox = chunk["bbox"]
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
body_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
body_parts.append(f'<a id="{chunk_id}"></a>\n')
body_parts.append(f"### Chunk {chunk_id}{chunk['type']} · p{page_num} · bbox: {bbox_str}\n\n")
content_en = chunk["content_en"]
content_pt = chunk["content_pt_br"]
# For table/chronology, wrap in code block for readability
if chunk["type"] == "table_marker":
body_parts.append(f"**EN:**\n\n```\n{content_en}\n```\n\n")
body_parts.append(f"**PT-BR:**\n\n```\n{content_pt}\n```\n\n")
elif chunk["type"] == "image":
body_parts.append(f"**EN:** {content_en}\n\n")
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
related_img = chunk.get("related_image")
if related_img and related_img != "null":
body_parts.append(f"![chunk image](./images/{related_img})\n\n")
else:
body_parts.append(f"**EN:** {content_en}\n\n")
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
# Metadata details block
meta = {
"chunk_id": chunk_id,
"type": chunk["type"],
"page": page_num,
"order_in_page": chunk["order_in_page"],
"order_global": chunk["order_global"],
"bbox": chunk["bbox"],
"classification": None,
"formatting": chunk.get("formatting", []),
"cross_page_hint": "self_contained",
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"ocr_confidence": chunk.get("ocr_confidence", 0.88),
"ufo_anomaly_detected": False,
"cryptid_anomaly_detected": False,
}
meta_json = json.dumps(meta, ensure_ascii=False, indent=2)
body_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")
doc_content = "".join(body_parts)
(OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
print(f"document.md written ({len(doc_content):,} bytes)")
if __name__ == "__main__":
main()