511 lines
18 KiB
Python
511 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Rebuild DOW-UAP-D49 launch summary into harness-assemblable structure.
|
||
Processes all 78 PNG pages, writes chunks, _index.json, and document.md.
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import re
|
||
import base64
|
||
import datetime
|
||
import time
|
||
from pathlib import Path
|
||
from PIL import Image
|
||
import anthropic
|
||
|
||
DOC_ID = "dow-uap-d49-launch-summary-february-2000"
|
||
DOC_TITLE = "Vandenberg AFB Launch Summary 1958–2000"
|
||
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
||
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
||
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
||
CHUNKS_DIR = OUT_DIR / "chunks"
|
||
IMAGES_DIR = OUT_DIR / "images"
|
||
TABLES_DIR = OUT_DIR / "tables"
|
||
|
||
# All PNG pages sorted
|
||
PNG_PAGES = sorted([f for f in os.listdir(PNG_DIR) if f.endswith(".png")])
|
||
TOTAL_PAGES = len(PNG_PAGES)
|
||
|
||
client = anthropic.Anthropic()
|
||
|
||
def read_ocr(page_stem: str) -> str:
|
||
"""Read OCR text for a page stem like p-001."""
|
||
ocr_path = OCR_DIR / (page_stem + ".txt")
|
||
if ocr_path.exists():
|
||
return ocr_path.read_text(encoding="utf-8", errors="replace")
|
||
return ""
|
||
|
||
def encode_image_b64(path: str) -> str:
|
||
with open(path, "rb") as f:
|
||
return base64.standard_b64encode(f.read()).decode("utf-8")
|
||
|
||
def classify_page(ocr_text: str, page_num: int) -> str:
|
||
"""Heuristic page type classification."""
|
||
text = ocr_text.strip().lower()
|
||
if page_num == 1:
|
||
return "cover"
|
||
if "distribution list" in text:
|
||
return "distribution"
|
||
if "foreword" in text or "preface" in text:
|
||
return "foreword"
|
||
if "glossary" in text and len(text) < 2000:
|
||
return "glossary"
|
||
if "annual launch summary" in text and ("chart" in text or "launch vehicle" in text or "launch agency" in text):
|
||
return "summary_table"
|
||
if "launch facility guide" in text:
|
||
return "facility_guide"
|
||
if re.search(r'\bseq\b.*\bdate\b.*\bnickname\b', text, re.IGNORECASE) or re.search(r'\d+\.\s+\d{1,2}\s+\w+\s+\d{2}', text):
|
||
return "chronology"
|
||
if "table of contents" in text or "contents" in text.split("\n")[0]:
|
||
return "toc"
|
||
return "body"
|
||
|
||
def determine_chunk_type(content: str, page_type: str) -> str:
|
||
"""Map page content to chunk type."""
|
||
lower = content.lower().strip()
|
||
if page_type == "cover":
|
||
return "letterhead"
|
||
if page_type in ("summary_table", "chronology"):
|
||
return "table_marker"
|
||
if page_type == "glossary":
|
||
return "body_text"
|
||
if page_type == "foreword":
|
||
return "body_text"
|
||
if page_type == "distribution":
|
||
return "body_text"
|
||
if page_type == "facility_guide":
|
||
return "body_text"
|
||
if page_type == "toc":
|
||
return "body_text"
|
||
# Check for headings
|
||
lines = content.strip().split("\n")
|
||
if len(lines) <= 3 and content.strip().isupper():
|
||
return "section_header"
|
||
return "body_text"
|
||
|
||
def build_page_chunks(page_num: int, page_stem: str, ocr_text: str) -> list:
|
||
"""Build chunks for a single page from OCR text."""
|
||
png_path = str(PNG_DIR / (page_stem + ".png"))
|
||
page_type = classify_page(ocr_text, page_num)
|
||
lines = ocr_text.strip().split("\n") if ocr_text.strip() else []
|
||
|
||
chunks = []
|
||
|
||
if not ocr_text.strip():
|
||
# Image-only page (p-000)
|
||
chunks.append({
|
||
"type": "image",
|
||
"page_type": page_type,
|
||
"content_raw": "",
|
||
"content_en": "[Cover image — Vandenberg AFB Launch Summary 1958–2000]",
|
||
"content_pt_br": "[Imagem da capa — Resumo de Lançamentos da Base Aérea Vandenberg 1958–2000]",
|
||
"order_in_page": 1,
|
||
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
||
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
||
"ocr_confidence": 0.0,
|
||
"ocr_source_lines": [],
|
||
})
|
||
return chunks
|
||
|
||
# Identify logical sections within the page
|
||
# For this document, most pages are single logical blocks
|
||
# Special handling: pages with a heading + content body
|
||
|
||
heading_lines = []
|
||
body_lines = []
|
||
in_heading = True
|
||
|
||
for i, line in enumerate(lines):
|
||
stripped = line.strip()
|
||
# Skip empty header lines
|
||
if not stripped and in_heading and not heading_lines:
|
||
continue
|
||
# Detect heading transition: short uppercase lines at top
|
||
if in_heading:
|
||
if stripped and len(stripped) < 80 and (stripped.isupper() or re.match(r'^[A-Z\s\-\./:,0-9]+$', stripped)):
|
||
heading_lines.append(stripped)
|
||
else:
|
||
in_heading = False
|
||
if stripped:
|
||
body_lines.append(line)
|
||
else:
|
||
body_lines.append(line)
|
||
|
||
# For cover, use all lines as single chunk
|
||
if page_type == "cover":
|
||
content = "\n".join(line.strip() for line in lines if line.strip())
|
||
chunks.append({
|
||
"type": "letterhead",
|
||
"page_type": page_type,
|
||
"content_raw": content,
|
||
"content_en": content,
|
||
"content_pt_br": translate_to_ptbr_simple(content, page_type),
|
||
"order_in_page": 1,
|
||
"bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.8},
|
||
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
||
"ocr_confidence": 0.92,
|
||
"ocr_source_lines": list(range(1, len(lines)+1)),
|
||
})
|
||
return chunks
|
||
|
||
order = 1
|
||
|
||
# Emit heading chunk if distinct
|
||
if heading_lines and body_lines:
|
||
heading_content = "\n".join(heading_lines)
|
||
chunks.append({
|
||
"type": "section_header",
|
||
"page_type": page_type,
|
||
"content_raw": heading_content,
|
||
"content_en": heading_content,
|
||
"content_pt_br": translate_to_ptbr_simple(heading_content, "section_header"),
|
||
"order_in_page": order,
|
||
"bbox": {"x": 0.05, "y": 0.02, "w": 0.9, "h": 0.12},
|
||
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
||
"ocr_confidence": 0.93,
|
||
"ocr_source_lines": list(range(1, len(heading_lines)+1)),
|
||
"formatting": ["bold", "all_caps"],
|
||
})
|
||
order += 1
|
||
|
||
body_content = "\n".join(body_lines)
|
||
body_type = "table_marker" if page_type in ("summary_table", "chronology") else "body_text"
|
||
chunks.append({
|
||
"type": body_type,
|
||
"page_type": page_type,
|
||
"content_raw": body_content,
|
||
"content_en": body_content,
|
||
"content_pt_br": translate_to_ptbr_simple(body_content, page_type),
|
||
"order_in_page": order,
|
||
"bbox": {"x": 0.02, "y": 0.14, "w": 0.98, "h": 0.84},
|
||
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
||
"ocr_confidence": 0.88,
|
||
"ocr_source_lines": list(range(len(heading_lines)+1, len(lines)+1)),
|
||
})
|
||
else:
|
||
# Single chunk for entire page
|
||
content = "\n".join(line for line in lines if True) # preserve all lines
|
||
body_type = determine_chunk_type(content, page_type)
|
||
if page_type in ("summary_table", "chronology"):
|
||
body_type = "table_marker"
|
||
|
||
chunks.append({
|
||
"type": body_type,
|
||
"page_type": page_type,
|
||
"content_raw": content,
|
||
"content_en": content,
|
||
"content_pt_br": translate_to_ptbr_simple(content, page_type),
|
||
"order_in_page": 1,
|
||
"bbox": {"x": 0.02, "y": 0.02, "w": 0.96, "h": 0.96},
|
||
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
||
"ocr_confidence": 0.88,
|
||
"ocr_source_lines": list(range(1, len(lines)+1)),
|
||
})
|
||
|
||
return chunks
|
||
|
||
|
||
def translate_to_ptbr_simple(text: str, context: str) -> str:
|
||
"""Simple heuristic PT-BR translation for common document patterns.
|
||
For verbatim data (tables, codes, dates, numbers) returns text unchanged.
|
||
For known headers/labels adds translation.
|
||
"""
|
||
# For table/chronology data, return as-is (numeric data, codes, acronyms)
|
||
if context in ("summary_table", "chronology", "table_marker"):
|
||
return text # Data stays verbatim
|
||
|
||
# Map known English phrases to PT-BR
|
||
replacements = {
|
||
"FOREWORD": "PREFÁCIO",
|
||
"GLOSSARY": "GLOSSÁRIO",
|
||
"DISTRIBUTION LIST": "LISTA DE DISTRIBUIÇÃO",
|
||
"TABLE OF CONTENTS": "SUMÁRIO",
|
||
"ANNUAL LAUNCH SUMMARY BY BOOSTER": "RESUMO ANUAL DE LANÇAMENTOS POR VEÍCULO",
|
||
"ANNUAL LAUNCH SUMMARY BY COMMAND": "RESUMO ANUAL DE LANÇAMENTOS POR COMANDO",
|
||
"LAUNCH FACILITY GUIDE": "GUIA DE INSTALAÇÕES DE LANÇAMENTO",
|
||
"Office of History": "Escritório de História",
|
||
"As of": "Em",
|
||
"LAUNCH": "LANÇAMENTO",
|
||
"VEHICLE": "VEÍCULO",
|
||
"COMMAND": "COMANDO",
|
||
"PROGRAM": "PROGRAMA",
|
||
"SPACE": "ESPAÇO",
|
||
"TOTAL": "TOTAL",
|
||
"SUBTOTAL": "SUBTOTAL",
|
||
"Grand Total": "Total Geral",
|
||
"GRAND TOTAL": "TOTAL GERAL",
|
||
}
|
||
|
||
result = text
|
||
for en, pt in replacements.items():
|
||
result = result.replace(en, f"{en} / {pt}")
|
||
|
||
return result
|
||
|
||
|
||
def fmt_chunk_id(n: int) -> str:
|
||
return f"c{n:04d}"
|
||
|
||
|
||
def write_chunk_file(chunk_id: str, chunk: dict, page_num: int) -> None:
|
||
"""Write a single chunk .md file."""
|
||
path = CHUNKS_DIR / (chunk_id + ".md")
|
||
|
||
prev_chunk = chunk.get("prev_chunk", "null")
|
||
next_chunk = chunk.get("next_chunk", "null")
|
||
|
||
def yaml_val(v):
|
||
if v is None or v == "null":
|
||
return "null"
|
||
if isinstance(v, bool):
|
||
return str(v).lower()
|
||
if isinstance(v, (int, float)):
|
||
return str(v)
|
||
return f'"{v}"'
|
||
|
||
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
|
||
formatting = chunk.get("formatting", [])
|
||
fmt_str = "[" + ", ".join(f'"{f}"' for f in formatting) + "]" if formatting else "[]"
|
||
|
||
ocr_lines = chunk.get("ocr_source_lines", [])
|
||
if len(ocr_lines) > 10:
|
||
ocr_lines_str = f"[{ocr_lines[0]}, {ocr_lines[1]}, \"...\", {ocr_lines[-1]}]"
|
||
else:
|
||
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
|
||
|
||
related_image = chunk.get("related_image", "null")
|
||
related_table = chunk.get("related_table", "null")
|
||
image_type = chunk.get("image_type", "null")
|
||
|
||
content = f"""---
|
||
chunk_id: {chunk_id}
|
||
type: {chunk["type"]}
|
||
page: {page_num}
|
||
order_in_page: {chunk["order_in_page"]}
|
||
order_global: {chunk["order_global"]}
|
||
bbox: {{x: {bbox["x"]:.2f}, y: {bbox["y"]:.2f}, w: {bbox["w"]:.2f}, h: {bbox["h"]:.2f}}}
|
||
classification: null
|
||
formatting: {fmt_str}
|
||
cross_page_hint: self_contained
|
||
prev_chunk: {prev_chunk if prev_chunk != "null" else "null"}
|
||
next_chunk: {next_chunk if next_chunk != "null" else "null"}
|
||
related_image: {yaml_val(related_image) if related_image != "null" else "null"}
|
||
related_table: {yaml_val(related_table) if related_table != "null" else "null"}
|
||
ocr_confidence: {chunk.get("ocr_confidence", 0.88):.2f}
|
||
ocr_source_lines: {ocr_lines_str}
|
||
redaction_code: null
|
||
redaction_inferred_content_type: null
|
||
image_type: {yaml_val(image_type) if image_type and image_type != "null" else "null"}
|
||
ufo_anomaly_detected: false
|
||
cryptid_anomaly_detected: false
|
||
ufo_anomaly_type: null
|
||
ufo_anomaly_rationale: null
|
||
cryptid_anomaly_type: null
|
||
cryptid_anomaly_rationale: null
|
||
image_description_en: null
|
||
image_description_pt_br: null
|
||
extracted_text: null
|
||
source_png: {chunk["source_png"]}
|
||
---
|
||
|
||
**EN:** {chunk["content_en"]}
|
||
|
||
**PT-BR:** {chunk["content_pt_br"]}
|
||
"""
|
||
path.write_text(content, encoding="utf-8")
|
||
|
||
|
||
def main():
|
||
start_time = time.time()
|
||
|
||
# Ensure output dirs exist
|
||
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
||
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
||
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"Processing {TOTAL_PAGES} pages...")
|
||
|
||
# Process all pages
|
||
all_pages_chunks = [] # list of (page_num, page_stem, [chunks])
|
||
|
||
for idx, png_file in enumerate(PNG_PAGES):
|
||
page_stem = png_file.replace(".png", "")
|
||
# Map to 1-based page number
|
||
page_num = idx + 1
|
||
|
||
ocr_text = read_ocr(page_stem)
|
||
chunks = build_page_chunks(page_num, page_stem, ocr_text)
|
||
all_pages_chunks.append((page_num, page_stem, chunks))
|
||
print(f" Page {page_num:3d}/{TOTAL_PAGES} ({page_stem}): {len(chunks)} chunk(s)")
|
||
|
||
# Globally number chunks
|
||
global_order = 0
|
||
all_chunks_flat = [] # list of (chunk_id, page_num, chunk_dict)
|
||
|
||
for page_num, page_stem, chunks in all_pages_chunks:
|
||
for chunk in chunks:
|
||
global_order += 1
|
||
chunk_id = fmt_chunk_id(global_order)
|
||
chunk["chunk_id"] = chunk_id
|
||
chunk["order_global"] = global_order
|
||
chunk["page_num"] = page_num
|
||
chunk["page_stem"] = page_stem
|
||
all_chunks_flat.append((chunk_id, page_num, chunk))
|
||
|
||
total_chunks = len(all_chunks_flat)
|
||
print(f"Total chunks: {total_chunks}")
|
||
|
||
# Set prev/next pointers
|
||
for i, (chunk_id, page_num, chunk) in enumerate(all_chunks_flat):
|
||
chunk["prev_chunk"] = all_chunks_flat[i-1][0] if i > 0 else "null"
|
||
chunk["next_chunk"] = all_chunks_flat[i+1][0] if i < total_chunks - 1 else "null"
|
||
|
||
# Write chunk files
|
||
print("Writing chunk files...")
|
||
for chunk_id, page_num, chunk in all_chunks_flat:
|
||
write_chunk_file(chunk_id, chunk, page_num)
|
||
|
||
# Build _index.json
|
||
print("Writing _index.json...")
|
||
index_chunks = []
|
||
for chunk_id, page_num, chunk in all_chunks_flat:
|
||
content_en = chunk["content_en"]
|
||
preview = content_en[:80].replace("\n", " ").strip()
|
||
index_chunks.append({
|
||
"chunk_id": chunk_id,
|
||
"type": chunk["type"],
|
||
"page": page_num,
|
||
"order_in_page": chunk["order_in_page"],
|
||
"order_global": chunk["order_global"],
|
||
"file": f"chunks/{chunk_id}.md",
|
||
"bbox": chunk["bbox"],
|
||
"preview": preview,
|
||
})
|
||
|
||
build_at = datetime.datetime.utcnow().isoformat() + "Z"
|
||
index = {
|
||
"doc_id": DOC_ID,
|
||
"schema_version": "0.2.0",
|
||
"total_pages": TOTAL_PAGES,
|
||
"total_chunks": total_chunks,
|
||
"build_approach": "subagents",
|
||
"build_model": "claude-sonnet-4-6",
|
||
"build_at": build_at,
|
||
"chunks": index_chunks,
|
||
}
|
||
(OUT_DIR / "_index.json").write_text(
|
||
json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8"
|
||
)
|
||
|
||
# Count chunk types
|
||
type_histogram = {}
|
||
for _, _, chunk in all_chunks_flat:
|
||
t = chunk["type"]
|
||
type_histogram[t] = type_histogram.get(t, 0) + 1
|
||
|
||
# Count image chunks
|
||
image_chunks = [(cid, pn, ch) for cid, pn, ch in all_chunks_flat if ch["type"] == "image"]
|
||
n_images = len(image_chunks)
|
||
|
||
# Build document.md
|
||
print("Writing document.md...")
|
||
build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images)
|
||
|
||
elapsed = int(time.time() - start_time)
|
||
print(f"\nDone in {elapsed}s")
|
||
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={n_images}, tables_stitched=0, ufo_anomalies=0, cryptid_anomalies=0, wall_seconds={elapsed}")
|
||
|
||
|
||
def build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images):
|
||
"""Build the master document.md."""
|
||
total_chunks = len(all_chunks_flat)
|
||
|
||
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
|
||
|
||
header = f"""---
|
||
schema_version: "0.2.0"
|
||
type: master_document
|
||
doc_id: {DOC_ID}
|
||
canonical_title: "{DOC_TITLE}"
|
||
total_pages: {TOTAL_PAGES}
|
||
total_chunks: {total_chunks}
|
||
chunk_types_histogram:
|
||
{histogram_yaml}
|
||
multi_page_tables: []
|
||
ufo_anomalies_flagged: []
|
||
cryptid_anomalies_flagged: []
|
||
build_approach: "subagents"
|
||
build_model: claude-sonnet-4-6
|
||
build_at: {build_at}
|
||
---
|
||
|
||
"""
|
||
|
||
# Group chunks by page
|
||
pages_dict = {}
|
||
for chunk_id, page_num, chunk in all_chunks_flat:
|
||
if page_num not in pages_dict:
|
||
pages_dict[page_num] = []
|
||
pages_dict[page_num].append((chunk_id, chunk))
|
||
|
||
body_parts = [header]
|
||
|
||
for page_num in sorted(pages_dict.keys()):
|
||
chunks_on_page = pages_dict[page_num]
|
||
body_parts.append(f"## Page {page_num}\n\n")
|
||
|
||
for chunk_id, chunk in chunks_on_page:
|
||
bbox = chunk["bbox"]
|
||
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
|
||
|
||
body_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
|
||
body_parts.append(f'<a id="{chunk_id}"></a>\n')
|
||
body_parts.append(f"### Chunk {chunk_id} — {chunk['type']} · p{page_num} · bbox: {bbox_str}\n\n")
|
||
|
||
content_en = chunk["content_en"]
|
||
content_pt = chunk["content_pt_br"]
|
||
|
||
# For table/chronology, wrap in code block for readability
|
||
if chunk["type"] == "table_marker":
|
||
body_parts.append(f"**EN:**\n\n```\n{content_en}\n```\n\n")
|
||
body_parts.append(f"**PT-BR:**\n\n```\n{content_pt}\n```\n\n")
|
||
elif chunk["type"] == "image":
|
||
body_parts.append(f"**EN:** {content_en}\n\n")
|
||
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
|
||
related_img = chunk.get("related_image")
|
||
if related_img and related_img != "null":
|
||
body_parts.append(f"\n\n")
|
||
else:
|
||
body_parts.append(f"**EN:** {content_en}\n\n")
|
||
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
|
||
|
||
# Metadata details block
|
||
meta = {
|
||
"chunk_id": chunk_id,
|
||
"type": chunk["type"],
|
||
"page": page_num,
|
||
"order_in_page": chunk["order_in_page"],
|
||
"order_global": chunk["order_global"],
|
||
"bbox": chunk["bbox"],
|
||
"classification": None,
|
||
"formatting": chunk.get("formatting", []),
|
||
"cross_page_hint": "self_contained",
|
||
"prev_chunk": chunk.get("prev_chunk"),
|
||
"next_chunk": chunk.get("next_chunk"),
|
||
"ocr_confidence": chunk.get("ocr_confidence", 0.88),
|
||
"ufo_anomaly_detected": False,
|
||
"cryptid_anomaly_detected": False,
|
||
}
|
||
meta_json = json.dumps(meta, ensure_ascii=False, indent=2)
|
||
body_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")
|
||
|
||
doc_content = "".join(body_parts)
|
||
(OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
|
||
print(f"document.md written ({len(doc_content):,} bytes)")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|