#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Rebuild DOW-UAP-D49 launch summary into harness-assemblable structure. Processes all 78 PNG pages, writes chunks, _index.json, and document.md. """ import os import sys import json import re import base64 import datetime import time from pathlib import Path from PIL import Image import anthropic DOC_ID = "dow-uap-d49-launch-summary-february-2000" DOC_TITLE = "Vandenberg AFB Launch Summary 1958–2000" PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" # All PNG pages sorted PNG_PAGES = sorted([f for f in os.listdir(PNG_DIR) if f.endswith(".png")]) TOTAL_PAGES = len(PNG_PAGES) client = anthropic.Anthropic() def read_ocr(page_stem: str) -> str: """Read OCR text for a page stem like p-001.""" ocr_path = OCR_DIR / (page_stem + ".txt") if ocr_path.exists(): return ocr_path.read_text(encoding="utf-8", errors="replace") return "" def encode_image_b64(path: str) -> str: with open(path, "rb") as f: return base64.standard_b64encode(f.read()).decode("utf-8") def classify_page(ocr_text: str, page_num: int) -> str: """Heuristic page type classification.""" text = ocr_text.strip().lower() if page_num == 1: return "cover" if "distribution list" in text: return "distribution" if "foreword" in text or "preface" in text: return "foreword" if "glossary" in text and len(text) < 2000: return "glossary" if "annual launch summary" in text and ("chart" in text or "launch vehicle" in text or "launch agency" in text): return "summary_table" if "launch facility guide" in text: return "facility_guide" if re.search(r'\bseq\b.*\bdate\b.*\bnickname\b', text, re.IGNORECASE) or re.search(r'\d+\.\s+\d{1,2}\s+\w+\s+\d{2}', text): return "chronology" if "table of contents" in text or "contents" in text.split("\n")[0]: return "toc" return "body" def determine_chunk_type(content: str, page_type: str) -> str: """Map page content to chunk type.""" lower = content.lower().strip() if page_type == "cover": return "letterhead" if page_type in ("summary_table", "chronology"): return "table_marker" if page_type == "glossary": return "body_text" if page_type == "foreword": return "body_text" if page_type == "distribution": return "body_text" if page_type == "facility_guide": return "body_text" if page_type == "toc": return "body_text" # Check for headings lines = content.strip().split("\n") if len(lines) <= 3 and content.strip().isupper(): return "section_header" return "body_text" def build_page_chunks(page_num: int, page_stem: str, ocr_text: str) -> list: """Build chunks for a single page from OCR text.""" png_path = str(PNG_DIR / (page_stem + ".png")) page_type = classify_page(ocr_text, page_num) lines = ocr_text.strip().split("\n") if ocr_text.strip() else [] chunks = [] if not ocr_text.strip(): # Image-only page (p-000) chunks.append({ "type": "image", "page_type": page_type, "content_raw": "", "content_en": "[Cover image — Vandenberg AFB Launch Summary 1958–2000]", "content_pt_br": "[Imagem da capa — Resumo de Lançamentos da Base Aérea Vandenberg 1958–2000]", "order_in_page": 1, "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png", "ocr_confidence": 0.0, "ocr_source_lines": [], }) return chunks # Identify logical sections within the page # For this document, most pages are single logical blocks # Special handling: pages with a heading + content body heading_lines = [] body_lines = [] in_heading = True for i, line in enumerate(lines): stripped = line.strip() # Skip empty header lines if not stripped and in_heading and not heading_lines: continue # Detect heading transition: short uppercase lines at top if in_heading: if stripped and len(stripped) < 80 and (stripped.isupper() or re.match(r'^[A-Z\s\-\./:,0-9]+$', stripped)): heading_lines.append(stripped) else: in_heading = False if stripped: body_lines.append(line) else: body_lines.append(line) # For cover, use all lines as single chunk if page_type == "cover": content = "\n".join(line.strip() for line in lines if line.strip()) chunks.append({ "type": "letterhead", "page_type": page_type, "content_raw": content, "content_en": content, "content_pt_br": translate_to_ptbr_simple(content, page_type), "order_in_page": 1, "bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.8}, "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png", "ocr_confidence": 0.92, "ocr_source_lines": list(range(1, len(lines)+1)), }) return chunks order = 1 # Emit heading chunk if distinct if heading_lines and body_lines: heading_content = "\n".join(heading_lines) chunks.append({ "type": "section_header", "page_type": page_type, "content_raw": heading_content, "content_en": heading_content, "content_pt_br": translate_to_ptbr_simple(heading_content, "section_header"), "order_in_page": order, "bbox": {"x": 0.05, "y": 0.02, "w": 0.9, "h": 0.12}, "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png", "ocr_confidence": 0.93, "ocr_source_lines": list(range(1, len(heading_lines)+1)), "formatting": ["bold", "all_caps"], }) order += 1 body_content = "\n".join(body_lines) body_type = "table_marker" if page_type in ("summary_table", "chronology") else "body_text" chunks.append({ "type": body_type, "page_type": page_type, "content_raw": body_content, "content_en": body_content, "content_pt_br": translate_to_ptbr_simple(body_content, page_type), "order_in_page": order, "bbox": {"x": 0.02, "y": 0.14, "w": 0.98, "h": 0.84}, "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png", "ocr_confidence": 0.88, "ocr_source_lines": list(range(len(heading_lines)+1, len(lines)+1)), }) else: # Single chunk for entire page content = "\n".join(line for line in lines if True) # preserve all lines body_type = determine_chunk_type(content, page_type) if page_type in ("summary_table", "chronology"): body_type = "table_marker" chunks.append({ "type": body_type, "page_type": page_type, "content_raw": content, "content_en": content, "content_pt_br": translate_to_ptbr_simple(content, page_type), "order_in_page": 1, "bbox": {"x": 0.02, "y": 0.02, "w": 0.96, "h": 0.96}, "source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png", "ocr_confidence": 0.88, "ocr_source_lines": list(range(1, len(lines)+1)), }) return chunks def translate_to_ptbr_simple(text: str, context: str) -> str: """Simple heuristic PT-BR translation for common document patterns. For verbatim data (tables, codes, dates, numbers) returns text unchanged. For known headers/labels adds translation. """ # For table/chronology data, return as-is (numeric data, codes, acronyms) if context in ("summary_table", "chronology", "table_marker"): return text # Data stays verbatim # Map known English phrases to PT-BR replacements = { "FOREWORD": "PREFÁCIO", "GLOSSARY": "GLOSSÁRIO", "DISTRIBUTION LIST": "LISTA DE DISTRIBUIÇÃO", "TABLE OF CONTENTS": "SUMÁRIO", "ANNUAL LAUNCH SUMMARY BY BOOSTER": "RESUMO ANUAL DE LANÇAMENTOS POR VEÍCULO", "ANNUAL LAUNCH SUMMARY BY COMMAND": "RESUMO ANUAL DE LANÇAMENTOS POR COMANDO", "LAUNCH FACILITY GUIDE": "GUIA DE INSTALAÇÕES DE LANÇAMENTO", "Office of History": "Escritório de História", "As of": "Em", "LAUNCH": "LANÇAMENTO", "VEHICLE": "VEÍCULO", "COMMAND": "COMANDO", "PROGRAM": "PROGRAMA", "SPACE": "ESPAÇO", "TOTAL": "TOTAL", "SUBTOTAL": "SUBTOTAL", "Grand Total": "Total Geral", "GRAND TOTAL": "TOTAL GERAL", } result = text for en, pt in replacements.items(): result = result.replace(en, f"{en} / {pt}") return result def fmt_chunk_id(n: int) -> str: return f"c{n:04d}" def write_chunk_file(chunk_id: str, chunk: dict, page_num: int) -> None: """Write a single chunk .md file.""" path = CHUNKS_DIR / (chunk_id + ".md") prev_chunk = chunk.get("prev_chunk", "null") next_chunk = chunk.get("next_chunk", "null") def yaml_val(v): if v is None or v == "null": return "null" if isinstance(v, bool): return str(v).lower() if isinstance(v, (int, float)): return str(v) return f'"{v}"' bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}) formatting = chunk.get("formatting", []) fmt_str = "[" + ", ".join(f'"{f}"' for f in formatting) + "]" if formatting else "[]" ocr_lines = chunk.get("ocr_source_lines", []) if len(ocr_lines) > 10: ocr_lines_str = f"[{ocr_lines[0]}, {ocr_lines[1]}, \"...\", {ocr_lines[-1]}]" else: ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]" related_image = chunk.get("related_image", "null") related_table = chunk.get("related_table", "null") image_type = chunk.get("image_type", "null") content = f"""--- chunk_id: {chunk_id} type: {chunk["type"]} page: {page_num} order_in_page: {chunk["order_in_page"]} order_global: {chunk["order_global"]} bbox: {{x: {bbox["x"]:.2f}, y: {bbox["y"]:.2f}, w: {bbox["w"]:.2f}, h: {bbox["h"]:.2f}}} classification: null formatting: {fmt_str} cross_page_hint: self_contained prev_chunk: {prev_chunk if prev_chunk != "null" else "null"} next_chunk: {next_chunk if next_chunk != "null" else "null"} related_image: {yaml_val(related_image) if related_image != "null" else "null"} related_table: {yaml_val(related_table) if related_table != "null" else "null"} ocr_confidence: {chunk.get("ocr_confidence", 0.88):.2f} ocr_source_lines: {ocr_lines_str} redaction_code: null redaction_inferred_content_type: null image_type: {yaml_val(image_type) if image_type and image_type != "null" else "null"} ufo_anomaly_detected: false cryptid_anomaly_detected: false ufo_anomaly_type: null ufo_anomaly_rationale: null cryptid_anomaly_type: null cryptid_anomaly_rationale: null image_description_en: null image_description_pt_br: null extracted_text: null source_png: {chunk["source_png"]} --- **EN:** {chunk["content_en"]} **PT-BR:** {chunk["content_pt_br"]} """ path.write_text(content, encoding="utf-8") def main(): start_time = time.time() # Ensure output dirs exist CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) print(f"Processing {TOTAL_PAGES} pages...") # Process all pages all_pages_chunks = [] # list of (page_num, page_stem, [chunks]) for idx, png_file in enumerate(PNG_PAGES): page_stem = png_file.replace(".png", "") # Map to 1-based page number page_num = idx + 1 ocr_text = read_ocr(page_stem) chunks = build_page_chunks(page_num, page_stem, ocr_text) all_pages_chunks.append((page_num, page_stem, chunks)) print(f" Page {page_num:3d}/{TOTAL_PAGES} ({page_stem}): {len(chunks)} chunk(s)") # Globally number chunks global_order = 0 all_chunks_flat = [] # list of (chunk_id, page_num, chunk_dict) for page_num, page_stem, chunks in all_pages_chunks: for chunk in chunks: global_order += 1 chunk_id = fmt_chunk_id(global_order) chunk["chunk_id"] = chunk_id chunk["order_global"] = global_order chunk["page_num"] = page_num chunk["page_stem"] = page_stem all_chunks_flat.append((chunk_id, page_num, chunk)) total_chunks = len(all_chunks_flat) print(f"Total chunks: {total_chunks}") # Set prev/next pointers for i, (chunk_id, page_num, chunk) in enumerate(all_chunks_flat): chunk["prev_chunk"] = all_chunks_flat[i-1][0] if i > 0 else "null" chunk["next_chunk"] = all_chunks_flat[i+1][0] if i < total_chunks - 1 else "null" # Write chunk files print("Writing chunk files...") for chunk_id, page_num, chunk in all_chunks_flat: write_chunk_file(chunk_id, chunk, page_num) # Build _index.json print("Writing _index.json...") index_chunks = [] for chunk_id, page_num, chunk in all_chunks_flat: content_en = chunk["content_en"] preview = content_en[:80].replace("\n", " ").strip() index_chunks.append({ "chunk_id": chunk_id, "type": chunk["type"], "page": page_num, "order_in_page": chunk["order_in_page"], "order_global": chunk["order_global"], "file": f"chunks/{chunk_id}.md", "bbox": chunk["bbox"], "preview": preview, }) build_at = datetime.datetime.utcnow().isoformat() + "Z" index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": total_chunks, "build_approach": "subagents", "build_model": "claude-sonnet-4-6", "build_at": build_at, "chunks": index_chunks, } (OUT_DIR / "_index.json").write_text( json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8" ) # Count chunk types type_histogram = {} for _, _, chunk in all_chunks_flat: t = chunk["type"] type_histogram[t] = type_histogram.get(t, 0) + 1 # Count image chunks image_chunks = [(cid, pn, ch) for cid, pn, ch in all_chunks_flat if ch["type"] == "image"] n_images = len(image_chunks) # Build document.md print("Writing document.md...") build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images) elapsed = int(time.time() - start_time) print(f"\nDone in {elapsed}s") print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={n_images}, tables_stitched=0, ufo_anomalies=0, cryptid_anomalies=0, wall_seconds={elapsed}") def build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images): """Build the master document.md.""" total_chunks = len(all_chunks_flat) histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items())) header = f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {TOTAL_PAGES} total_chunks: {total_chunks} chunk_types_histogram: {histogram_yaml} multi_page_tables: [] ufo_anomalies_flagged: [] cryptid_anomalies_flagged: [] build_approach: "subagents" build_model: claude-sonnet-4-6 build_at: {build_at} --- """ # Group chunks by page pages_dict = {} for chunk_id, page_num, chunk in all_chunks_flat: if page_num not in pages_dict: pages_dict[page_num] = [] pages_dict[page_num].append((chunk_id, chunk)) body_parts = [header] for page_num in sorted(pages_dict.keys()): chunks_on_page = pages_dict[page_num] body_parts.append(f"## Page {page_num}\n\n") for chunk_id, chunk in chunks_on_page: bbox = chunk["bbox"] bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}" body_parts.append(f"\n") body_parts.append(f'\n') body_parts.append(f"### Chunk {chunk_id} — {chunk['type']} · p{page_num} · bbox: {bbox_str}\n\n") content_en = chunk["content_en"] content_pt = chunk["content_pt_br"] # For table/chronology, wrap in code block for readability if chunk["type"] == "table_marker": body_parts.append(f"**EN:**\n\n```\n{content_en}\n```\n\n") body_parts.append(f"**PT-BR:**\n\n```\n{content_pt}\n```\n\n") elif chunk["type"] == "image": body_parts.append(f"**EN:** {content_en}\n\n") body_parts.append(f"**PT-BR:** {content_pt}\n\n") related_img = chunk.get("related_image") if related_img and related_img != "null": body_parts.append(f"![chunk image](./images/{related_img})\n\n") else: body_parts.append(f"**EN:** {content_en}\n\n") body_parts.append(f"**PT-BR:** {content_pt}\n\n") # Metadata details block meta = { "chunk_id": chunk_id, "type": chunk["type"], "page": page_num, "order_in_page": chunk["order_in_page"], "order_global": chunk["order_global"], "bbox": chunk["bbox"], "classification": None, "formatting": chunk.get("formatting", []), "cross_page_hint": "self_contained", "prev_chunk": chunk.get("prev_chunk"), "next_chunk": chunk.get("next_chunk"), "ocr_confidence": chunk.get("ocr_confidence", 0.88), "ufo_anomaly_detected": False, "cryptid_anomaly_detected": False, } meta_json = json.dumps(meta, ensure_ascii=False, indent=2) body_parts.append(f"
metadata\n\n```json\n{meta_json}\n```\n\n
\n\n---\n\n") doc_content = "".join(body_parts) (OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8") print(f"document.md written ({len(doc_content):,} bytes)") if __name__ == "__main__": main()