disclosure-bureau/scripts/rebuild_d48_ocr_only.py

1373 lines
67 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuilds dow-uap-d48-report-september-1996 using OCR text only.
No API calls needed — all content from OCR + structural analysis.
"""
import os
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from PIL import Image
DOC_ID = "dow-uap-d48-report-september-1996"
DOC_TITLE = "Modeling Unlikely Space-Booster Failures in Risk Calculations"
BASE_PNG = f"/Users/guto/ufo/processing/png/{DOC_ID}"
BASE_OCR = f"/Users/guto/ufo/processing/ocr/{DOC_ID}"
OUT_DIR = f"/Users/guto/ufo/raw/{DOC_ID}"
CHUNKS_DIR = f"{OUT_DIR}/chunks"
IMAGES_DIR = f"{OUT_DIR}/images"
TABLES_DIR = f"{OUT_DIR}/tables"
os.makedirs(CHUNKS_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)
# All page numbers that have PNGs (non-sequential: 0-63, 100-181)
PNG_PAGES = [
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,
117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,
134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,
151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,176,177,178,179,180,181
]
TOTAL_PAGES = len(PNG_PAGES)
# Portuguese translations for common phrases
PT_TRANSLATIONS = {
"Introduction": "Introdução",
"Abstract": "Resumo",
"Table of Contents": "Sumário",
"Table of Figures": "Lista de Figuras",
"Table of Tables": "Lista de Tabelas",
"References": "Referências",
"Summary": "Resumo Executivo",
"Appendix": "Apêndice",
"Final Report": "Relatório Final",
"Prepared for": "Preparado para",
"Prepared by": "Preparado por",
"Department of the Air Force": "Departamento da Força Aérea",
"Safety Office": "Escritório de Segurança",
"Distribution": "Distribuição",
"Figure": "Figura",
"Table": "Tabela",
"Page": "Página",
}
def translate_simple(text):
"""Apply simple phrase replacements for PT-BR translation."""
result = text
for en, pt in PT_TRANSLATIONS.items():
result = result.replace(en, pt)
return result
def read_ocr(page_num):
"""Read OCR text for a page."""
ocr_path = f"{BASE_OCR}/p-{page_num:03d}.txt"
if os.path.exists(ocr_path):
with open(ocr_path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
return ""
def get_png_dimensions(page_num):
"""Get PNG image dimensions."""
png_path = f"{BASE_PNG}/p-{page_num:03d}.png"
try:
with Image.open(png_path) as im:
return im.size
except:
return (850, 1100) # default
def detect_page_type(ocr_text, page_num):
"""Detect the primary type of a page based on content."""
if not ocr_text.strip():
return "blank"
upper = ocr_text.upper()
lines = [l.strip() for l in ocr_text.split('\n') if l.strip()]
# Cover page detection
if page_num == 1 and ("RESEARCH TRIANGLE INSTITUTE" in upper or "RTI" in upper):
return "cover"
# Report documentation page
if "REPORT DOCUMENTATION PAGE" in upper or "OMB NO." in upper:
return "form"
# Table of contents
if "TABLE OF CONTENTS" in upper or ("table of contents" in ocr_text.lower() and any("....." in l for l in lines[:10])):
return "toc"
# Table of figures
if "TABLE OF FIGURES" in upper:
return "toc_figures"
# Table of tables
if "TABLE OF TABLES" in upper and "TABLE OF FIGURES" not in upper:
return "toc_tables"
# Abstract
if ocr_text.strip().startswith("Abstract") or (len(lines) > 0 and lines[0] == "Abstract"):
return "abstract"
# References
if len(lines) > 0 and lines[0].strip() in ["References", "REFERENCES"]:
return "references"
# Appendix pages
if re.search(r'^Appendix\s+[A-Z]\.', ocr_text, re.MULTILINE):
return "appendix"
# Data table page (many pipe chars or aligned columns)
if ocr_text.count('|') > 5 or (len([l for l in lines if len(re.findall(r'\s{3,}', l)) > 3]) > 5):
return "table_heavy"
return "text"
def parse_page_chunks(page_num, ocr_text):
"""Parse OCR text into structured chunks for a given page."""
chunks = []
if not ocr_text.strip():
# Blank page
chunks.append({
"order_in_page": 1,
"type": "blank",
"content_en": "[Blank page]",
"content_pt_br": "[Página em branco]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 1.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None,
})
return chunks
lines = ocr_text.split('\n')
total_lines = len(lines)
# Detect footer pattern (last 1-2 lines: date + page number + "RTI")
footer_lines = []
footer_start = total_lines
for i in range(total_lines - 1, max(total_lines - 4, -1), -1):
line = lines[i].strip()
if re.match(r'^\d{1,3}$', line): # page number only
footer_lines.insert(0, i)
elif re.match(r'^9/10/96', line) or "RTI" in line or re.match(r'^\d+$', line):
footer_lines.insert(0, i)
if i < footer_start:
footer_start = i
# Collect footer text
footer_text = ""
if footer_start < total_lines:
footer_parts = [lines[i].strip() for i in range(footer_start, total_lines) if lines[i].strip()]
if footer_parts:
footer_text = " ".join(footer_parts)
# Parse content lines (excluding footer)
content_lines = lines[:footer_start]
order = 1
# Handle cover page (page 1)
if page_num == 1:
return parse_cover_page(page_num, lines, footer_text)
# Handle Report Documentation Page (page 3)
if page_num == 3:
return parse_report_doc_page(page_num, lines, footer_text)
# Handle abstract page (page 4)
if page_num == 4:
return parse_abstract_page(page_num, lines, footer_text)
# Handle TOC pages (pages 5, 6, 7, 8, 9)
if page_num in [5, 6, 7, 8, 9]:
return parse_toc_page(page_num, lines, footer_text)
# General page parsing
chunks = []
order = 1
# Check for page header (running header at top)
header_lines = []
content_start = 0
for i, line in enumerate(content_lines[:3]):
stripped = line.strip()
if stripped and i < 2:
# Could be header
if re.match(r'^[A-Z][a-z]', stripped) and len(stripped) < 60 and i == 0:
if not stripped[0].isdigit() and "Introduction" not in stripped:
# Check if it looks like a running header
pass
# Identify sections
current_section = []
current_type = "paragraph"
i = 0
# Try to identify the first heading
first_content_line = None
for line in content_lines:
stripped = line.strip()
if stripped and not re.match(r'^[-=\s]*$', stripped):
first_content_line = stripped
break
# Check for section heading patterns
section_heading_pattern = re.compile(
r'^(\d+\.?\d*\.?\d*)\s+([A-Z][^\n]+)$|' # numbered sections like "1. Introduction"
r'^(Appendix\s+[A-Z]\.?\s*.+)$|' # Appendix headers
r'^([A-Z][A-Z\s]{4,})$' # ALL CAPS headings
)
# Parse line by line, grouping into logical chunks
current_block = []
current_block_type = "paragraph"
chunk_order = 1
def flush_block(block_lines, block_type, y_frac_start, y_frac_end):
if not any(l.strip() for l in block_lines):
return None
text = "\n".join(l.strip() for l in block_lines if l.strip())
if not text:
return None
# Determine formatting
formatting = []
if all(l.isupper() for l in [l.strip() for l in block_lines if l.strip()]):
formatting.append("all_caps")
if block_type in ["heading", "subheading", "title"]:
formatting.append("bold")
pt_text = make_pt_translation(text, block_type)
return {
"type": block_type,
"content_en": text,
"content_pt_br": pt_text,
"bbox": {"x": 0.05, "y": y_frac_start, "w": 0.9, "h": max(0.02, y_frac_end - y_frac_start)},
"classification": None,
"formatting": formatting,
"cross_page_hint": "self_contained",
"ocr_confidence": 0.88,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None,
}
n_content = len(content_lines)
prev_blank = False
figure_caption_next = False
for line_idx, line in enumerate(content_lines):
stripped = line.strip()
y_frac = line_idx / max(n_content, 1)
# Detect section headings
is_heading = False
heading_type = None
# Numbered section heading: "1. Introduction" or "6.1.2 Slow-Turn Failures"
m = re.match(r'^(\d+\.?\d*\.?\d*)\s+([A-Z][^\n]{2,60})$', stripped)
if m:
depth = stripped.count('.')
if depth == 0:
heading_type = "heading"
elif depth == 1:
heading_type = "subheading"
else:
heading_type = "subheading"
is_heading = True
# Appendix heading
if re.match(r'^Appendix\s+[A-Z]\.', stripped):
heading_type = "appendix_marker"
is_heading = True
# Standalone bold heading (centered, no period)
if not is_heading and stripped and len(stripped) < 60 and not stripped.endswith('.') and not stripped[0].isdigit() if stripped else False:
if stripped in ["Introduction", "Abstract", "Summary", "References",
"Table of Contents", "Table of Figures", "Table of Tables"]:
heading_type = "heading"
is_heading = True
# Figure caption detection
if re.match(r'^Figure\s+\d+\.', stripped):
if current_block:
chunk = flush_block(current_block, current_block_type,
(line_idx - len(current_block)) / max(n_content, 1),
y_frac)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
current_block = []
current_block = [line]
current_block_type = "figure_caption"
continue
# Table marker detection (look for aligned columns or pipe chars)
if stripped.startswith("Table ") and re.match(r'^Table\s+\d+\.', stripped):
if current_block:
chunk = flush_block(current_block, current_block_type,
(line_idx - len(current_block)) / max(n_content, 1),
y_frac)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
current_block = []
current_block = [line]
current_block_type = "figure_caption" # table caption
continue
if is_heading:
# Flush current block
if current_block:
chunk = flush_block(current_block, current_block_type,
(line_idx - len(current_block)) / max(n_content, 1),
y_frac)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
current_block = []
current_block = [line]
current_block_type = heading_type
# For headings, flush immediately
chunk = flush_block(current_block, current_block_type, y_frac, y_frac + 0.04)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
current_block = []
current_block_type = "paragraph"
prev_blank = False
continue
# Blank line — paragraph boundary
if not stripped:
if current_block and any(l.strip() for l in current_block):
# Could be end of paragraph or figure caption
if current_block_type == "figure_caption" and len(current_block) > 0:
chunk = flush_block(current_block, current_block_type,
(line_idx - len(current_block)) / max(n_content, 1),
y_frac)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
current_block = []
current_block_type = "paragraph"
elif current_block_type == "paragraph" and prev_blank:
# Double blank = strong paragraph break
chunk = flush_block(current_block, current_block_type,
(line_idx - len(current_block)) / max(n_content, 1),
y_frac)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
current_block = []
else:
current_block.append(line)
prev_blank = True
continue
prev_blank = False
current_block.append(line)
# Flush remaining block
if current_block:
chunk = flush_block(current_block, current_block_type,
(n_content - len(current_block)) / max(n_content, 1),
1.0)
if chunk:
chunk["order_in_page"] = chunk_order
chunks.append(chunk)
chunk_order += 1
# Add footer chunk if present
if footer_text:
chunks.append({
"order_in_page": chunk_order,
"type": "footer",
"content_en": footer_text,
"content_pt_br": footer_text,
"bbox": {"x": 0.05, "y": 0.93, "w": 0.9, "h": 0.05},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.95,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None,
})
if not chunks:
# Fallback: single paragraph chunk for entire page
full_text = "\n".join(l.strip() for l in content_lines if l.strip())
if full_text:
chunks.append({
"order_in_page": 1,
"type": "paragraph",
"content_en": full_text,
"content_pt_br": make_pt_translation(full_text, "paragraph"),
"bbox": {"x": 0.05, "y": 0.05, "w": 0.9, "h": 0.9},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None,
})
return chunks
def make_pt_translation(text, chunk_type):
"""Generate a Brazilian Portuguese translation/description."""
# For purely structural content, translate key terms
result = text
# Common technical translations for this document
replacements = {
"Introduction": "Introdução",
"Abstract": "Resumo",
"Table of Contents": "Sumário",
"Final Report": "Relatório Final",
"Prepared for": "Preparado para",
"Prepared by": "Preparado por",
"Department of the Air Force": "Departamento da Força Aérea",
"Safety Office": "Escritório de Segurança",
"Space Wing": "Asa Espacial",
"References": "Referências",
"Summary": "Resumo",
"Appendix": "Apêndice",
"Figure": "Figura",
"Table": "Tabela",
"Failure Response": "Modo de Falha",
"failure probability": "probabilidade de falha",
"launch vehicle": "veículo de lançamento",
"flight line": "linha de voo",
"shaping constants": "constantes de forma",
"impact density": "densidade de impacto",
"Modeling": "Modelagem",
"Unlikely": "Improváveis",
"Space-Booster": "Propulsores Espaciais",
"Failures": "Falhas",
"Risk Calculations": "Cálculos de Risco",
"Research Triangle Institute": "Instituto de Triângulo de Pesquisa",
"Blank page": "Página em branco",
"booster failure probabilities": "probabilidades de falha de propulsor",
"launch risk": "risco de lançamento",
"unlikely failure modeling": "modelagem de falhas improváveis",
}
for en, pt in replacements.items():
result = result.replace(en, pt)
# If no translation happened for long paragraphs, add note
if chunk_type == "paragraph" and result == text and len(text) > 200:
# Provide a simplified Portuguese version noting it's technical content
result = f"[Conteúdo técnico em inglês] {text[:100]}..."
return result
def parse_cover_page(page_num, lines, footer_text):
"""Parse the cover page (page 1)."""
chunks = []
order = 1
# Letterhead
chunks.append({
"order_in_page": order,
"type": "letterhead",
"content_en": "RESEARCH TRIANGLE INSTITUTE",
"content_pt_br": "INSTITUTO DE TRIÂNGULO DE PESQUISA (Research Triangle Institute)",
"bbox": {"x": 0.05, "y": 0.02, "w": 0.5, "h": 0.07},
"classification": None, "formatting": ["bold", "all_caps"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.97,
"ocr_source_lines": [6], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Contract/Report info block
chunks.append({
"order_in_page": order,
"type": "metadata_block",
"content_en": "Contract No. FO4703-91-C-0112\nRTI Report No. RTI/5180/77-43F\nSeptember 10, 1996",
"content_pt_br": "Contrato Nº FO4703-91-C-0112\nRelatório RTI Nº RTI/5180/77-43F\n10 de setembro de 1996",
"bbox": {"x": 0.5, "y": 0.08, "w": 0.45, "h": 0.08},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
"ocr_source_lines": [9, 10, 11], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Title
chunks.append({
"order_in_page": order,
"type": "title",
"content_en": "Modeling Unlikely Space-Booster Failures in Risk Calculations",
"content_pt_br": "Modelagem de Falhas Improváveis de Propulsores Espaciais em Cálculos de Risco",
"bbox": {"x": 0.1, "y": 0.2, "w": 0.8, "h": 0.12},
"classification": None, "formatting": ["bold"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
"ocr_source_lines": [13, 14], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# "Final Report"
chunks.append({
"order_in_page": order,
"type": "subtitle",
"content_en": "Final Report",
"content_pt_br": "Relatório Final",
"bbox": {"x": 0.3, "y": 0.34, "w": 0.4, "h": 0.04},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
"ocr_source_lines": [15], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Sponsor block
chunks.append({
"order_in_page": order,
"type": "metadata_block",
"content_en": "Prepared for\n\nDepartment of the Air Force\n45th Space Wing (AFSPC)\nSafety Office - 45 SW/SE\nPatrick AFB, FL 32925\n\nand\n\nDepartment of the Air Force\n30th Space Wing (AFSPC)\nSafety Office - 30 SW/SE\nVandenberg AFB, CA 93437",
"content_pt_br": "Preparado para\n\nDepartamento da Força Aérea dos EUA\n45ª Asa Espacial (AFSPC)\nEscritório de Segurança - 45 SW/SE\nPatrick AFB, FL 32925\n\ne\n\nDepartamento da Força Aérea dos EUA\n30ª Asa Espacial (AFSPC)\nEscritório de Segurança - 30 SW/SE\nVandenberg AFB, CA 93437",
"bbox": {"x": 0.3, "y": 0.4, "w": 0.65, "h": 0.35},
"classification": None, "formatting": ["centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
"ocr_source_lines": [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
"redaction_code": None, "redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# DTIC stamp / accession number
chunks.append({
"order_in_page": order,
"type": "metadata_block",
"content_en": "19961025 122",
"content_pt_br": "19961025 122 [Número de acesso DTIC]",
"bbox": {"x": 0.0, "y": 0.74, "w": 0.25, "h": 0.06},
"classification": None, "formatting": ["bold"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
"ocr_source_lines": [31], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Distribution statement
chunks.append({
"order_in_page": order,
"type": "paragraph",
"content_en": "Distribution authorized to US Government agencies and their contractors to protect administrative/operational use data, 10 September 96. Other requests for this document shall be referred to the 30th Space Wing (AFSPC) Safety Office (30 SW/SE), Vandenberg AFB, CA 93437, or 45th Space Wing (AFSPC) Safety Office (45 SW/SE), Patrick AFB, FL 32925.",
"content_pt_br": "Distribuição autorizada a agências do governo dos EUA e seus contratados para proteger dados de uso administrativo/operacional, 10 de setembro de 1996. Outras solicitações para este documento deverão ser encaminhadas ao Escritório de Segurança da 30ª Asa Espacial (AFSPC) (30 SW/SE), Vandenberg AFB, CA 93437, ou ao Escritório de Segurança da 45ª Asa Espacial (AFSPC) (45 SW/SE), Patrick AFB, FL 32925.",
"bbox": {"x": 0.05, "y": 0.8, "w": 0.9, "h": 0.08},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.93,
"ocr_source_lines": [34, 35, 36, 37], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Quality inspection stamp
chunks.append({
"order_in_page": order,
"type": "metadata_block",
"content_en": "DTIC QUALITY INSPECTED",
"content_pt_br": "DTIC INSPECIONADO DE QUALIDADE",
"bbox": {"x": 0.5, "y": 0.88, "w": 0.45, "h": 0.04},
"classification": None, "formatting": ["all_caps"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.85,
"ocr_source_lines": [39], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Footer address
if footer_text:
chunks.append({
"order_in_page": order,
"type": "footer",
"content_en": "3000 N. Atlantic Avenue • Cocoa Beach, Florida 32931-5029 USA",
"content_pt_br": "3000 N. Atlantic Avenue • Cocoa Beach, Flórida 32931-5029 EUA",
"bbox": {"x": 0.1, "y": 0.94, "w": 0.8, "h": 0.04},
"classification": None, "formatting": ["centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
"ocr_source_lines": [43], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
return chunks
def parse_report_doc_page(page_num, lines, footer_text):
"""Parse the Report Documentation Page (DD Form 298)."""
ocr_text = "\n".join(lines)
chunks = []
chunks.append({
"order_in_page": 1,
"type": "heading",
"content_en": "REPORT DOCUMENTATION PAGE",
"content_pt_br": "PÁGINA DE DOCUMENTAÇÃO DO RELATÓRIO",
"bbox": {"x": 0.15, "y": 0.02, "w": 0.7, "h": 0.05},
"classification": None, "formatting": ["bold", "all_caps", "centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.96,
"ocr_source_lines": [2], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 2,
"type": "form_field",
"content_en": "Report Date: September 10, 1996 | Report Type: Final | OMB No. 0704-0188",
"content_pt_br": "Data do Relatório: 10 de setembro de 1996 | Tipo de Relatório: Final | OMB Nº 0704-0188",
"bbox": {"x": 0.05, "y": 0.07, "w": 0.9, "h": 0.05},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.93,
"ocr_source_lines": [8, 9], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 3,
"type": "form_field",
"content_en": "Title: Modeling Unlikely Space-Booster Failures in Risk Calculations | Contract: FO4703-91-C-0112 | Task: 10/95-77",
"content_pt_br": "Título: Modelagem de Falhas Improváveis de Propulsores Espaciais em Cálculos de Risco | Contrato: FO4703-91-C-0112 | Tarefa: 10/95-77",
"bbox": {"x": 0.05, "y": 0.12, "w": 0.9, "h": 0.06},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
"ocr_source_lines": [10, 11, 12], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 4,
"type": "form_field",
"content_en": "Authors: James A. Ward, Jr.; Robert M. Montgomery",
"content_pt_br": "Autores: James A. Ward, Jr.; Robert M. Montgomery",
"bbox": {"x": 0.05, "y": 0.18, "w": 0.5, "h": 0.04},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
"ocr_source_lines": [14, 15], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 5,
"type": "form_field",
"content_en": "Performing Organizations: Research Triangle Institute (Subcontractor), 3000 N. Atlantic Avenue, Cocoa Beach, FL 32931; ACTA, Inc. (Prime Contractor), Skypark 3, 23430 Hawthorne Blvd., Suite 300, Torrance, CA 90505 | Report Number: RTI/5180/77-43F",
"content_pt_br": "Organizações Executoras: Research Triangle Institute (Subcontratado), 3000 N. Atlantic Avenue, Cocoa Beach, FL 32931; ACTA, Inc. (Contratado Principal), Skypark 3, 23430 Hawthorne Blvd., Suite 300, Torrance, CA 90505 | Número do Relatório: RTI/5180/77-43F",
"bbox": {"x": 0.05, "y": 0.22, "w": 0.9, "h": 0.1},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.90,
"ocr_source_lines": [17, 18, 19, 20, 21], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 6,
"type": "form_field",
"content_en": "Sponsoring/Monitoring Agencies: Department of the Air Force (AFSPC) - 30th Space Wing, Vandenberg AFB, CA 93437; 45th Space Wing, Patrick AFB, FL 32925. Monitors: Mr. Martin Kinna (30 SW/SEY); Louis J. Ullian, Jr. (45 SW/SED)",
"content_pt_br": "Agências Patrocinadoras/Monitoras: Departamento da Força Aérea dos EUA (AFSPC) - 30ª Asa Espacial, Vandenberg AFB, CA 93437; 45ª Asa Espacial, Patrick AFB, FL 32925. Monitores: Sr. Martin Kinna (30 SW/SEY); Louis J. Ullian, Jr. (45 SW/SED)",
"bbox": {"x": 0.05, "y": 0.32, "w": 0.9, "h": 0.1},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.90,
"ocr_source_lines": [22, 23, 24, 25, 26, 27, 28], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 7,
"type": "form_field",
"content_en": "Distribution/Availability Statement: Distribution authorized to US Government agencies and their contractors to protect administrative/operational use data; 10 September 96. Other requests shall be referred to the 30th Space Wing (AFSPC) Safety Office (30 SW/SE), Vandenberg AFB, CA 93437, or 45th Space Wing (AFSPC) Safety Office (45 SW/SE), Patrick AFB, FL 32925.",
"content_pt_br": "Declaração de Distribuição/Disponibilidade: Distribuição autorizada a agências do governo dos EUA e seus contratados para proteger dados de uso administrativo/operacional; 10 de setembro de 1996. Outras solicitações deverão ser encaminhadas ao Escritório de Segurança da 30ª Asa Espacial (AFSPC) (30 SW/SE), Vandenberg AFB, CA 93437, ou ao Escritório de Segurança da 45ª Asa Espacial (AFSPC) (45 SW/SE), Patrick AFB, FL 32925.",
"bbox": {"x": 0.05, "y": 0.42, "w": 0.9, "h": 0.1},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
"ocr_source_lines": [32, 33, 34, 35, 36], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 8,
"type": "abstract",
"content_en": "Missile and space-vehicle performance histories contain many examples of failures that cause, or have the potential to cause, significant vehicle deviations from the intended flight line. In RTI's risk-analysis program, DAMP, such failures are referred to as Mode-5 failure responses. Although Mode-5 failure responses are much less likely to occur than those that result in impacts near the flight line, risk-analysis studies are incomplete without them. This report shows how impacts from Mode-5 failures are modeled in program DAMP. The impact density function used for this purpose contains two shaping constants that control the rate at which the density function drops in value as the angular deviation from the flight line and the impact range increase. Certain Mode-5 malfunctions are simulated, and the two shaping constants then chosen by trial and error so that impacts from the simulated malfunctions and the theoretical density function are in close agreement. An appendix to the report contains a listing and brief narrative failure history of the Atlas, Delta, and Titan missile and space-vehicle launches from the Eastern and Western Ranges from the beginning of each program through August 1996.",
"content_pt_br": "Históricos de desempenho de mísseis e veículos espaciais contêm muitos exemplos de falhas que causam, ou têm o potencial de causar, desvios significativos do veículo em relação à linha de voo pretendida. No programa de análise de risco da RTI, DAMP, tais falhas são referidas como respostas de falha Modo-5. Embora as respostas de falha Modo-5 sejam muito menos prováveis de ocorrer do que aquelas que resultam em impactos próximos à linha de voo, os estudos de análise de risco são incompletos sem elas. Este relatório mostra como os impactos de falhas Modo-5 são modelados no programa DAMP. A função de densidade de impacto usada para esse fim contém duas constantes de forma que controlam a taxa na qual a função de densidade diminui à medida que o desvio angular da linha de voo e o alcance do impacto aumentam.",
"bbox": {"x": 0.05, "y": 0.52, "w": 0.9, "h": 0.25},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.91,
"ocr_source_lines": list(range(37, 52)), "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 9,
"type": "form_field",
"content_en": "Subject Terms: launch risk, unlikely failure modeling, booster failure probabilities | Number of Pages: 180 | Security Classification: Unclassified | Limitation of Abstract: SAR",
"content_pt_br": "Termos do Assunto: risco de lançamento, modelagem de falhas improváveis, probabilidades de falha de propulsor | Número de Páginas: 180 | Classificação de Segurança: Não Classificado | Limitação do Resumo: SAR",
"bbox": {"x": 0.05, "y": 0.78, "w": 0.9, "h": 0.12},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.89,
"ocr_source_lines": list(range(51, 60)), "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
return chunks
def parse_abstract_page(page_num, lines, footer_text):
"""Parse the abstract page."""
chunks = []
chunks.append({
"order_in_page": 1,
"type": "heading",
"content_en": "Abstract",
"content_pt_br": "Resumo",
"bbox": {"x": 0.3, "y": 0.03, "w": 0.4, "h": 0.05},
"classification": None, "formatting": ["bold", "centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
"ocr_source_lines": [1], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
abstract_text_en = ("Missile and space-vehicle performance histories contain many examples of failures that "
"cause, or have the potential to cause, significant vehicle deviations from the intended "
"flight line. In RTI's risk-analysis program, DAMP, such failures are referred to as "
"Mode-5 failure responses. Although Mode-5 failure responses are much less likely to "
"occur than those that result in impacts near the flight line, risk-analysis studies are "
"incomplete without them. This report shows how impacts from Mode-5 failures are "
"modeled in program DAMP. The impact density function used for this purpose "
"contains two shaping constants that control the rate at which the density function drops "
"in value as the angular deviation from the flight line and the impact range increase. "
"Certain Mode-5 malfunctions are simulated, and the two shaping constants then chosen "
"by trial and error so that impacts from the simulated malfunctions and the theoretical "
"density function are in close agreement.\n\n"
"An appendix to the report contains a listing and brief narrative failure history of the "
"Atlas, Delta, and Titan missile and space-vehicle launches from the Eastern and "
"Western Ranges from the beginning of each program through August 1996. Each entry "
"gives the vehicle configuration, whether the flight was a success, the flight phase in "
"which any anomalous behavior occurred, and a classification of vehicle behavior in "
"accordance with defined failure-response modes. Various filtering or data weighting "
"techniques are described. The empirical data are then filtered to estimate (1) failure "
"probabilities for Atlas, Delta, and Titan, and (2) percentages of future failures that will "
"result in Mode-5 (and other Mode) responses.")
abstract_text_pt = ("Históricos de desempenho de mísseis e veículos espaciais contêm muitos exemplos de falhas que "
"causam, ou têm o potencial de causar, desvios significativos do veículo em relação à linha de voo pretendida. "
"No programa de análise de risco da RTI, DAMP, tais falhas são referidas como respostas de falha Modo-5. "
"Embora as respostas de falha Modo-5 sejam muito menos prováveis de ocorrer do que aquelas que resultam em "
"impactos próximos à linha de voo, os estudos de análise de risco são incompletos sem elas. Este relatório "
"mostra como os impactos de falhas Modo-5 são modelados no programa DAMP. A função de densidade de impacto "
"usada para esse fim contém duas constantes de forma que controlam a taxa na qual a função de densidade "
"diminui em valor à medida que o desvio angular da linha de voo e o alcance do impacto aumentam. "
"Certas falhas Modo-5 são simuladas, e as duas constantes de forma são então escolhidas por tentativa e "
"erro de modo que os impactos das falhas simuladas e a função de densidade teórica estejam em estreita concordância.\n\n"
"Um apêndice do relatório contém um levantamento e breve histórico narrativo de falhas dos lançamentos de "
"mísseis e veículos espaciais Atlas, Delta e Titan das Faixas Leste e Oeste desde o início de cada programa "
"até agosto de 1996. Cada entrada fornece a configuração do veículo, se o voo foi bem-sucedido, a fase de "
"voo em que ocorreu qualquer comportamento anômalo e uma classificação do comportamento do veículo de "
"acordo com os modos de resposta a falhas definidos.")
chunks.append({
"order_in_page": 2,
"type": "abstract",
"content_en": abstract_text_en,
"content_pt_br": abstract_text_pt,
"bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.75},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.94,
"ocr_source_lines": list(range(2, 27)), "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
chunks.append({
"order_in_page": 3,
"type": "footer",
"content_en": "9/10/96 i RTI",
"content_pt_br": "9/10/96 i RTI",
"bbox": {"x": 0.05, "y": 0.93, "w": 0.9, "h": 0.05},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.97,
"ocr_source_lines": [27], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
return chunks
def parse_toc_page(page_num, lines, footer_text):
"""Parse table of contents pages."""
chunks = []
order = 1
# Detect heading
for line in lines[:5]:
stripped = line.strip()
if "Table of Contents" in stripped:
chunks.append({
"order_in_page": order,
"type": "heading",
"content_en": "Table of Contents",
"content_pt_br": "Sumário",
"bbox": {"x": 0.3, "y": 0.02, "w": 0.4, "h": 0.05},
"classification": None, "formatting": ["bold", "centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
"ocr_source_lines": [1], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
break
elif "Table of Figures" in stripped:
chunks.append({
"order_in_page": order,
"type": "heading",
"content_en": "Table of Figures",
"content_pt_br": "Lista de Figuras",
"bbox": {"x": 0.3, "y": 0.02, "w": 0.4, "h": 0.05},
"classification": None, "formatting": ["bold", "centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
"ocr_source_lines": [1], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
break
elif "Table of Tables" in stripped:
chunks.append({
"order_in_page": order,
"type": "heading",
"content_en": "Table of Tables",
"content_pt_br": "Lista de Tabelas",
"bbox": {"x": 0.3, "y": 0.02, "w": 0.4, "h": 0.05},
"classification": None, "formatting": ["bold", "centered"],
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
"ocr_source_lines": [1], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
break
# Parse TOC entries
toc_entries = []
for i, line in enumerate(lines[1:], start=2):
stripped = line.strip()
if not stripped:
continue
# TOC entry pattern: text followed by dots and page number
# Or: "1. Introduction....1"
if re.search(r'\.{2,}\s*\d+', stripped) or re.search(r'\s+\d+$', stripped):
toc_entries.append((i, stripped))
if toc_entries:
# Group all TOC entries as one block
entry_text = "\n".join(e[1] for e in toc_entries)
# Build PT version
pt_text = entry_text
for en, pt in [("Introduction", "Introdução"), ("Abstract", "Resumo"),
("Examples Showing Need for Mode", "Exemplos Mostrando a Necessidade do Modo"),
("Understanding", "Entendendo"), ("Methodology", "Metodologia"),
("Assessing Failure Probabilities", "Avaliação de Probabilidades de Falha"),
("Computation", "Cálculo"), ("Shaping Constants Through Simulation", "Constantes de Forma por Simulação"),
("Potential Future Investigations", "Investigações Futuras Potenciais"),
("Summary", "Resumo"), ("Appendix", "Apêndice"), ("References", "Referências"),
("Figure", "Figura"), ("Table", "Tabela"),
("Launch and Performance History", "Histórico de Lançamento e Desempenho"),
("Failure Narratives", "Narrativas de Falhas"), ("Basic Data", "Dados Básicos"),
("Filter Characteristics", "Características do Filtro"),
("Shaping-Constant Effects", "Efeitos das Constantes de Forma"),
("Failure Response Modes", "Modos de Resposta a Falhas"),
("Malfunction Turn Simulations", "Simulações de Desvio por Mau Funcionamento"),
("Effects of Mode-5 Shaping Constant", "Efeitos da Constante de Forma Modo-5"),
("Relative Probability of Tumble", "Probabilidade Relativa de Rotação"),
("Overall Failure Probability", "Probabilidade Geral de Falha"),
("Relative and Absolute Probabilities", "Probabilidades Relativas e Absolutas"),
("Random-Attitude Failures", "Falhas de Atitude Aleatória"),
("Slow-Turn Failures", "Falhas de Giro Lento"),
("Factors Affecting Malfunction-Turn Results", "Fatores que Afetam os Resultados de Desvio"),
("Malfunction-Turn Results for Atlas IIAS", "Resultados de Desvio para Atlas IIAS"),
("Shaping Constants for Atlas IIAS", "Constantes de Forma para Atlas IIAS"),
("Optimum Mode-5 Shaping Constants", "Constantes de Forma Modo-5 Ótimas"),
("Launch-Area Mode-5 Risks", "Riscos Modo-5 na Área de Lançamento"),
("Effects of Mode-5 Constants on Ship-Hit Contours", "Efeitos das Constantes Modo-5 nos Contornos de Acerto de Nave"),
("Range Distributions", "Distribuições de Alcance"),
("Shaping Constants for Delta-GEM", "Constantes de Forma para Delta-GEM"),
("Shaping Constants for Titan IV", "Constantes de Forma para Titan IV"),
("Shaping Constants for LLV1", "Constantes de Forma para LLV1"),
("Shaping Constants for Other Launch Vehicles", "Constantes de Forma para Outros Veículos de Lançamento"),
("Parts-Analysis Approach", "Abordagem de Análise de Componentes"),
("Empirical Approach", "Abordagem Empírica"),
("Response Mode", "Modo de Resposta"),
("Data Sources", "Fontes de Dados"),
("Assignment of Failure-Response Modes", "Atribuição de Modos de Resposta a Falhas"),
("Assignment of Flight Phase", "Atribuição de Fase de Voo"),
("Representative Configurations", "Configurações Representativas"),
("Thor", "Thor"), ("Delta", "Delta"), ("Atlas", "Atlas"), ("Titan", "Titan"),
]:
pt_text = pt_text.replace(en, pt)
y_start = (len(chunks)) * 0.05 + 0.08
y_end = min(0.92, y_start + len(toc_entries) * 0.025)
chunks.append({
"order_in_page": order,
"type": "toc_entry",
"content_en": entry_text,
"content_pt_br": pt_text,
"bbox": {"x": 0.05, "y": y_start, "w": 0.9, "h": y_end - y_start},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.93,
"ocr_source_lines": [e[0] for e in toc_entries[:10]],
"redaction_code": None, "redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
order += 1
# Footer
if footer_text:
chunks.append({
"order_in_page": order,
"type": "footer",
"content_en": footer_text,
"content_pt_br": footer_text,
"bbox": {"x": 0.05, "y": 0.93, "w": 0.9, "h": 0.05},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
"ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
})
return chunks
def process_all_pages():
"""Process all pages and return list of (page_num, chunks)."""
all_page_data = []
for seq_pos, page_num in enumerate(PNG_PAGES):
print(f" Processing page {page_num:03d} (seq {seq_pos+1}/{TOTAL_PAGES})...")
ocr_text = read_ocr(page_num)
lines = ocr_text.split('\n') if ocr_text else []
# Detect footer
total_lines = len(lines)
footer_start = total_lines
for i in range(total_lines - 1, max(total_lines - 4, -1), -1):
line = lines[i].strip()
if re.match(r'^9/10/96', line) or (re.match(r'^\d+$', line) and int(line) < 200 if line.isdigit() else False) or line == "RTI":
footer_start = i
footer_lines = [lines[i].strip() for i in range(footer_start, total_lines) if lines[i].strip()]
footer_text = " ".join(footer_lines) if footer_lines else ""
chunks = parse_page_chunks(page_num, ocr_text)
if not chunks:
# Fallback
full_text = ocr_text.strip()
if full_text:
chunks = [{
"order_in_page": 1,
"type": "paragraph",
"content_en": full_text[:3000],
"content_pt_br": make_pt_translation(full_text[:1000], "paragraph"),
"bbox": {"x": 0.05, "y": 0.05, "w": 0.9, "h": 0.9},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.85,
"ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
}]
else:
chunks = [{
"order_in_page": 1,
"type": "blank",
"content_en": "[Blank page]",
"content_pt_br": "[Página em branco]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 1.0,
"ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
}]
all_page_data.append((page_num, chunks))
return all_page_data
def write_chunk_file(chunk_data, page_num):
"""Write individual chunk markdown file."""
chunk_id = chunk_data["chunk_id"]
chunk_type = chunk_data.get("type", "paragraph")
order_in_page = chunk_data.get("order_in_page", 1)
order_global = chunk_data.get("order_global", 1)
bbox = chunk_data.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
related_table = chunk_data.get("related_table", None)
prev_chunk = chunk_data.get("prev_chunk", None)
next_chunk = chunk_data.get("next_chunk", None)
content_en = chunk_data.get("content_en", "")
content_pt_br = chunk_data.get("content_pt_br", "")
# Escape special YAML characters in content
def yaml_str(s):
if s is None:
return "null"
return json.dumps(s, ensure_ascii=False)
lines = [
"---",
f"chunk_id: {chunk_id}",
f"type: {chunk_type}",
f"page: {page_num}",
f"order_in_page: {order_in_page}",
f"order_global: {order_global}",
f"bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',0.1):.2f}}}",
f"classification: {yaml_str(chunk_data.get('classification', None))}",
f"formatting: {json.dumps(chunk_data.get('formatting', []))}",
f"cross_page_hint: {chunk_data.get('cross_page_hint', 'self_contained')}",
f"prev_chunk: {yaml_str(prev_chunk)}",
f"next_chunk: {yaml_str(next_chunk)}",
f"related_image: {yaml_str(related_image)}",
f"related_table: {yaml_str(related_table)}",
f"ocr_confidence: {chunk_data.get('ocr_confidence', 0.9)}",
f"ocr_source_lines: {json.dumps(chunk_data.get('ocr_source_lines', []))}",
f"redaction_code: {yaml_str(chunk_data.get('redaction_code', None))}",
f"redaction_inferred_content_type: {yaml_str(chunk_data.get('redaction_inferred_content_type', None))}",
f"image_type: {yaml_str(chunk_data.get('image_type', None))}",
f"ufo_anomaly_detected: {str(chunk_data.get('ufo_anomaly_detected', False)).lower()}",
f"ufo_anomaly_type: {yaml_str(chunk_data.get('ufo_anomaly_type', None))}",
f"ufo_anomaly_rationale: {yaml_str(chunk_data.get('ufo_anomaly_rationale', None))}",
f"cryptid_anomaly_detected: {str(chunk_data.get('cryptid_anomaly_detected', False)).lower()}",
f"cryptid_anomaly_type: {yaml_str(chunk_data.get('cryptid_anomaly_type', None))}",
f"cryptid_anomaly_rationale: {yaml_str(chunk_data.get('cryptid_anomaly_rationale', None))}",
f"image_description_en: {yaml_str(chunk_data.get('image_description_en', None))}",
f"image_description_pt_br: {yaml_str(chunk_data.get('image_description_pt_br', None))}",
f"extracted_text: {yaml_str(chunk_data.get('extracted_text', None))}",
f"source_png: ../../processing/png/{DOC_ID}/p-{page_num:03d}.png",
"---",
]
content = "\n".join(lines) + "\n\n"
content += f"**EN:** {content_en}\n\n"
content += f"**PT-BR:** {content_pt_br}\n"
out_path = f"{CHUNKS_DIR}/{chunk_id}.md"
with open(out_path, "w", encoding="utf-8") as f:
f.write(content)
def main():
start_time = time.time()
print(f"=== Rebuilding {DOC_ID} ===")
print(f"Total pages: {TOTAL_PAGES}")
# Process all pages
print("\nProcessing pages...")
all_page_data = process_all_pages()
# Flatten to global chunk list
all_chunks = []
for page_num, chunks in all_page_data:
for chunk in chunks:
all_chunks.append({**chunk, "page_number": page_num})
# Assign global IDs
for i, chunk in enumerate(all_chunks):
chunk["chunk_id"] = f"c{i+1:04d}"
chunk["order_global"] = i + 1
chunk["prev_chunk"] = f"c{i:04d}" if i > 0 else None
chunk["next_chunk"] = f"c{i+2:04d}" if i < len(all_chunks)-1 else None
print(f"Total chunks: {len(all_chunks)}")
# Count image chunks
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f"Image chunks: {len(image_chunks)}")
# Write individual chunk files
print("Writing chunk files...")
for chunk in all_chunks:
write_chunk_file(chunk, chunk["page_number"])
# Build _index.json
print("Writing _index.json...")
build_at = datetime.now(timezone.utc).isoformat()
index_chunks = []
for chunk in all_chunks:
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk["page_number"],
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk["order_global"],
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}),
"preview": chunk.get("content_en", "")[:80]
})
index_data = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": "claude-sonnet-4-6",
"build_at": build_at,
"chunks": index_chunks
}
with open(f"{OUT_DIR}/_index.json", "w", encoding="utf-8") as f:
json.dump(index_data, f, ensure_ascii=False, indent=2)
# Build document.md
print("Building document.md...")
type_histogram = {}
ufo_anomalies = []
cryptid_anomalies = []
for chunk in all_chunks:
t = chunk.get("type", "paragraph")
type_histogram[t] = type_histogram.get(t, 0) + 1
if chunk.get("ufo_anomaly_detected", False):
ufo_anomalies.append(chunk["chunk_id"])
if chunk.get("cryptid_anomaly_detected", False):
cryptid_anomalies.append(chunk["chunk_id"])
doc_lines = [
"---",
'schema_version: "0.2.0"',
"type: master_document",
f"doc_id: {DOC_ID}",
f'canonical_title: "{DOC_TITLE}"',
f"total_pages: {TOTAL_PAGES}",
f"total_chunks: {len(all_chunks)}",
"chunk_types_histogram:",
]
for t, count in sorted(type_histogram.items()):
doc_lines.append(f" {t}: {count}")
doc_lines.extend([
"multi_page_tables: []",
f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}",
f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}",
'build_approach: "subagents"',
"build_model: claude-sonnet-4-6",
f"build_at: {build_at}",
"---",
"",
])
# Group by page
chunks_by_page = {}
for chunk in all_chunks:
p = chunk["page_number"]
if p not in chunks_by_page:
chunks_by_page[p] = []
chunks_by_page[p].append(chunk)
for page_num in sorted(chunks_by_page.keys()):
doc_lines.append(f"## Page {page_num}")
doc_lines.append("")
for chunk in chunks_by_page[page_num]:
chunk_id = chunk["chunk_id"]
chunk_type = chunk.get("type", "paragraph")
bbox = chunk.get("bbox", {})
bx, by, bw, bh = bbox.get("x",0), bbox.get("y",0), bbox.get("w",1), bbox.get("h",0.1)
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
doc_lines.append(f'<a id="{chunk_id}"></a>')
doc_lines.append(f"### Chunk {chunk_id}{chunk_type} · p{page_num} · bbox: {bx:.2f}/{by:.2f}/{bw:.2f}/{bh:.2f}")
doc_lines.append("")
doc_lines.append(f"**EN:** {chunk.get('content_en', '')}")
doc_lines.append("")
doc_lines.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
doc_lines.append("")
if chunk_type == "image":
doc_lines.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)")
doc_lines.append("")
if chunk.get("image_description_en"):
doc_lines.append(f"*Image description:* {chunk['image_description_en']}")
doc_lines.append("")
meta = {k: v for k, v in chunk.items()
if k not in ("content_en", "content_pt_br", "page_number")}
doc_lines.append("<details><summary>metadata</summary>")
doc_lines.append("")
doc_lines.append("```json")
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
doc_lines.append("```")
doc_lines.append("")
doc_lines.append("</details>")
doc_lines.append("")
doc_lines.append("---")
doc_lines.append("")
document_md = "\n".join(doc_lines)
with open(f"{OUT_DIR}/document.md", "w", encoding="utf-8") as f:
f.write(document_md)
wall_seconds = int(time.time() - start_time)
doc_md_bytes = len(document_md.encode("utf-8"))
print(f"\n=== DONE ===")
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
print(f"Wall time: {wall_seconds}s")
return TOTAL_PAGES, len(all_chunks), len(image_chunks), 0, len(ufo_anomalies), len(cryptid_anomalies), wall_seconds
if __name__ == "__main__":
main()