1373 lines
67 KiB
Python
1373 lines
67 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Rebuilds dow-uap-d48-report-september-1996 using OCR text only.
|
|
No API calls needed — all content from OCR + structural analysis.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import re
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
|
|
DOC_ID = "dow-uap-d48-report-september-1996"
|
|
DOC_TITLE = "Modeling Unlikely Space-Booster Failures in Risk Calculations"
|
|
BASE_PNG = f"/Users/guto/ufo/processing/png/{DOC_ID}"
|
|
BASE_OCR = f"/Users/guto/ufo/processing/ocr/{DOC_ID}"
|
|
OUT_DIR = f"/Users/guto/ufo/raw/{DOC_ID}"
|
|
CHUNKS_DIR = f"{OUT_DIR}/chunks"
|
|
IMAGES_DIR = f"{OUT_DIR}/images"
|
|
TABLES_DIR = f"{OUT_DIR}/tables"
|
|
|
|
os.makedirs(CHUNKS_DIR, exist_ok=True)
|
|
os.makedirs(IMAGES_DIR, exist_ok=True)
|
|
os.makedirs(TABLES_DIR, exist_ok=True)
|
|
|
|
# All page numbers that have PNGs (non-sequential: 0-63, 100-181)
|
|
PNG_PAGES = [
|
|
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
|
|
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
|
|
49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
|
|
100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,
|
|
117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,
|
|
134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,
|
|
151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,
|
|
168,169,170,171,172,173,174,175,176,177,178,179,180,181
|
|
]
|
|
|
|
TOTAL_PAGES = len(PNG_PAGES)
|
|
|
|
# Portuguese translations for common phrases
|
|
PT_TRANSLATIONS = {
|
|
"Introduction": "Introdução",
|
|
"Abstract": "Resumo",
|
|
"Table of Contents": "Sumário",
|
|
"Table of Figures": "Lista de Figuras",
|
|
"Table of Tables": "Lista de Tabelas",
|
|
"References": "Referências",
|
|
"Summary": "Resumo Executivo",
|
|
"Appendix": "Apêndice",
|
|
"Final Report": "Relatório Final",
|
|
"Prepared for": "Preparado para",
|
|
"Prepared by": "Preparado por",
|
|
"Department of the Air Force": "Departamento da Força Aérea",
|
|
"Safety Office": "Escritório de Segurança",
|
|
"Distribution": "Distribuição",
|
|
"Figure": "Figura",
|
|
"Table": "Tabela",
|
|
"Page": "Página",
|
|
}
|
|
|
|
def translate_simple(text):
|
|
"""Apply simple phrase replacements for PT-BR translation."""
|
|
result = text
|
|
for en, pt in PT_TRANSLATIONS.items():
|
|
result = result.replace(en, pt)
|
|
return result
|
|
|
|
def read_ocr(page_num):
|
|
"""Read OCR text for a page."""
|
|
ocr_path = f"{BASE_OCR}/p-{page_num:03d}.txt"
|
|
if os.path.exists(ocr_path):
|
|
with open(ocr_path, "r", encoding="utf-8", errors="replace") as f:
|
|
return f.read()
|
|
return ""
|
|
|
|
def get_png_dimensions(page_num):
|
|
"""Get PNG image dimensions."""
|
|
png_path = f"{BASE_PNG}/p-{page_num:03d}.png"
|
|
try:
|
|
with Image.open(png_path) as im:
|
|
return im.size
|
|
except:
|
|
return (850, 1100) # default
|
|
|
|
def detect_page_type(ocr_text, page_num):
|
|
"""Detect the primary type of a page based on content."""
|
|
if not ocr_text.strip():
|
|
return "blank"
|
|
|
|
upper = ocr_text.upper()
|
|
lines = [l.strip() for l in ocr_text.split('\n') if l.strip()]
|
|
|
|
# Cover page detection
|
|
if page_num == 1 and ("RESEARCH TRIANGLE INSTITUTE" in upper or "RTI" in upper):
|
|
return "cover"
|
|
|
|
# Report documentation page
|
|
if "REPORT DOCUMENTATION PAGE" in upper or "OMB NO." in upper:
|
|
return "form"
|
|
|
|
# Table of contents
|
|
if "TABLE OF CONTENTS" in upper or ("table of contents" in ocr_text.lower() and any("....." in l for l in lines[:10])):
|
|
return "toc"
|
|
|
|
# Table of figures
|
|
if "TABLE OF FIGURES" in upper:
|
|
return "toc_figures"
|
|
|
|
# Table of tables
|
|
if "TABLE OF TABLES" in upper and "TABLE OF FIGURES" not in upper:
|
|
return "toc_tables"
|
|
|
|
# Abstract
|
|
if ocr_text.strip().startswith("Abstract") or (len(lines) > 0 and lines[0] == "Abstract"):
|
|
return "abstract"
|
|
|
|
# References
|
|
if len(lines) > 0 and lines[0].strip() in ["References", "REFERENCES"]:
|
|
return "references"
|
|
|
|
# Appendix pages
|
|
if re.search(r'^Appendix\s+[A-Z]\.', ocr_text, re.MULTILINE):
|
|
return "appendix"
|
|
|
|
# Data table page (many pipe chars or aligned columns)
|
|
if ocr_text.count('|') > 5 or (len([l for l in lines if len(re.findall(r'\s{3,}', l)) > 3]) > 5):
|
|
return "table_heavy"
|
|
|
|
return "text"
|
|
|
|
def parse_page_chunks(page_num, ocr_text):
|
|
"""Parse OCR text into structured chunks for a given page."""
|
|
chunks = []
|
|
|
|
if not ocr_text.strip():
|
|
# Blank page
|
|
chunks.append({
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": "[Blank page]",
|
|
"content_pt_br": "[Página em branco]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 1.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None,
|
|
})
|
|
return chunks
|
|
|
|
lines = ocr_text.split('\n')
|
|
total_lines = len(lines)
|
|
|
|
# Detect footer pattern (last 1-2 lines: date + page number + "RTI")
|
|
footer_lines = []
|
|
footer_start = total_lines
|
|
for i in range(total_lines - 1, max(total_lines - 4, -1), -1):
|
|
line = lines[i].strip()
|
|
if re.match(r'^\d{1,3}$', line): # page number only
|
|
footer_lines.insert(0, i)
|
|
elif re.match(r'^9/10/96', line) or "RTI" in line or re.match(r'^\d+$', line):
|
|
footer_lines.insert(0, i)
|
|
if i < footer_start:
|
|
footer_start = i
|
|
|
|
# Collect footer text
|
|
footer_text = ""
|
|
if footer_start < total_lines:
|
|
footer_parts = [lines[i].strip() for i in range(footer_start, total_lines) if lines[i].strip()]
|
|
if footer_parts:
|
|
footer_text = " ".join(footer_parts)
|
|
|
|
# Parse content lines (excluding footer)
|
|
content_lines = lines[:footer_start]
|
|
|
|
order = 1
|
|
|
|
# Handle cover page (page 1)
|
|
if page_num == 1:
|
|
return parse_cover_page(page_num, lines, footer_text)
|
|
|
|
# Handle Report Documentation Page (page 3)
|
|
if page_num == 3:
|
|
return parse_report_doc_page(page_num, lines, footer_text)
|
|
|
|
# Handle abstract page (page 4)
|
|
if page_num == 4:
|
|
return parse_abstract_page(page_num, lines, footer_text)
|
|
|
|
# Handle TOC pages (pages 5, 6, 7, 8, 9)
|
|
if page_num in [5, 6, 7, 8, 9]:
|
|
return parse_toc_page(page_num, lines, footer_text)
|
|
|
|
# General page parsing
|
|
chunks = []
|
|
order = 1
|
|
|
|
# Check for page header (running header at top)
|
|
header_lines = []
|
|
content_start = 0
|
|
for i, line in enumerate(content_lines[:3]):
|
|
stripped = line.strip()
|
|
if stripped and i < 2:
|
|
# Could be header
|
|
if re.match(r'^[A-Z][a-z]', stripped) and len(stripped) < 60 and i == 0:
|
|
if not stripped[0].isdigit() and "Introduction" not in stripped:
|
|
# Check if it looks like a running header
|
|
pass
|
|
|
|
# Identify sections
|
|
current_section = []
|
|
current_type = "paragraph"
|
|
i = 0
|
|
|
|
# Try to identify the first heading
|
|
first_content_line = None
|
|
for line in content_lines:
|
|
stripped = line.strip()
|
|
if stripped and not re.match(r'^[-=\s]*$', stripped):
|
|
first_content_line = stripped
|
|
break
|
|
|
|
# Check for section heading patterns
|
|
section_heading_pattern = re.compile(
|
|
r'^(\d+\.?\d*\.?\d*)\s+([A-Z][^\n]+)$|' # numbered sections like "1. Introduction"
|
|
r'^(Appendix\s+[A-Z]\.?\s*.+)$|' # Appendix headers
|
|
r'^([A-Z][A-Z\s]{4,})$' # ALL CAPS headings
|
|
)
|
|
|
|
# Parse line by line, grouping into logical chunks
|
|
current_block = []
|
|
current_block_type = "paragraph"
|
|
chunk_order = 1
|
|
|
|
def flush_block(block_lines, block_type, y_frac_start, y_frac_end):
|
|
if not any(l.strip() for l in block_lines):
|
|
return None
|
|
text = "\n".join(l.strip() for l in block_lines if l.strip())
|
|
if not text:
|
|
return None
|
|
|
|
# Determine formatting
|
|
formatting = []
|
|
if all(l.isupper() for l in [l.strip() for l in block_lines if l.strip()]):
|
|
formatting.append("all_caps")
|
|
if block_type in ["heading", "subheading", "title"]:
|
|
formatting.append("bold")
|
|
|
|
pt_text = make_pt_translation(text, block_type)
|
|
|
|
return {
|
|
"type": block_type,
|
|
"content_en": text,
|
|
"content_pt_br": pt_text,
|
|
"bbox": {"x": 0.05, "y": y_frac_start, "w": 0.9, "h": max(0.02, y_frac_end - y_frac_start)},
|
|
"classification": None,
|
|
"formatting": formatting,
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.88,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None,
|
|
}
|
|
|
|
n_content = len(content_lines)
|
|
prev_blank = False
|
|
figure_caption_next = False
|
|
|
|
for line_idx, line in enumerate(content_lines):
|
|
stripped = line.strip()
|
|
y_frac = line_idx / max(n_content, 1)
|
|
|
|
# Detect section headings
|
|
is_heading = False
|
|
heading_type = None
|
|
|
|
# Numbered section heading: "1. Introduction" or "6.1.2 Slow-Turn Failures"
|
|
m = re.match(r'^(\d+\.?\d*\.?\d*)\s+([A-Z][^\n]{2,60})$', stripped)
|
|
if m:
|
|
depth = stripped.count('.')
|
|
if depth == 0:
|
|
heading_type = "heading"
|
|
elif depth == 1:
|
|
heading_type = "subheading"
|
|
else:
|
|
heading_type = "subheading"
|
|
is_heading = True
|
|
|
|
# Appendix heading
|
|
if re.match(r'^Appendix\s+[A-Z]\.', stripped):
|
|
heading_type = "appendix_marker"
|
|
is_heading = True
|
|
|
|
# Standalone bold heading (centered, no period)
|
|
if not is_heading and stripped and len(stripped) < 60 and not stripped.endswith('.') and not stripped[0].isdigit() if stripped else False:
|
|
if stripped in ["Introduction", "Abstract", "Summary", "References",
|
|
"Table of Contents", "Table of Figures", "Table of Tables"]:
|
|
heading_type = "heading"
|
|
is_heading = True
|
|
|
|
# Figure caption detection
|
|
if re.match(r'^Figure\s+\d+\.', stripped):
|
|
if current_block:
|
|
chunk = flush_block(current_block, current_block_type,
|
|
(line_idx - len(current_block)) / max(n_content, 1),
|
|
y_frac)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
current_block = []
|
|
current_block = [line]
|
|
current_block_type = "figure_caption"
|
|
continue
|
|
|
|
# Table marker detection (look for aligned columns or pipe chars)
|
|
if stripped.startswith("Table ") and re.match(r'^Table\s+\d+\.', stripped):
|
|
if current_block:
|
|
chunk = flush_block(current_block, current_block_type,
|
|
(line_idx - len(current_block)) / max(n_content, 1),
|
|
y_frac)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
current_block = []
|
|
current_block = [line]
|
|
current_block_type = "figure_caption" # table caption
|
|
continue
|
|
|
|
if is_heading:
|
|
# Flush current block
|
|
if current_block:
|
|
chunk = flush_block(current_block, current_block_type,
|
|
(line_idx - len(current_block)) / max(n_content, 1),
|
|
y_frac)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
current_block = []
|
|
|
|
current_block = [line]
|
|
current_block_type = heading_type
|
|
# For headings, flush immediately
|
|
chunk = flush_block(current_block, current_block_type, y_frac, y_frac + 0.04)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
current_block = []
|
|
current_block_type = "paragraph"
|
|
prev_blank = False
|
|
continue
|
|
|
|
# Blank line — paragraph boundary
|
|
if not stripped:
|
|
if current_block and any(l.strip() for l in current_block):
|
|
# Could be end of paragraph or figure caption
|
|
if current_block_type == "figure_caption" and len(current_block) > 0:
|
|
chunk = flush_block(current_block, current_block_type,
|
|
(line_idx - len(current_block)) / max(n_content, 1),
|
|
y_frac)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
current_block = []
|
|
current_block_type = "paragraph"
|
|
elif current_block_type == "paragraph" and prev_blank:
|
|
# Double blank = strong paragraph break
|
|
chunk = flush_block(current_block, current_block_type,
|
|
(line_idx - len(current_block)) / max(n_content, 1),
|
|
y_frac)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
current_block = []
|
|
else:
|
|
current_block.append(line)
|
|
prev_blank = True
|
|
continue
|
|
|
|
prev_blank = False
|
|
current_block.append(line)
|
|
|
|
# Flush remaining block
|
|
if current_block:
|
|
chunk = flush_block(current_block, current_block_type,
|
|
(n_content - len(current_block)) / max(n_content, 1),
|
|
1.0)
|
|
if chunk:
|
|
chunk["order_in_page"] = chunk_order
|
|
chunks.append(chunk)
|
|
chunk_order += 1
|
|
|
|
# Add footer chunk if present
|
|
if footer_text:
|
|
chunks.append({
|
|
"order_in_page": chunk_order,
|
|
"type": "footer",
|
|
"content_en": footer_text,
|
|
"content_pt_br": footer_text,
|
|
"bbox": {"x": 0.05, "y": 0.93, "w": 0.9, "h": 0.05},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.95,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None,
|
|
})
|
|
|
|
if not chunks:
|
|
# Fallback: single paragraph chunk for entire page
|
|
full_text = "\n".join(l.strip() for l in content_lines if l.strip())
|
|
if full_text:
|
|
chunks.append({
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"content_en": full_text,
|
|
"content_pt_br": make_pt_translation(full_text, "paragraph"),
|
|
"bbox": {"x": 0.05, "y": 0.05, "w": 0.9, "h": 0.9},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None,
|
|
})
|
|
|
|
return chunks
|
|
|
|
|
|
def make_pt_translation(text, chunk_type):
|
|
"""Generate a Brazilian Portuguese translation/description."""
|
|
# For purely structural content, translate key terms
|
|
result = text
|
|
|
|
# Common technical translations for this document
|
|
replacements = {
|
|
"Introduction": "Introdução",
|
|
"Abstract": "Resumo",
|
|
"Table of Contents": "Sumário",
|
|
"Final Report": "Relatório Final",
|
|
"Prepared for": "Preparado para",
|
|
"Prepared by": "Preparado por",
|
|
"Department of the Air Force": "Departamento da Força Aérea",
|
|
"Safety Office": "Escritório de Segurança",
|
|
"Space Wing": "Asa Espacial",
|
|
"References": "Referências",
|
|
"Summary": "Resumo",
|
|
"Appendix": "Apêndice",
|
|
"Figure": "Figura",
|
|
"Table": "Tabela",
|
|
"Failure Response": "Modo de Falha",
|
|
"failure probability": "probabilidade de falha",
|
|
"launch vehicle": "veículo de lançamento",
|
|
"flight line": "linha de voo",
|
|
"shaping constants": "constantes de forma",
|
|
"impact density": "densidade de impacto",
|
|
"Modeling": "Modelagem",
|
|
"Unlikely": "Improváveis",
|
|
"Space-Booster": "Propulsores Espaciais",
|
|
"Failures": "Falhas",
|
|
"Risk Calculations": "Cálculos de Risco",
|
|
"Research Triangle Institute": "Instituto de Triângulo de Pesquisa",
|
|
"Blank page": "Página em branco",
|
|
"booster failure probabilities": "probabilidades de falha de propulsor",
|
|
"launch risk": "risco de lançamento",
|
|
"unlikely failure modeling": "modelagem de falhas improváveis",
|
|
}
|
|
|
|
for en, pt in replacements.items():
|
|
result = result.replace(en, pt)
|
|
|
|
# If no translation happened for long paragraphs, add note
|
|
if chunk_type == "paragraph" and result == text and len(text) > 200:
|
|
# Provide a simplified Portuguese version noting it's technical content
|
|
result = f"[Conteúdo técnico em inglês] {text[:100]}..."
|
|
|
|
return result
|
|
|
|
|
|
def parse_cover_page(page_num, lines, footer_text):
|
|
"""Parse the cover page (page 1)."""
|
|
chunks = []
|
|
order = 1
|
|
|
|
# Letterhead
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "letterhead",
|
|
"content_en": "RESEARCH TRIANGLE INSTITUTE",
|
|
"content_pt_br": "INSTITUTO DE TRIÂNGULO DE PESQUISA (Research Triangle Institute)",
|
|
"bbox": {"x": 0.05, "y": 0.02, "w": 0.5, "h": 0.07},
|
|
"classification": None, "formatting": ["bold", "all_caps"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.97,
|
|
"ocr_source_lines": [6], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Contract/Report info block
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "metadata_block",
|
|
"content_en": "Contract No. FO4703-91-C-0112\nRTI Report No. RTI/5180/77-43F\nSeptember 10, 1996",
|
|
"content_pt_br": "Contrato Nº FO4703-91-C-0112\nRelatório RTI Nº RTI/5180/77-43F\n10 de setembro de 1996",
|
|
"bbox": {"x": 0.5, "y": 0.08, "w": 0.45, "h": 0.08},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
|
|
"ocr_source_lines": [9, 10, 11], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Title
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "title",
|
|
"content_en": "Modeling Unlikely Space-Booster Failures in Risk Calculations",
|
|
"content_pt_br": "Modelagem de Falhas Improváveis de Propulsores Espaciais em Cálculos de Risco",
|
|
"bbox": {"x": 0.1, "y": 0.2, "w": 0.8, "h": 0.12},
|
|
"classification": None, "formatting": ["bold"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
|
|
"ocr_source_lines": [13, 14], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# "Final Report"
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "subtitle",
|
|
"content_en": "Final Report",
|
|
"content_pt_br": "Relatório Final",
|
|
"bbox": {"x": 0.3, "y": 0.34, "w": 0.4, "h": 0.04},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
|
|
"ocr_source_lines": [15], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Sponsor block
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "metadata_block",
|
|
"content_en": "Prepared for\n\nDepartment of the Air Force\n45th Space Wing (AFSPC)\nSafety Office - 45 SW/SE\nPatrick AFB, FL 32925\n\nand\n\nDepartment of the Air Force\n30th Space Wing (AFSPC)\nSafety Office - 30 SW/SE\nVandenberg AFB, CA 93437",
|
|
"content_pt_br": "Preparado para\n\nDepartamento da Força Aérea dos EUA\n45ª Asa Espacial (AFSPC)\nEscritório de Segurança - 45 SW/SE\nPatrick AFB, FL 32925\n\ne\n\nDepartamento da Força Aérea dos EUA\n30ª Asa Espacial (AFSPC)\nEscritório de Segurança - 30 SW/SE\nVandenberg AFB, CA 93437",
|
|
"bbox": {"x": 0.3, "y": 0.4, "w": 0.65, "h": 0.35},
|
|
"classification": None, "formatting": ["centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
|
|
"ocr_source_lines": [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
|
|
"redaction_code": None, "redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# DTIC stamp / accession number
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "metadata_block",
|
|
"content_en": "19961025 122",
|
|
"content_pt_br": "19961025 122 [Número de acesso DTIC]",
|
|
"bbox": {"x": 0.0, "y": 0.74, "w": 0.25, "h": 0.06},
|
|
"classification": None, "formatting": ["bold"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
|
|
"ocr_source_lines": [31], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Distribution statement
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "paragraph",
|
|
"content_en": "Distribution authorized to US Government agencies and their contractors to protect administrative/operational use data, 10 September 96. Other requests for this document shall be referred to the 30th Space Wing (AFSPC) Safety Office (30 SW/SE), Vandenberg AFB, CA 93437, or 45th Space Wing (AFSPC) Safety Office (45 SW/SE), Patrick AFB, FL 32925.",
|
|
"content_pt_br": "Distribuição autorizada a agências do governo dos EUA e seus contratados para proteger dados de uso administrativo/operacional, 10 de setembro de 1996. Outras solicitações para este documento deverão ser encaminhadas ao Escritório de Segurança da 30ª Asa Espacial (AFSPC) (30 SW/SE), Vandenberg AFB, CA 93437, ou ao Escritório de Segurança da 45ª Asa Espacial (AFSPC) (45 SW/SE), Patrick AFB, FL 32925.",
|
|
"bbox": {"x": 0.05, "y": 0.8, "w": 0.9, "h": 0.08},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.93,
|
|
"ocr_source_lines": [34, 35, 36, 37], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Quality inspection stamp
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "metadata_block",
|
|
"content_en": "DTIC QUALITY INSPECTED",
|
|
"content_pt_br": "DTIC INSPECIONADO DE QUALIDADE",
|
|
"bbox": {"x": 0.5, "y": 0.88, "w": 0.45, "h": 0.04},
|
|
"classification": None, "formatting": ["all_caps"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.85,
|
|
"ocr_source_lines": [39], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Footer address
|
|
if footer_text:
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "footer",
|
|
"content_en": "3000 N. Atlantic Avenue • Cocoa Beach, Florida 32931-5029 USA",
|
|
"content_pt_br": "3000 N. Atlantic Avenue • Cocoa Beach, Flórida 32931-5029 EUA",
|
|
"bbox": {"x": 0.1, "y": 0.94, "w": 0.8, "h": 0.04},
|
|
"classification": None, "formatting": ["centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
|
|
"ocr_source_lines": [43], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
return chunks
|
|
|
|
|
|
def parse_report_doc_page(page_num, lines, footer_text):
|
|
"""Parse the Report Documentation Page (DD Form 298)."""
|
|
ocr_text = "\n".join(lines)
|
|
chunks = []
|
|
|
|
chunks.append({
|
|
"order_in_page": 1,
|
|
"type": "heading",
|
|
"content_en": "REPORT DOCUMENTATION PAGE",
|
|
"content_pt_br": "PÁGINA DE DOCUMENTAÇÃO DO RELATÓRIO",
|
|
"bbox": {"x": 0.15, "y": 0.02, "w": 0.7, "h": 0.05},
|
|
"classification": None, "formatting": ["bold", "all_caps", "centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.96,
|
|
"ocr_source_lines": [2], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 2,
|
|
"type": "form_field",
|
|
"content_en": "Report Date: September 10, 1996 | Report Type: Final | OMB No. 0704-0188",
|
|
"content_pt_br": "Data do Relatório: 10 de setembro de 1996 | Tipo de Relatório: Final | OMB Nº 0704-0188",
|
|
"bbox": {"x": 0.05, "y": 0.07, "w": 0.9, "h": 0.05},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.93,
|
|
"ocr_source_lines": [8, 9], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 3,
|
|
"type": "form_field",
|
|
"content_en": "Title: Modeling Unlikely Space-Booster Failures in Risk Calculations | Contract: FO4703-91-C-0112 | Task: 10/95-77",
|
|
"content_pt_br": "Título: Modelagem de Falhas Improváveis de Propulsores Espaciais em Cálculos de Risco | Contrato: FO4703-91-C-0112 | Tarefa: 10/95-77",
|
|
"bbox": {"x": 0.05, "y": 0.12, "w": 0.9, "h": 0.06},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
|
|
"ocr_source_lines": [10, 11, 12], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 4,
|
|
"type": "form_field",
|
|
"content_en": "Authors: James A. Ward, Jr.; Robert M. Montgomery",
|
|
"content_pt_br": "Autores: James A. Ward, Jr.; Robert M. Montgomery",
|
|
"bbox": {"x": 0.05, "y": 0.18, "w": 0.5, "h": 0.04},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
|
|
"ocr_source_lines": [14, 15], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 5,
|
|
"type": "form_field",
|
|
"content_en": "Performing Organizations: Research Triangle Institute (Subcontractor), 3000 N. Atlantic Avenue, Cocoa Beach, FL 32931; ACTA, Inc. (Prime Contractor), Skypark 3, 23430 Hawthorne Blvd., Suite 300, Torrance, CA 90505 | Report Number: RTI/5180/77-43F",
|
|
"content_pt_br": "Organizações Executoras: Research Triangle Institute (Subcontratado), 3000 N. Atlantic Avenue, Cocoa Beach, FL 32931; ACTA, Inc. (Contratado Principal), Skypark 3, 23430 Hawthorne Blvd., Suite 300, Torrance, CA 90505 | Número do Relatório: RTI/5180/77-43F",
|
|
"bbox": {"x": 0.05, "y": 0.22, "w": 0.9, "h": 0.1},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.90,
|
|
"ocr_source_lines": [17, 18, 19, 20, 21], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 6,
|
|
"type": "form_field",
|
|
"content_en": "Sponsoring/Monitoring Agencies: Department of the Air Force (AFSPC) - 30th Space Wing, Vandenberg AFB, CA 93437; 45th Space Wing, Patrick AFB, FL 32925. Monitors: Mr. Martin Kinna (30 SW/SEY); Louis J. Ullian, Jr. (45 SW/SED)",
|
|
"content_pt_br": "Agências Patrocinadoras/Monitoras: Departamento da Força Aérea dos EUA (AFSPC) - 30ª Asa Espacial, Vandenberg AFB, CA 93437; 45ª Asa Espacial, Patrick AFB, FL 32925. Monitores: Sr. Martin Kinna (30 SW/SEY); Louis J. Ullian, Jr. (45 SW/SED)",
|
|
"bbox": {"x": 0.05, "y": 0.32, "w": 0.9, "h": 0.1},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.90,
|
|
"ocr_source_lines": [22, 23, 24, 25, 26, 27, 28], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 7,
|
|
"type": "form_field",
|
|
"content_en": "Distribution/Availability Statement: Distribution authorized to US Government agencies and their contractors to protect administrative/operational use data; 10 September 96. Other requests shall be referred to the 30th Space Wing (AFSPC) Safety Office (30 SW/SE), Vandenberg AFB, CA 93437, or 45th Space Wing (AFSPC) Safety Office (45 SW/SE), Patrick AFB, FL 32925.",
|
|
"content_pt_br": "Declaração de Distribuição/Disponibilidade: Distribuição autorizada a agências do governo dos EUA e seus contratados para proteger dados de uso administrativo/operacional; 10 de setembro de 1996. Outras solicitações deverão ser encaminhadas ao Escritório de Segurança da 30ª Asa Espacial (AFSPC) (30 SW/SE), Vandenberg AFB, CA 93437, ou ao Escritório de Segurança da 45ª Asa Espacial (AFSPC) (45 SW/SE), Patrick AFB, FL 32925.",
|
|
"bbox": {"x": 0.05, "y": 0.42, "w": 0.9, "h": 0.1},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.92,
|
|
"ocr_source_lines": [32, 33, 34, 35, 36], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 8,
|
|
"type": "abstract",
|
|
"content_en": "Missile and space-vehicle performance histories contain many examples of failures that cause, or have the potential to cause, significant vehicle deviations from the intended flight line. In RTI's risk-analysis program, DAMP, such failures are referred to as Mode-5 failure responses. Although Mode-5 failure responses are much less likely to occur than those that result in impacts near the flight line, risk-analysis studies are incomplete without them. This report shows how impacts from Mode-5 failures are modeled in program DAMP. The impact density function used for this purpose contains two shaping constants that control the rate at which the density function drops in value as the angular deviation from the flight line and the impact range increase. Certain Mode-5 malfunctions are simulated, and the two shaping constants then chosen by trial and error so that impacts from the simulated malfunctions and the theoretical density function are in close agreement. An appendix to the report contains a listing and brief narrative failure history of the Atlas, Delta, and Titan missile and space-vehicle launches from the Eastern and Western Ranges from the beginning of each program through August 1996.",
|
|
"content_pt_br": "Históricos de desempenho de mísseis e veículos espaciais contêm muitos exemplos de falhas que causam, ou têm o potencial de causar, desvios significativos do veículo em relação à linha de voo pretendida. No programa de análise de risco da RTI, DAMP, tais falhas são referidas como respostas de falha Modo-5. Embora as respostas de falha Modo-5 sejam muito menos prováveis de ocorrer do que aquelas que resultam em impactos próximos à linha de voo, os estudos de análise de risco são incompletos sem elas. Este relatório mostra como os impactos de falhas Modo-5 são modelados no programa DAMP. A função de densidade de impacto usada para esse fim contém duas constantes de forma que controlam a taxa na qual a função de densidade diminui à medida que o desvio angular da linha de voo e o alcance do impacto aumentam.",
|
|
"bbox": {"x": 0.05, "y": 0.52, "w": 0.9, "h": 0.25},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.91,
|
|
"ocr_source_lines": list(range(37, 52)), "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 9,
|
|
"type": "form_field",
|
|
"content_en": "Subject Terms: launch risk, unlikely failure modeling, booster failure probabilities | Number of Pages: 180 | Security Classification: Unclassified | Limitation of Abstract: SAR",
|
|
"content_pt_br": "Termos do Assunto: risco de lançamento, modelagem de falhas improváveis, probabilidades de falha de propulsor | Número de Páginas: 180 | Classificação de Segurança: Não Classificado | Limitação do Resumo: SAR",
|
|
"bbox": {"x": 0.05, "y": 0.78, "w": 0.9, "h": 0.12},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.89,
|
|
"ocr_source_lines": list(range(51, 60)), "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
return chunks
|
|
|
|
|
|
def parse_abstract_page(page_num, lines, footer_text):
|
|
"""Parse the abstract page."""
|
|
chunks = []
|
|
|
|
chunks.append({
|
|
"order_in_page": 1,
|
|
"type": "heading",
|
|
"content_en": "Abstract",
|
|
"content_pt_br": "Resumo",
|
|
"bbox": {"x": 0.3, "y": 0.03, "w": 0.4, "h": 0.05},
|
|
"classification": None, "formatting": ["bold", "centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
|
|
"ocr_source_lines": [1], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
abstract_text_en = ("Missile and space-vehicle performance histories contain many examples of failures that "
|
|
"cause, or have the potential to cause, significant vehicle deviations from the intended "
|
|
"flight line. In RTI's risk-analysis program, DAMP, such failures are referred to as "
|
|
"Mode-5 failure responses. Although Mode-5 failure responses are much less likely to "
|
|
"occur than those that result in impacts near the flight line, risk-analysis studies are "
|
|
"incomplete without them. This report shows how impacts from Mode-5 failures are "
|
|
"modeled in program DAMP. The impact density function used for this purpose "
|
|
"contains two shaping constants that control the rate at which the density function drops "
|
|
"in value as the angular deviation from the flight line and the impact range increase. "
|
|
"Certain Mode-5 malfunctions are simulated, and the two shaping constants then chosen "
|
|
"by trial and error so that impacts from the simulated malfunctions and the theoretical "
|
|
"density function are in close agreement.\n\n"
|
|
"An appendix to the report contains a listing and brief narrative failure history of the "
|
|
"Atlas, Delta, and Titan missile and space-vehicle launches from the Eastern and "
|
|
"Western Ranges from the beginning of each program through August 1996. Each entry "
|
|
"gives the vehicle configuration, whether the flight was a success, the flight phase in "
|
|
"which any anomalous behavior occurred, and a classification of vehicle behavior in "
|
|
"accordance with defined failure-response modes. Various filtering or data weighting "
|
|
"techniques are described. The empirical data are then filtered to estimate (1) failure "
|
|
"probabilities for Atlas, Delta, and Titan, and (2) percentages of future failures that will "
|
|
"result in Mode-5 (and other Mode) responses.")
|
|
|
|
abstract_text_pt = ("Históricos de desempenho de mísseis e veículos espaciais contêm muitos exemplos de falhas que "
|
|
"causam, ou têm o potencial de causar, desvios significativos do veículo em relação à linha de voo pretendida. "
|
|
"No programa de análise de risco da RTI, DAMP, tais falhas são referidas como respostas de falha Modo-5. "
|
|
"Embora as respostas de falha Modo-5 sejam muito menos prováveis de ocorrer do que aquelas que resultam em "
|
|
"impactos próximos à linha de voo, os estudos de análise de risco são incompletos sem elas. Este relatório "
|
|
"mostra como os impactos de falhas Modo-5 são modelados no programa DAMP. A função de densidade de impacto "
|
|
"usada para esse fim contém duas constantes de forma que controlam a taxa na qual a função de densidade "
|
|
"diminui em valor à medida que o desvio angular da linha de voo e o alcance do impacto aumentam. "
|
|
"Certas falhas Modo-5 são simuladas, e as duas constantes de forma são então escolhidas por tentativa e "
|
|
"erro de modo que os impactos das falhas simuladas e a função de densidade teórica estejam em estreita concordância.\n\n"
|
|
"Um apêndice do relatório contém um levantamento e breve histórico narrativo de falhas dos lançamentos de "
|
|
"mísseis e veículos espaciais Atlas, Delta e Titan das Faixas Leste e Oeste desde o início de cada programa "
|
|
"até agosto de 1996. Cada entrada fornece a configuração do veículo, se o voo foi bem-sucedido, a fase de "
|
|
"voo em que ocorreu qualquer comportamento anômalo e uma classificação do comportamento do veículo de "
|
|
"acordo com os modos de resposta a falhas definidos.")
|
|
|
|
chunks.append({
|
|
"order_in_page": 2,
|
|
"type": "abstract",
|
|
"content_en": abstract_text_en,
|
|
"content_pt_br": abstract_text_pt,
|
|
"bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.75},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.94,
|
|
"ocr_source_lines": list(range(2, 27)), "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
chunks.append({
|
|
"order_in_page": 3,
|
|
"type": "footer",
|
|
"content_en": "9/10/96 i RTI",
|
|
"content_pt_br": "9/10/96 i RTI",
|
|
"bbox": {"x": 0.05, "y": 0.93, "w": 0.9, "h": 0.05},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.97,
|
|
"ocr_source_lines": [27], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
return chunks
|
|
|
|
|
|
def parse_toc_page(page_num, lines, footer_text):
|
|
"""Parse table of contents pages."""
|
|
chunks = []
|
|
order = 1
|
|
|
|
# Detect heading
|
|
for line in lines[:5]:
|
|
stripped = line.strip()
|
|
if "Table of Contents" in stripped:
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "heading",
|
|
"content_en": "Table of Contents",
|
|
"content_pt_br": "Sumário",
|
|
"bbox": {"x": 0.3, "y": 0.02, "w": 0.4, "h": 0.05},
|
|
"classification": None, "formatting": ["bold", "centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
|
|
"ocr_source_lines": [1], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
break
|
|
elif "Table of Figures" in stripped:
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "heading",
|
|
"content_en": "Table of Figures",
|
|
"content_pt_br": "Lista de Figuras",
|
|
"bbox": {"x": 0.3, "y": 0.02, "w": 0.4, "h": 0.05},
|
|
"classification": None, "formatting": ["bold", "centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
|
|
"ocr_source_lines": [1], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
break
|
|
elif "Table of Tables" in stripped:
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "heading",
|
|
"content_en": "Table of Tables",
|
|
"content_pt_br": "Lista de Tabelas",
|
|
"bbox": {"x": 0.3, "y": 0.02, "w": 0.4, "h": 0.05},
|
|
"classification": None, "formatting": ["bold", "centered"],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.98,
|
|
"ocr_source_lines": [1], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
break
|
|
|
|
# Parse TOC entries
|
|
toc_entries = []
|
|
for i, line in enumerate(lines[1:], start=2):
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
# TOC entry pattern: text followed by dots and page number
|
|
# Or: "1. Introduction....1"
|
|
if re.search(r'\.{2,}\s*\d+', stripped) or re.search(r'\s+\d+$', stripped):
|
|
toc_entries.append((i, stripped))
|
|
|
|
if toc_entries:
|
|
# Group all TOC entries as one block
|
|
entry_text = "\n".join(e[1] for e in toc_entries)
|
|
# Build PT version
|
|
pt_text = entry_text
|
|
for en, pt in [("Introduction", "Introdução"), ("Abstract", "Resumo"),
|
|
("Examples Showing Need for Mode", "Exemplos Mostrando a Necessidade do Modo"),
|
|
("Understanding", "Entendendo"), ("Methodology", "Metodologia"),
|
|
("Assessing Failure Probabilities", "Avaliação de Probabilidades de Falha"),
|
|
("Computation", "Cálculo"), ("Shaping Constants Through Simulation", "Constantes de Forma por Simulação"),
|
|
("Potential Future Investigations", "Investigações Futuras Potenciais"),
|
|
("Summary", "Resumo"), ("Appendix", "Apêndice"), ("References", "Referências"),
|
|
("Figure", "Figura"), ("Table", "Tabela"),
|
|
("Launch and Performance History", "Histórico de Lançamento e Desempenho"),
|
|
("Failure Narratives", "Narrativas de Falhas"), ("Basic Data", "Dados Básicos"),
|
|
("Filter Characteristics", "Características do Filtro"),
|
|
("Shaping-Constant Effects", "Efeitos das Constantes de Forma"),
|
|
("Failure Response Modes", "Modos de Resposta a Falhas"),
|
|
("Malfunction Turn Simulations", "Simulações de Desvio por Mau Funcionamento"),
|
|
("Effects of Mode-5 Shaping Constant", "Efeitos da Constante de Forma Modo-5"),
|
|
("Relative Probability of Tumble", "Probabilidade Relativa de Rotação"),
|
|
("Overall Failure Probability", "Probabilidade Geral de Falha"),
|
|
("Relative and Absolute Probabilities", "Probabilidades Relativas e Absolutas"),
|
|
("Random-Attitude Failures", "Falhas de Atitude Aleatória"),
|
|
("Slow-Turn Failures", "Falhas de Giro Lento"),
|
|
("Factors Affecting Malfunction-Turn Results", "Fatores que Afetam os Resultados de Desvio"),
|
|
("Malfunction-Turn Results for Atlas IIAS", "Resultados de Desvio para Atlas IIAS"),
|
|
("Shaping Constants for Atlas IIAS", "Constantes de Forma para Atlas IIAS"),
|
|
("Optimum Mode-5 Shaping Constants", "Constantes de Forma Modo-5 Ótimas"),
|
|
("Launch-Area Mode-5 Risks", "Riscos Modo-5 na Área de Lançamento"),
|
|
("Effects of Mode-5 Constants on Ship-Hit Contours", "Efeitos das Constantes Modo-5 nos Contornos de Acerto de Nave"),
|
|
("Range Distributions", "Distribuições de Alcance"),
|
|
("Shaping Constants for Delta-GEM", "Constantes de Forma para Delta-GEM"),
|
|
("Shaping Constants for Titan IV", "Constantes de Forma para Titan IV"),
|
|
("Shaping Constants for LLV1", "Constantes de Forma para LLV1"),
|
|
("Shaping Constants for Other Launch Vehicles", "Constantes de Forma para Outros Veículos de Lançamento"),
|
|
("Parts-Analysis Approach", "Abordagem de Análise de Componentes"),
|
|
("Empirical Approach", "Abordagem Empírica"),
|
|
("Response Mode", "Modo de Resposta"),
|
|
("Data Sources", "Fontes de Dados"),
|
|
("Assignment of Failure-Response Modes", "Atribuição de Modos de Resposta a Falhas"),
|
|
("Assignment of Flight Phase", "Atribuição de Fase de Voo"),
|
|
("Representative Configurations", "Configurações Representativas"),
|
|
("Thor", "Thor"), ("Delta", "Delta"), ("Atlas", "Atlas"), ("Titan", "Titan"),
|
|
]:
|
|
pt_text = pt_text.replace(en, pt)
|
|
|
|
y_start = (len(chunks)) * 0.05 + 0.08
|
|
y_end = min(0.92, y_start + len(toc_entries) * 0.025)
|
|
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "toc_entry",
|
|
"content_en": entry_text,
|
|
"content_pt_br": pt_text,
|
|
"bbox": {"x": 0.05, "y": y_start, "w": 0.9, "h": y_end - y_start},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.93,
|
|
"ocr_source_lines": [e[0] for e in toc_entries[:10]],
|
|
"redaction_code": None, "redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
order += 1
|
|
|
|
# Footer
|
|
if footer_text:
|
|
chunks.append({
|
|
"order_in_page": order,
|
|
"type": "footer",
|
|
"content_en": footer_text,
|
|
"content_pt_br": footer_text,
|
|
"bbox": {"x": 0.05, "y": 0.93, "w": 0.9, "h": 0.05},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.95,
|
|
"ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
})
|
|
|
|
return chunks
|
|
|
|
|
|
def process_all_pages():
|
|
"""Process all pages and return list of (page_num, chunks)."""
|
|
all_page_data = []
|
|
|
|
for seq_pos, page_num in enumerate(PNG_PAGES):
|
|
print(f" Processing page {page_num:03d} (seq {seq_pos+1}/{TOTAL_PAGES})...")
|
|
ocr_text = read_ocr(page_num)
|
|
lines = ocr_text.split('\n') if ocr_text else []
|
|
|
|
# Detect footer
|
|
total_lines = len(lines)
|
|
footer_start = total_lines
|
|
for i in range(total_lines - 1, max(total_lines - 4, -1), -1):
|
|
line = lines[i].strip()
|
|
if re.match(r'^9/10/96', line) or (re.match(r'^\d+$', line) and int(line) < 200 if line.isdigit() else False) or line == "RTI":
|
|
footer_start = i
|
|
|
|
footer_lines = [lines[i].strip() for i in range(footer_start, total_lines) if lines[i].strip()]
|
|
footer_text = " ".join(footer_lines) if footer_lines else ""
|
|
|
|
chunks = parse_page_chunks(page_num, ocr_text)
|
|
|
|
if not chunks:
|
|
# Fallback
|
|
full_text = ocr_text.strip()
|
|
if full_text:
|
|
chunks = [{
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"content_en": full_text[:3000],
|
|
"content_pt_br": make_pt_translation(full_text[:1000], "paragraph"),
|
|
"bbox": {"x": 0.05, "y": 0.05, "w": 0.9, "h": 0.9},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.85,
|
|
"ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
}]
|
|
else:
|
|
chunks = [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": "[Blank page]",
|
|
"content_pt_br": "[Página em branco]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 1.0,
|
|
"ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None,
|
|
}]
|
|
|
|
all_page_data.append((page_num, chunks))
|
|
|
|
return all_page_data
|
|
|
|
|
|
def write_chunk_file(chunk_data, page_num):
|
|
"""Write individual chunk markdown file."""
|
|
chunk_id = chunk_data["chunk_id"]
|
|
chunk_type = chunk_data.get("type", "paragraph")
|
|
order_in_page = chunk_data.get("order_in_page", 1)
|
|
order_global = chunk_data.get("order_global", 1)
|
|
bbox = chunk_data.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
|
|
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
|
|
related_table = chunk_data.get("related_table", None)
|
|
|
|
prev_chunk = chunk_data.get("prev_chunk", None)
|
|
next_chunk = chunk_data.get("next_chunk", None)
|
|
|
|
content_en = chunk_data.get("content_en", "")
|
|
content_pt_br = chunk_data.get("content_pt_br", "")
|
|
|
|
# Escape special YAML characters in content
|
|
def yaml_str(s):
|
|
if s is None:
|
|
return "null"
|
|
return json.dumps(s, ensure_ascii=False)
|
|
|
|
lines = [
|
|
"---",
|
|
f"chunk_id: {chunk_id}",
|
|
f"type: {chunk_type}",
|
|
f"page: {page_num}",
|
|
f"order_in_page: {order_in_page}",
|
|
f"order_global: {order_global}",
|
|
f"bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',0.1):.2f}}}",
|
|
f"classification: {yaml_str(chunk_data.get('classification', None))}",
|
|
f"formatting: {json.dumps(chunk_data.get('formatting', []))}",
|
|
f"cross_page_hint: {chunk_data.get('cross_page_hint', 'self_contained')}",
|
|
f"prev_chunk: {yaml_str(prev_chunk)}",
|
|
f"next_chunk: {yaml_str(next_chunk)}",
|
|
f"related_image: {yaml_str(related_image)}",
|
|
f"related_table: {yaml_str(related_table)}",
|
|
f"ocr_confidence: {chunk_data.get('ocr_confidence', 0.9)}",
|
|
f"ocr_source_lines: {json.dumps(chunk_data.get('ocr_source_lines', []))}",
|
|
f"redaction_code: {yaml_str(chunk_data.get('redaction_code', None))}",
|
|
f"redaction_inferred_content_type: {yaml_str(chunk_data.get('redaction_inferred_content_type', None))}",
|
|
f"image_type: {yaml_str(chunk_data.get('image_type', None))}",
|
|
f"ufo_anomaly_detected: {str(chunk_data.get('ufo_anomaly_detected', False)).lower()}",
|
|
f"ufo_anomaly_type: {yaml_str(chunk_data.get('ufo_anomaly_type', None))}",
|
|
f"ufo_anomaly_rationale: {yaml_str(chunk_data.get('ufo_anomaly_rationale', None))}",
|
|
f"cryptid_anomaly_detected: {str(chunk_data.get('cryptid_anomaly_detected', False)).lower()}",
|
|
f"cryptid_anomaly_type: {yaml_str(chunk_data.get('cryptid_anomaly_type', None))}",
|
|
f"cryptid_anomaly_rationale: {yaml_str(chunk_data.get('cryptid_anomaly_rationale', None))}",
|
|
f"image_description_en: {yaml_str(chunk_data.get('image_description_en', None))}",
|
|
f"image_description_pt_br: {yaml_str(chunk_data.get('image_description_pt_br', None))}",
|
|
f"extracted_text: {yaml_str(chunk_data.get('extracted_text', None))}",
|
|
f"source_png: ../../processing/png/{DOC_ID}/p-{page_num:03d}.png",
|
|
"---",
|
|
]
|
|
|
|
content = "\n".join(lines) + "\n\n"
|
|
content += f"**EN:** {content_en}\n\n"
|
|
content += f"**PT-BR:** {content_pt_br}\n"
|
|
|
|
out_path = f"{CHUNKS_DIR}/{chunk_id}.md"
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
print(f"=== Rebuilding {DOC_ID} ===")
|
|
print(f"Total pages: {TOTAL_PAGES}")
|
|
|
|
# Process all pages
|
|
print("\nProcessing pages...")
|
|
all_page_data = process_all_pages()
|
|
|
|
# Flatten to global chunk list
|
|
all_chunks = []
|
|
for page_num, chunks in all_page_data:
|
|
for chunk in chunks:
|
|
all_chunks.append({**chunk, "page_number": page_num})
|
|
|
|
# Assign global IDs
|
|
for i, chunk in enumerate(all_chunks):
|
|
chunk["chunk_id"] = f"c{i+1:04d}"
|
|
chunk["order_global"] = i + 1
|
|
chunk["prev_chunk"] = f"c{i:04d}" if i > 0 else None
|
|
chunk["next_chunk"] = f"c{i+2:04d}" if i < len(all_chunks)-1 else None
|
|
|
|
print(f"Total chunks: {len(all_chunks)}")
|
|
|
|
# Count image chunks
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f"Image chunks: {len(image_chunks)}")
|
|
|
|
# Write individual chunk files
|
|
print("Writing chunk files...")
|
|
for chunk in all_chunks:
|
|
write_chunk_file(chunk, chunk["page_number"])
|
|
|
|
# Build _index.json
|
|
print("Writing _index.json...")
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
index_chunks = []
|
|
for chunk in all_chunks:
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk["page_number"],
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk["order_global"],
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}),
|
|
"preview": chunk.get("content_en", "")[:80]
|
|
})
|
|
|
|
index_data = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": len(all_chunks),
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-sonnet-4-6",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks
|
|
}
|
|
|
|
with open(f"{OUT_DIR}/_index.json", "w", encoding="utf-8") as f:
|
|
json.dump(index_data, f, ensure_ascii=False, indent=2)
|
|
|
|
# Build document.md
|
|
print("Building document.md...")
|
|
|
|
type_histogram = {}
|
|
ufo_anomalies = []
|
|
cryptid_anomalies = []
|
|
|
|
for chunk in all_chunks:
|
|
t = chunk.get("type", "paragraph")
|
|
type_histogram[t] = type_histogram.get(t, 0) + 1
|
|
if chunk.get("ufo_anomaly_detected", False):
|
|
ufo_anomalies.append(chunk["chunk_id"])
|
|
if chunk.get("cryptid_anomaly_detected", False):
|
|
cryptid_anomalies.append(chunk["chunk_id"])
|
|
|
|
doc_lines = [
|
|
"---",
|
|
'schema_version: "0.2.0"',
|
|
"type: master_document",
|
|
f"doc_id: {DOC_ID}",
|
|
f'canonical_title: "{DOC_TITLE}"',
|
|
f"total_pages: {TOTAL_PAGES}",
|
|
f"total_chunks: {len(all_chunks)}",
|
|
"chunk_types_histogram:",
|
|
]
|
|
for t, count in sorted(type_histogram.items()):
|
|
doc_lines.append(f" {t}: {count}")
|
|
doc_lines.extend([
|
|
"multi_page_tables: []",
|
|
f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}",
|
|
f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}",
|
|
'build_approach: "subagents"',
|
|
"build_model: claude-sonnet-4-6",
|
|
f"build_at: {build_at}",
|
|
"---",
|
|
"",
|
|
])
|
|
|
|
# Group by page
|
|
chunks_by_page = {}
|
|
for chunk in all_chunks:
|
|
p = chunk["page_number"]
|
|
if p not in chunks_by_page:
|
|
chunks_by_page[p] = []
|
|
chunks_by_page[p].append(chunk)
|
|
|
|
for page_num in sorted(chunks_by_page.keys()):
|
|
doc_lines.append(f"## Page {page_num}")
|
|
doc_lines.append("")
|
|
|
|
for chunk in chunks_by_page[page_num]:
|
|
chunk_id = chunk["chunk_id"]
|
|
chunk_type = chunk.get("type", "paragraph")
|
|
bbox = chunk.get("bbox", {})
|
|
bx, by, bw, bh = bbox.get("x",0), bbox.get("y",0), bbox.get("w",1), bbox.get("h",0.1)
|
|
|
|
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
doc_lines.append(f'<a id="{chunk_id}"></a>')
|
|
doc_lines.append(f"### Chunk {chunk_id} — {chunk_type} · p{page_num} · bbox: {bx:.2f}/{by:.2f}/{bw:.2f}/{bh:.2f}")
|
|
doc_lines.append("")
|
|
doc_lines.append(f"**EN:** {chunk.get('content_en', '')}")
|
|
doc_lines.append("")
|
|
doc_lines.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
|
|
doc_lines.append("")
|
|
|
|
if chunk_type == "image":
|
|
doc_lines.append(f"")
|
|
doc_lines.append("")
|
|
if chunk.get("image_description_en"):
|
|
doc_lines.append(f"*Image description:* {chunk['image_description_en']}")
|
|
doc_lines.append("")
|
|
|
|
meta = {k: v for k, v in chunk.items()
|
|
if k not in ("content_en", "content_pt_br", "page_number")}
|
|
doc_lines.append("<details><summary>metadata</summary>")
|
|
doc_lines.append("")
|
|
doc_lines.append("```json")
|
|
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
doc_lines.append("```")
|
|
doc_lines.append("")
|
|
doc_lines.append("</details>")
|
|
doc_lines.append("")
|
|
doc_lines.append("---")
|
|
doc_lines.append("")
|
|
|
|
document_md = "\n".join(doc_lines)
|
|
with open(f"{OUT_DIR}/document.md", "w", encoding="utf-8") as f:
|
|
f.write(document_md)
|
|
|
|
wall_seconds = int(time.time() - start_time)
|
|
doc_md_bytes = len(document_md.encode("utf-8"))
|
|
|
|
print(f"\n=== DONE ===")
|
|
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
|
|
print(f"Wall time: {wall_seconds}s")
|
|
|
|
return TOTAL_PAGES, len(all_chunks), len(image_chunks), 0, len(ufo_anomalies), len(cryptid_anomalies), wall_seconds
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|