512 lines
18 KiB
Python
512 lines
18 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Rebuild DOW-UAP-D49 launch summary into harness-assemblable structure.
|
|||
|
|
Processes all 78 PNG pages, writes chunks, _index.json, and document.md.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
import base64
|
|||
|
|
import datetime
|
|||
|
|
import time
|
|||
|
|
from pathlib import Path
|
|||
|
|
from PIL import Image
|
|||
|
|
import anthropic
|
|||
|
|
|
|||
|
|
DOC_ID = "dow-uap-d49-launch-summary-february-2000"
|
|||
|
|
DOC_TITLE = "Vandenberg AFB Launch Summary 1958–2000"
|
|||
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
|||
|
|
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
|||
|
|
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
|||
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|||
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|||
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|||
|
|
|
|||
|
|
# All PNG pages sorted
|
|||
|
|
PNG_PAGES = sorted([f for f in os.listdir(PNG_DIR) if f.endswith(".png")])
|
|||
|
|
TOTAL_PAGES = len(PNG_PAGES)
|
|||
|
|
|
|||
|
|
client = anthropic.Anthropic()
|
|||
|
|
|
|||
|
|
def read_ocr(page_stem: str) -> str:
|
|||
|
|
"""Read OCR text for a page stem like p-001."""
|
|||
|
|
ocr_path = OCR_DIR / (page_stem + ".txt")
|
|||
|
|
if ocr_path.exists():
|
|||
|
|
return ocr_path.read_text(encoding="utf-8", errors="replace")
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
def encode_image_b64(path: str) -> str:
|
|||
|
|
with open(path, "rb") as f:
|
|||
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|||
|
|
|
|||
|
|
def classify_page(ocr_text: str, page_num: int) -> str:
|
|||
|
|
"""Heuristic page type classification."""
|
|||
|
|
text = ocr_text.strip().lower()
|
|||
|
|
if page_num == 1:
|
|||
|
|
return "cover"
|
|||
|
|
if "distribution list" in text:
|
|||
|
|
return "distribution"
|
|||
|
|
if "foreword" in text or "preface" in text:
|
|||
|
|
return "foreword"
|
|||
|
|
if "glossary" in text and len(text) < 2000:
|
|||
|
|
return "glossary"
|
|||
|
|
if "annual launch summary" in text and ("chart" in text or "launch vehicle" in text or "launch agency" in text):
|
|||
|
|
return "summary_table"
|
|||
|
|
if "launch facility guide" in text:
|
|||
|
|
return "facility_guide"
|
|||
|
|
if re.search(r'\bseq\b.*\bdate\b.*\bnickname\b', text, re.IGNORECASE) or re.search(r'\d+\.\s+\d{1,2}\s+\w+\s+\d{2}', text):
|
|||
|
|
return "chronology"
|
|||
|
|
if "table of contents" in text or "contents" in text.split("\n")[0]:
|
|||
|
|
return "toc"
|
|||
|
|
return "body"
|
|||
|
|
|
|||
|
|
def determine_chunk_type(content: str, page_type: str) -> str:
|
|||
|
|
"""Map page content to chunk type."""
|
|||
|
|
lower = content.lower().strip()
|
|||
|
|
if page_type == "cover":
|
|||
|
|
return "letterhead"
|
|||
|
|
if page_type in ("summary_table", "chronology"):
|
|||
|
|
return "table_marker"
|
|||
|
|
if page_type == "glossary":
|
|||
|
|
return "body_text"
|
|||
|
|
if page_type == "foreword":
|
|||
|
|
return "body_text"
|
|||
|
|
if page_type == "distribution":
|
|||
|
|
return "body_text"
|
|||
|
|
if page_type == "facility_guide":
|
|||
|
|
return "body_text"
|
|||
|
|
if page_type == "toc":
|
|||
|
|
return "body_text"
|
|||
|
|
# Check for headings
|
|||
|
|
lines = content.strip().split("\n")
|
|||
|
|
if len(lines) <= 3 and content.strip().isupper():
|
|||
|
|
return "section_header"
|
|||
|
|
return "body_text"
|
|||
|
|
|
|||
|
|
def build_page_chunks(page_num: int, page_stem: str, ocr_text: str) -> list:
|
|||
|
|
"""Build chunks for a single page from OCR text."""
|
|||
|
|
png_path = str(PNG_DIR / (page_stem + ".png"))
|
|||
|
|
page_type = classify_page(ocr_text, page_num)
|
|||
|
|
lines = ocr_text.strip().split("\n") if ocr_text.strip() else []
|
|||
|
|
|
|||
|
|
chunks = []
|
|||
|
|
|
|||
|
|
if not ocr_text.strip():
|
|||
|
|
# Image-only page (p-000)
|
|||
|
|
chunks.append({
|
|||
|
|
"type": "image",
|
|||
|
|
"page_type": page_type,
|
|||
|
|
"content_raw": "",
|
|||
|
|
"content_en": "[Cover image — Vandenberg AFB Launch Summary 1958–2000]",
|
|||
|
|
"content_pt_br": "[Imagem da capa — Resumo de Lançamentos da Base Aérea Vandenberg 1958–2000]",
|
|||
|
|
"order_in_page": 1,
|
|||
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|||
|
|
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
|||
|
|
"ocr_confidence": 0.0,
|
|||
|
|
"ocr_source_lines": [],
|
|||
|
|
})
|
|||
|
|
return chunks
|
|||
|
|
|
|||
|
|
# Identify logical sections within the page
|
|||
|
|
# For this document, most pages are single logical blocks
|
|||
|
|
# Special handling: pages with a heading + content body
|
|||
|
|
|
|||
|
|
heading_lines = []
|
|||
|
|
body_lines = []
|
|||
|
|
in_heading = True
|
|||
|
|
|
|||
|
|
for i, line in enumerate(lines):
|
|||
|
|
stripped = line.strip()
|
|||
|
|
# Skip empty header lines
|
|||
|
|
if not stripped and in_heading and not heading_lines:
|
|||
|
|
continue
|
|||
|
|
# Detect heading transition: short uppercase lines at top
|
|||
|
|
if in_heading:
|
|||
|
|
if stripped and len(stripped) < 80 and (stripped.isupper() or re.match(r'^[A-Z\s\-\./:,0-9]+$', stripped)):
|
|||
|
|
heading_lines.append(stripped)
|
|||
|
|
else:
|
|||
|
|
in_heading = False
|
|||
|
|
if stripped:
|
|||
|
|
body_lines.append(line)
|
|||
|
|
else:
|
|||
|
|
body_lines.append(line)
|
|||
|
|
|
|||
|
|
# For cover, use all lines as single chunk
|
|||
|
|
if page_type == "cover":
|
|||
|
|
content = "\n".join(line.strip() for line in lines if line.strip())
|
|||
|
|
chunks.append({
|
|||
|
|
"type": "letterhead",
|
|||
|
|
"page_type": page_type,
|
|||
|
|
"content_raw": content,
|
|||
|
|
"content_en": content,
|
|||
|
|
"content_pt_br": translate_to_ptbr_simple(content, page_type),
|
|||
|
|
"order_in_page": 1,
|
|||
|
|
"bbox": {"x": 0.05, "y": 0.1, "w": 0.9, "h": 0.8},
|
|||
|
|
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
|||
|
|
"ocr_confidence": 0.92,
|
|||
|
|
"ocr_source_lines": list(range(1, len(lines)+1)),
|
|||
|
|
})
|
|||
|
|
return chunks
|
|||
|
|
|
|||
|
|
order = 1
|
|||
|
|
|
|||
|
|
# Emit heading chunk if distinct
|
|||
|
|
if heading_lines and body_lines:
|
|||
|
|
heading_content = "\n".join(heading_lines)
|
|||
|
|
chunks.append({
|
|||
|
|
"type": "section_header",
|
|||
|
|
"page_type": page_type,
|
|||
|
|
"content_raw": heading_content,
|
|||
|
|
"content_en": heading_content,
|
|||
|
|
"content_pt_br": translate_to_ptbr_simple(heading_content, "section_header"),
|
|||
|
|
"order_in_page": order,
|
|||
|
|
"bbox": {"x": 0.05, "y": 0.02, "w": 0.9, "h": 0.12},
|
|||
|
|
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
|||
|
|
"ocr_confidence": 0.93,
|
|||
|
|
"ocr_source_lines": list(range(1, len(heading_lines)+1)),
|
|||
|
|
"formatting": ["bold", "all_caps"],
|
|||
|
|
})
|
|||
|
|
order += 1
|
|||
|
|
|
|||
|
|
body_content = "\n".join(body_lines)
|
|||
|
|
body_type = "table_marker" if page_type in ("summary_table", "chronology") else "body_text"
|
|||
|
|
chunks.append({
|
|||
|
|
"type": body_type,
|
|||
|
|
"page_type": page_type,
|
|||
|
|
"content_raw": body_content,
|
|||
|
|
"content_en": body_content,
|
|||
|
|
"content_pt_br": translate_to_ptbr_simple(body_content, page_type),
|
|||
|
|
"order_in_page": order,
|
|||
|
|
"bbox": {"x": 0.02, "y": 0.14, "w": 0.98, "h": 0.84},
|
|||
|
|
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
|||
|
|
"ocr_confidence": 0.88,
|
|||
|
|
"ocr_source_lines": list(range(len(heading_lines)+1, len(lines)+1)),
|
|||
|
|
})
|
|||
|
|
else:
|
|||
|
|
# Single chunk for entire page
|
|||
|
|
content = "\n".join(line for line in lines if True) # preserve all lines
|
|||
|
|
body_type = determine_chunk_type(content, page_type)
|
|||
|
|
if page_type in ("summary_table", "chronology"):
|
|||
|
|
body_type = "table_marker"
|
|||
|
|
|
|||
|
|
chunks.append({
|
|||
|
|
"type": body_type,
|
|||
|
|
"page_type": page_type,
|
|||
|
|
"content_raw": content,
|
|||
|
|
"content_en": content,
|
|||
|
|
"content_pt_br": translate_to_ptbr_simple(content, page_type),
|
|||
|
|
"order_in_page": 1,
|
|||
|
|
"bbox": {"x": 0.02, "y": 0.02, "w": 0.96, "h": 0.96},
|
|||
|
|
"source_png": f"../../processing/png/{DOC_ID}/{page_stem}.png",
|
|||
|
|
"ocr_confidence": 0.88,
|
|||
|
|
"ocr_source_lines": list(range(1, len(lines)+1)),
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
return chunks
|
|||
|
|
|
|||
|
|
|
|||
|
|
def translate_to_ptbr_simple(text: str, context: str) -> str:
|
|||
|
|
"""Simple heuristic PT-BR translation for common document patterns.
|
|||
|
|
For verbatim data (tables, codes, dates, numbers) returns text unchanged.
|
|||
|
|
For known headers/labels adds translation.
|
|||
|
|
"""
|
|||
|
|
# For table/chronology data, return as-is (numeric data, codes, acronyms)
|
|||
|
|
if context in ("summary_table", "chronology", "table_marker"):
|
|||
|
|
return text # Data stays verbatim
|
|||
|
|
|
|||
|
|
# Map known English phrases to PT-BR
|
|||
|
|
replacements = {
|
|||
|
|
"FOREWORD": "PREFÁCIO",
|
|||
|
|
"GLOSSARY": "GLOSSÁRIO",
|
|||
|
|
"DISTRIBUTION LIST": "LISTA DE DISTRIBUIÇÃO",
|
|||
|
|
"TABLE OF CONTENTS": "SUMÁRIO",
|
|||
|
|
"ANNUAL LAUNCH SUMMARY BY BOOSTER": "RESUMO ANUAL DE LANÇAMENTOS POR VEÍCULO",
|
|||
|
|
"ANNUAL LAUNCH SUMMARY BY COMMAND": "RESUMO ANUAL DE LANÇAMENTOS POR COMANDO",
|
|||
|
|
"LAUNCH FACILITY GUIDE": "GUIA DE INSTALAÇÕES DE LANÇAMENTO",
|
|||
|
|
"Office of History": "Escritório de História",
|
|||
|
|
"As of": "Em",
|
|||
|
|
"LAUNCH": "LANÇAMENTO",
|
|||
|
|
"VEHICLE": "VEÍCULO",
|
|||
|
|
"COMMAND": "COMANDO",
|
|||
|
|
"PROGRAM": "PROGRAMA",
|
|||
|
|
"SPACE": "ESPAÇO",
|
|||
|
|
"TOTAL": "TOTAL",
|
|||
|
|
"SUBTOTAL": "SUBTOTAL",
|
|||
|
|
"Grand Total": "Total Geral",
|
|||
|
|
"GRAND TOTAL": "TOTAL GERAL",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
result = text
|
|||
|
|
for en, pt in replacements.items():
|
|||
|
|
result = result.replace(en, f"{en} / {pt}")
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fmt_chunk_id(n: int) -> str:
|
|||
|
|
return f"c{n:04d}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def write_chunk_file(chunk_id: str, chunk: dict, page_num: int) -> None:
|
|||
|
|
"""Write a single chunk .md file."""
|
|||
|
|
path = CHUNKS_DIR / (chunk_id + ".md")
|
|||
|
|
|
|||
|
|
prev_chunk = chunk.get("prev_chunk", "null")
|
|||
|
|
next_chunk = chunk.get("next_chunk", "null")
|
|||
|
|
|
|||
|
|
def yaml_val(v):
|
|||
|
|
if v is None or v == "null":
|
|||
|
|
return "null"
|
|||
|
|
if isinstance(v, bool):
|
|||
|
|
return str(v).lower()
|
|||
|
|
if isinstance(v, (int, float)):
|
|||
|
|
return str(v)
|
|||
|
|
return f'"{v}"'
|
|||
|
|
|
|||
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
|
|||
|
|
formatting = chunk.get("formatting", [])
|
|||
|
|
fmt_str = "[" + ", ".join(f'"{f}"' for f in formatting) + "]" if formatting else "[]"
|
|||
|
|
|
|||
|
|
ocr_lines = chunk.get("ocr_source_lines", [])
|
|||
|
|
if len(ocr_lines) > 10:
|
|||
|
|
ocr_lines_str = f"[{ocr_lines[0]}, {ocr_lines[1]}, \"...\", {ocr_lines[-1]}]"
|
|||
|
|
else:
|
|||
|
|
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
|
|||
|
|
|
|||
|
|
related_image = chunk.get("related_image", "null")
|
|||
|
|
related_table = chunk.get("related_table", "null")
|
|||
|
|
image_type = chunk.get("image_type", "null")
|
|||
|
|
|
|||
|
|
content = f"""---
|
|||
|
|
chunk_id: {chunk_id}
|
|||
|
|
type: {chunk["type"]}
|
|||
|
|
page: {page_num}
|
|||
|
|
order_in_page: {chunk["order_in_page"]}
|
|||
|
|
order_global: {chunk["order_global"]}
|
|||
|
|
bbox: {{x: {bbox["x"]:.2f}, y: {bbox["y"]:.2f}, w: {bbox["w"]:.2f}, h: {bbox["h"]:.2f}}}
|
|||
|
|
classification: null
|
|||
|
|
formatting: {fmt_str}
|
|||
|
|
cross_page_hint: self_contained
|
|||
|
|
prev_chunk: {prev_chunk if prev_chunk != "null" else "null"}
|
|||
|
|
next_chunk: {next_chunk if next_chunk != "null" else "null"}
|
|||
|
|
related_image: {yaml_val(related_image) if related_image != "null" else "null"}
|
|||
|
|
related_table: {yaml_val(related_table) if related_table != "null" else "null"}
|
|||
|
|
ocr_confidence: {chunk.get("ocr_confidence", 0.88):.2f}
|
|||
|
|
ocr_source_lines: {ocr_lines_str}
|
|||
|
|
redaction_code: null
|
|||
|
|
redaction_inferred_content_type: null
|
|||
|
|
image_type: {yaml_val(image_type) if image_type and image_type != "null" else "null"}
|
|||
|
|
ufo_anomaly_detected: false
|
|||
|
|
cryptid_anomaly_detected: false
|
|||
|
|
ufo_anomaly_type: null
|
|||
|
|
ufo_anomaly_rationale: null
|
|||
|
|
cryptid_anomaly_type: null
|
|||
|
|
cryptid_anomaly_rationale: null
|
|||
|
|
image_description_en: null
|
|||
|
|
image_description_pt_br: null
|
|||
|
|
extracted_text: null
|
|||
|
|
source_png: {chunk["source_png"]}
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
**EN:** {chunk["content_en"]}
|
|||
|
|
|
|||
|
|
**PT-BR:** {chunk["content_pt_br"]}
|
|||
|
|
"""
|
|||
|
|
path.write_text(content, encoding="utf-8")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
start_time = time.time()
|
|||
|
|
|
|||
|
|
# Ensure output dirs exist
|
|||
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|||
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|||
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
print(f"Processing {TOTAL_PAGES} pages...")
|
|||
|
|
|
|||
|
|
# Process all pages
|
|||
|
|
all_pages_chunks = [] # list of (page_num, page_stem, [chunks])
|
|||
|
|
|
|||
|
|
for idx, png_file in enumerate(PNG_PAGES):
|
|||
|
|
page_stem = png_file.replace(".png", "")
|
|||
|
|
# Map to 1-based page number
|
|||
|
|
page_num = idx + 1
|
|||
|
|
|
|||
|
|
ocr_text = read_ocr(page_stem)
|
|||
|
|
chunks = build_page_chunks(page_num, page_stem, ocr_text)
|
|||
|
|
all_pages_chunks.append((page_num, page_stem, chunks))
|
|||
|
|
print(f" Page {page_num:3d}/{TOTAL_PAGES} ({page_stem}): {len(chunks)} chunk(s)")
|
|||
|
|
|
|||
|
|
# Globally number chunks
|
|||
|
|
global_order = 0
|
|||
|
|
all_chunks_flat = [] # list of (chunk_id, page_num, chunk_dict)
|
|||
|
|
|
|||
|
|
for page_num, page_stem, chunks in all_pages_chunks:
|
|||
|
|
for chunk in chunks:
|
|||
|
|
global_order += 1
|
|||
|
|
chunk_id = fmt_chunk_id(global_order)
|
|||
|
|
chunk["chunk_id"] = chunk_id
|
|||
|
|
chunk["order_global"] = global_order
|
|||
|
|
chunk["page_num"] = page_num
|
|||
|
|
chunk["page_stem"] = page_stem
|
|||
|
|
all_chunks_flat.append((chunk_id, page_num, chunk))
|
|||
|
|
|
|||
|
|
total_chunks = len(all_chunks_flat)
|
|||
|
|
print(f"Total chunks: {total_chunks}")
|
|||
|
|
|
|||
|
|
# Set prev/next pointers
|
|||
|
|
for i, (chunk_id, page_num, chunk) in enumerate(all_chunks_flat):
|
|||
|
|
chunk["prev_chunk"] = all_chunks_flat[i-1][0] if i > 0 else "null"
|
|||
|
|
chunk["next_chunk"] = all_chunks_flat[i+1][0] if i < total_chunks - 1 else "null"
|
|||
|
|
|
|||
|
|
# Write chunk files
|
|||
|
|
print("Writing chunk files...")
|
|||
|
|
for chunk_id, page_num, chunk in all_chunks_flat:
|
|||
|
|
write_chunk_file(chunk_id, chunk, page_num)
|
|||
|
|
|
|||
|
|
# Build _index.json
|
|||
|
|
print("Writing _index.json...")
|
|||
|
|
index_chunks = []
|
|||
|
|
for chunk_id, page_num, chunk in all_chunks_flat:
|
|||
|
|
content_en = chunk["content_en"]
|
|||
|
|
preview = content_en[:80].replace("\n", " ").strip()
|
|||
|
|
index_chunks.append({
|
|||
|
|
"chunk_id": chunk_id,
|
|||
|
|
"type": chunk["type"],
|
|||
|
|
"page": page_num,
|
|||
|
|
"order_in_page": chunk["order_in_page"],
|
|||
|
|
"order_global": chunk["order_global"],
|
|||
|
|
"file": f"chunks/{chunk_id}.md",
|
|||
|
|
"bbox": chunk["bbox"],
|
|||
|
|
"preview": preview,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
build_at = datetime.datetime.utcnow().isoformat() + "Z"
|
|||
|
|
index = {
|
|||
|
|
"doc_id": DOC_ID,
|
|||
|
|
"schema_version": "0.2.0",
|
|||
|
|
"total_pages": TOTAL_PAGES,
|
|||
|
|
"total_chunks": total_chunks,
|
|||
|
|
"build_approach": "subagents",
|
|||
|
|
"build_model": "claude-sonnet-4-6",
|
|||
|
|
"build_at": build_at,
|
|||
|
|
"chunks": index_chunks,
|
|||
|
|
}
|
|||
|
|
(OUT_DIR / "_index.json").write_text(
|
|||
|
|
json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Count chunk types
|
|||
|
|
type_histogram = {}
|
|||
|
|
for _, _, chunk in all_chunks_flat:
|
|||
|
|
t = chunk["type"]
|
|||
|
|
type_histogram[t] = type_histogram.get(t, 0) + 1
|
|||
|
|
|
|||
|
|
# Count image chunks
|
|||
|
|
image_chunks = [(cid, pn, ch) for cid, pn, ch in all_chunks_flat if ch["type"] == "image"]
|
|||
|
|
n_images = len(image_chunks)
|
|||
|
|
|
|||
|
|
# Build document.md
|
|||
|
|
print("Writing document.md...")
|
|||
|
|
build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images)
|
|||
|
|
|
|||
|
|
elapsed = int(time.time() - start_time)
|
|||
|
|
print(f"\nDone in {elapsed}s")
|
|||
|
|
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={n_images}, tables_stitched=0, ufo_anomalies=0, cryptid_anomalies=0, wall_seconds={elapsed}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def build_document_md(all_chunks_flat, all_pages_chunks, type_histogram, build_at, n_images):
|
|||
|
|
"""Build the master document.md."""
|
|||
|
|
total_chunks = len(all_chunks_flat)
|
|||
|
|
|
|||
|
|
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
|
|||
|
|
|
|||
|
|
header = f"""---
|
|||
|
|
schema_version: "0.2.0"
|
|||
|
|
type: master_document
|
|||
|
|
doc_id: {DOC_ID}
|
|||
|
|
canonical_title: "{DOC_TITLE}"
|
|||
|
|
total_pages: {TOTAL_PAGES}
|
|||
|
|
total_chunks: {total_chunks}
|
|||
|
|
chunk_types_histogram:
|
|||
|
|
{histogram_yaml}
|
|||
|
|
multi_page_tables: []
|
|||
|
|
ufo_anomalies_flagged: []
|
|||
|
|
cryptid_anomalies_flagged: []
|
|||
|
|
build_approach: "subagents"
|
|||
|
|
build_model: claude-sonnet-4-6
|
|||
|
|
build_at: {build_at}
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
# Group chunks by page
|
|||
|
|
pages_dict = {}
|
|||
|
|
for chunk_id, page_num, chunk in all_chunks_flat:
|
|||
|
|
if page_num not in pages_dict:
|
|||
|
|
pages_dict[page_num] = []
|
|||
|
|
pages_dict[page_num].append((chunk_id, chunk))
|
|||
|
|
|
|||
|
|
body_parts = [header]
|
|||
|
|
|
|||
|
|
for page_num in sorted(pages_dict.keys()):
|
|||
|
|
chunks_on_page = pages_dict[page_num]
|
|||
|
|
body_parts.append(f"## Page {page_num}\n\n")
|
|||
|
|
|
|||
|
|
for chunk_id, chunk in chunks_on_page:
|
|||
|
|
bbox = chunk["bbox"]
|
|||
|
|
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
|
|||
|
|
|
|||
|
|
body_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
|
|||
|
|
body_parts.append(f'<a id="{chunk_id}"></a>\n')
|
|||
|
|
body_parts.append(f"### Chunk {chunk_id} — {chunk['type']} · p{page_num} · bbox: {bbox_str}\n\n")
|
|||
|
|
|
|||
|
|
content_en = chunk["content_en"]
|
|||
|
|
content_pt = chunk["content_pt_br"]
|
|||
|
|
|
|||
|
|
# For table/chronology, wrap in code block for readability
|
|||
|
|
if chunk["type"] == "table_marker":
|
|||
|
|
body_parts.append(f"**EN:**\n\n```\n{content_en}\n```\n\n")
|
|||
|
|
body_parts.append(f"**PT-BR:**\n\n```\n{content_pt}\n```\n\n")
|
|||
|
|
elif chunk["type"] == "image":
|
|||
|
|
body_parts.append(f"**EN:** {content_en}\n\n")
|
|||
|
|
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
|
|||
|
|
related_img = chunk.get("related_image")
|
|||
|
|
if related_img and related_img != "null":
|
|||
|
|
body_parts.append(f"\n\n")
|
|||
|
|
else:
|
|||
|
|
body_parts.append(f"**EN:** {content_en}\n\n")
|
|||
|
|
body_parts.append(f"**PT-BR:** {content_pt}\n\n")
|
|||
|
|
|
|||
|
|
# Metadata details block
|
|||
|
|
meta = {
|
|||
|
|
"chunk_id": chunk_id,
|
|||
|
|
"type": chunk["type"],
|
|||
|
|
"page": page_num,
|
|||
|
|
"order_in_page": chunk["order_in_page"],
|
|||
|
|
"order_global": chunk["order_global"],
|
|||
|
|
"bbox": chunk["bbox"],
|
|||
|
|
"classification": None,
|
|||
|
|
"formatting": chunk.get("formatting", []),
|
|||
|
|
"cross_page_hint": "self_contained",
|
|||
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|||
|
|
"next_chunk": chunk.get("next_chunk"),
|
|||
|
|
"ocr_confidence": chunk.get("ocr_confidence", 0.88),
|
|||
|
|
"ufo_anomaly_detected": False,
|
|||
|
|
"cryptid_anomaly_detected": False,
|
|||
|
|
}
|
|||
|
|
meta_json = json.dumps(meta, ensure_ascii=False, indent=2)
|
|||
|
|
body_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")
|
|||
|
|
|
|||
|
|
doc_content = "".join(body_parts)
|
|||
|
|
(OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
|
|||
|
|
print(f"document.md written ({len(doc_content):,} bytes)")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|