597 lines
24 KiB
Python
597 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Rebuilds dow-uap-d48-report-september-1996 into harness-assemblable structure.
|
|
Processes all 146 pages with vision + OCR, generates chunks, images, index, and document.md
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import base64
|
|
import re
|
|
import csv
|
|
import time
|
|
import concurrent.futures
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
import anthropic
|
|
|
|
DOC_ID = "dow-uap-d48-report-september-1996"
|
|
DOC_TITLE = "Modeling Unlikely Space-Booster Failures in Risk Calculations"
|
|
BASE_PNG = f"/Users/guto/ufo/processing/png/{DOC_ID}"
|
|
BASE_OCR = f"/Users/guto/ufo/processing/ocr/{DOC_ID}"
|
|
OUT_DIR = f"/Users/guto/ufo/raw/{DOC_ID}"
|
|
CHUNKS_DIR = f"{OUT_DIR}/chunks"
|
|
IMAGES_DIR = f"{OUT_DIR}/images"
|
|
TABLES_DIR = f"{OUT_DIR}/tables"
|
|
|
|
os.makedirs(CHUNKS_DIR, exist_ok=True)
|
|
os.makedirs(IMAGES_DIR, exist_ok=True)
|
|
os.makedirs(TABLES_DIR, exist_ok=True)
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
# All page numbers that have PNGs
|
|
PNG_PAGES = [
|
|
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
|
|
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
|
|
49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
|
|
100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,
|
|
117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,
|
|
134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,
|
|
151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,
|
|
168,169,170,171,172,173,174,175,176,177,178,179,180,181
|
|
]
|
|
|
|
TOTAL_PAGES = len(PNG_PAGES)
|
|
|
|
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent. Analyze the provided page image and OCR text from a declassified technical document and extract all content as structured chunks.
|
|
|
|
Document: "{doc_title}"
|
|
Page number (file): {page_num} (sequential position {seq_pos} of {total_pages})
|
|
OCR text:
|
|
```
|
|
{ocr_text}
|
|
```
|
|
|
|
Return a JSON object with this exact structure:
|
|
{{
|
|
"page_number": {page_num},
|
|
"seq_position": {seq_pos},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<type>",
|
|
"content_en": "<english content>",
|
|
"content_pt_br": "<portuguese BR translation>",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.9,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null,
|
|
"image_description_en": null,
|
|
"image_description_pt_br": null,
|
|
"extracted_text": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Chunk types (use ONLY these):
|
|
- letterhead: institution/org header at top of page
|
|
- classification_banner: classification marking (SECRET, TOP SECRET, UNCLASSIFIED, etc.)
|
|
- title: document or section title
|
|
- subtitle: subtitle or sub-heading
|
|
- heading: section heading (numbered or unnumbered)
|
|
- subheading: subsection heading
|
|
- paragraph: body text paragraph
|
|
- list_item: bullet or numbered list item
|
|
- table_marker: a table (include table data in content_en as pipe-delimited markdown table)
|
|
- figure_caption: caption for a figure or chart
|
|
- image: a photograph, diagram, chart, graph, or illustration
|
|
- footer: footer text (page numbers, dates, etc.)
|
|
- header: running header
|
|
- signature_block: signature area
|
|
- redaction: redacted/blacked-out area
|
|
- page_number: standalone page number
|
|
- toc_entry: table of contents entry
|
|
- abstract: abstract section
|
|
- reference: bibliography/reference entry
|
|
- form_field: form field label and value
|
|
- metadata_block: document metadata block (e.g., Report Documentation Page)
|
|
- appendix_marker: appendix label/header
|
|
- blank: intentionally blank area
|
|
|
|
Rules:
|
|
1. Every visible content region becomes a chunk — do not skip anything.
|
|
2. For tables: include the full table as markdown pipe-delimited format in content_en.
|
|
3. For images/figures: set type=image, describe what you see in image_description_en and image_description_pt_br. Set extracted_text if the image contains text.
|
|
4. bbox coordinates: x,y = top-left corner (0-1 normalized), w,h = width/height (0-1 normalized).
|
|
5. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
|
|
6. content_pt_br: full Brazilian Portuguese translation of content_en (NOT European Portuguese).
|
|
7. formatting: array of applicable: ["bold", "italic", "all_caps", "underline", "centered", "right_aligned"]
|
|
8. classification: null for unclassified content, or the exact marking string if present.
|
|
9. ocr_confidence: estimate 0.0-1.0 based on OCR quality.
|
|
10. For the abstract, use type=abstract.
|
|
11. For TOC entries, each line is a separate toc_entry chunk.
|
|
12. For figure captions, use type=figure_caption.
|
|
13. ufo_anomaly_detected: true only if content describes UAP/UFO phenomenon (this is a space booster report, very unlikely).
|
|
14. Return ONLY valid JSON, no markdown fences, no explanation text.
|
|
"""
|
|
|
|
def read_ocr(page_num):
|
|
"""Read OCR text for a page number, return empty string if not found."""
|
|
ocr_path = f"{BASE_OCR}/p-{page_num:03d}.txt"
|
|
if os.path.exists(ocr_path):
|
|
with open(ocr_path, "r", encoding="utf-8", errors="replace") as f:
|
|
return f.read()
|
|
return ""
|
|
|
|
def read_png_b64(page_num):
|
|
"""Read PNG image as base64."""
|
|
png_path = f"{BASE_PNG}/p-{page_num:03d}.png"
|
|
with open(png_path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
def process_page(page_num, seq_pos):
|
|
"""Process a single page using vision + OCR, return page chunk data."""
|
|
ocr_text = read_ocr(page_num)
|
|
img_b64 = read_png_b64(page_num)
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
page_num=page_num,
|
|
seq_pos=seq_pos,
|
|
total_pages=TOTAL_PAGES,
|
|
ocr_text=ocr_text[:4000] if ocr_text else "(no OCR available)"
|
|
)
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=4096,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_b64
|
|
}
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": prompt
|
|
}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
raw = response.content[0].text.strip()
|
|
# Remove markdown fences if present
|
|
raw = re.sub(r'^```json\s*', '', raw)
|
|
raw = re.sub(r'^```\s*', '', raw)
|
|
raw = re.sub(r'\s*```$', '', raw)
|
|
|
|
data = json.loads(raw)
|
|
print(f" [OK] page {page_num:03d} (seq {seq_pos}) -> {len(data.get('chunks', []))} chunks")
|
|
return data
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f" [WARN] page {page_num:03d} JSON parse error (attempt {attempt+1}): {e}")
|
|
if attempt == max_retries - 1:
|
|
# Return minimal fallback
|
|
return {
|
|
"page_number": page_num,
|
|
"seq_position": seq_pos,
|
|
"chunks": [
|
|
{
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"content_en": ocr_text[:2000] if ocr_text else f"[Page {page_num} - content extraction failed]",
|
|
"content_pt_br": f"[Página {page_num} - extração de conteúdo falhou]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.5,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}
|
|
]
|
|
}
|
|
except Exception as e:
|
|
print(f" [ERROR] page {page_num:03d} (attempt {attempt+1}): {e}")
|
|
if attempt < max_retries - 1:
|
|
time.sleep(2 ** attempt)
|
|
else:
|
|
return {
|
|
"page_number": page_num,
|
|
"seq_position": seq_pos,
|
|
"chunks": [
|
|
{
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"content_en": f"[Page {page_num} - processing error: {str(e)[:100]}]",
|
|
"content_pt_br": f"[Página {page_num} - erro de processamento]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}
|
|
]
|
|
}
|
|
|
|
def process_pages_batch(pages_with_seq):
|
|
"""Process a batch of pages concurrently."""
|
|
results = {}
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
future_to_page = {
|
|
executor.submit(process_page, page_num, seq_pos): (page_num, seq_pos)
|
|
for page_num, seq_pos in pages_with_seq
|
|
}
|
|
for future in concurrent.futures.as_completed(future_to_page):
|
|
page_num, seq_pos = future_to_page[future]
|
|
try:
|
|
result = future.result()
|
|
results[seq_pos] = result
|
|
except Exception as e:
|
|
print(f" [FATAL] page {page_num}: {e}")
|
|
results[seq_pos] = {
|
|
"page_number": page_num,
|
|
"seq_position": seq_pos,
|
|
"chunks": []
|
|
}
|
|
return results
|
|
|
|
def crop_image_chunk(chunk_id, page_num, bbox):
|
|
"""Crop image region from page PNG and save."""
|
|
png_path = f"{BASE_PNG}/p-{page_num:03d}.png"
|
|
out_path = f"{IMAGES_DIR}/IMG-{chunk_id}.png"
|
|
|
|
try:
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
x, y, w, h = bbox.get("x", 0), bbox.get("y", 0), bbox.get("w", 1), bbox.get("h", 1)
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
cropped = im.crop((left, top, right, bottom))
|
|
cropped.save(out_path)
|
|
return out_path
|
|
except Exception as e:
|
|
print(f" [WARN] crop failed for {chunk_id}: {e}")
|
|
return None
|
|
|
|
def write_chunk_file(chunk_data, page_num):
|
|
"""Write individual chunk markdown file."""
|
|
chunk_id = chunk_data["chunk_id"]
|
|
chunk_type = chunk_data.get("type", "paragraph")
|
|
order_in_page = chunk_data.get("order_in_page", 1)
|
|
order_global = chunk_data.get("order_global", 1)
|
|
bbox = chunk_data.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
|
|
|
|
# Determine related_image and related_table
|
|
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
|
|
related_table = chunk_data.get("related_table", None)
|
|
|
|
prev_chunk = chunk_data.get("prev_chunk", None)
|
|
next_chunk = chunk_data.get("next_chunk", None)
|
|
|
|
content_en = chunk_data.get("content_en", "")
|
|
content_pt_br = chunk_data.get("content_pt_br", "")
|
|
|
|
yaml_lines = [
|
|
f"---",
|
|
f"chunk_id: {chunk_id}",
|
|
f"type: {chunk_type}",
|
|
f"page: {page_num}",
|
|
f"order_in_page: {order_in_page}",
|
|
f"order_global: {order_global}",
|
|
f"bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',0.1):.2f}}}",
|
|
f"classification: {json.dumps(chunk_data.get('classification', None))}",
|
|
f"formatting: {json.dumps(chunk_data.get('formatting', []))}",
|
|
f"cross_page_hint: {chunk_data.get('cross_page_hint', 'self_contained')}",
|
|
f"prev_chunk: {json.dumps(prev_chunk)}",
|
|
f"next_chunk: {json.dumps(next_chunk)}",
|
|
f"related_image: {json.dumps(related_image)}",
|
|
f"related_table: {json.dumps(related_table)}",
|
|
f"ocr_confidence: {chunk_data.get('ocr_confidence', 0.9)}",
|
|
f"ocr_source_lines: {json.dumps(chunk_data.get('ocr_source_lines', []))}",
|
|
f"redaction_code: {json.dumps(chunk_data.get('redaction_code', None))}",
|
|
f"redaction_inferred_content_type: {json.dumps(chunk_data.get('redaction_inferred_content_type', None))}",
|
|
f"image_type: {json.dumps(chunk_data.get('image_type', None))}",
|
|
f"ufo_anomaly_detected: {str(chunk_data.get('ufo_anomaly_detected', False)).lower()}",
|
|
f"ufo_anomaly_type: {json.dumps(chunk_data.get('ufo_anomaly_type', None))}",
|
|
f"ufo_anomaly_rationale: {json.dumps(chunk_data.get('ufo_anomaly_rationale', None))}",
|
|
f"cryptid_anomaly_detected: {str(chunk_data.get('cryptid_anomaly_detected', False)).lower()}",
|
|
f"cryptid_anomaly_type: {json.dumps(chunk_data.get('cryptid_anomaly_type', None))}",
|
|
f"cryptid_anomaly_rationale: {json.dumps(chunk_data.get('cryptid_anomaly_rationale', None))}",
|
|
f"image_description_en: {json.dumps(chunk_data.get('image_description_en', None))}",
|
|
f"image_description_pt_br: {json.dumps(chunk_data.get('image_description_pt_br', None))}",
|
|
f"extracted_text: {json.dumps(chunk_data.get('extracted_text', None))}",
|
|
f"source_png: ../../processing/png/{DOC_ID}/p-{page_num:03d}.png",
|
|
f"---",
|
|
]
|
|
|
|
content = "\n".join(yaml_lines) + "\n\n"
|
|
content += f"**EN:** {content_en}\n\n"
|
|
content += f"**PT-BR:** {content_pt_br}\n"
|
|
|
|
out_path = f"{CHUNKS_DIR}/{chunk_id}.md"
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
print(f"=== Rebuilding {DOC_ID} ===")
|
|
print(f"Total pages to process: {TOTAL_PAGES}")
|
|
|
|
# Create page batches (5 at a time)
|
|
pages_with_seq = [(page_num, seq_pos+1) for seq_pos, page_num in enumerate(PNG_PAGES)]
|
|
|
|
all_page_results = {}
|
|
batch_size = 5
|
|
|
|
for batch_start in range(0, len(pages_with_seq), batch_size):
|
|
batch = pages_with_seq[batch_start:batch_start+batch_size]
|
|
batch_nums = [p[0] for p in batch]
|
|
print(f"\nProcessing batch {batch_start//batch_size + 1}: pages {batch_nums}")
|
|
|
|
batch_results = process_pages_batch(batch)
|
|
all_page_results.update(batch_results)
|
|
|
|
# Small pause between batches to avoid rate limiting
|
|
if batch_start + batch_size < len(pages_with_seq):
|
|
time.sleep(0.5)
|
|
|
|
print(f"\n=== All {TOTAL_PAGES} pages processed ===")
|
|
|
|
# Global chunk numbering
|
|
# Sort results by seq_position
|
|
all_chunks = []
|
|
for seq_pos in sorted(all_page_results.keys()):
|
|
page_result = all_page_results[seq_pos]
|
|
page_num = page_result["page_number"]
|
|
chunks = page_result.get("chunks", [])
|
|
# Sort chunks by order_in_page
|
|
chunks_sorted = sorted(chunks, key=lambda c: c.get("order_in_page", 0))
|
|
for chunk in chunks_sorted:
|
|
all_chunks.append({
|
|
**chunk,
|
|
"page_number": page_num,
|
|
"seq_position": seq_pos
|
|
})
|
|
|
|
# Assign global chunk IDs
|
|
for global_idx, chunk in enumerate(all_chunks):
|
|
chunk["chunk_id"] = f"c{global_idx+1:04d}"
|
|
chunk["order_global"] = global_idx + 1
|
|
chunk["prev_chunk"] = f"c{global_idx:04d}" if global_idx > 0 else None
|
|
chunk["next_chunk"] = f"c{global_idx+2:04d}" if global_idx < len(all_chunks)-1 else None
|
|
|
|
print(f"Total chunks: {len(all_chunks)}")
|
|
|
|
# Crop images and collect image chunks
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f"Image chunks found: {len(image_chunks)}")
|
|
|
|
for img_chunk in image_chunks:
|
|
chunk_id = img_chunk["chunk_id"]
|
|
page_num = img_chunk["page_number"]
|
|
bbox = img_chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
|
|
crop_image_chunk(chunk_id, page_num, bbox)
|
|
print(f" Cropped image: {chunk_id} from page {page_num}")
|
|
|
|
# Write individual chunk files
|
|
print("\nWriting chunk files...")
|
|
for chunk in all_chunks:
|
|
write_chunk_file(chunk, chunk["page_number"])
|
|
|
|
# Build _index.json
|
|
print("Writing _index.json...")
|
|
index_chunks = []
|
|
for chunk in all_chunks:
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk["page_number"],
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk["order_global"],
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}),
|
|
"preview": chunk.get("content_en", "")[:80]
|
|
})
|
|
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
index_data = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": len(all_chunks),
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks
|
|
}
|
|
|
|
with open(f"{OUT_DIR}/_index.json", "w", encoding="utf-8") as f:
|
|
json.dump(index_data, f, ensure_ascii=False, indent=2)
|
|
|
|
# Build document.md
|
|
print("Building document.md...")
|
|
|
|
# Compute histograms and stats
|
|
type_histogram = {}
|
|
ufo_anomalies = []
|
|
cryptid_anomalies = []
|
|
|
|
for chunk in all_chunks:
|
|
t = chunk.get("type", "paragraph")
|
|
type_histogram[t] = type_histogram.get(t, 0) + 1
|
|
if chunk.get("ufo_anomaly_detected", False):
|
|
ufo_anomalies.append(chunk["chunk_id"])
|
|
if chunk.get("cryptid_anomaly_detected", False):
|
|
cryptid_anomalies.append(chunk["chunk_id"])
|
|
|
|
doc_lines = []
|
|
doc_lines.append("---")
|
|
doc_lines.append('schema_version: "0.2.0"')
|
|
doc_lines.append("type: master_document")
|
|
doc_lines.append(f"doc_id: {DOC_ID}")
|
|
doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
|
|
doc_lines.append(f"total_pages: {TOTAL_PAGES}")
|
|
doc_lines.append(f"total_chunks: {len(all_chunks)}")
|
|
doc_lines.append("chunk_types_histogram:")
|
|
for t, count in sorted(type_histogram.items()):
|
|
doc_lines.append(f" {t}: {count}")
|
|
doc_lines.append("multi_page_tables: []")
|
|
doc_lines.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
|
|
doc_lines.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
|
|
doc_lines.append('build_approach: "subagents"')
|
|
doc_lines.append("build_model: claude-haiku-4-5")
|
|
doc_lines.append(f"build_at: {build_at}")
|
|
doc_lines.append("---")
|
|
doc_lines.append("")
|
|
|
|
# Group chunks by page
|
|
chunks_by_page = {}
|
|
for chunk in all_chunks:
|
|
p = chunk["page_number"]
|
|
if p not in chunks_by_page:
|
|
chunks_by_page[p] = []
|
|
chunks_by_page[p].append(chunk)
|
|
|
|
for page_num in sorted(chunks_by_page.keys()):
|
|
doc_lines.append(f"## Page {page_num}")
|
|
doc_lines.append("")
|
|
|
|
for chunk in chunks_by_page[page_num]:
|
|
chunk_id = chunk["chunk_id"]
|
|
chunk_type = chunk.get("type", "paragraph")
|
|
bbox = chunk.get("bbox", {})
|
|
bx = bbox.get("x", 0)
|
|
by = bbox.get("y", 0)
|
|
bw = bbox.get("w", 1)
|
|
bh = bbox.get("h", 0.1)
|
|
|
|
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
doc_lines.append(f'<a id="{chunk_id}"></a>')
|
|
doc_lines.append(f"### Chunk {chunk_id} — {chunk_type} · p{page_num} · bbox: {bx:.2f}/{by:.2f}/{bw:.2f}/{bh:.2f}")
|
|
doc_lines.append("")
|
|
|
|
content_en = chunk.get("content_en", "")
|
|
content_pt_br = chunk.get("content_pt_br", "")
|
|
|
|
doc_lines.append(f"**EN:** {content_en}")
|
|
doc_lines.append("")
|
|
doc_lines.append(f"**PT-BR:** {content_pt_br}")
|
|
doc_lines.append("")
|
|
|
|
# Image embed
|
|
if chunk_type == "image":
|
|
img_path = f"./images/IMG-{chunk_id}.png"
|
|
doc_lines.append(f"")
|
|
doc_lines.append("")
|
|
if chunk.get("image_description_en"):
|
|
doc_lines.append(f"*Image description:* {chunk['image_description_en']}")
|
|
doc_lines.append("")
|
|
|
|
# Metadata collapsible
|
|
meta = {
|
|
"chunk_id": chunk_id,
|
|
"type": chunk_type,
|
|
"page": page_num,
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"bbox": bbox,
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting", []),
|
|
"cross_page_hint": chunk.get("cross_page_hint", "self_contained"),
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|
"next_chunk": chunk.get("next_chunk"),
|
|
"ocr_confidence": chunk.get("ocr_confidence", 0.9),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
|
|
"image_type": chunk.get("image_type"),
|
|
"image_description_en": chunk.get("image_description_en"),
|
|
"image_description_pt_br": chunk.get("image_description_pt_br")
|
|
}
|
|
doc_lines.append("<details><summary>metadata</summary>")
|
|
doc_lines.append("")
|
|
doc_lines.append("```json")
|
|
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
doc_lines.append("```")
|
|
doc_lines.append("")
|
|
doc_lines.append("</details>")
|
|
doc_lines.append("")
|
|
doc_lines.append("---")
|
|
doc_lines.append("")
|
|
|
|
document_md = "\n".join(doc_lines)
|
|
with open(f"{OUT_DIR}/document.md", "w", encoding="utf-8") as f:
|
|
f.write(document_md)
|
|
|
|
wall_seconds = int(time.time() - start_time)
|
|
doc_md_bytes = len(document_md.encode("utf-8"))
|
|
|
|
print(f"\n=== DONE ===")
|
|
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
|
|
print(f"Wall time: {wall_seconds}s")
|
|
|
|
return {
|
|
"pages": TOTAL_PAGES,
|
|
"chunks": len(all_chunks),
|
|
"images": len(image_chunks),
|
|
"tables": 0,
|
|
"ufo": len(ufo_anomalies),
|
|
"cryptid": len(cryptid_anomalies),
|
|
"wall_seconds": wall_seconds,
|
|
"doc_md_bytes": doc_md_bytes
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
main()
|