592 lines
24 KiB
Python
592 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-3
|
|
Processes all 155 pages in parallel batches, generates chunks, images, and index.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import base64
|
|
import time
|
|
import concurrent.futures
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import anthropic
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-3"
|
|
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 3 — FBI Flying Discs Investigation File"
|
|
TOTAL_PAGES = 155
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
|
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
|
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
CHUNK_TYPES = [
|
|
"letterhead", "header", "classification_banner", "subject_line",
|
|
"salutation", "body_paragraph", "signature_block", "handwritten_note",
|
|
"stamp", "redaction_block", "image", "table_marker", "footer",
|
|
"page_number", "attachment_label", "routing_slip", "blank",
|
|
"caption", "list_item", "address_block"
|
|
]
|
|
|
|
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent for a declassified FBI UAP/UFO document archive.
|
|
|
|
Your task: Analyze the provided page image and extract ALL content into structured chunks.
|
|
|
|
Document: {doc_title}
|
|
Page: {page_number} of {total_pages}
|
|
Page PNG path: {page_png_path}
|
|
|
|
Return a JSON object with this exact structure:
|
|
{{
|
|
"page_number": {page_number},
|
|
"classification": "<classification string found on page or null>",
|
|
"page_type": "<blank|text|image|mixed|cover>",
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<chunk_type>",
|
|
"content_en": "<English text content or description>",
|
|
"content_pt_br": "<Brazilian Portuguese translation/description>",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": "<classification string or null>",
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.9,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
RULES:
|
|
1. Extract ALL content — no chunk can be skipped.
|
|
2. Use ONLY these chunk types: letterhead, header, classification_banner, subject_line, salutation, body_paragraph, signature_block, handwritten_note, stamp, redaction_block, image, table_marker, footer, page_number, attachment_label, routing_slip, blank, caption, list_item, address_block
|
|
3. bbox values are normalized 0.0-1.0 (x=left, y=top, w=width, h=height of the page).
|
|
4. content_en: verbatim transcription for text, description for images.
|
|
5. content_pt_br: Brazilian Portuguese translation of content_en (NOT European Portuguese). For verbatim text blocks, provide both the original (verbatim) and a translation note.
|
|
6. For redacted blocks: set type="redaction_block", content_en="[REDACTED]", set redaction_code if visible (e.g., "(b)(1)", "(b)(6)"), redaction_inferred_content_type with your best inference.
|
|
7. For images/photos: type="image", image_type = one of: photograph|sketch|diagram|map|chart|logo|signature|stamp|other
|
|
8. For tables: type="table_marker"
|
|
9. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
|
|
10. UAP/cryptid anomaly detection: flag any anomalous visual content (UFO shapes, unusual aerial phenomena, cryptid-related imagery).
|
|
11. If page is blank or nearly blank: create ONE chunk type="blank".
|
|
12. classification_banner chunks at top/bottom of page for classification markings.
|
|
13. stamps: type="stamp" for rubber stamps, file numbers, dates stamped on documents.
|
|
14. Return ONLY valid JSON, no other text.
|
|
|
|
IMPORTANT: Be thorough. A typical text page has 5-15 chunks. A photo page may have 2-3 chunks. Cover/envelope pages have 4-8 chunks.
|
|
"""
|
|
|
|
|
|
def encode_image_b64(path: Path) -> str:
|
|
with open(path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
|
|
def process_page(page_num: int) -> dict:
|
|
"""Process a single page and return its chunks as a dict."""
|
|
# PNG files are p-000.png through p-154.png (zero-indexed)
|
|
png_index = page_num - 1 # page 1 = p-000.png
|
|
png_path = PNG_DIR / f"p-{png_index:03d}.png"
|
|
|
|
if not png_path.exists():
|
|
print(f" WARNING: PNG not found for page {page_num}: {png_path}")
|
|
return {
|
|
"page_number": page_num,
|
|
"classification": None,
|
|
"page_type": "blank",
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": "[Page image not found]",
|
|
"content_pt_br": "[Imagem da página não encontrada]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}]
|
|
}
|
|
|
|
img_b64 = encode_image_b64(png_path)
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
page_number=page_num,
|
|
total_pages=TOTAL_PAGES,
|
|
page_png_path=str(png_path)
|
|
)
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=4096,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_b64
|
|
}
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": prompt
|
|
}
|
|
]
|
|
}]
|
|
)
|
|
|
|
text = response.content[0].text.strip()
|
|
# Strip markdown code fences if present
|
|
if text.startswith("```"):
|
|
lines = text.split("\n")
|
|
text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
|
|
|
|
data = json.loads(text)
|
|
data["page_number"] = page_num # ensure correct
|
|
return data
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f" Page {page_num} attempt {attempt+1}: JSON parse error: {e}")
|
|
if attempt == max_retries - 1:
|
|
# Return a fallback
|
|
return {
|
|
"page_number": page_num,
|
|
"classification": None,
|
|
"page_type": "text",
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "body_paragraph",
|
|
"content_en": f"[Page {page_num} — parse error, content not extracted]",
|
|
"content_pt_br": f"[Página {page_num} — erro de análise, conteúdo não extraído]",
|
|
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}]
|
|
}
|
|
except anthropic.APIError as e:
|
|
print(f" Page {page_num} attempt {attempt+1}: API error: {e}")
|
|
if attempt < max_retries - 1:
|
|
time.sleep(2 ** attempt)
|
|
else:
|
|
return {
|
|
"page_number": page_num,
|
|
"classification": None,
|
|
"page_type": "text",
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "body_paragraph",
|
|
"content_en": f"[Page {page_num} — API error]",
|
|
"content_pt_br": f"[Página {page_num} — erro de API]",
|
|
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}]
|
|
}
|
|
|
|
|
|
def crop_image(chunk_id: str, png_path: Path, bbox: dict):
|
|
"""Crop a region from the page PNG and save to images dir."""
|
|
try:
|
|
from PIL import Image
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
x = bbox.get("x", 0)
|
|
y = bbox.get("y", 0)
|
|
w = bbox.get("w", 1)
|
|
h = bbox.get("h", 1)
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
cropped = im.crop((left, top, right, bottom))
|
|
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
cropped.save(out_path)
|
|
return out_path
|
|
except Exception as e:
|
|
print(f" Crop error for {chunk_id}: {e}")
|
|
return None
|
|
|
|
|
|
def write_chunk_file(chunk_data: dict, chunk_id: str, page_num: int,
|
|
order_global: int, prev_chunk, next_chunk,
|
|
has_image: bool) -> None:
|
|
"""Write a single chunk markdown file."""
|
|
bbox = chunk_data.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
png_index = page_num - 1
|
|
source_png = f"../../processing/png/{DOC_ID}/p-{png_index:03d}.png"
|
|
|
|
related_image = f"IMG-{chunk_id}.png" if has_image else "null"
|
|
related_table = chunk_data.get("related_table", "null") or "null"
|
|
|
|
ufo = chunk_data.get("ufo_anomaly_detected", False)
|
|
cryptid = chunk_data.get("cryptid_anomaly_detected", False)
|
|
|
|
frontmatter = f"""---
|
|
chunk_id: {chunk_id}
|
|
type: {chunk_data.get("type", "body_paragraph")}
|
|
page: {page_num}
|
|
order_in_page: {chunk_data.get("order_in_page", 1)}
|
|
order_global: {order_global}
|
|
bbox: {{x: {bbox.get("x", 0):.3f}, y: {bbox.get("y", 0):.3f}, w: {bbox.get("w", 1):.3f}, h: {bbox.get("h", 1):.3f}}}
|
|
classification: {json.dumps(chunk_data.get("classification"))}
|
|
formatting: {json.dumps(chunk_data.get("formatting", []))}
|
|
cross_page_hint: {chunk_data.get("cross_page_hint", "self_contained")}
|
|
prev_chunk: {json.dumps(prev_chunk)}
|
|
next_chunk: {json.dumps(next_chunk)}
|
|
related_image: {json.dumps(related_image if has_image else None)}
|
|
related_table: {json.dumps(chunk_data.get("related_table"))}
|
|
ocr_confidence: {chunk_data.get("ocr_confidence", 0.9)}
|
|
ocr_source_lines: {json.dumps(chunk_data.get("ocr_source_lines", []))}
|
|
redaction_code: {json.dumps(chunk_data.get("redaction_code"))}
|
|
redaction_inferred_content_type: {json.dumps(chunk_data.get("redaction_inferred_content_type"))}
|
|
image_type: {json.dumps(chunk_data.get("image_type"))}
|
|
ufo_anomaly_detected: {str(ufo).lower()}
|
|
ufo_anomaly_type: {json.dumps(chunk_data.get("ufo_anomaly_type"))}
|
|
ufo_anomaly_rationale: {json.dumps(chunk_data.get("ufo_anomaly_rationale"))}
|
|
cryptid_anomaly_detected: {str(cryptid).lower()}
|
|
cryptid_anomaly_type: {json.dumps(chunk_data.get("cryptid_anomaly_type"))}
|
|
cryptid_anomaly_rationale: {json.dumps(chunk_data.get("cryptid_anomaly_rationale"))}
|
|
image_description_en: {json.dumps(chunk_data.get("image_description_en"))}
|
|
image_description_pt_br: {json.dumps(chunk_data.get("image_description_pt_br"))}
|
|
extracted_text: {json.dumps(chunk_data.get("extracted_text"))}
|
|
source_png: {source_png}
|
|
---
|
|
|
|
**EN:** {chunk_data.get("content_en", "")}
|
|
|
|
**PT-BR:** {chunk_data.get("content_pt_br", "")}
|
|
"""
|
|
|
|
out_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
out_path.write_text(frontmatter, encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
print(f"Starting rebuild of {DOC_ID}")
|
|
print(f"Processing {TOTAL_PAGES} pages with 4 parallel workers...")
|
|
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Step 1: Process all pages in parallel batches of 4
|
|
all_pages = {} # page_num -> page_data
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
future_to_page = {
|
|
executor.submit(process_page, page_num): page_num
|
|
for page_num in range(1, TOTAL_PAGES + 1)
|
|
}
|
|
completed = 0
|
|
for future in concurrent.futures.as_completed(future_to_page):
|
|
page_num = future_to_page[future]
|
|
try:
|
|
result = future.result()
|
|
all_pages[page_num] = result
|
|
completed += 1
|
|
if completed % 10 == 0:
|
|
print(f" Completed {completed}/{TOTAL_PAGES} pages...")
|
|
except Exception as e:
|
|
print(f" Page {page_num} failed: {e}")
|
|
all_pages[page_num] = {
|
|
"page_number": page_num,
|
|
"classification": None,
|
|
"page_type": "text",
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "body_paragraph",
|
|
"content_en": f"[Page {page_num} — processing failed: {e}]",
|
|
"content_pt_br": f"[Página {page_num} — processamento falhou: {e}]",
|
|
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}]
|
|
}
|
|
|
|
print(f"All pages processed. Assigning global chunk IDs...")
|
|
|
|
# Step 2: Assign global chunk IDs
|
|
all_chunks = [] # list of (chunk_id, page_num, chunk_data)
|
|
global_order = 0
|
|
|
|
for page_num in range(1, TOTAL_PAGES + 1):
|
|
page_data = all_pages[page_num]
|
|
chunks = page_data.get("chunks", [])
|
|
# Sort by order_in_page
|
|
chunks.sort(key=lambda c: c.get("order_in_page", 0))
|
|
for chunk in chunks:
|
|
global_order += 1
|
|
chunk_id = f"c{global_order:04d}"
|
|
all_chunks.append((chunk_id, page_num, chunk))
|
|
|
|
total_chunks = len(all_chunks)
|
|
print(f"Total chunks: {total_chunks}")
|
|
|
|
# Set prev/next pointers
|
|
for i, (chunk_id, page_num, chunk) in enumerate(all_chunks):
|
|
prev_chunk = all_chunks[i-1][0] if i > 0 else None
|
|
next_chunk = all_chunks[i+1][0] if i < len(all_chunks) - 1 else None
|
|
chunk["_chunk_id"] = chunk_id
|
|
chunk["_prev"] = prev_chunk
|
|
chunk["_next"] = next_chunk
|
|
chunk["_order_global"] = i + 1
|
|
|
|
# Step 3: Crop images for image-type chunks
|
|
print("Cropping images for image chunks...")
|
|
image_chunks = [(cid, pnum, c) for cid, pnum, c in all_chunks if c.get("type") == "image"]
|
|
print(f" Found {len(image_chunks)} image chunks")
|
|
|
|
for chunk_id, page_num, chunk in image_chunks:
|
|
png_index = page_num - 1
|
|
png_path = PNG_DIR / f"p-{png_index:03d}.png"
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
crop_image(chunk_id, png_path, bbox)
|
|
|
|
# Step 4: Write chunk files
|
|
print("Writing chunk files...")
|
|
for chunk_id, page_num, chunk in all_chunks:
|
|
has_image = chunk.get("type") == "image"
|
|
write_chunk_file(
|
|
chunk, chunk_id, page_num,
|
|
chunk["_order_global"],
|
|
chunk["_prev"],
|
|
chunk["_next"],
|
|
has_image
|
|
)
|
|
|
|
# Step 5: Write _index.json
|
|
print("Writing _index.json...")
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
index_chunks = []
|
|
for chunk_id, page_num, chunk in all_chunks:
|
|
content_en = chunk.get("content_en", "")
|
|
preview = content_en[:80] if content_en else ""
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
index_chunks.append({
|
|
"chunk_id": chunk_id,
|
|
"type": chunk.get("type", "body_paragraph"),
|
|
"page": page_num,
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk["_order_global"],
|
|
"file": f"chunks/{chunk_id}.md",
|
|
"bbox": bbox,
|
|
"preview": preview
|
|
})
|
|
|
|
index = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": total_chunks,
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks
|
|
}
|
|
|
|
index_path = OUT_DIR / "_index.json"
|
|
index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
# Step 6: Compute stats
|
|
chunk_types = {}
|
|
ufo_anomalies = []
|
|
cryptid_anomalies = []
|
|
images_count = 0
|
|
|
|
for chunk_id, page_num, chunk in all_chunks:
|
|
t = chunk.get("type", "body_paragraph")
|
|
chunk_types[t] = chunk_types.get(t, 0) + 1
|
|
if chunk.get("ufo_anomaly_detected"):
|
|
ufo_anomalies.append(chunk_id)
|
|
if chunk.get("cryptid_anomaly_detected"):
|
|
cryptid_anomalies.append(chunk_id)
|
|
if t == "image":
|
|
images_count += 1
|
|
|
|
# Step 7: Write document.md
|
|
print("Writing document.md...")
|
|
|
|
frontmatter_lines = [
|
|
"---",
|
|
'schema_version: "0.2.0"',
|
|
"type: master_document",
|
|
f"doc_id: {DOC_ID}",
|
|
f'canonical_title: "{DOC_TITLE}"',
|
|
f"total_pages: {TOTAL_PAGES}",
|
|
f"total_chunks: {total_chunks}",
|
|
"chunk_types_histogram:",
|
|
]
|
|
for t, count in sorted(chunk_types.items()):
|
|
frontmatter_lines.append(f" {t}: {count}")
|
|
frontmatter_lines.append("multi_page_tables: []")
|
|
frontmatter_lines.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
|
|
frontmatter_lines.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
|
|
frontmatter_lines.append('build_approach: "subagents"')
|
|
frontmatter_lines.append("build_model: claude-haiku-4-5")
|
|
frontmatter_lines.append(f"build_at: {build_at}")
|
|
frontmatter_lines.append("---")
|
|
frontmatter_lines.append("")
|
|
|
|
doc_lines = frontmatter_lines[:]
|
|
|
|
current_page = 0
|
|
for chunk_id, page_num, chunk in all_chunks:
|
|
if page_num != current_page:
|
|
current_page = page_num
|
|
doc_lines.append(f"## Page {page_num}")
|
|
doc_lines.append("")
|
|
|
|
chunk_type = chunk.get("type", "body_paragraph")
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"
|
|
|
|
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
doc_lines.append(f'<a id="{chunk_id}"></a>')
|
|
doc_lines.append(f"### Chunk {chunk_id} — {chunk_type} · p{page_num} · bbox: {bbox_str}")
|
|
doc_lines.append("")
|
|
|
|
content_en = chunk.get("content_en", "")
|
|
content_pt = chunk.get("content_pt_br", "")
|
|
|
|
doc_lines.append(f"**EN:** {content_en}")
|
|
doc_lines.append("")
|
|
doc_lines.append(f"**PT-BR:** {content_pt}")
|
|
doc_lines.append("")
|
|
|
|
if chunk_type == "image":
|
|
doc_lines.append(f"")
|
|
desc_en = chunk.get("image_description_en", "")
|
|
desc_pt = chunk.get("image_description_pt_br", "")
|
|
if desc_en:
|
|
doc_lines.append(f"*{desc_en}*")
|
|
if desc_pt:
|
|
doc_lines.append(f"*{desc_pt}*")
|
|
doc_lines.append("")
|
|
|
|
# Build metadata JSON for details block
|
|
meta = {
|
|
"chunk_id": chunk_id,
|
|
"type": chunk_type,
|
|
"page": page_num,
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk["_order_global"],
|
|
"bbox": bbox,
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting", []),
|
|
"cross_page_hint": chunk.get("cross_page_hint", "self_contained"),
|
|
"prev_chunk": chunk["_prev"],
|
|
"next_chunk": chunk["_next"],
|
|
"ocr_confidence": chunk.get("ocr_confidence", 0.9),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
|
|
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
|
|
"cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
|
|
"redaction_code": chunk.get("redaction_code"),
|
|
"image_type": chunk.get("image_type"),
|
|
}
|
|
|
|
doc_lines.append("<details><summary>metadata</summary>")
|
|
doc_lines.append("")
|
|
doc_lines.append("```json")
|
|
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
doc_lines.append("```")
|
|
doc_lines.append("")
|
|
doc_lines.append("</details>")
|
|
doc_lines.append("")
|
|
doc_lines.append("---")
|
|
doc_lines.append("")
|
|
|
|
doc_content = "\n".join(doc_lines)
|
|
doc_path = OUT_DIR / "document.md"
|
|
doc_path.write_text(doc_content, encoding="utf-8")
|
|
|
|
wall_seconds = int(time.time() - start_time)
|
|
doc_md_bytes = len(doc_content.encode("utf-8"))
|
|
|
|
print(f"\nDone!")
|
|
print(f"STATS pages={TOTAL_PAGES} chunks={total_chunks} images={images_count} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_md_bytes}")
|
|
print(f"Wall time: {wall_seconds}s")
|
|
|
|
return {
|
|
"pages": TOTAL_PAGES,
|
|
"chunks": total_chunks,
|
|
"images": images_count,
|
|
"tables": 0,
|
|
"ufo": len(ufo_anomalies),
|
|
"cryptid": len(cryptid_anomalies),
|
|
"wall_seconds": wall_seconds,
|
|
"doc_md_bytes": doc_md_bytes
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|