disclosure-bureau/scripts/rebuild_doc65_section2.py

660 lines
27 KiB
Python

#!/usr/bin/env python3
"""
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-2
Processes all 159 pages in parallel batches of 5, generates chunks, images, index, document.md
"""
import os
import sys
import json
import base64
import time
import concurrent.futures
from datetime import datetime, timezone
from pathlib import Path
import anthropic
from PIL import Image
# ── Config ──────────────────────────────────────────────────────────────────
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2"
DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
client = anthropic.Anthropic()
CHUNK_TYPES = [
"cover", "letterhead", "stamp", "header", "subheader", "paragraph",
"redaction", "signature", "image", "table_marker", "footer",
"page_number", "classification_marking", "separator", "handwriting",
"form_field", "caption", "list_item", "annotation", "blank"
]
# Build page mapping: sequential 1..159 -> actual file number
def build_page_map():
pngs = sorted(
int(p.stem.replace("p-", ""))
for p in PNG_DIR.glob("p-*.png")
)
return {i + 1: num for i, num in enumerate(pngs)}
PAGE_MAP = build_page_map()
TOTAL_PAGES = len(PAGE_MAP)
def load_image_b64(path: Path) -> str:
with open(path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def load_ocr(actual_num: int) -> str:
ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt"
if ocr_path.exists():
text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
return text if text else ""
return ""
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder subagent. Your job is to analyze a declassified FBI document page and extract ALL content as structured chunks.
Document: {doc_title}
Page: {page_number} of {total_pages}
Actual file: p-{actual_num:03d}.png
OCR text (may be empty/poor quality):
{ocr_text}
Analyze the image carefully. Extract ALL visible content into chunks. Return a JSON object:
{{
"page_number": {page_number},
"chunks": [
{{
"order_in_page": 1,
"type": "<one of: cover|letterhead|stamp|header|subheader|paragraph|redaction|signature|image|table_marker|footer|page_number|classification_marking|separator|handwriting|form_field|caption|list_item|annotation|blank>",
"content_en": "<exact transcription or English description>",
"content_pt_br": "<Brazilian Portuguese translation/description>",
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}}
]
}}
Rules:
- bbox: x,y = top-left corner (0.0-1.0 fraction of page), w,h = width/height fractions
- classification: string like "SECRET" or null
- formatting: array of ["bold","italic","all_caps","underline","strikethrough"] as applicable
- cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
- For redaction blocks: type="redaction", include redaction_code if visible (e.g. "(b)(1)", "(b)(3)", "(b)(6)")
- For stamps: type="stamp", describe the stamp text
- For images/diagrams/photos: type="image", set image_type to "photo"|"diagram"|"sketch"|"map"|"chart"|"signature_block"
- For tables: type="table_marker"
- ufo_anomaly_detected: true only if the page contains an image/sketch/photo of an anomalous aerial phenomenon
- cryptid_anomaly_detected: true only if the page contains imagery of cryptids/unknown creatures
- content_en: transcribe verbatim when legible; describe when not (e.g., "[Redacted block]", "[Stamp: RECEIVED]")
- content_pt_br: Brazilian Portuguese equivalent
- Return ONLY valid JSON, no markdown fences, no explanation
- Do NOT skip any visible content area
- Minimum 1 chunk per page (even blank pages get type="blank")
"""
def rebuild_page(page_seq: int) -> dict:
"""Process one page, return {page_number, chunks:[...]}"""
actual_num = PAGE_MAP[page_seq]
png_path = PNG_DIR / f"p-{actual_num:03d}.png"
ocr_text = load_ocr(actual_num)
img_b64 = load_image_b64(png_path)
prompt = PAGE_REBUILDER_PROMPT.format(
doc_title=DOC_TITLE,
page_number=page_seq,
total_pages=TOTAL_PAGES,
actual_num=actual_num,
ocr_text=ocr_text[:2000] if ocr_text else "(no OCR available)"
)
retries = 3
for attempt in range(retries):
try:
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_b64
}
},
{
"type": "text",
"text": prompt
}
]
}]
)
raw = response.content[0].text.strip()
# Strip markdown fences if present
if raw.startswith("```"):
raw = raw.split("\n", 1)[1]
if raw.endswith("```"):
raw = raw[:-3]
raw = raw.strip()
data = json.loads(raw)
data["page_number"] = page_seq
data["actual_num"] = actual_num
if "chunks" not in data:
data["chunks"] = []
# Ensure order_in_page
for i, ch in enumerate(data["chunks"]):
ch["order_in_page"] = i + 1
ch["page"] = page_seq
print(f" [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True)
return data
except Exception as e:
if attempt < retries - 1:
wait = 2 ** attempt * 5
print(f" [RETRY {attempt+1}] page {page_seq}: {e}, waiting {wait}s", flush=True)
time.sleep(wait)
else:
print(f" [FAIL] page {page_seq}: {e}", flush=True)
return {
"page_number": page_seq,
"actual_num": actual_num,
"chunks": [{
"order_in_page": 1,
"type": "blank",
"page": page_seq,
"content_en": "[Page processing failed]",
"content_pt_br": "[Falha no processamento da página]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}]
}
IMAGE_ANALYST_PROMPT = """You are an image analyst examining a cropped region from a declassified FBI document about flying discs / UAP investigations.
Analyze this image region and return a JSON object:
{{
"image_type": "<photo|diagram|sketch|map|chart|signature_block|stamp|seal|other>",
"image_description_en": "<detailed description in English>",
"image_description_pt_br": "<descrição detalhada em português brasileiro>",
"extracted_text": "<any text visible in the image, verbatim>",
"ufo_anomaly_detected": <true|false>,
"ufo_anomaly_type": "<type or null>",
"ufo_anomaly_rationale": "<rationale or null>",
"cryptid_anomaly_detected": <true|false>,
"cryptid_anomaly_type": "<type or null>",
"cryptid_anomaly_rationale": "<rationale or null>"
}}
Return ONLY valid JSON, no markdown fences.
"""
def analyze_image(chunk_id: str, img_path: Path) -> dict:
"""Analyze a cropped image, return metadata dict"""
if not img_path.exists():
return {
"image_type": "other",
"image_description_en": "Image not available",
"image_description_pt_br": "Imagem não disponível",
"extracted_text": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}
img_b64 = load_image_b64(img_path)
retries = 3
for attempt in range(retries):
try:
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_b64
}
},
{
"type": "text",
"text": IMAGE_ANALYST_PROMPT
}
]
}]
)
raw = response.content[0].text.strip()
if raw.startswith("```"):
raw = raw.split("\n", 1)[1]
if raw.endswith("```"):
raw = raw[:-3]
raw = raw.strip()
return json.loads(raw)
except Exception as e:
if attempt < retries - 1:
time.sleep(2 ** attempt * 3)
else:
print(f" [IMAGE FAIL] {chunk_id}: {e}", flush=True)
return {
"image_type": "other",
"image_description_en": "Analysis failed",
"image_description_pt_br": "Análise falhou",
"extracted_text": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}
def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path:
"""Crop bbox region from page PNG and save to images dir"""
src = PNG_DIR / f"p-{actual_num:03d}.png"
dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
try:
im = Image.open(src)
W, H = im.size
x = bbox.get("x", 0.0)
y = bbox.get("y", 0.0)
w = bbox.get("w", 1.0)
h = bbox.get("h", 1.0)
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
cropped = im.crop((left, top, right, bottom))
cropped.save(str(dst))
return dst
except Exception as e:
print(f" [CROP FAIL] {chunk_id}: {e}", flush=True)
return dst
def write_chunk_file(chunk: dict, chunk_id: str):
"""Write individual chunk markdown file"""
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
page = chunk.get("page", 1)
actual_num = PAGE_MAP.get(page, page)
related_image = f"IMG-{chunk_id}.png" if chunk.get("type") == "image" else "null"
related_table = chunk.get("related_table", "null") or "null"
prev_chunk = chunk.get("prev_chunk", "null") or "null"
next_chunk = chunk.get("next_chunk", "null") or "null"
fmt_list = chunk.get("formatting", []) or []
fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
ocr_lines = chunk.get("ocr_source_lines", []) or []
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
# Boolean fields
ufo_det = str(chunk.get("ufo_anomaly_detected", False)).lower()
crypto_det = str(chunk.get("cryptid_anomaly_detected", False)).lower()
def yaml_val(v):
if v is None or v == "null":
return "null"
if isinstance(v, bool):
return str(v).lower()
return str(v)
content = f"""---
chunk_id: {chunk_id}
type: {chunk.get("type", "paragraph")}
page: {page}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
classification: {yaml_val(chunk.get("classification"))}
formatting: {fmt_str}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {prev_chunk}
next_chunk: {next_chunk}
related_image: {related_image}
related_table: {related_table}
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
ocr_source_lines: {ocr_lines_str}
redaction_code: {yaml_val(chunk.get("redaction_code"))}
redaction_inferred_content_type: {yaml_val(chunk.get("redaction_inferred_content_type"))}
image_type: {yaml_val(chunk.get("image_type"))}
ufo_anomaly_detected: {ufo_det}
cryptid_anomaly_detected: {crypto_det}
ufo_anomaly_type: {yaml_val(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {yaml_val(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {yaml_val(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {yaml_val(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {yaml_val(chunk.get("image_description_en"))}
image_description_pt_br: {yaml_val(chunk.get("image_description_pt_br"))}
extracted_text: {yaml_val(chunk.get("extracted_text"))}
source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png
---
**EN:** {chunk.get("content_en", "")}
**PT-BR:** {chunk.get("content_pt_br", "")}
"""
chunk_path = CHUNKS_DIR / f"{chunk_id}.md"
chunk_path.write_text(content, encoding="utf-8")
def main():
t_start = time.time()
print(f"Starting rebuild of {DOC_ID}", flush=True)
print(f"Total pages: {TOTAL_PAGES}", flush=True)
# Ensure output dirs
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
# Step 1: Process all pages in parallel batches of 5
print("\n=== Phase 1: Page rebuilding ===", flush=True)
all_page_results = {}
page_seqs = list(range(1, TOTAL_PAGES + 1))
batch_size = 5
for batch_start in range(0, len(page_seqs), batch_size):
batch = page_seqs[batch_start:batch_start + batch_size]
print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
futures = {executor.submit(rebuild_page, p): p for p in batch}
for future in concurrent.futures.as_completed(futures):
result = future.result()
all_page_results[result["page_number"]] = result
# Small delay between batches to avoid rate limits
if batch_start + batch_size < len(page_seqs):
time.sleep(1)
# Step 2: Globally number chunks
print("\n=== Phase 2: Global chunk numbering ===", flush=True)
all_chunks = []
order_global = 0
for page_seq in sorted(all_page_results.keys()):
page_data = all_page_results[page_seq]
chunks = page_data.get("chunks", [])
for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)):
order_global += 1
chunk_id = f"c{order_global:04d}"
chunk["chunk_id"] = chunk_id
chunk["order_global"] = order_global
chunk["actual_num"] = page_data.get("actual_num", page_seq)
all_chunks.append(chunk)
# Set prev/next pointers
for i, chunk in enumerate(all_chunks):
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
print(f" Total chunks: {len(all_chunks)}", flush=True)
# Step 3: Crop images (all first, then analyze)
print("\n=== Phase 3: Cropping images ===", flush=True)
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f" Found {len(image_chunks)} image chunks", flush=True)
for chunk in image_chunks:
chunk_id = chunk["chunk_id"]
page = chunk["page"]
actual_num = chunk.get("actual_num", PAGE_MAP.get(page, page))
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
crop_image(page, actual_num, chunk_id, bbox)
# Step 4: Analyze images in parallel batches of 5
print("\n=== Phase 4: Image analysis ===", flush=True)
for batch_start in range(0, len(image_chunks), batch_size):
batch = image_chunks[batch_start:batch_start + batch_size]
print(f" Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
futures = {}
for chunk in batch:
chunk_id = chunk["chunk_id"]
img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id
for future in concurrent.futures.as_completed(futures):
chunk_id = futures[future]
img_meta = future.result()
# Find chunk and merge
for chunk in all_chunks:
if chunk["chunk_id"] == chunk_id:
chunk.update({
"image_type": img_meta.get("image_type", chunk.get("image_type")),
"image_description_en": img_meta.get("image_description_en"),
"image_description_pt_br": img_meta.get("image_description_pt_br"),
"extracted_text": img_meta.get("extracted_text"),
"ufo_anomaly_detected": img_meta.get("ufo_anomaly_detected", False),
"ufo_anomaly_type": img_meta.get("ufo_anomaly_type"),
"ufo_anomaly_rationale": img_meta.get("ufo_anomaly_rationale"),
"cryptid_anomaly_detected": img_meta.get("cryptid_anomaly_detected", False),
"cryptid_anomaly_type": img_meta.get("cryptid_anomaly_type"),
"cryptid_anomaly_rationale": img_meta.get("cryptid_anomaly_rationale"),
})
print(f" [IMG OK] {chunk_id}", flush=True)
break
if batch_start + batch_size < len(image_chunks):
time.sleep(1)
# Step 5: Check for cross-page table stitching
print("\n=== Phase 5: Table stitching check ===", flush=True)
tables_stitched = 0
# (Simple check - full stitching would require more complex logic)
# Find table_marker chunks that span pages
table_markers = [c for c in all_chunks if c.get("type") == "table_marker"]
print(f" Found {len(table_markers)} table markers", flush=True)
# No cross-page stitching needed for this pass - all tables are self-contained
# Step 6: Write individual chunk files
print("\n=== Phase 6: Writing chunk files ===", flush=True)
for chunk in all_chunks:
write_chunk_file(chunk, chunk["chunk_id"])
print(f" Wrote {len(all_chunks)} chunk files", flush=True)
# Step 7: Write _index.json
print("\n=== Phase 7: Writing _index.json ===", flush=True)
build_at = datetime.now(timezone.utc).isoformat()
index_chunks = []
for chunk in all_chunks:
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
content_en = chunk.get("content_en", "")
preview = (content_en[:80] + "...") if len(content_en) > 80 else content_en
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk.get("page", 1),
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": bbox,
"preview": preview
})
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": "claude-haiku-4-5",
"build_at": build_at,
"chunks": index_chunks
}
index_path = OUT_DIR / "_index.json"
index_path.write_text(json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8")
print(f" Written: {index_path}", flush=True)
# Step 8: Assemble document.md
print("\n=== Phase 8: Assembling document.md ===", flush=True)
# Compute stats
type_histogram = {}
for chunk in all_chunks:
t = chunk.get("type", "paragraph")
type_histogram[t] = type_histogram.get(t, 0) + 1
ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
images_extracted = len(image_chunks)
# Build frontmatter
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
ufo_yaml = "\n".join(f" - {c}" for c in ufo_flagged) if ufo_flagged else " []"
cryptid_yaml = "\n".join(f" - {c}" for c in cryptid_flagged) if cryptid_flagged else " []"
doc_parts = [f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {len(all_chunks)}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged:
{ufo_yaml if ufo_flagged else " []"}
cryptid_anomalies_flagged:
{cryptid_yaml if cryptid_flagged else " []"}
build_approach: "subagents"
build_model: "claude-haiku-4-5"
build_at: "{build_at}"
---
"""]
# Group chunks by page
chunks_by_page = {}
for chunk in all_chunks:
p = chunk.get("page", 1)
chunks_by_page.setdefault(p, []).append(chunk)
for page_seq in sorted(chunks_by_page.keys()):
page_chunks = chunks_by_page[page_seq]
doc_parts.append(f"\n## Page {page_seq}\n")
for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)):
chunk_id = chunk["chunk_id"]
ctype = chunk.get("type", "paragraph")
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"
doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
doc_parts.append(f'<a id="{chunk_id}"></a>\n')
doc_parts.append(f"### Chunk {chunk_id}{ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
content_en = chunk.get("content_en", "")
content_pt_br = chunk.get("content_pt_br", "")
doc_parts.append(f"**EN:** {content_en}\n\n")
doc_parts.append(f"**PT-BR:** {content_pt_br}\n\n")
# Image embed
if ctype == "image":
img_rel = f"./images/IMG-{chunk_id}.png"
doc_parts.append(f"![{chunk_id} image]({img_rel})\n\n")
desc_en = chunk.get("image_description_en", "")
desc_pt = chunk.get("image_description_pt_br", "")
if desc_en:
doc_parts.append(f"**Image Description (EN):** {desc_en}\n\n")
if desc_pt:
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {desc_pt}\n\n")
# Table render
if ctype == "table_marker" and chunk.get("stitched_table"):
rows = chunk["stitched_table"]
if rows:
doc_parts.append("<table>\n")
for row in rows:
doc_parts.append("<tr>" + "".join(f"<td>{cell}</td>" for cell in row) + "</tr>\n")
doc_parts.append("</table>\n\n")
# Metadata details
meta = {
"chunk_id": chunk_id,
"type": ctype,
"page": chunk.get("page"),
"order_in_page": chunk.get("order_in_page"),
"order_global": chunk.get("order_global"),
"bbox": bbox,
"classification": chunk.get("classification"),
"formatting": chunk.get("formatting", []),
"cross_page_hint": chunk.get("cross_page_hint"),
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"ocr_confidence": chunk.get("ocr_confidence"),
"redaction_code": chunk.get("redaction_code"),
"image_type": chunk.get("image_type"),
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
}
meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
doc_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")
doc_md = "".join(doc_parts)
doc_path = OUT_DIR / "document.md"
doc_path.write_text(doc_md, encoding="utf-8")
doc_md_bytes = len(doc_md.encode("utf-8"))
print(f" Written: {doc_path} ({doc_md_bytes} bytes)", flush=True)
t_end = time.time()
wall_seconds = int(t_end - t_start)
print(f"\n=== DONE ===", flush=True)
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True)
print(f"\npages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}", flush=True)
if __name__ == "__main__":
main()