disclosure-bureau/scripts/rebuild_doc65_s2_v2.py

629 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Rebuild script v2 for doc-65-hs1-834228961-62-hq-83894-section-2
Uses claude CLI for vision processing (no direct API key needed).
Processes 159 pages in batches of 5.
"""
import os
import sys
import json
import time
import subprocess
import concurrent.futures
import textwrap
from datetime import datetime, timezone
from pathlib import Path
from PIL import Image
# ── Config ──────────────────────────────────────────────────────────────────
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2"
DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
CLAUDE_BIN = "/Users/guto/.local/bin/claude"
BATCH_SIZE = 5
CLAUDE_TIMEOUT = 120 # seconds per page call
def build_page_map():
pngs = sorted(
int(p.stem.replace("p-", ""))
for p in PNG_DIR.glob("p-*.png")
)
return {i + 1: num for i, num in enumerate(pngs)}
PAGE_MAP = build_page_map()
TOTAL_PAGES = len(PAGE_MAP)
def load_ocr(actual_num: int) -> str:
ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt"
if ocr_path.exists():
text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
return text[:2000] if text else ""
return ""
PAGE_REBUILDER_PROMPT_TEMPLATE = """You are a page-rebuilder agent analyzing a page from a declassified FBI document about Flying Discs / UAP investigations.
Document: {doc_title}
Actual page file: p-{actual_num:03d}.png
Sequential page number: {page_seq} of {total_pages}
OCR text (may be empty or poor quality):
{ocr_text}
Use the Read tool to read this image:
/Users/guto/ufo/processing/png/{doc_id}/p-{actual_num:03d}.png
Then analyze ALL visible content and return a JSON object with this exact structure (return ONLY the JSON, no markdown fences, no explanation):
{{
"page_number": {page_seq},
"chunks": [
{{
"order_in_page": 1,
"type": "cover",
"content_en": "exact transcription or description in English",
"content_pt_br": "descrição ou transcrição em português brasileiro",
"bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}}
]
}}
RULES:
- type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank
- bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0)
- Split the page into logical chunks (letterhead separate from body text, stamps separate, etc.)
- For redacted blocks: type=redaction, include redaction_code if visible e.g. "(b)(1)", "(b)(3)", "(b)(6)"
- For stamps (RECEIVED, RECORDED, etc.): type=stamp
- For photos, sketches, diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other
- For tables: type=table_marker
- cross_page_hint: self_contained | continues_to_next | continues_from_prev
- content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]"
- content_pt_br: Brazilian Portuguese translation/description
- formatting: array of applicable: bold | italic | all_caps | underline | typewritten | handwritten
- ufo_anomaly_detected: true ONLY if page has image/sketch/photo of an anomalous aerial object
- Blank pages: one chunk with type=blank
- Return ONLY valid JSON, nothing else"""
IMAGE_ANALYST_PROMPT_TEMPLATE = """You are an image analyst for declassified FBI UFO/UAP investigation documents.
Read this cropped image region:
{img_path}
Analyze it and return ONLY this JSON (no markdown fences):
{{
"image_type": "photo",
"image_description_en": "detailed description in English",
"image_description_pt_br": "descrição detalhada em português brasileiro",
"extracted_text": "any text visible verbatim or null",
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}}
image_type: photo | diagram | sketch | map | chart | signature_block | stamp | seal | other
Return ONLY valid JSON."""
def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str:
"""Run claude CLI with a prompt, return stdout text."""
try:
result = subprocess.run(
[CLAUDE_BIN, "-p", "--dangerously-skip-permissions",
"--model", "claude-haiku-4-5",
"--no-session-persistence",
prompt],
capture_output=True,
text=True,
timeout=timeout,
env={**os.environ}
)
return result.stdout.strip()
except subprocess.TimeoutExpired:
return ""
except Exception as e:
return f"ERROR: {e}"
def parse_json_response(raw: str):
"""Try to parse JSON from response, stripping markdown fences."""
text = raw.strip()
# Strip markdown fences
if text.startswith("```"):
lines = text.split("\n")
# Remove first line (```json or ```)
lines = lines[1:]
# Remove last line if it's ```
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
text = "\n".join(lines).strip()
# Find JSON object boundaries
start = text.find("{")
if start == -1:
return None
# Find matching closing brace
depth = 0
end = -1
for i, ch in enumerate(text[start:]):
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
end = start + i + 1
break
if end == -1:
return None
try:
return json.loads(text[start:end])
except json.JSONDecodeError:
return None
def rebuild_page(page_seq: int) -> dict:
"""Process one page via claude CLI."""
actual_num = PAGE_MAP[page_seq]
ocr_text = load_ocr(actual_num)
prompt = PAGE_REBUILDER_PROMPT_TEMPLATE.format(
doc_title=DOC_TITLE,
actual_num=actual_num,
page_seq=page_seq,
total_pages=TOTAL_PAGES,
ocr_text=ocr_text if ocr_text else "(no OCR available)",
doc_id=DOC_ID
)
retries = 3
for attempt in range(retries):
raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT)
if not raw or raw.startswith("ERROR:"):
if attempt < retries - 1:
wait = 5 * (attempt + 1)
print(f" [RETRY {attempt+1}] page {page_seq}: empty/error, waiting {wait}s", flush=True)
time.sleep(wait)
continue
else:
break
data = parse_json_response(raw)
if data and "chunks" in data:
data["page_number"] = page_seq
data["actual_num"] = actual_num
for i, ch in enumerate(data["chunks"]):
ch["order_in_page"] = i + 1
ch["page"] = page_seq
print(f" [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True)
return data
else:
if attempt < retries - 1:
print(f" [RETRY {attempt+1}] page {page_seq}: bad JSON, retrying", flush=True)
time.sleep(3)
else:
print(f" [FAIL] page {page_seq}: could not parse JSON. Raw: {raw[:200]}", flush=True)
# Fallback
return {
"page_number": page_seq,
"actual_num": actual_num,
"chunks": [{
"order_in_page": 1,
"type": "blank",
"page": page_seq,
"content_en": "[Page processing failed - manual review required]",
"content_pt_br": "[Falha no processamento da página - revisão manual necessária]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}]
}
def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path:
"""Crop bbox region from page PNG."""
src = PNG_DIR / f"p-{actual_num:03d}.png"
dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
try:
im = Image.open(src)
W, H = im.size
x = max(0.0, min(1.0, bbox.get("x", 0.0)))
y = max(0.0, min(1.0, bbox.get("y", 0.0)))
w = max(0.01, min(1.0 - x, bbox.get("w", 1.0)))
h = max(0.01, min(1.0 - y, bbox.get("h", 0.1)))
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
cropped = im.crop((left, top, right, bottom))
cropped.save(str(dst))
except Exception as e:
print(f" [CROP FAIL] {chunk_id}: {e}", flush=True)
return dst
def analyze_image(chunk_id: str, img_path: Path) -> dict:
"""Analyze cropped image via claude CLI."""
if not img_path.exists():
return {
"image_type": "other",
"image_description_en": "Image not available",
"image_description_pt_br": "Imagem não disponível",
"extracted_text": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}
prompt = IMAGE_ANALYST_PROMPT_TEMPLATE.format(img_path=str(img_path))
retries = 2
for attempt in range(retries):
raw = run_claude(prompt, timeout=60)
data = parse_json_response(raw)
if data:
print(f" [IMG OK] {chunk_id}", flush=True)
return data
if attempt < retries - 1:
time.sleep(3)
print(f" [IMG FAIL] {chunk_id}", flush=True)
return {
"image_type": "other",
"image_description_en": "Analysis failed",
"image_description_pt_br": "Análise falhou",
"extracted_text": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None
}
def write_chunk_file(chunk: dict):
"""Write individual chunk markdown file."""
chunk_id = chunk["chunk_id"]
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
page = chunk.get("page", 1)
actual_num = PAGE_MAP.get(page, page)
ctype = chunk.get("type", "paragraph")
related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null"
related_table = chunk.get("related_table") or "null"
prev_chunk = chunk.get("prev_chunk") or "null"
next_chunk = chunk.get("next_chunk") or "null"
fmt_list = chunk.get("formatting") or []
fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
ocr_lines = chunk.get("ocr_source_lines") or []
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
def yv(v):
if v is None:
return "null"
if isinstance(v, bool):
return str(v).lower()
s = str(v)
# Quote if contains special chars
if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@', '`']):
return f'"{s}"'
return s
content = f"""---
chunk_id: {chunk_id}
type: {ctype}
page: {page}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
classification: {yv(chunk.get("classification"))}
formatting: {fmt_str}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {prev_chunk}
next_chunk: {next_chunk}
related_image: {related_image}
related_table: {related_table}
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
ocr_source_lines: {ocr_lines_str}
redaction_code: {yv(chunk.get("redaction_code"))}
redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))}
image_type: {yv(chunk.get("image_type"))}
ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()}
cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()}
ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {yv(chunk.get("image_description_en"))}
image_description_pt_br: {yv(chunk.get("image_description_pt_br"))}
extracted_text: {yv(chunk.get("extracted_text"))}
source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png
---
**EN:** {chunk.get("content_en", "")}
**PT-BR:** {chunk.get("content_pt_br", "")}
"""
(CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8")
def main():
t_start = time.time()
print(f"Starting rebuild: {DOC_ID}", flush=True)
print(f"Total pages: {TOTAL_PAGES}", flush=True)
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
# Phase 1: Rebuild pages in parallel batches of 5
print("\n=== Phase 1: Page rebuilding ===", flush=True)
all_page_results = {}
page_seqs = list(range(1, TOTAL_PAGES + 1))
for batch_start in range(0, len(page_seqs), BATCH_SIZE):
batch = page_seqs[batch_start:batch_start + BATCH_SIZE]
print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
futures = {executor.submit(rebuild_page, p): p for p in batch}
for future in concurrent.futures.as_completed(futures):
result = future.result()
all_page_results[result["page_number"]] = result
# Save intermediate state after each batch
state_path = OUT_DIR / "_rebuild_state.json"
state_path.write_text(
json.dumps({str(k): v for k, v in all_page_results.items()}, ensure_ascii=False),
encoding="utf-8"
)
# Phase 2: Global chunk numbering
print("\n=== Phase 2: Global chunk numbering ===", flush=True)
all_chunks = []
order_global = 0
for page_seq in sorted(all_page_results.keys()):
chunks = all_page_results[page_seq].get("chunks", [])
actual_num = all_page_results[page_seq].get("actual_num", PAGE_MAP.get(page_seq, page_seq))
for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)):
order_global += 1
chunk_id = f"c{order_global:04d}"
chunk["chunk_id"] = chunk_id
chunk["order_global"] = order_global
chunk["actual_num"] = actual_num
all_chunks.append(chunk)
for i, chunk in enumerate(all_chunks):
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
print(f" Total chunks: {len(all_chunks)}", flush=True)
# Phase 3: Crop all images
print("\n=== Phase 3: Cropping images ===", flush=True)
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f" Found {len(image_chunks)} image chunks", flush=True)
for chunk in image_chunks:
crop_image(
chunk["page"],
chunk.get("actual_num", PAGE_MAP.get(chunk["page"], chunk["page"])),
chunk["chunk_id"],
chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
)
# Phase 4: Analyze images in parallel batches of 5
print("\n=== Phase 4: Image analysis ===", flush=True)
chunk_lookup = {c["chunk_id"]: c for c in all_chunks}
for batch_start in range(0, len(image_chunks), BATCH_SIZE):
batch = image_chunks[batch_start:batch_start + BATCH_SIZE]
print(f" Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
futures = {}
for chunk in batch:
chunk_id = chunk["chunk_id"]
img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id
for future in concurrent.futures.as_completed(futures):
chunk_id = futures[future]
img_meta = future.result()
chunk = chunk_lookup.get(chunk_id)
if chunk:
chunk.update({k: v for k, v in img_meta.items() if v is not None})
# Phase 5: Table stitching check
print("\n=== Phase 5: Table stitching ===", flush=True)
tables_stitched = 0
table_markers = [c for c in all_chunks if c.get("type") == "table_marker"]
print(f" Found {len(table_markers)} table markers (no cross-page stitching needed)", flush=True)
# Phase 6: Write chunk files
print("\n=== Phase 6: Writing chunk files ===", flush=True)
for chunk in all_chunks:
write_chunk_file(chunk)
print(f" Wrote {len(all_chunks)} chunk files", flush=True)
# Phase 7: Write _index.json
print("\n=== Phase 7: Writing _index.json ===", flush=True)
build_at = datetime.now(timezone.utc).isoformat()
index_chunks = []
for chunk in all_chunks:
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
content_en = chunk.get("content_en", "")
preview = content_en[:80] + ("..." if len(content_en) > 80 else "")
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk.get("page", 1),
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": bbox,
"preview": preview
})
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": "claude-haiku-4-5",
"build_at": build_at,
"chunks": index_chunks
}
(OUT_DIR / "_index.json").write_text(
json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
)
# Phase 8: Assemble document.md
print("\n=== Phase 8: Assembling document.md ===", flush=True)
type_histogram = {}
for chunk in all_chunks:
t = chunk.get("type", "paragraph")
type_histogram[t] = type_histogram.get(t, 0) + 1
ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
images_extracted = len(image_chunks)
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
def list_yaml(items):
if not items:
return " []"
return "\n".join(f" - {i}" for i in items)
doc_parts = [f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {len(all_chunks)}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged:
{list_yaml(ufo_flagged)}
cryptid_anomalies_flagged:
{list_yaml(cryptid_flagged)}
build_approach: "subagents"
build_model: "claude-haiku-4-5"
build_at: "{build_at}"
---
"""]
chunks_by_page = {}
for chunk in all_chunks:
p = chunk.get("page", 1)
chunks_by_page.setdefault(p, []).append(chunk)
for page_seq in sorted(chunks_by_page.keys()):
page_chunks = chunks_by_page[page_seq]
doc_parts.append(f"\n## Page {page_seq}\n")
for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)):
chunk_id = chunk["chunk_id"]
ctype = chunk.get("type", "paragraph")
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"
doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
doc_parts.append(f'<a id="{chunk_id}"></a>\n')
doc_parts.append(f"### Chunk {chunk_id}{ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n")
doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n")
if ctype == "image":
doc_parts.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)\n\n")
if chunk.get("image_description_en"):
doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n")
if chunk.get("image_description_pt_br"):
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n")
# Metadata block
meta = {
"chunk_id": chunk_id,
"type": ctype,
"page": chunk.get("page"),
"order_in_page": chunk.get("order_in_page"),
"order_global": chunk.get("order_global"),
"bbox": bbox,
"classification": chunk.get("classification"),
"formatting": chunk.get("formatting", []),
"cross_page_hint": chunk.get("cross_page_hint"),
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"ocr_confidence": chunk.get("ocr_confidence"),
"redaction_code": chunk.get("redaction_code"),
"image_type": chunk.get("image_type"),
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
"ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
}
meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
doc_parts.append(
f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n"
)
doc_md = "".join(doc_parts)
(OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8")
doc_md_bytes = len(doc_md.encode("utf-8"))
# Cleanup intermediate state
state_path = OUT_DIR / "_rebuild_state.json"
if state_path.exists():
state_path.unlink()
t_end = time.time()
wall_seconds = int(t_end - t_start)
print(f"\n=== DONE ===", flush=True)
final_line = f"pages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}"
print(final_line, flush=True)
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True)
if __name__ == "__main__":
main()