507 lines
20 KiB
Python
507 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
rebuild_doc65_suba_final.py
|
|
Full rebuild of doc-65-hs1-834228961-62-hq-83894-sub-a
|
|
89 pages (p-000 to p-063, p-100 to p-124 PNGs)
|
|
Uses Anthropic claude-haiku-4-5 for vision processing.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import base64
|
|
import time
|
|
import re
|
|
import threading
|
|
from datetime import datetime, timezone
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
from PIL import Image as PILImage
|
|
import anthropic
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-sub-a"
|
|
DOC_TITLE = "FBI HQ 62-HQ-83894 Sub A — Flying Saucers / UAP Investigation File"
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
|
|
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
|
|
BATCH_SIZE = 4
|
|
MAX_WORKERS = 4
|
|
|
|
_lock = threading.Lock()
|
|
|
|
def safe_print(*args, **kwargs):
|
|
with _lock:
|
|
print(*args, **kwargs, flush=True)
|
|
|
|
# Ensure dirs
|
|
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
# Build ordered list of PNG files
|
|
png_files = sorted(PNG_DIR.glob("p-*.png"))
|
|
TOTAL_PAGES = len(png_files)
|
|
safe_print(f"Found {TOTAL_PAGES} PNG pages")
|
|
|
|
|
|
def load_image_b64(path: Path) -> str:
|
|
with open(path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
|
|
def load_ocr(png_name: str) -> str:
|
|
ocr_name = png_name.replace(".png", ".txt")
|
|
ocr_path = OCR_DIR / ocr_name
|
|
if ocr_path.exists():
|
|
txt = ocr_path.read_text(encoding="utf-8").strip()
|
|
if txt:
|
|
return txt[:3000]
|
|
return "(no OCR text available — use vision only)"
|
|
|
|
|
|
def extract_json(text: str) -> dict:
|
|
text = text.strip()
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```\s*$", "", text)
|
|
start = text.find("{")
|
|
if start == -1:
|
|
raise ValueError("No JSON found")
|
|
depth = 0
|
|
for i, c in enumerate(text[start:], start):
|
|
if c == "{":
|
|
depth += 1
|
|
elif c == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
return json.loads(text[start:i+1])
|
|
raise ValueError("Unclosed JSON")
|
|
|
|
|
|
PAGE_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO FBI document archive.
|
|
|
|
Document: {doc_title}
|
|
Doc ID: {doc_id}
|
|
Page: {page_number} of {total_pages}
|
|
PNG: {png_filename}
|
|
|
|
OCR text:
|
|
---
|
|
{ocr_text}
|
|
---
|
|
|
|
Analyze this page image carefully. Extract ALL content as ordered semantic chunks.
|
|
|
|
Return ONLY valid JSON (no markdown, no fences):
|
|
|
|
{{
|
|
"page_number": {page_number},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "cover|letterhead|classification_banner|subject_line|salutation|body_paragraph|signature_block|date_line|reference_line|redaction_block|table_marker|image|caption|footer|header|list_item|handwritten_note|stamp|page_number|section_heading|blank",
|
|
"content_en": "verbatim text or description in English",
|
|
"content_pt_br": "tradução em português brasileiro",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Rules:
|
|
1. Every visible region = its own chunk. Do not skip content.
|
|
2. For images: set image_type to photograph|diagram|map|sketch|stamp|chart|handwriting|newspaper_clipping|other
|
|
3. For redaction_block: set redaction_code to visible FOIA code if shown.
|
|
4. For classification banners/stamps: set classification field to exact text.
|
|
5. ufo_anomaly_detected=true if content has UAP/UFO sighting details, craft descriptions, anomalous phenomena.
|
|
6. cross_page_hint: "self_contained"|"continues_to_next"|"continues_from_prev"
|
|
7. bbox: normalized 0.0-1.0 (x=left, y=top, w=width, h=height).
|
|
8. formatting: ["bold","italic","all_caps","underline","strikethrough"]
|
|
9. Newspaper clippings = type "image", image_type="newspaper_clipping", ufo_anomaly_detected=true if about UFOs.
|
|
10. Return ONLY the JSON object, nothing else."""
|
|
|
|
|
|
def fallback_chunk(page_number: int, reason: str) -> dict:
|
|
return {
|
|
"page_number": page_number,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "body_paragraph",
|
|
"content_en": f"[Page {page_number} - processing failed: {reason[:80]}]",
|
|
"content_pt_br": f"[Página {page_number} - falha no processamento: {reason[:80]}]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None, "formatting": [], "cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
}]
|
|
}
|
|
|
|
|
|
def process_page(page_idx: int, png_path: Path) -> dict:
|
|
page_number = page_idx + 1
|
|
png_filename = png_path.name
|
|
ocr_text = load_ocr(png_filename)
|
|
img_b64 = load_image_b64(png_path)
|
|
|
|
prompt = PAGE_PROMPT.format(
|
|
doc_title=DOC_TITLE, doc_id=DOC_ID,
|
|
page_number=page_number, total_pages=TOTAL_PAGES,
|
|
png_filename=png_filename, ocr_text=ocr_text,
|
|
)
|
|
|
|
for attempt in range(3):
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=4096,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
|
|
{"type": "text", "text": prompt},
|
|
],
|
|
}],
|
|
)
|
|
raw = response.content[0].text
|
|
data = extract_json(raw)
|
|
data["page_number"] = page_number
|
|
data["png_path"] = str(png_path)
|
|
data["png_filename"] = png_filename
|
|
safe_print(f" p{page_number} ({png_filename}): {len(data.get('chunks',[]))} chunks")
|
|
return data
|
|
except json.JSONDecodeError as e:
|
|
safe_print(f" p{page_number} JSON error attempt {attempt+1}: {e}")
|
|
if attempt == 2:
|
|
return fallback_chunk(page_number, f"JSON parse: {e}")
|
|
except Exception as e:
|
|
safe_print(f" p{page_number} error attempt {attempt+1}: {e}")
|
|
if attempt < 2:
|
|
time.sleep(2 ** attempt)
|
|
else:
|
|
return fallback_chunk(page_number, str(e))
|
|
|
|
|
|
IMAGE_ANALYST_PROMPT = """You are an image analyst for a declassified FBI UAP/UFO document archive.
|
|
|
|
Analyze this cropped image from FBI file 62-HQ-83894 about Flying Saucers/UAP.
|
|
|
|
Return ONLY valid JSON (no markdown, no fences):
|
|
|
|
{{
|
|
"image_description_en": "detailed English description",
|
|
"image_description_pt_br": "descrição detalhada em português brasileiro",
|
|
"image_type": "photograph|diagram|map|sketch|stamp|chart|handwriting|newspaper_clipping|other",
|
|
"extracted_text": "visible text verbatim or null",
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}"""
|
|
|
|
|
|
def crop_and_analyze_image(chunk: dict) -> dict:
|
|
chunk_id = chunk["chunk_id"]
|
|
png_path = chunk["png_path"]
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
|
|
|
|
# Crop
|
|
try:
|
|
im = PILImage.open(png_path)
|
|
W, H = im.size
|
|
x, y, w, h = bbox.get("x",0), bbox.get("y",0), bbox.get("w",1), bbox.get("h",1)
|
|
pad = 0.005
|
|
left = max(0, int((x-pad)*W))
|
|
top = max(0, int((y-pad)*H))
|
|
right = min(W, int((x+w+pad)*W))
|
|
bottom = min(H, int((y+h+pad)*H))
|
|
crop = im.crop((left, top, right, bottom))
|
|
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
crop.save(str(out_path))
|
|
img_b64 = load_image_b64(out_path)
|
|
except Exception as e:
|
|
safe_print(f" Crop error {chunk_id}: {e}")
|
|
return chunk
|
|
|
|
# Analyze
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=1024,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
|
|
{"type": "text", "text": IMAGE_ANALYST_PROMPT},
|
|
],
|
|
}],
|
|
)
|
|
raw = response.content[0].text
|
|
analysis = extract_json(raw)
|
|
for key in ["image_description_en","image_description_pt_br","image_type","extracted_text",
|
|
"ufo_anomaly_detected","ufo_anomaly_type","ufo_anomaly_rationale",
|
|
"cryptid_anomaly_detected","cryptid_anomaly_type","cryptid_anomaly_rationale"]:
|
|
if key in analysis:
|
|
chunk[key] = analysis[key]
|
|
safe_print(f" image analyzed: {chunk_id} ufo={chunk.get('ufo_anomaly_detected',False)}")
|
|
except Exception as e:
|
|
safe_print(f" Image analysis error {chunk_id}: {e}")
|
|
|
|
return chunk
|
|
|
|
|
|
def yaml_val(v):
|
|
if v is None:
|
|
return "null"
|
|
if isinstance(v, bool):
|
|
return "true" if v else "false"
|
|
if isinstance(v, list):
|
|
if not v:
|
|
return "[]"
|
|
return "[" + ", ".join(json.dumps(i, ensure_ascii=False) for i in v) + "]"
|
|
return json.dumps(v, ensure_ascii=False)
|
|
|
|
|
|
def write_chunk_file(chunk: dict):
|
|
chunk_id = chunk["chunk_id"]
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0})
|
|
chunk_type = chunk.get("type", "body_paragraph")
|
|
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
|
|
png_filename = chunk.get("png_filename", "")
|
|
|
|
fm = f"""---
|
|
chunk_id: {chunk_id}
|
|
type: {chunk_type}
|
|
page: {chunk['page']}
|
|
order_in_page: {chunk.get('order_in_page', 1)}
|
|
order_global: {chunk['order_global']}
|
|
bbox: {{x: {bbox.get('x',0):.2f}, y: {bbox.get('y',0):.2f}, w: {bbox.get('w',1):.2f}, h: {bbox.get('h',1):.2f}}}
|
|
classification: {yaml_val(chunk.get('classification'))}
|
|
formatting: {yaml_val(chunk.get('formatting', []))}
|
|
cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}
|
|
prev_chunk: {yaml_val(chunk.get('prev_chunk'))}
|
|
next_chunk: {yaml_val(chunk.get('next_chunk'))}
|
|
related_image: {yaml_val(related_image)}
|
|
related_table: {yaml_val(chunk.get('related_table'))}
|
|
ocr_confidence: {chunk.get('ocr_confidence', 0.8)}
|
|
ocr_source_lines: {yaml_val(chunk.get('ocr_source_lines', []))}
|
|
redaction_code: {yaml_val(chunk.get('redaction_code'))}
|
|
redaction_inferred_content_type: {yaml_val(chunk.get('redaction_inferred_content_type'))}
|
|
image_type: {yaml_val(chunk.get('image_type'))}
|
|
ufo_anomaly_detected: {yaml_val(chunk.get('ufo_anomaly_detected', False))}
|
|
cryptid_anomaly_detected: {yaml_val(chunk.get('cryptid_anomaly_detected', False))}
|
|
ufo_anomaly_type: {yaml_val(chunk.get('ufo_anomaly_type'))}
|
|
ufo_anomaly_rationale: {yaml_val(chunk.get('ufo_anomaly_rationale'))}
|
|
cryptid_anomaly_type: {yaml_val(chunk.get('cryptid_anomaly_type'))}
|
|
cryptid_anomaly_rationale: {yaml_val(chunk.get('cryptid_anomaly_rationale'))}
|
|
image_description_en: {yaml_val(chunk.get('image_description_en'))}
|
|
image_description_pt_br: {yaml_val(chunk.get('image_description_pt_br'))}
|
|
extracted_text: {yaml_val(chunk.get('extracted_text'))}
|
|
source_png: ../../processing/png/{DOC_ID}/{png_filename}
|
|
---
|
|
|
|
**EN:** {chunk.get('content_en', '')}
|
|
|
|
**PT-BR:** {chunk.get('content_pt_br', '')}
|
|
"""
|
|
(CHUNKS_DIR / f"{chunk_id}.md").write_text(fm, encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
start = time.time()
|
|
safe_print(f"=== Rebuild {DOC_ID} ===")
|
|
safe_print(f"Total pages: {TOTAL_PAGES}")
|
|
|
|
# Phase 1: Process pages in batches
|
|
all_pages = []
|
|
page_items = list(enumerate(png_files)) # (idx, path)
|
|
|
|
for batch_start in range(0, TOTAL_PAGES, BATCH_SIZE):
|
|
batch = page_items[batch_start: batch_start + BATCH_SIZE]
|
|
safe_print(f"Batch pages {[b[0]+1 for b in batch]}...")
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
|
futs = {ex.submit(process_page, idx, pth): idx for idx, pth in batch}
|
|
for fut in as_completed(futs):
|
|
result = fut.result()
|
|
all_pages.append(result)
|
|
|
|
all_pages.sort(key=lambda p: p["page_number"])
|
|
|
|
# Phase 2: Global chunk numbering
|
|
global_chunks = []
|
|
chunk_counter = 1
|
|
for page_data in all_pages:
|
|
page_chunks = sorted(page_data.get("chunks", []), key=lambda c: c.get("order_in_page", 1))
|
|
for chunk in page_chunks:
|
|
chunk["chunk_id"] = f"c{chunk_counter:04d}"
|
|
chunk["page"] = page_data["page_number"]
|
|
chunk["png_path"] = page_data["png_path"]
|
|
chunk["png_filename"] = page_data["png_filename"]
|
|
chunk["order_global"] = chunk_counter
|
|
global_chunks.append(chunk)
|
|
chunk_counter += 1
|
|
|
|
total_chunks = len(global_chunks)
|
|
safe_print(f"Total chunks: {total_chunks}")
|
|
|
|
# Set prev/next
|
|
for i, chunk in enumerate(global_chunks):
|
|
chunk["prev_chunk"] = global_chunks[i-1]["chunk_id"] if i > 0 else None
|
|
chunk["next_chunk"] = global_chunks[i+1]["chunk_id"] if i < total_chunks-1 else None
|
|
|
|
# Phase 3: Crop & analyze images
|
|
image_chunks = [c for c in global_chunks if c.get("type") == "image"]
|
|
safe_print(f"Image chunks: {len(image_chunks)}")
|
|
|
|
for batch_start in range(0, len(image_chunks), BATCH_SIZE):
|
|
batch = image_chunks[batch_start: batch_start + BATCH_SIZE]
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
|
futs = {ex.submit(crop_and_analyze_image, chunk): chunk["chunk_id"] for chunk in batch}
|
|
for fut in as_completed(futs):
|
|
fut.result() # side-effects already applied
|
|
|
|
# Phase 4: Write chunk files
|
|
safe_print("Writing chunk files...")
|
|
for chunk in global_chunks:
|
|
write_chunk_file(chunk)
|
|
|
|
# Phase 5: Write _index.json
|
|
safe_print("Writing _index.json...")
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
index_chunks = []
|
|
for chunk in global_chunks:
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "body_paragraph"),
|
|
"page": chunk["page"],
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk["order_global"],
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": chunk.get("bbox", {"x":0,"y":0,"w":1,"h":1}),
|
|
"preview": chunk.get("content_en","")[:80].replace("\n"," "),
|
|
})
|
|
index_data = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": total_chunks,
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks,
|
|
}
|
|
(OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
# Phase 6: Assemble document.md
|
|
safe_print("Assembling document.md...")
|
|
type_hist = {}
|
|
for chunk in global_chunks:
|
|
t = chunk.get("type","body_paragraph")
|
|
type_hist[t] = type_hist.get(t,0)+1
|
|
|
|
ufo_flagged = [c["chunk_id"] for c in global_chunks if c.get("ufo_anomaly_detected")]
|
|
cryptid_flagged = [c["chunk_id"] for c in global_chunks if c.get("cryptid_anomaly_detected")]
|
|
|
|
hist_yaml = "\n".join(f" {k}: {v}" for k,v in sorted(type_hist.items()))
|
|
|
|
doc_parts = [f"""---
|
|
schema_version: "0.2.0"
|
|
type: master_document
|
|
doc_id: {DOC_ID}
|
|
canonical_title: "{DOC_TITLE}"
|
|
total_pages: {TOTAL_PAGES}
|
|
total_chunks: {total_chunks}
|
|
chunk_types_histogram:
|
|
{hist_yaml}
|
|
multi_page_tables: []
|
|
ufo_anomalies_flagged: {json.dumps(ufo_flagged, ensure_ascii=False)}
|
|
cryptid_anomalies_flagged: {json.dumps(cryptid_flagged, ensure_ascii=False)}
|
|
build_approach: "subagents"
|
|
build_model: "claude-haiku-4-5"
|
|
build_at: "{build_at}"
|
|
---
|
|
"""]
|
|
|
|
chunks_by_page = {}
|
|
for chunk in global_chunks:
|
|
p = chunk["page"]
|
|
chunks_by_page.setdefault(p, []).append(chunk)
|
|
|
|
for page_num in sorted(chunks_by_page.keys()):
|
|
doc_parts.append(f"\n## Page {page_num}\n\n")
|
|
for chunk in chunks_by_page[page_num]:
|
|
cid = chunk["chunk_id"]
|
|
ctype = chunk.get("type","body_paragraph")
|
|
bbox = chunk.get("bbox",{})
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"
|
|
|
|
doc_parts.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->\n")
|
|
doc_parts.append(f'<a id="{cid}"></a>\n')
|
|
doc_parts.append(f"### Chunk {cid} — {ctype} · p{page_num} · bbox: {bbox_str}\n\n")
|
|
|
|
if ctype == "image":
|
|
doc_parts.append(f"\n\n")
|
|
d_en = chunk.get("image_description_en")
|
|
d_pt = chunk.get("image_description_pt_br")
|
|
if d_en:
|
|
doc_parts.append(f"**Image Description (EN):** {d_en}\n\n")
|
|
if d_pt:
|
|
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {d_pt}\n\n")
|
|
|
|
doc_parts.append(f"**EN:** {chunk.get('content_en','')}\n\n")
|
|
doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br','')}\n\n")
|
|
|
|
meta = {
|
|
"chunk_id": cid, "type": ctype,
|
|
"page": page_num, "order_in_page": chunk.get("order_in_page",1),
|
|
"order_global": chunk["order_global"],
|
|
"bbox": chunk.get("bbox",{}),
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting",[]),
|
|
"cross_page_hint": chunk.get("cross_page_hint","self_contained"),
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|
"next_chunk": chunk.get("next_chunk"),
|
|
"ocr_confidence": chunk.get("ocr_confidence",0.8),
|
|
"redaction_code": chunk.get("redaction_code"),
|
|
"image_type": chunk.get("image_type"),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected",False),
|
|
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected",False),
|
|
"source_png": f"../../processing/png/{DOC_ID}/{chunk.get('png_filename','')}",
|
|
}
|
|
doc_parts.append("<details><summary>metadata</summary>\n\n```json\n")
|
|
doc_parts.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
doc_parts.append("\n```\n\n</details>\n\n---\n\n")
|
|
|
|
doc_content = "".join(doc_parts)
|
|
(OUT_DIR / "document.md").write_text(doc_content, encoding="utf-8")
|
|
doc_md_bytes = len(doc_content.encode("utf-8"))
|
|
|
|
elapsed = int(time.time() - start)
|
|
safe_print(f"\nSTATS pages={TOTAL_PAGES} chunks={total_chunks} images={len(image_chunks)} tables=0 ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}")
|
|
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={len(image_chunks)}, tables_stitched=0, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={elapsed}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|