583 lines
23 KiB
Python
583 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rebuilds doc-65-hs1-834228961-62-hq-83894-section-7 into the raw/ layout.
|
|
Uses claude CLI (OAuth via Max plan) to process each page PNG via vision.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import base64
|
|
import time
|
|
import subprocess
|
|
import concurrent.futures
|
|
import threading
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-7"
|
|
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 7"
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
|
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
|
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
_print_lock = threading.Lock()
|
|
|
|
def safe_print(*args, **kwargs):
|
|
with _print_lock:
|
|
print(*args, **kwargs, flush=True)
|
|
|
|
|
|
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO document reconstruction system.
|
|
|
|
STEP 1: Use the Read tool to view this page image:
|
|
{page_png_path}
|
|
|
|
STEP 2: Analyze the page carefully. The page is from document: {doc_title}
|
|
Doc ID: {doc_id}
|
|
Page number (1-indexed in document): {page_number}
|
|
Total pages: {total_pages}
|
|
|
|
OCR text (may be empty):
|
|
{page_ocr_text}
|
|
|
|
STEP 3: Return a JSON object with ALL content from the page split into chunks.
|
|
|
|
Return ONLY this JSON structure (no markdown fences, no commentary):
|
|
{{
|
|
"page_number": {page_number},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<type_enum>",
|
|
"content_en": "English content or description",
|
|
"content_pt_br": "Conteúdo em português brasileiro",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.90,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null,
|
|
"image_description_en": null,
|
|
"image_description_pt_br": null,
|
|
"extracted_text": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Chunk type enum (use ONLY these):
|
|
- letterhead: agency/org header at top
|
|
- classification_banner: TOP SECRET/SECRET/CONFIDENTIAL/UNCLASSIFIED banners
|
|
- date_line: date of document
|
|
- to_from_line: TO:/FROM:/VIA: address lines
|
|
- subject_line: RE:/SUBJECT: lines
|
|
- paragraph: body text paragraph
|
|
- section_header: bold/underlined section title
|
|
- list_item: numbered or bulleted item
|
|
- redaction_block: blacked-out or whited-out region
|
|
- signature_block: signature/name/title at bottom
|
|
- image: photograph, diagram, sketch, stamp, seal
|
|
- table_marker: table content
|
|
- page_number: page number indicator
|
|
- footnote: footnote or endnote
|
|
- handwriting: handwritten annotation
|
|
- form_field: form label+value pairs
|
|
- blank: empty/whitespace page or region
|
|
|
|
Rules:
|
|
1. bbox values are NORMALIZED [0..1] (x=left, y=top, w=width, h=height)
|
|
2. Every visible region must be a chunk
|
|
3. For redaction_block: estimate redacted content type in redaction_inferred_content_type
|
|
4. For image chunks: provide detailed image_description_en AND image_description_pt_br
|
|
5. classification: extract from banners (e.g. "TOP SECRET") or null
|
|
6. formatting: array from: ["bold","italic","underline","all_caps","centered","right_aligned"]
|
|
7. cross_page_hint: "self_contained"|"continues_to_next"|"continues_from_prev"|"continues_both"
|
|
8. If blank page: one chunk of type "blank"
|
|
9. content_en: verbatim text (EN) or description; content_pt_br: PT-BR translation
|
|
10. ufo_anomaly_detected: true ONLY if page shows unidentified aerial phenomenon evidence
|
|
11. Output ONLY valid JSON, nothing else
|
|
"""
|
|
|
|
IMAGE_ANALYST_PROMPT = """You are an image analyst for declassified UAP/UFO document reconstruction.
|
|
|
|
STEP 1: Use the Read tool to view this cropped image:
|
|
{image_path}
|
|
|
|
STEP 2: Analyze the image carefully.
|
|
|
|
STEP 3: Return ONLY this JSON (no fences, no commentary):
|
|
{{
|
|
"image_description_en": "Detailed description in English",
|
|
"image_description_pt_br": "Descrição detalhada em português brasileiro",
|
|
"image_type": "<type>",
|
|
"extracted_text": "Any text visible in image verbatim, or null",
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
|
|
image_type enum: photograph|diagram|sketch|map|chart|seal|stamp|signature|redacted_region|form|other
|
|
ufo_anomaly_detected: true ONLY if image shows craft/object/phenomenon that appears to be UAP
|
|
cryptid_anomaly_detected: true ONLY if image shows anomalous/non-human entity
|
|
Return ONLY valid JSON.
|
|
"""
|
|
|
|
|
|
def extract_json(text: str) -> dict:
|
|
"""Extract JSON from claude CLI output."""
|
|
text = text.strip()
|
|
if text.startswith("```"):
|
|
import re
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```\s*$", "", text)
|
|
start = text.find("{")
|
|
if start == -1:
|
|
raise ValueError("No JSON object found")
|
|
depth = 0
|
|
for i, c in enumerate(text[start:], start):
|
|
if c == "{":
|
|
depth += 1
|
|
elif c == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
return json.loads(text[start:i+1])
|
|
raise ValueError("Unclosed JSON")
|
|
|
|
|
|
def call_claude(prompt: str, png_dir: Path, timeout: int = 180) -> dict:
|
|
"""Call claude CLI and return parsed JSON."""
|
|
cmd = [
|
|
"claude", "-p",
|
|
"--model", "haiku",
|
|
"--output-format", "json",
|
|
"--max-turns", "3",
|
|
"--allowedTools", "Read",
|
|
"--add-dir", str(png_dir),
|
|
"--", prompt
|
|
]
|
|
res = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
|
|
if res.returncode != 0:
|
|
raise RuntimeError(f"claude CLI failed rc={res.returncode}: {res.stderr[-1000:]}")
|
|
cli_output = json.loads(res.stdout)
|
|
if cli_output.get("is_error"):
|
|
raise RuntimeError(f"claude error: {cli_output.get('result', '')[:500]}")
|
|
result_text = cli_output.get("result", "")
|
|
return extract_json(result_text)
|
|
|
|
|
|
def get_page_list():
|
|
"""Returns list of (page_number, png_path) tuples sorted by page_number."""
|
|
files = sorted(PNG_DIR.glob("p-*.png"))
|
|
return [(i+1, f) for i, f in enumerate(files)]
|
|
|
|
|
|
def load_ocr(png_path: Path) -> str:
|
|
stem = png_path.stem # p-NNN
|
|
ocr_path = OCR_DIR / f"{stem}.txt"
|
|
if ocr_path.exists():
|
|
text = ocr_path.read_text(encoding="utf-8").strip()
|
|
return text if len(text) > 2 else ""
|
|
return ""
|
|
|
|
|
|
def process_page(page_number: int, png_path: Path, total_pages: int) -> dict:
|
|
"""Process a single page via claude vision."""
|
|
ocr_text = load_ocr(png_path)
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
doc_id=DOC_ID,
|
|
page_number=page_number,
|
|
total_pages=total_pages,
|
|
page_png_path=str(png_path),
|
|
page_ocr_text=ocr_text if ocr_text else "(no OCR available)"
|
|
)
|
|
|
|
retries = 3
|
|
for attempt in range(retries):
|
|
try:
|
|
result = call_claude(prompt, png_path.parent, timeout=180)
|
|
chunks = result.get("chunks", [])
|
|
safe_print(f" [OK] p{page_number:03d}: {len(chunks)} chunks")
|
|
return result
|
|
except Exception as e:
|
|
safe_print(f" [ERR] p{page_number:03d} attempt {attempt+1}: {str(e)[:200]}")
|
|
if attempt < retries - 1:
|
|
time.sleep(2 ** attempt)
|
|
|
|
# Fallback
|
|
return {
|
|
"page_number": page_number,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": f"[Page {page_number} — processing error]",
|
|
"content_pt_br": f"[Página {page_number} — erro de processamento]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained", "ocr_confidence": 0.0,
|
|
"ocr_source_lines": [], "redaction_code": None,
|
|
"redaction_inferred_content_type": None, "image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}]
|
|
}
|
|
|
|
|
|
def global_number_chunks(all_page_results: dict) -> list:
|
|
"""Assign global chunk IDs across all pages."""
|
|
chunks_flat = []
|
|
for page_num in sorted(all_page_results.keys()):
|
|
page_data = all_page_results[page_num]
|
|
page_chunks = sorted(page_data.get("chunks", []), key=lambda c: c.get("order_in_page", 0))
|
|
for chunk in page_chunks:
|
|
chunk["page"] = page_num
|
|
chunks_flat.append(chunk)
|
|
|
|
for i, chunk in enumerate(chunks_flat):
|
|
chunk["chunk_id"] = f"c{i+1:04d}"
|
|
chunk["order_global"] = i + 1
|
|
chunk["prev_chunk"] = f"c{i:04d}" if i > 0 else None
|
|
chunk["next_chunk"] = f"c{i+2:04d}" if i < len(chunks_flat) - 1 else None
|
|
|
|
return chunks_flat
|
|
|
|
|
|
def crop_image(chunk: dict, png_path: Path):
|
|
"""Crop image chunk bbox from page PNG."""
|
|
from PIL import Image
|
|
chunk_id = chunk["chunk_id"]
|
|
bbox = chunk.get("bbox", {})
|
|
x = bbox.get("x", 0)
|
|
y = bbox.get("y", 0)
|
|
w = bbox.get("w", 1)
|
|
h = bbox.get("h", 1)
|
|
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
try:
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
if right <= left or bottom <= top:
|
|
right = min(W, left + 10)
|
|
bottom = min(H, top + 10)
|
|
cropped = im.crop((left, top, right, bottom))
|
|
cropped.save(str(out_path))
|
|
return out_path
|
|
except Exception as e:
|
|
safe_print(f" [WARN] Crop failed {chunk_id}: {e}")
|
|
return None
|
|
|
|
|
|
def analyze_image(chunk: dict, png_path: Path) -> dict:
|
|
"""Crop and analyze an image chunk."""
|
|
cropped_path = crop_image(chunk, png_path)
|
|
if not cropped_path or not cropped_path.exists():
|
|
return chunk
|
|
|
|
prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path))
|
|
retries = 2
|
|
for attempt in range(retries):
|
|
try:
|
|
analysis = call_claude(prompt, cropped_path.parent, timeout=120)
|
|
for key in ["image_description_en", "image_description_pt_br", "image_type",
|
|
"extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type",
|
|
"ufo_anomaly_rationale", "cryptid_anomaly_detected",
|
|
"cryptid_anomaly_type", "cryptid_anomaly_rationale"]:
|
|
if key in analysis:
|
|
chunk[key] = analysis[key]
|
|
chunk["related_image"] = f"IMG-{chunk['chunk_id']}.png"
|
|
safe_print(f" [IMG] {chunk['chunk_id']}: analyzed")
|
|
return chunk
|
|
except Exception as e:
|
|
safe_print(f" [WARN] Image analysis {chunk['chunk_id']} attempt {attempt+1}: {str(e)[:150]}")
|
|
if attempt < retries - 1:
|
|
time.sleep(1)
|
|
return chunk
|
|
|
|
|
|
def write_chunk_file(chunk: dict, page_png_map: dict):
|
|
"""Write individual chunk .md file."""
|
|
chunk_id = chunk["chunk_id"]
|
|
page = chunk.get("page", 0)
|
|
bbox = chunk.get("bbox", {})
|
|
png_path = page_png_map.get(page)
|
|
source_png = f"../../processing/png/{DOC_ID}/{png_path.name}" if png_path else "unknown"
|
|
|
|
def jv(v):
|
|
return json.dumps(v, ensure_ascii=False)
|
|
|
|
yaml_lines = [
|
|
"---",
|
|
f"chunk_id: {chunk_id}",
|
|
f"type: {chunk.get('type', 'paragraph')}",
|
|
f"page: {page}",
|
|
f"order_in_page: {chunk.get('order_in_page', 1)}",
|
|
f"order_global: {chunk.get('order_global', 1)}",
|
|
f"bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 0):.3f}}}",
|
|
f"classification: {jv(chunk.get('classification'))}",
|
|
f"formatting: {jv(chunk.get('formatting', []))}",
|
|
f"cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}",
|
|
f"prev_chunk: {jv(chunk.get('prev_chunk'))}",
|
|
f"next_chunk: {jv(chunk.get('next_chunk'))}",
|
|
f"related_image: {jv(chunk.get('related_image'))}",
|
|
f"related_table: {jv(chunk.get('related_table'))}",
|
|
f"ocr_confidence: {chunk.get('ocr_confidence', 0.85)}",
|
|
f"ocr_source_lines: {jv(chunk.get('ocr_source_lines', []))}",
|
|
f"redaction_code: {jv(chunk.get('redaction_code'))}",
|
|
f"redaction_inferred_content_type: {jv(chunk.get('redaction_inferred_content_type'))}",
|
|
f"image_type: {jv(chunk.get('image_type'))}",
|
|
f"ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()}",
|
|
f"cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()}",
|
|
f"ufo_anomaly_type: {jv(chunk.get('ufo_anomaly_type'))}",
|
|
f"ufo_anomaly_rationale: {jv(chunk.get('ufo_anomaly_rationale'))}",
|
|
f"cryptid_anomaly_type: {jv(chunk.get('cryptid_anomaly_type'))}",
|
|
f"cryptid_anomaly_rationale: {jv(chunk.get('cryptid_anomaly_rationale'))}",
|
|
f"image_description_en: {jv(chunk.get('image_description_en'))}",
|
|
f"image_description_pt_br: {jv(chunk.get('image_description_pt_br'))}",
|
|
f"extracted_text: {jv(chunk.get('extracted_text'))}",
|
|
f"source_png: {source_png}",
|
|
"---",
|
|
"",
|
|
f"**EN:** {chunk.get('content_en', '')}",
|
|
"",
|
|
f"**PT-BR:** {chunk.get('content_pt_br', '')}",
|
|
""
|
|
]
|
|
out_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
out_path.write_text("\n".join(yaml_lines), encoding="utf-8")
|
|
|
|
|
|
def write_index(chunks_flat: list, total_pages: int):
|
|
"""Write _index.json."""
|
|
index = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": total_pages,
|
|
"total_chunks": len(chunks_flat),
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": datetime.now(timezone.utc).isoformat(),
|
|
"chunks": []
|
|
}
|
|
for chunk in chunks_flat:
|
|
chunk_id = chunk["chunk_id"]
|
|
preview = (chunk.get("content_en", "") or "")[:80]
|
|
index["chunks"].append({
|
|
"chunk_id": chunk_id,
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk.get("page", 1),
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"file": f"chunks/{chunk_id}.md",
|
|
"bbox": chunk.get("bbox", {}),
|
|
"preview": preview
|
|
})
|
|
(OUT_DIR / "_index.json").write_text(
|
|
json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
)
|
|
|
|
|
|
def write_document_md(chunks_flat: list, total_pages: int) -> int:
|
|
"""Assemble the master document.md."""
|
|
type_histogram = {}
|
|
ufo_flagged = []
|
|
cryptid_flagged = []
|
|
for chunk in chunks_flat:
|
|
t = chunk.get("type", "paragraph")
|
|
type_histogram[t] = type_histogram.get(t, 0) + 1
|
|
if chunk.get("ufo_anomaly_detected"):
|
|
ufo_flagged.append(chunk["chunk_id"])
|
|
if chunk.get("cryptid_anomaly_detected"):
|
|
cryptid_flagged.append(chunk["chunk_id"])
|
|
|
|
now_iso = datetime.now(timezone.utc).isoformat()
|
|
lines = [
|
|
"---",
|
|
"schema_version: \"0.2.0\"",
|
|
"type: master_document",
|
|
f"doc_id: {DOC_ID}",
|
|
f"canonical_title: \"{DOC_TITLE}\"",
|
|
f"total_pages: {total_pages}",
|
|
f"total_chunks: {len(chunks_flat)}",
|
|
f"chunk_types_histogram: {json.dumps(type_histogram, ensure_ascii=False)}",
|
|
"multi_page_tables: []",
|
|
f"ufo_anomalies_flagged: {json.dumps(ufo_flagged)}",
|
|
f"cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)}",
|
|
"build_approach: \"subagents\"",
|
|
"build_model: claude-haiku-4-5",
|
|
f"build_at: {now_iso}",
|
|
"---",
|
|
""
|
|
]
|
|
|
|
current_page = None
|
|
for chunk in chunks_flat:
|
|
page = chunk.get("page", 1)
|
|
if page != current_page:
|
|
current_page = page
|
|
lines.append(f"\n## Page {page}\n")
|
|
|
|
chunk_id = chunk["chunk_id"]
|
|
ctype = chunk.get("type", "paragraph")
|
|
bbox = chunk.get("bbox", {})
|
|
bbox_str = f"{bbox.get('x', 0):.2f}/{bbox.get('y', 0):.2f}/{bbox.get('w', 1):.2f}/{bbox.get('h', 0):.2f}"
|
|
|
|
lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
lines.append(f"<a id=\"{chunk_id}\"></a>")
|
|
lines.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}")
|
|
lines.append("")
|
|
lines.append(f"**EN:** {chunk.get('content_en', '')}")
|
|
lines.append("")
|
|
lines.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
|
|
lines.append("")
|
|
|
|
if ctype == "image" and chunk.get("related_image"):
|
|
lines.append(f"})")
|
|
lines.append("")
|
|
if chunk.get("image_description_en"):
|
|
lines.append(f"**Image Description (EN):** {chunk['image_description_en']}")
|
|
lines.append("")
|
|
if chunk.get("image_description_pt_br"):
|
|
lines.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}")
|
|
lines.append("")
|
|
|
|
meta = {k: v for k, v in chunk.items() if k not in ("content_en", "content_pt_br")}
|
|
lines.append("<details><summary>metadata</summary>")
|
|
lines.append("")
|
|
lines.append("```json")
|
|
lines.append(json.dumps(meta, indent=2, ensure_ascii=False))
|
|
lines.append("```")
|
|
lines.append("")
|
|
lines.append("</details>")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
content = "\n".join(lines)
|
|
(OUT_DIR / "document.md").write_text(content, encoding="utf-8")
|
|
return len(content.encode("utf-8"))
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
pages = get_page_list()
|
|
total_pages = len(pages)
|
|
page_png_map = {pnum: ppath for pnum, ppath in pages}
|
|
safe_print(f"Processing {total_pages} pages for {DOC_ID}")
|
|
|
|
# Process pages in batches of 5
|
|
batch_size = 5
|
|
all_page_results = {}
|
|
batches = [pages[i:i+batch_size] for i in range(0, len(pages), batch_size)]
|
|
|
|
for batch_idx, batch in enumerate(batches):
|
|
page_nums = [p[0] for p in batch]
|
|
safe_print(f"Batch {batch_idx+1}/{len(batches)}: pages {page_nums}")
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
futures = {
|
|
executor.submit(process_page, pnum, ppath, total_pages): pnum
|
|
for pnum, ppath in batch
|
|
}
|
|
for future in concurrent.futures.as_completed(futures):
|
|
pnum = futures[future]
|
|
try:
|
|
result = future.result()
|
|
all_page_results[pnum] = result
|
|
except Exception as e:
|
|
safe_print(f" [FATAL] Page {pnum}: {e}")
|
|
all_page_results[pnum] = {
|
|
"page_number": pnum,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": f"[Page {pnum} — fatal error]",
|
|
"content_pt_br": f"[Página {pnum} — erro fatal]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None, "ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}]
|
|
}
|
|
|
|
safe_print(f"\nAll pages processed. Numbering chunks globally...")
|
|
chunks_flat = global_number_chunks(all_page_results)
|
|
total_chunks = len(chunks_flat)
|
|
safe_print(f"Total chunks: {total_chunks}")
|
|
|
|
# Analyze image chunks in batches of 5
|
|
image_chunks = [c for c in chunks_flat if c.get("type") == "image"]
|
|
safe_print(f"\nProcessing {len(image_chunks)} image chunks...")
|
|
img_batches = [image_chunks[i:i+5] for i in range(0, len(image_chunks), 5)]
|
|
for img_batch_idx, img_batch in enumerate(img_batches):
|
|
safe_print(f"Image batch {img_batch_idx+1}/{len(img_batches)}")
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
futures = {}
|
|
for chunk in img_batch:
|
|
page = chunk.get("page", 1)
|
|
png_path = page_png_map.get(page)
|
|
if png_path:
|
|
f = executor.submit(analyze_image, chunk, png_path)
|
|
futures[f] = chunk["chunk_id"]
|
|
for future in concurrent.futures.as_completed(futures):
|
|
try:
|
|
future.result()
|
|
except Exception as e:
|
|
cid = futures[future]
|
|
safe_print(f" [ERR] Image {cid}: {e}")
|
|
|
|
safe_print(f"\nWriting chunk files...")
|
|
for chunk in chunks_flat:
|
|
write_chunk_file(chunk, page_png_map)
|
|
|
|
safe_print(f"Writing _index.json...")
|
|
write_index(chunks_flat, total_pages)
|
|
|
|
safe_print(f"Writing document.md...")
|
|
doc_bytes = write_document_md(chunks_flat, total_pages)
|
|
|
|
images_count = len([c for c in chunks_flat if c.get("type") == "image"])
|
|
ufo_count = len([c for c in chunks_flat if c.get("ufo_anomaly_detected")])
|
|
cryptid_count = len([c for c in chunks_flat if c.get("cryptid_anomaly_detected")])
|
|
wall_seconds = int(time.time() - start_time)
|
|
|
|
safe_print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_count} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}")
|
|
safe_print(f"pages_done={total_pages}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={ufo_count}, cryptid_anomalies={cryptid_count}, wall_seconds={wall_seconds}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|