633 lines
25 KiB
Python
633 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Rebuild doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for
|
|
into structured chunk files, _index.json, and document.md.
|
|
|
|
Uses `claude -p --model haiku` subprocess calls (OAuth via Max plan).
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import threading
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
DOC_ID = "doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for"
|
|
DOC_TITLE = "UFO's and Defense: What Should We Prepare For?"
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
|
|
MODEL = "haiku"
|
|
TOTAL_PAGES = 93
|
|
WORKERS = 4
|
|
TIMEOUT = 240 # seconds per page call
|
|
|
|
_print_lock = threading.Lock()
|
|
|
|
def safe_print(*args, **kwargs):
|
|
with _print_lock:
|
|
print(*args, **kwargs, flush=True)
|
|
|
|
|
|
PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder agent. Analyze the scanned document page image and extract all content into structured chunks.
|
|
|
|
Document: {doc_title}
|
|
Page: {page_number} of {total_pages}
|
|
Doc ID: {doc_id}
|
|
|
|
STEP 1: Use the Read tool to view this PNG image:
|
|
{png_path}
|
|
|
|
STEP 2: Analyze every element on the page carefully.
|
|
|
|
STEP 3: Return ONE JSON object only (no markdown fence, no commentary):
|
|
{{
|
|
"page_number": {page_number},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"content_en": "verbatim English text from page",
|
|
"content_pt_br": "tradução em português brasileiro",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null,
|
|
"image_description_en": null,
|
|
"image_description_pt_br": null,
|
|
"extracted_text": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
CHUNK TYPES (use exactly one):
|
|
- letterhead: document header/letterhead
|
|
- classification_marking: classification marking (TOP SECRET, CUI, etc.)
|
|
- date_line: date field
|
|
- address_block: TO:/FROM:/distribution fields
|
|
- heading: section/chapter/subject heading
|
|
- paragraph: body text paragraph
|
|
- numbered_item: numbered list item
|
|
- bulleted_item: bullet list item
|
|
- table_marker: table content
|
|
- image: photograph, diagram, chart, sketch, map, graph
|
|
- caption: figure/image caption
|
|
- footer: page footer
|
|
- page_number: standalone page number
|
|
- signature: signature/signatory block
|
|
- redaction: blacked-out/redacted area
|
|
- stamp: official stamp or seal
|
|
- handwriting: handwritten annotation
|
|
- blank_area: empty area
|
|
- form_field: form field with label and value
|
|
- unknown: unidentifiable element
|
|
|
|
RULES:
|
|
1. Split content into logical chunks (one concept per chunk). A typical page has 3-15 chunks.
|
|
2. For image chunks: describe what you see in content_en and set image_type.
|
|
3. image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other
|
|
4. bbox: normalized coordinates 0.0-1.0 (x=left, y=top, w=width, h=height)
|
|
5. content_en: verbatim text if text chunk; visual description if image chunk
|
|
6. content_pt_br: Brazilian Portuguese translation (NOT European Portuguese)
|
|
7. classification: null or the marking text (e.g. "CUI", "UNCLASSIFIED")
|
|
8. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
|
|
9. formatting: array from ["bold", "italic", "all_caps", "underline"]
|
|
10. If page is completely blank: ONE chunk of type "blank_area"
|
|
11. Preserve French text verbatim (document may contain French)
|
|
12. For redaction chunks: set redaction_code if visible (e.g. "(b)(1)")
|
|
13. ufo_anomaly_detected: true ONLY for image chunks showing actual UAP/anomalous phenomena
|
|
|
|
Output ONLY the JSON object. No preamble. No fence. No commentary.'''
|
|
|
|
|
|
IMAGE_ANALYST_PROMPT = '''You are an image analyst for a UAP/UFO declassified document.
|
|
|
|
STEP 1: Use the Read tool to view this cropped image:
|
|
{image_path}
|
|
|
|
STEP 2: Analyze it carefully.
|
|
|
|
STEP 3: Return ONE JSON object only (no markdown fence):
|
|
{{
|
|
"image_description_en": "detailed English description",
|
|
"image_description_pt_br": "descrição detalhada em português brasileiro",
|
|
"image_type": "photograph",
|
|
"extracted_text": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
|
|
image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other
|
|
Set ufo_anomaly_detected=true only if the image shows an actual UAP/UFO or anomalous aerial phenomenon.
|
|
Set cryptid_anomaly_detected=true only if the image shows a cryptid or unknown creature.
|
|
extracted_text: any text visible inside the image (verbatim), or null.
|
|
|
|
Output ONLY the JSON object.'''
|
|
|
|
|
|
def extract_json(text: str) -> dict:
|
|
"""Extract JSON from claude CLI output."""
|
|
text = text.strip()
|
|
# Strip markdown fences if present
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```\s*$", "", text.rstrip())
|
|
# Find first { and matching }
|
|
start = text.find("{")
|
|
if start == -1:
|
|
raise ValueError(f"No JSON found in: {text[:200]}")
|
|
depth = 0
|
|
for i, c in enumerate(text[start:], start):
|
|
if c == "{":
|
|
depth += 1
|
|
elif c == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
return json.loads(text[start:i + 1])
|
|
raise ValueError("Unclosed JSON in response")
|
|
|
|
|
|
def call_claude(prompt: str, allowed_tools: str = "Read", timeout: int = TIMEOUT) -> str:
|
|
"""Call claude -p CLI and return result text."""
|
|
cmd = [
|
|
"claude", "-p",
|
|
"--model", MODEL,
|
|
"--output-format", "json",
|
|
"--max-turns", "5",
|
|
"--allowedTools", allowed_tools,
|
|
"--add-dir", str(PNG_DIR),
|
|
"--add-dir", str(IMAGES_DIR),
|
|
"--",
|
|
prompt,
|
|
]
|
|
res = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
check=False,
|
|
env={**os.environ},
|
|
)
|
|
if res.returncode != 0:
|
|
raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}")
|
|
|
|
cli = json.loads(res.stdout)
|
|
if cli.get("is_error"):
|
|
raise RuntimeError(f"claude error: {cli.get('result', '')[:500]}")
|
|
|
|
return cli.get("result", "")
|
|
|
|
|
|
def process_page(page_num: int) -> dict:
|
|
"""Process a single page using claude -p CLI."""
|
|
png_path = PNG_DIR / f"p-{page_num:03d}.png"
|
|
|
|
if not png_path.exists():
|
|
safe_print(f" Page {page_num}: PNG missing — placeholder")
|
|
return {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank_area",
|
|
"content_en": f"[Page {page_num} — PNG not available]",
|
|
"content_pt_br": f"[Página {page_num} — PNG não disponível]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [],
|
|
"redaction_code": None, "redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None
|
|
}]
|
|
}
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
page_number=page_num,
|
|
total_pages=TOTAL_PAGES,
|
|
doc_id=DOC_ID,
|
|
png_path=str(png_path),
|
|
)
|
|
|
|
max_retries = 3
|
|
for attempt in range(1, max_retries + 1):
|
|
try:
|
|
result_text = call_claude(prompt, allowed_tools="Read")
|
|
data = extract_json(result_text)
|
|
data["page_number"] = page_num
|
|
# Validate chunks exist
|
|
if not isinstance(data.get("chunks"), list) or len(data["chunks"]) == 0:
|
|
raise ValueError("No chunks in response")
|
|
safe_print(f" Page {page_num}: {len(data['chunks'])} chunks")
|
|
return data
|
|
except (subprocess.TimeoutExpired,) as e:
|
|
safe_print(f" Page {page_num}: timeout attempt {attempt}/{max_retries}")
|
|
if attempt == max_retries:
|
|
break
|
|
time.sleep(10 * attempt)
|
|
except (RuntimeError, ValueError, json.JSONDecodeError) as e:
|
|
safe_print(f" Page {page_num}: error attempt {attempt}/{max_retries}: {str(e)[:100]}")
|
|
if attempt == max_retries:
|
|
break
|
|
backoff = 5 * attempt + random.uniform(0, 3)
|
|
time.sleep(backoff)
|
|
|
|
# Return fallback
|
|
safe_print(f" Page {page_num}: FALLBACK after {max_retries} attempts")
|
|
return {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "unknown",
|
|
"content_en": f"[Page {page_num} — content extraction failed after {max_retries} attempts]",
|
|
"content_pt_br": f"[Página {page_num} — extração de conteúdo falhou após {max_retries} tentativas]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [],
|
|
"redaction_code": None, "redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None
|
|
}]
|
|
}
|
|
|
|
|
|
def crop_image(chunk_id: str, png_path: Path, bbox: dict) -> object:
|
|
"""Crop image region from page PNG."""
|
|
from PIL import Image
|
|
|
|
cropped_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
try:
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
x = max(0.0, float(bbox.get("x", 0)))
|
|
y = max(0.0, float(bbox.get("y", 0)))
|
|
w = max(0.01, float(bbox.get("w", 1)))
|
|
h = max(0.01, float(bbox.get("h", 0.1)))
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
|
|
if right <= left or bottom <= top:
|
|
safe_print(f" Crop {chunk_id}: degenerate bbox {bbox}")
|
|
return None
|
|
|
|
cropped = im.crop((left, top, right, bottom))
|
|
cropped.save(str(cropped_path))
|
|
safe_print(f" Cropped {chunk_id}: {left},{top},{right},{bottom} from {W}x{H}")
|
|
return cropped_path
|
|
except Exception as e:
|
|
safe_print(f" Crop {chunk_id}: error: {e}")
|
|
return None
|
|
|
|
|
|
def analyze_image(chunk_id: str, cropped_path: Path) -> dict:
|
|
"""Analyze a cropped image using claude -p CLI."""
|
|
if not cropped_path or not cropped_path.exists():
|
|
return {}
|
|
|
|
prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path))
|
|
|
|
max_retries = 2
|
|
for attempt in range(1, max_retries + 1):
|
|
try:
|
|
result_text = call_claude(prompt, allowed_tools="Read", timeout=120)
|
|
data = extract_json(result_text)
|
|
safe_print(f" Image {chunk_id}: analyzed (ufo={data.get('ufo_anomaly_detected', False)})")
|
|
return data
|
|
except Exception as e:
|
|
safe_print(f" Image {chunk_id}: error attempt {attempt}: {str(e)[:80]}")
|
|
if attempt < max_retries:
|
|
time.sleep(5)
|
|
return {}
|
|
|
|
|
|
def write_chunk_file(chunk: dict) -> None:
|
|
"""Write chunk .md file."""
|
|
chunk_id = chunk["chunk_id"]
|
|
chunk_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
|
|
bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
|
|
page_num = chunk.get("page", 1)
|
|
source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png"
|
|
|
|
content = f"""---
|
|
chunk_id: {chunk_id}
|
|
type: {chunk.get("type", "paragraph")}
|
|
page: {page_num}
|
|
order_in_page: {chunk.get("order_in_page", 1)}
|
|
order_global: {chunk.get("order_global", 1)}
|
|
bbox: {{x: {float(bbox.get('x') or 0):.2f}, y: {float(bbox.get('y') or 0):.2f}, w: {float(bbox.get('w') or 1):.2f}, h: {float(bbox.get('h') or 0.1):.2f}}}
|
|
classification: {json.dumps(chunk.get("classification"))}
|
|
formatting: {json.dumps(chunk.get("formatting", []))}
|
|
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
|
|
prev_chunk: {json.dumps(chunk.get("prev_chunk"))}
|
|
next_chunk: {json.dumps(chunk.get("next_chunk"))}
|
|
related_image: {json.dumps(chunk.get("related_image"))}
|
|
related_table: null
|
|
ocr_confidence: {float(chunk.get("ocr_confidence") or 0.85)}
|
|
ocr_source_lines: {json.dumps(chunk.get("ocr_source_lines", []))}
|
|
redaction_code: {json.dumps(chunk.get("redaction_code"))}
|
|
redaction_inferred_content_type: {json.dumps(chunk.get("redaction_inferred_content_type"))}
|
|
image_type: {json.dumps(chunk.get("image_type"))}
|
|
ufo_anomaly_detected: {str(bool(chunk.get("ufo_anomaly_detected", False))).lower()}
|
|
cryptid_anomaly_detected: {str(bool(chunk.get("cryptid_anomaly_detected", False))).lower()}
|
|
ufo_anomaly_type: {json.dumps(chunk.get("ufo_anomaly_type"))}
|
|
ufo_anomaly_rationale: {json.dumps(chunk.get("ufo_anomaly_rationale"))}
|
|
cryptid_anomaly_type: {json.dumps(chunk.get("cryptid_anomaly_type"))}
|
|
cryptid_anomaly_rationale: {json.dumps(chunk.get("cryptid_anomaly_rationale"))}
|
|
image_description_en: {json.dumps(chunk.get("image_description_en"))}
|
|
image_description_pt_br: {json.dumps(chunk.get("image_description_pt_br"))}
|
|
extracted_text: {json.dumps(chunk.get("extracted_text"))}
|
|
source_png: {source_png}
|
|
---
|
|
|
|
**EN:** {chunk.get("content_en", "")}
|
|
|
|
**PT-BR:** {chunk.get("content_pt_br", "")}
|
|
"""
|
|
chunk_path.write_text(content, encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Rebuilding {DOC_ID}")
|
|
print(f"Processing {TOTAL_PAGES} pages with {WORKERS} workers...")
|
|
print("=" * 70)
|
|
|
|
page_numbers = list(range(1, TOTAL_PAGES + 1)) # 1..93
|
|
all_page_data = {}
|
|
|
|
# Process pages in batches of WORKERS
|
|
for batch_start in range(0, len(page_numbers), WORKERS):
|
|
batch = page_numbers[batch_start:batch_start + WORKERS]
|
|
batch_num = batch_start // WORKERS + 1
|
|
total_batches = (len(page_numbers) + WORKERS - 1) // WORKERS
|
|
safe_print(f"\nBatch {batch_num}/{total_batches}: pages {batch}")
|
|
|
|
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
|
|
futures = {executor.submit(process_page, p): p for p in batch}
|
|
for future in as_completed(futures):
|
|
page_num = futures[future]
|
|
try:
|
|
data = future.result()
|
|
all_page_data[page_num] = data
|
|
except Exception as e:
|
|
safe_print(f" Page {page_num}: CRITICAL FAILURE: {e}")
|
|
all_page_data[page_num] = {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "unknown",
|
|
"content_en": f"[Page {page_num} — critical failure]",
|
|
"content_pt_br": f"[Página {page_num} — falha crítica]",
|
|
"bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [],
|
|
"redaction_code": None, "redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None
|
|
}]
|
|
}
|
|
|
|
print(f"\nAll pages processed. Assigning global chunk IDs...")
|
|
|
|
# Assign global chunk IDs in page order
|
|
all_chunks = []
|
|
chunk_counter = 1
|
|
for page_num in sorted(all_page_data.keys()):
|
|
page_data = all_page_data[page_num]
|
|
chunks = page_data.get("chunks", [])
|
|
chunks.sort(key=lambda c: c.get("order_in_page", 1))
|
|
for chunk in chunks:
|
|
chunk_id = f"c{chunk_counter:04d}"
|
|
chunk["chunk_id"] = chunk_id
|
|
chunk["page"] = page_num
|
|
chunk["order_global"] = chunk_counter
|
|
chunk_counter += 1
|
|
all_chunks.append(chunk)
|
|
|
|
total_chunks = len(all_chunks)
|
|
print(f"Total chunks: {total_chunks}")
|
|
|
|
# Prev/next pointers
|
|
for i, chunk in enumerate(all_chunks):
|
|
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
|
|
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
|
|
|
|
# Identify image chunks
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f"\nCropping {len(image_chunks)} images...")
|
|
|
|
# Crop all images first
|
|
crop_results = {}
|
|
for chunk in image_chunks:
|
|
chunk_id = chunk["chunk_id"]
|
|
page_num = chunk["page"]
|
|
png_path = PNG_DIR / f"p-{page_num:03d}.png"
|
|
if png_path.exists():
|
|
cp = crop_image(chunk_id, png_path, chunk.get("bbox", {}))
|
|
crop_results[chunk_id] = cp
|
|
else:
|
|
crop_results[chunk_id] = None
|
|
|
|
# Analyze images in batches
|
|
image_items = [(c["chunk_id"], crop_results.get(c["chunk_id"]))
|
|
for c in image_chunks if crop_results.get(c["chunk_id"])]
|
|
print(f"\nAnalyzing {len(image_items)} cropped images...")
|
|
|
|
image_analysis = {}
|
|
for batch_start in range(0, len(image_items), WORKERS):
|
|
batch = image_items[batch_start:batch_start + WORKERS]
|
|
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
|
|
futures = {executor.submit(analyze_image, cid, cp): cid for cid, cp in batch}
|
|
for future in as_completed(futures):
|
|
chunk_id = futures[future]
|
|
try:
|
|
image_analysis[chunk_id] = future.result()
|
|
except Exception as e:
|
|
safe_print(f" Image analysis {chunk_id}: {e}")
|
|
image_analysis[chunk_id] = {}
|
|
|
|
# Merge image analysis into chunks
|
|
for chunk in all_chunks:
|
|
chunk_id = chunk["chunk_id"]
|
|
if chunk.get("type") == "image":
|
|
chunk["related_image"] = f"IMG-{chunk_id}.png"
|
|
if chunk_id in image_analysis:
|
|
for field in ["image_description_en", "image_description_pt_br", "image_type",
|
|
"extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type",
|
|
"ufo_anomaly_rationale", "cryptid_anomaly_detected",
|
|
"cryptid_anomaly_type", "cryptid_anomaly_rationale"]:
|
|
if field in image_analysis[chunk_id]:
|
|
chunk[field] = image_analysis[chunk_id][field]
|
|
|
|
# Write chunk files
|
|
print(f"\nWriting {total_chunks} chunk files...")
|
|
for chunk in all_chunks:
|
|
write_chunk_file(chunk)
|
|
print("Chunk files written.")
|
|
|
|
# Build _index.json
|
|
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
index_chunks = []
|
|
for chunk in all_chunks:
|
|
bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk.get("page", 1),
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": bbox,
|
|
"preview": chunk.get("content_en", "")[:80]
|
|
})
|
|
|
|
index_data = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": total_chunks,
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": now_iso,
|
|
"chunks": index_chunks
|
|
}
|
|
(OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print("_index.json written.")
|
|
|
|
# Compute stats
|
|
chunk_types = {}
|
|
ufo_anomalies = []
|
|
cryptid_anomalies = []
|
|
images_count = 0
|
|
for chunk in all_chunks:
|
|
t = chunk.get("type", "paragraph")
|
|
chunk_types[t] = chunk_types.get(t, 0) + 1
|
|
if chunk.get("ufo_anomaly_detected"):
|
|
ufo_anomalies.append(chunk["chunk_id"])
|
|
if chunk.get("cryptid_anomaly_detected"):
|
|
cryptid_anomalies.append(chunk["chunk_id"])
|
|
if t == "image":
|
|
images_count += 1
|
|
|
|
# Assemble document.md
|
|
print("\nAssembling document.md...")
|
|
parts = []
|
|
|
|
# Frontmatter
|
|
parts.append("---")
|
|
parts.append('schema_version: "0.2.0"')
|
|
parts.append("type: master_document")
|
|
parts.append(f"doc_id: {DOC_ID}")
|
|
parts.append(f'canonical_title: "{DOC_TITLE}"')
|
|
parts.append(f"total_pages: {TOTAL_PAGES}")
|
|
parts.append(f"total_chunks: {total_chunks}")
|
|
parts.append("chunk_types_histogram:")
|
|
for t, count in sorted(chunk_types.items()):
|
|
parts.append(f" {t}: {count}")
|
|
parts.append("multi_page_tables: []")
|
|
parts.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
|
|
parts.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
|
|
parts.append('build_approach: "subagents"')
|
|
parts.append("build_model: claude-haiku-4-5")
|
|
parts.append(f"build_at: {now_iso}")
|
|
parts.append("---")
|
|
parts.append("")
|
|
|
|
current_page = None
|
|
for chunk in all_chunks:
|
|
page = chunk.get("page", 1)
|
|
if page != current_page:
|
|
current_page = page
|
|
parts.append(f"\n## Page {page}\n")
|
|
|
|
chunk_id = chunk["chunk_id"]
|
|
bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
|
|
bbox_str = f"{float(bbox.get('x') or 0):.2f}/{float(bbox.get('y') or 0):.2f}/{float(bbox.get('w') or 1):.2f}/{float(bbox.get('h') or 0.1):.2f}"
|
|
ctype = chunk.get("type", "paragraph")
|
|
|
|
parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
parts.append(f'<a id="{chunk_id}"></a>')
|
|
parts.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}")
|
|
parts.append("")
|
|
parts.append(f"**EN:** {chunk.get('content_en', '')}")
|
|
parts.append("")
|
|
parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
|
|
parts.append("")
|
|
|
|
if ctype == "image":
|
|
img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
if img_path.exists():
|
|
parts.append(f"")
|
|
parts.append("")
|
|
if chunk.get("image_description_en"):
|
|
parts.append(f"*{chunk['image_description_en']}*")
|
|
parts.append("")
|
|
|
|
# Metadata block
|
|
meta = {k: v for k, v in chunk.items() if k not in ["content_en", "content_pt_br"]}
|
|
parts.append("<details><summary>metadata</summary>")
|
|
parts.append("")
|
|
parts.append("```json")
|
|
parts.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
parts.append("```")
|
|
parts.append("")
|
|
parts.append("</details>")
|
|
parts.append("")
|
|
parts.append("---")
|
|
parts.append("")
|
|
|
|
document_md = "\n".join(parts)
|
|
doc_path = OUT_DIR / "document.md"
|
|
doc_path.write_text(document_md, encoding="utf-8")
|
|
doc_md_bytes = len(document_md.encode("utf-8"))
|
|
print(f"document.md written ({doc_md_bytes:,} bytes)")
|
|
|
|
wall_seconds = int(time.time() - start_time)
|
|
print(f"\n{'='*70}")
|
|
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={len(ufo_anomalies)}, cryptid_anomalies={len(cryptid_anomalies)}, wall_seconds={wall_seconds}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|