660 lines
27 KiB
Python
660 lines
27 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rebuild script for doc-65-hs1-834228961-62-hq-83894-section-2
|
|
Processes all 159 pages in parallel batches of 5, generates chunks, images, index, document.md
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import base64
|
|
import time
|
|
import concurrent.futures
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import anthropic
|
|
from PIL import Image
|
|
|
|
# ── Config ──────────────────────────────────────────────────────────────────
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2"
|
|
DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)"
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
|
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
|
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
CHUNK_TYPES = [
|
|
"cover", "letterhead", "stamp", "header", "subheader", "paragraph",
|
|
"redaction", "signature", "image", "table_marker", "footer",
|
|
"page_number", "classification_marking", "separator", "handwriting",
|
|
"form_field", "caption", "list_item", "annotation", "blank"
|
|
]
|
|
|
|
# Build page mapping: sequential 1..159 -> actual file number
|
|
def build_page_map():
|
|
pngs = sorted(
|
|
int(p.stem.replace("p-", ""))
|
|
for p in PNG_DIR.glob("p-*.png")
|
|
)
|
|
return {i + 1: num for i, num in enumerate(pngs)}
|
|
|
|
PAGE_MAP = build_page_map()
|
|
TOTAL_PAGES = len(PAGE_MAP)
|
|
|
|
def load_image_b64(path: Path) -> str:
|
|
with open(path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
def load_ocr(actual_num: int) -> str:
|
|
ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt"
|
|
if ocr_path.exists():
|
|
text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
|
|
return text if text else ""
|
|
return ""
|
|
|
|
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder subagent. Your job is to analyze a declassified FBI document page and extract ALL content as structured chunks.
|
|
|
|
Document: {doc_title}
|
|
Page: {page_number} of {total_pages}
|
|
Actual file: p-{actual_num:03d}.png
|
|
|
|
OCR text (may be empty/poor quality):
|
|
{ocr_text}
|
|
|
|
Analyze the image carefully. Extract ALL visible content into chunks. Return a JSON object:
|
|
{{
|
|
"page_number": {page_number},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<one of: cover|letterhead|stamp|header|subheader|paragraph|redaction|signature|image|table_marker|footer|page_number|classification_marking|separator|handwriting|form_field|caption|list_item|annotation|blank>",
|
|
"content_en": "<exact transcription or English description>",
|
|
"content_pt_br": "<Brazilian Portuguese translation/description>",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Rules:
|
|
- bbox: x,y = top-left corner (0.0-1.0 fraction of page), w,h = width/height fractions
|
|
- classification: string like "SECRET" or null
|
|
- formatting: array of ["bold","italic","all_caps","underline","strikethrough"] as applicable
|
|
- cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
|
|
- For redaction blocks: type="redaction", include redaction_code if visible (e.g. "(b)(1)", "(b)(3)", "(b)(6)")
|
|
- For stamps: type="stamp", describe the stamp text
|
|
- For images/diagrams/photos: type="image", set image_type to "photo"|"diagram"|"sketch"|"map"|"chart"|"signature_block"
|
|
- For tables: type="table_marker"
|
|
- ufo_anomaly_detected: true only if the page contains an image/sketch/photo of an anomalous aerial phenomenon
|
|
- cryptid_anomaly_detected: true only if the page contains imagery of cryptids/unknown creatures
|
|
- content_en: transcribe verbatim when legible; describe when not (e.g., "[Redacted block]", "[Stamp: RECEIVED]")
|
|
- content_pt_br: Brazilian Portuguese equivalent
|
|
- Return ONLY valid JSON, no markdown fences, no explanation
|
|
- Do NOT skip any visible content area
|
|
- Minimum 1 chunk per page (even blank pages get type="blank")
|
|
"""
|
|
|
|
def rebuild_page(page_seq: int) -> dict:
|
|
"""Process one page, return {page_number, chunks:[...]}"""
|
|
actual_num = PAGE_MAP[page_seq]
|
|
png_path = PNG_DIR / f"p-{actual_num:03d}.png"
|
|
ocr_text = load_ocr(actual_num)
|
|
|
|
img_b64 = load_image_b64(png_path)
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
page_number=page_seq,
|
|
total_pages=TOTAL_PAGES,
|
|
actual_num=actual_num,
|
|
ocr_text=ocr_text[:2000] if ocr_text else "(no OCR available)"
|
|
)
|
|
|
|
retries = 3
|
|
for attempt in range(retries):
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=4096,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_b64
|
|
}
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": prompt
|
|
}
|
|
]
|
|
}]
|
|
)
|
|
|
|
raw = response.content[0].text.strip()
|
|
# Strip markdown fences if present
|
|
if raw.startswith("```"):
|
|
raw = raw.split("\n", 1)[1]
|
|
if raw.endswith("```"):
|
|
raw = raw[:-3]
|
|
raw = raw.strip()
|
|
|
|
data = json.loads(raw)
|
|
data["page_number"] = page_seq
|
|
data["actual_num"] = actual_num
|
|
if "chunks" not in data:
|
|
data["chunks"] = []
|
|
# Ensure order_in_page
|
|
for i, ch in enumerate(data["chunks"]):
|
|
ch["order_in_page"] = i + 1
|
|
ch["page"] = page_seq
|
|
print(f" [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True)
|
|
return data
|
|
except Exception as e:
|
|
if attempt < retries - 1:
|
|
wait = 2 ** attempt * 5
|
|
print(f" [RETRY {attempt+1}] page {page_seq}: {e}, waiting {wait}s", flush=True)
|
|
time.sleep(wait)
|
|
else:
|
|
print(f" [FAIL] page {page_seq}: {e}", flush=True)
|
|
return {
|
|
"page_number": page_seq,
|
|
"actual_num": actual_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"page": page_seq,
|
|
"content_en": "[Page processing failed]",
|
|
"content_pt_br": "[Falha no processamento da página]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}]
|
|
}
|
|
|
|
IMAGE_ANALYST_PROMPT = """You are an image analyst examining a cropped region from a declassified FBI document about flying discs / UAP investigations.
|
|
|
|
Analyze this image region and return a JSON object:
|
|
{{
|
|
"image_type": "<photo|diagram|sketch|map|chart|signature_block|stamp|seal|other>",
|
|
"image_description_en": "<detailed description in English>",
|
|
"image_description_pt_br": "<descrição detalhada em português brasileiro>",
|
|
"extracted_text": "<any text visible in the image, verbatim>",
|
|
"ufo_anomaly_detected": <true|false>,
|
|
"ufo_anomaly_type": "<type or null>",
|
|
"ufo_anomaly_rationale": "<rationale or null>",
|
|
"cryptid_anomaly_detected": <true|false>,
|
|
"cryptid_anomaly_type": "<type or null>",
|
|
"cryptid_anomaly_rationale": "<rationale or null>"
|
|
}}
|
|
|
|
Return ONLY valid JSON, no markdown fences.
|
|
"""
|
|
|
|
def analyze_image(chunk_id: str, img_path: Path) -> dict:
|
|
"""Analyze a cropped image, return metadata dict"""
|
|
if not img_path.exists():
|
|
return {
|
|
"image_type": "other",
|
|
"image_description_en": "Image not available",
|
|
"image_description_pt_br": "Imagem não disponível",
|
|
"extracted_text": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}
|
|
|
|
img_b64 = load_image_b64(img_path)
|
|
|
|
retries = 3
|
|
for attempt in range(retries):
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=1024,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_b64
|
|
}
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": IMAGE_ANALYST_PROMPT
|
|
}
|
|
]
|
|
}]
|
|
)
|
|
raw = response.content[0].text.strip()
|
|
if raw.startswith("```"):
|
|
raw = raw.split("\n", 1)[1]
|
|
if raw.endswith("```"):
|
|
raw = raw[:-3]
|
|
raw = raw.strip()
|
|
return json.loads(raw)
|
|
except Exception as e:
|
|
if attempt < retries - 1:
|
|
time.sleep(2 ** attempt * 3)
|
|
else:
|
|
print(f" [IMAGE FAIL] {chunk_id}: {e}", flush=True)
|
|
return {
|
|
"image_type": "other",
|
|
"image_description_en": "Analysis failed",
|
|
"image_description_pt_br": "Análise falhou",
|
|
"extracted_text": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}
|
|
|
|
def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path:
|
|
"""Crop bbox region from page PNG and save to images dir"""
|
|
src = PNG_DIR / f"p-{actual_num:03d}.png"
|
|
dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
|
|
try:
|
|
im = Image.open(src)
|
|
W, H = im.size
|
|
x = bbox.get("x", 0.0)
|
|
y = bbox.get("y", 0.0)
|
|
w = bbox.get("w", 1.0)
|
|
h = bbox.get("h", 1.0)
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
cropped = im.crop((left, top, right, bottom))
|
|
cropped.save(str(dst))
|
|
return dst
|
|
except Exception as e:
|
|
print(f" [CROP FAIL] {chunk_id}: {e}", flush=True)
|
|
return dst
|
|
|
|
def write_chunk_file(chunk: dict, chunk_id: str):
|
|
"""Write individual chunk markdown file"""
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
page = chunk.get("page", 1)
|
|
actual_num = PAGE_MAP.get(page, page)
|
|
|
|
related_image = f"IMG-{chunk_id}.png" if chunk.get("type") == "image" else "null"
|
|
related_table = chunk.get("related_table", "null") or "null"
|
|
|
|
prev_chunk = chunk.get("prev_chunk", "null") or "null"
|
|
next_chunk = chunk.get("next_chunk", "null") or "null"
|
|
|
|
fmt_list = chunk.get("formatting", []) or []
|
|
fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
|
|
|
|
ocr_lines = chunk.get("ocr_source_lines", []) or []
|
|
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
|
|
|
|
# Boolean fields
|
|
ufo_det = str(chunk.get("ufo_anomaly_detected", False)).lower()
|
|
crypto_det = str(chunk.get("cryptid_anomaly_detected", False)).lower()
|
|
|
|
def yaml_val(v):
|
|
if v is None or v == "null":
|
|
return "null"
|
|
if isinstance(v, bool):
|
|
return str(v).lower()
|
|
return str(v)
|
|
|
|
content = f"""---
|
|
chunk_id: {chunk_id}
|
|
type: {chunk.get("type", "paragraph")}
|
|
page: {page}
|
|
order_in_page: {chunk.get("order_in_page", 1)}
|
|
order_global: {chunk.get("order_global", 1)}
|
|
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
|
|
classification: {yaml_val(chunk.get("classification"))}
|
|
formatting: {fmt_str}
|
|
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
|
|
prev_chunk: {prev_chunk}
|
|
next_chunk: {next_chunk}
|
|
related_image: {related_image}
|
|
related_table: {related_table}
|
|
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
|
|
ocr_source_lines: {ocr_lines_str}
|
|
redaction_code: {yaml_val(chunk.get("redaction_code"))}
|
|
redaction_inferred_content_type: {yaml_val(chunk.get("redaction_inferred_content_type"))}
|
|
image_type: {yaml_val(chunk.get("image_type"))}
|
|
ufo_anomaly_detected: {ufo_det}
|
|
cryptid_anomaly_detected: {crypto_det}
|
|
ufo_anomaly_type: {yaml_val(chunk.get("ufo_anomaly_type"))}
|
|
ufo_anomaly_rationale: {yaml_val(chunk.get("ufo_anomaly_rationale"))}
|
|
cryptid_anomaly_type: {yaml_val(chunk.get("cryptid_anomaly_type"))}
|
|
cryptid_anomaly_rationale: {yaml_val(chunk.get("cryptid_anomaly_rationale"))}
|
|
image_description_en: {yaml_val(chunk.get("image_description_en"))}
|
|
image_description_pt_br: {yaml_val(chunk.get("image_description_pt_br"))}
|
|
extracted_text: {yaml_val(chunk.get("extracted_text"))}
|
|
source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png
|
|
---
|
|
|
|
**EN:** {chunk.get("content_en", "")}
|
|
|
|
**PT-BR:** {chunk.get("content_pt_br", "")}
|
|
"""
|
|
chunk_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
chunk_path.write_text(content, encoding="utf-8")
|
|
|
|
def main():
|
|
t_start = time.time()
|
|
print(f"Starting rebuild of {DOC_ID}", flush=True)
|
|
print(f"Total pages: {TOTAL_PAGES}", flush=True)
|
|
|
|
# Ensure output dirs
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Step 1: Process all pages in parallel batches of 5
|
|
print("\n=== Phase 1: Page rebuilding ===", flush=True)
|
|
all_page_results = {}
|
|
|
|
page_seqs = list(range(1, TOTAL_PAGES + 1))
|
|
batch_size = 5
|
|
|
|
for batch_start in range(0, len(page_seqs), batch_size):
|
|
batch = page_seqs[batch_start:batch_start + batch_size]
|
|
print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
|
|
futures = {executor.submit(rebuild_page, p): p for p in batch}
|
|
for future in concurrent.futures.as_completed(futures):
|
|
result = future.result()
|
|
all_page_results[result["page_number"]] = result
|
|
|
|
# Small delay between batches to avoid rate limits
|
|
if batch_start + batch_size < len(page_seqs):
|
|
time.sleep(1)
|
|
|
|
# Step 2: Globally number chunks
|
|
print("\n=== Phase 2: Global chunk numbering ===", flush=True)
|
|
all_chunks = []
|
|
order_global = 0
|
|
|
|
for page_seq in sorted(all_page_results.keys()):
|
|
page_data = all_page_results[page_seq]
|
|
chunks = page_data.get("chunks", [])
|
|
|
|
for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)):
|
|
order_global += 1
|
|
chunk_id = f"c{order_global:04d}"
|
|
chunk["chunk_id"] = chunk_id
|
|
chunk["order_global"] = order_global
|
|
chunk["actual_num"] = page_data.get("actual_num", page_seq)
|
|
all_chunks.append(chunk)
|
|
|
|
# Set prev/next pointers
|
|
for i, chunk in enumerate(all_chunks):
|
|
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
|
|
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
|
|
|
|
print(f" Total chunks: {len(all_chunks)}", flush=True)
|
|
|
|
# Step 3: Crop images (all first, then analyze)
|
|
print("\n=== Phase 3: Cropping images ===", flush=True)
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f" Found {len(image_chunks)} image chunks", flush=True)
|
|
|
|
for chunk in image_chunks:
|
|
chunk_id = chunk["chunk_id"]
|
|
page = chunk["page"]
|
|
actual_num = chunk.get("actual_num", PAGE_MAP.get(page, page))
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
crop_image(page, actual_num, chunk_id, bbox)
|
|
|
|
# Step 4: Analyze images in parallel batches of 5
|
|
print("\n=== Phase 4: Image analysis ===", flush=True)
|
|
|
|
for batch_start in range(0, len(image_chunks), batch_size):
|
|
batch = image_chunks[batch_start:batch_start + batch_size]
|
|
print(f" Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
|
|
futures = {}
|
|
for chunk in batch:
|
|
chunk_id = chunk["chunk_id"]
|
|
img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id
|
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
chunk_id = futures[future]
|
|
img_meta = future.result()
|
|
# Find chunk and merge
|
|
for chunk in all_chunks:
|
|
if chunk["chunk_id"] == chunk_id:
|
|
chunk.update({
|
|
"image_type": img_meta.get("image_type", chunk.get("image_type")),
|
|
"image_description_en": img_meta.get("image_description_en"),
|
|
"image_description_pt_br": img_meta.get("image_description_pt_br"),
|
|
"extracted_text": img_meta.get("extracted_text"),
|
|
"ufo_anomaly_detected": img_meta.get("ufo_anomaly_detected", False),
|
|
"ufo_anomaly_type": img_meta.get("ufo_anomaly_type"),
|
|
"ufo_anomaly_rationale": img_meta.get("ufo_anomaly_rationale"),
|
|
"cryptid_anomaly_detected": img_meta.get("cryptid_anomaly_detected", False),
|
|
"cryptid_anomaly_type": img_meta.get("cryptid_anomaly_type"),
|
|
"cryptid_anomaly_rationale": img_meta.get("cryptid_anomaly_rationale"),
|
|
})
|
|
print(f" [IMG OK] {chunk_id}", flush=True)
|
|
break
|
|
|
|
if batch_start + batch_size < len(image_chunks):
|
|
time.sleep(1)
|
|
|
|
# Step 5: Check for cross-page table stitching
|
|
print("\n=== Phase 5: Table stitching check ===", flush=True)
|
|
tables_stitched = 0
|
|
# (Simple check - full stitching would require more complex logic)
|
|
# Find table_marker chunks that span pages
|
|
table_markers = [c for c in all_chunks if c.get("type") == "table_marker"]
|
|
print(f" Found {len(table_markers)} table markers", flush=True)
|
|
# No cross-page stitching needed for this pass - all tables are self-contained
|
|
|
|
# Step 6: Write individual chunk files
|
|
print("\n=== Phase 6: Writing chunk files ===", flush=True)
|
|
for chunk in all_chunks:
|
|
write_chunk_file(chunk, chunk["chunk_id"])
|
|
print(f" Wrote {len(all_chunks)} chunk files", flush=True)
|
|
|
|
# Step 7: Write _index.json
|
|
print("\n=== Phase 7: Writing _index.json ===", flush=True)
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
index_chunks = []
|
|
for chunk in all_chunks:
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
content_en = chunk.get("content_en", "")
|
|
preview = (content_en[:80] + "...") if len(content_en) > 80 else content_en
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk.get("page", 1),
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": bbox,
|
|
"preview": preview
|
|
})
|
|
|
|
index = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": len(all_chunks),
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks
|
|
}
|
|
|
|
index_path = OUT_DIR / "_index.json"
|
|
index_path.write_text(json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
print(f" Written: {index_path}", flush=True)
|
|
|
|
# Step 8: Assemble document.md
|
|
print("\n=== Phase 8: Assembling document.md ===", flush=True)
|
|
|
|
# Compute stats
|
|
type_histogram = {}
|
|
for chunk in all_chunks:
|
|
t = chunk.get("type", "paragraph")
|
|
type_histogram[t] = type_histogram.get(t, 0) + 1
|
|
|
|
ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
|
|
cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
|
|
images_extracted = len(image_chunks)
|
|
|
|
# Build frontmatter
|
|
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
|
|
ufo_yaml = "\n".join(f" - {c}" for c in ufo_flagged) if ufo_flagged else " []"
|
|
cryptid_yaml = "\n".join(f" - {c}" for c in cryptid_flagged) if cryptid_flagged else " []"
|
|
|
|
doc_parts = [f"""---
|
|
schema_version: "0.2.0"
|
|
type: master_document
|
|
doc_id: {DOC_ID}
|
|
canonical_title: "{DOC_TITLE}"
|
|
total_pages: {TOTAL_PAGES}
|
|
total_chunks: {len(all_chunks)}
|
|
chunk_types_histogram:
|
|
{histogram_yaml}
|
|
multi_page_tables: []
|
|
ufo_anomalies_flagged:
|
|
{ufo_yaml if ufo_flagged else " []"}
|
|
cryptid_anomalies_flagged:
|
|
{cryptid_yaml if cryptid_flagged else " []"}
|
|
build_approach: "subagents"
|
|
build_model: "claude-haiku-4-5"
|
|
build_at: "{build_at}"
|
|
---
|
|
"""]
|
|
|
|
# Group chunks by page
|
|
chunks_by_page = {}
|
|
for chunk in all_chunks:
|
|
p = chunk.get("page", 1)
|
|
chunks_by_page.setdefault(p, []).append(chunk)
|
|
|
|
for page_seq in sorted(chunks_by_page.keys()):
|
|
page_chunks = chunks_by_page[page_seq]
|
|
doc_parts.append(f"\n## Page {page_seq}\n")
|
|
|
|
for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)):
|
|
chunk_id = chunk["chunk_id"]
|
|
ctype = chunk.get("type", "paragraph")
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"
|
|
|
|
doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
|
|
doc_parts.append(f'<a id="{chunk_id}"></a>\n')
|
|
doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
|
|
|
|
content_en = chunk.get("content_en", "")
|
|
content_pt_br = chunk.get("content_pt_br", "")
|
|
doc_parts.append(f"**EN:** {content_en}\n\n")
|
|
doc_parts.append(f"**PT-BR:** {content_pt_br}\n\n")
|
|
|
|
# Image embed
|
|
if ctype == "image":
|
|
img_rel = f"./images/IMG-{chunk_id}.png"
|
|
doc_parts.append(f"\n\n")
|
|
desc_en = chunk.get("image_description_en", "")
|
|
desc_pt = chunk.get("image_description_pt_br", "")
|
|
if desc_en:
|
|
doc_parts.append(f"**Image Description (EN):** {desc_en}\n\n")
|
|
if desc_pt:
|
|
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {desc_pt}\n\n")
|
|
|
|
# Table render
|
|
if ctype == "table_marker" and chunk.get("stitched_table"):
|
|
rows = chunk["stitched_table"]
|
|
if rows:
|
|
doc_parts.append("<table>\n")
|
|
for row in rows:
|
|
doc_parts.append("<tr>" + "".join(f"<td>{cell}</td>" for cell in row) + "</tr>\n")
|
|
doc_parts.append("</table>\n\n")
|
|
|
|
# Metadata details
|
|
meta = {
|
|
"chunk_id": chunk_id,
|
|
"type": ctype,
|
|
"page": chunk.get("page"),
|
|
"order_in_page": chunk.get("order_in_page"),
|
|
"order_global": chunk.get("order_global"),
|
|
"bbox": bbox,
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting", []),
|
|
"cross_page_hint": chunk.get("cross_page_hint"),
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|
"next_chunk": chunk.get("next_chunk"),
|
|
"ocr_confidence": chunk.get("ocr_confidence"),
|
|
"redaction_code": chunk.get("redaction_code"),
|
|
"image_type": chunk.get("image_type"),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
|
|
}
|
|
meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
|
|
doc_parts.append(f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n")
|
|
|
|
doc_md = "".join(doc_parts)
|
|
doc_path = OUT_DIR / "document.md"
|
|
doc_path.write_text(doc_md, encoding="utf-8")
|
|
doc_md_bytes = len(doc_md.encode("utf-8"))
|
|
print(f" Written: {doc_path} ({doc_md_bytes} bytes)", flush=True)
|
|
|
|
t_end = time.time()
|
|
wall_seconds = int(t_end - t_start)
|
|
|
|
print(f"\n=== DONE ===", flush=True)
|
|
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True)
|
|
print(f"\npages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}", flush=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|