629 lines
24 KiB
Python
629 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rebuild script v2 for doc-65-hs1-834228961-62-hq-83894-section-2
|
|
Uses claude CLI for vision processing (no direct API key needed).
|
|
Processes 159 pages in batches of 5.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import subprocess
|
|
import concurrent.futures
|
|
import textwrap
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
|
|
# ── Config ──────────────────────────────────────────────────────────────────
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-2"
|
|
DOC_TITLE = "FBI HQ-83894 Section 2 — Flying Discs Investigation (Serials 53-100)"
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
|
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
|
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
CLAUDE_BIN = "/Users/guto/.local/bin/claude"
|
|
|
|
BATCH_SIZE = 5
|
|
CLAUDE_TIMEOUT = 120 # seconds per page call
|
|
|
|
def build_page_map():
|
|
pngs = sorted(
|
|
int(p.stem.replace("p-", ""))
|
|
for p in PNG_DIR.glob("p-*.png")
|
|
)
|
|
return {i + 1: num for i, num in enumerate(pngs)}
|
|
|
|
PAGE_MAP = build_page_map()
|
|
TOTAL_PAGES = len(PAGE_MAP)
|
|
|
|
def load_ocr(actual_num: int) -> str:
|
|
ocr_path = OCR_DIR / f"p-{actual_num:03d}.txt"
|
|
if ocr_path.exists():
|
|
text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
|
|
return text[:2000] if text else ""
|
|
return ""
|
|
|
|
PAGE_REBUILDER_PROMPT_TEMPLATE = """You are a page-rebuilder agent analyzing a page from a declassified FBI document about Flying Discs / UAP investigations.
|
|
|
|
Document: {doc_title}
|
|
Actual page file: p-{actual_num:03d}.png
|
|
Sequential page number: {page_seq} of {total_pages}
|
|
|
|
OCR text (may be empty or poor quality):
|
|
{ocr_text}
|
|
|
|
Use the Read tool to read this image:
|
|
/Users/guto/ufo/processing/png/{doc_id}/p-{actual_num:03d}.png
|
|
|
|
Then analyze ALL visible content and return a JSON object with this exact structure (return ONLY the JSON, no markdown fences, no explanation):
|
|
{{
|
|
"page_number": {page_seq},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "cover",
|
|
"content_en": "exact transcription or description in English",
|
|
"content_pt_br": "descrição ou transcrição em português brasileiro",
|
|
"bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
RULES:
|
|
- type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank
|
|
- bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0)
|
|
- Split the page into logical chunks (letterhead separate from body text, stamps separate, etc.)
|
|
- For redacted blocks: type=redaction, include redaction_code if visible e.g. "(b)(1)", "(b)(3)", "(b)(6)"
|
|
- For stamps (RECEIVED, RECORDED, etc.): type=stamp
|
|
- For photos, sketches, diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other
|
|
- For tables: type=table_marker
|
|
- cross_page_hint: self_contained | continues_to_next | continues_from_prev
|
|
- content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]"
|
|
- content_pt_br: Brazilian Portuguese translation/description
|
|
- formatting: array of applicable: bold | italic | all_caps | underline | typewritten | handwritten
|
|
- ufo_anomaly_detected: true ONLY if page has image/sketch/photo of an anomalous aerial object
|
|
- Blank pages: one chunk with type=blank
|
|
- Return ONLY valid JSON, nothing else"""
|
|
|
|
IMAGE_ANALYST_PROMPT_TEMPLATE = """You are an image analyst for declassified FBI UFO/UAP investigation documents.
|
|
|
|
Read this cropped image region:
|
|
{img_path}
|
|
|
|
Analyze it and return ONLY this JSON (no markdown fences):
|
|
{{
|
|
"image_type": "photo",
|
|
"image_description_en": "detailed description in English",
|
|
"image_description_pt_br": "descrição detalhada em português brasileiro",
|
|
"extracted_text": "any text visible verbatim or null",
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null
|
|
}}
|
|
|
|
image_type: photo | diagram | sketch | map | chart | signature_block | stamp | seal | other
|
|
Return ONLY valid JSON."""
|
|
|
|
def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str:
|
|
"""Run claude CLI with a prompt, return stdout text."""
|
|
try:
|
|
result = subprocess.run(
|
|
[CLAUDE_BIN, "-p", "--dangerously-skip-permissions",
|
|
"--model", "claude-haiku-4-5",
|
|
"--no-session-persistence",
|
|
prompt],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
env={**os.environ}
|
|
)
|
|
return result.stdout.strip()
|
|
except subprocess.TimeoutExpired:
|
|
return ""
|
|
except Exception as e:
|
|
return f"ERROR: {e}"
|
|
|
|
def parse_json_response(raw: str):
|
|
"""Try to parse JSON from response, stripping markdown fences."""
|
|
text = raw.strip()
|
|
# Strip markdown fences
|
|
if text.startswith("```"):
|
|
lines = text.split("\n")
|
|
# Remove first line (```json or ```)
|
|
lines = lines[1:]
|
|
# Remove last line if it's ```
|
|
if lines and lines[-1].strip() == "```":
|
|
lines = lines[:-1]
|
|
text = "\n".join(lines).strip()
|
|
|
|
# Find JSON object boundaries
|
|
start = text.find("{")
|
|
if start == -1:
|
|
return None
|
|
# Find matching closing brace
|
|
depth = 0
|
|
end = -1
|
|
for i, ch in enumerate(text[start:]):
|
|
if ch == "{":
|
|
depth += 1
|
|
elif ch == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
end = start + i + 1
|
|
break
|
|
if end == -1:
|
|
return None
|
|
|
|
try:
|
|
return json.loads(text[start:end])
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
def rebuild_page(page_seq: int) -> dict:
|
|
"""Process one page via claude CLI."""
|
|
actual_num = PAGE_MAP[page_seq]
|
|
ocr_text = load_ocr(actual_num)
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT_TEMPLATE.format(
|
|
doc_title=DOC_TITLE,
|
|
actual_num=actual_num,
|
|
page_seq=page_seq,
|
|
total_pages=TOTAL_PAGES,
|
|
ocr_text=ocr_text if ocr_text else "(no OCR available)",
|
|
doc_id=DOC_ID
|
|
)
|
|
|
|
retries = 3
|
|
for attempt in range(retries):
|
|
raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT)
|
|
if not raw or raw.startswith("ERROR:"):
|
|
if attempt < retries - 1:
|
|
wait = 5 * (attempt + 1)
|
|
print(f" [RETRY {attempt+1}] page {page_seq}: empty/error, waiting {wait}s", flush=True)
|
|
time.sleep(wait)
|
|
continue
|
|
else:
|
|
break
|
|
|
|
data = parse_json_response(raw)
|
|
if data and "chunks" in data:
|
|
data["page_number"] = page_seq
|
|
data["actual_num"] = actual_num
|
|
for i, ch in enumerate(data["chunks"]):
|
|
ch["order_in_page"] = i + 1
|
|
ch["page"] = page_seq
|
|
print(f" [OK] page {page_seq:03d} (p-{actual_num:03d}) → {len(data['chunks'])} chunks", flush=True)
|
|
return data
|
|
else:
|
|
if attempt < retries - 1:
|
|
print(f" [RETRY {attempt+1}] page {page_seq}: bad JSON, retrying", flush=True)
|
|
time.sleep(3)
|
|
else:
|
|
print(f" [FAIL] page {page_seq}: could not parse JSON. Raw: {raw[:200]}", flush=True)
|
|
|
|
# Fallback
|
|
return {
|
|
"page_number": page_seq,
|
|
"actual_num": actual_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"page": page_seq,
|
|
"content_en": "[Page processing failed - manual review required]",
|
|
"content_pt_br": "[Falha no processamento da página - revisão manual necessária]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}]
|
|
}
|
|
|
|
def crop_image(page_seq: int, actual_num: int, chunk_id: str, bbox: dict) -> Path:
|
|
"""Crop bbox region from page PNG."""
|
|
src = PNG_DIR / f"p-{actual_num:03d}.png"
|
|
dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
try:
|
|
im = Image.open(src)
|
|
W, H = im.size
|
|
x = max(0.0, min(1.0, bbox.get("x", 0.0)))
|
|
y = max(0.0, min(1.0, bbox.get("y", 0.0)))
|
|
w = max(0.01, min(1.0 - x, bbox.get("w", 1.0)))
|
|
h = max(0.01, min(1.0 - y, bbox.get("h", 0.1)))
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
cropped = im.crop((left, top, right, bottom))
|
|
cropped.save(str(dst))
|
|
except Exception as e:
|
|
print(f" [CROP FAIL] {chunk_id}: {e}", flush=True)
|
|
return dst
|
|
|
|
def analyze_image(chunk_id: str, img_path: Path) -> dict:
|
|
"""Analyze cropped image via claude CLI."""
|
|
if not img_path.exists():
|
|
return {
|
|
"image_type": "other",
|
|
"image_description_en": "Image not available",
|
|
"image_description_pt_br": "Imagem não disponível",
|
|
"extracted_text": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}
|
|
|
|
prompt = IMAGE_ANALYST_PROMPT_TEMPLATE.format(img_path=str(img_path))
|
|
retries = 2
|
|
for attempt in range(retries):
|
|
raw = run_claude(prompt, timeout=60)
|
|
data = parse_json_response(raw)
|
|
if data:
|
|
print(f" [IMG OK] {chunk_id}", flush=True)
|
|
return data
|
|
if attempt < retries - 1:
|
|
time.sleep(3)
|
|
|
|
print(f" [IMG FAIL] {chunk_id}", flush=True)
|
|
return {
|
|
"image_type": "other",
|
|
"image_description_en": "Analysis failed",
|
|
"image_description_pt_br": "Análise falhou",
|
|
"extracted_text": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None
|
|
}
|
|
|
|
def write_chunk_file(chunk: dict):
|
|
"""Write individual chunk markdown file."""
|
|
chunk_id = chunk["chunk_id"]
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
page = chunk.get("page", 1)
|
|
actual_num = PAGE_MAP.get(page, page)
|
|
ctype = chunk.get("type", "paragraph")
|
|
|
|
related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null"
|
|
related_table = chunk.get("related_table") or "null"
|
|
prev_chunk = chunk.get("prev_chunk") or "null"
|
|
next_chunk = chunk.get("next_chunk") or "null"
|
|
|
|
fmt_list = chunk.get("formatting") or []
|
|
fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
|
|
|
|
ocr_lines = chunk.get("ocr_source_lines") or []
|
|
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
|
|
|
|
def yv(v):
|
|
if v is None:
|
|
return "null"
|
|
if isinstance(v, bool):
|
|
return str(v).lower()
|
|
s = str(v)
|
|
# Quote if contains special chars
|
|
if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@', '`']):
|
|
return f'"{s}"'
|
|
return s
|
|
|
|
content = f"""---
|
|
chunk_id: {chunk_id}
|
|
type: {ctype}
|
|
page: {page}
|
|
order_in_page: {chunk.get("order_in_page", 1)}
|
|
order_global: {chunk.get("order_global", 1)}
|
|
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
|
|
classification: {yv(chunk.get("classification"))}
|
|
formatting: {fmt_str}
|
|
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
|
|
prev_chunk: {prev_chunk}
|
|
next_chunk: {next_chunk}
|
|
related_image: {related_image}
|
|
related_table: {related_table}
|
|
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
|
|
ocr_source_lines: {ocr_lines_str}
|
|
redaction_code: {yv(chunk.get("redaction_code"))}
|
|
redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))}
|
|
image_type: {yv(chunk.get("image_type"))}
|
|
ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()}
|
|
cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()}
|
|
ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))}
|
|
ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))}
|
|
cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))}
|
|
cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))}
|
|
image_description_en: {yv(chunk.get("image_description_en"))}
|
|
image_description_pt_br: {yv(chunk.get("image_description_pt_br"))}
|
|
extracted_text: {yv(chunk.get("extracted_text"))}
|
|
source_png: ../../processing/png/{DOC_ID}/p-{actual_num:03d}.png
|
|
---
|
|
|
|
**EN:** {chunk.get("content_en", "")}
|
|
|
|
**PT-BR:** {chunk.get("content_pt_br", "")}
|
|
"""
|
|
(CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8")
|
|
|
|
def main():
|
|
t_start = time.time()
|
|
print(f"Starting rebuild: {DOC_ID}", flush=True)
|
|
print(f"Total pages: {TOTAL_PAGES}", flush=True)
|
|
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Phase 1: Rebuild pages in parallel batches of 5
|
|
print("\n=== Phase 1: Page rebuilding ===", flush=True)
|
|
all_page_results = {}
|
|
page_seqs = list(range(1, TOTAL_PAGES + 1))
|
|
|
|
for batch_start in range(0, len(page_seqs), BATCH_SIZE):
|
|
batch = page_seqs[batch_start:batch_start + BATCH_SIZE]
|
|
print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
|
|
futures = {executor.submit(rebuild_page, p): p for p in batch}
|
|
for future in concurrent.futures.as_completed(futures):
|
|
result = future.result()
|
|
all_page_results[result["page_number"]] = result
|
|
|
|
# Save intermediate state after each batch
|
|
state_path = OUT_DIR / "_rebuild_state.json"
|
|
state_path.write_text(
|
|
json.dumps({str(k): v for k, v in all_page_results.items()}, ensure_ascii=False),
|
|
encoding="utf-8"
|
|
)
|
|
|
|
# Phase 2: Global chunk numbering
|
|
print("\n=== Phase 2: Global chunk numbering ===", flush=True)
|
|
all_chunks = []
|
|
order_global = 0
|
|
|
|
for page_seq in sorted(all_page_results.keys()):
|
|
chunks = all_page_results[page_seq].get("chunks", [])
|
|
actual_num = all_page_results[page_seq].get("actual_num", PAGE_MAP.get(page_seq, page_seq))
|
|
for chunk in sorted(chunks, key=lambda c: c.get("order_in_page", 0)):
|
|
order_global += 1
|
|
chunk_id = f"c{order_global:04d}"
|
|
chunk["chunk_id"] = chunk_id
|
|
chunk["order_global"] = order_global
|
|
chunk["actual_num"] = actual_num
|
|
all_chunks.append(chunk)
|
|
|
|
for i, chunk in enumerate(all_chunks):
|
|
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
|
|
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
|
|
|
|
print(f" Total chunks: {len(all_chunks)}", flush=True)
|
|
|
|
# Phase 3: Crop all images
|
|
print("\n=== Phase 3: Cropping images ===", flush=True)
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f" Found {len(image_chunks)} image chunks", flush=True)
|
|
|
|
for chunk in image_chunks:
|
|
crop_image(
|
|
chunk["page"],
|
|
chunk.get("actual_num", PAGE_MAP.get(chunk["page"], chunk["page"])),
|
|
chunk["chunk_id"],
|
|
chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
)
|
|
|
|
# Phase 4: Analyze images in parallel batches of 5
|
|
print("\n=== Phase 4: Image analysis ===", flush=True)
|
|
chunk_lookup = {c["chunk_id"]: c for c in all_chunks}
|
|
|
|
for batch_start in range(0, len(image_chunks), BATCH_SIZE):
|
|
batch = image_chunks[batch_start:batch_start + BATCH_SIZE]
|
|
print(f" Image batch {batch_start+1}-{batch_start+len(batch)}...", flush=True)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
|
|
futures = {}
|
|
for chunk in batch:
|
|
chunk_id = chunk["chunk_id"]
|
|
img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
futures[executor.submit(analyze_image, chunk_id, img_path)] = chunk_id
|
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
chunk_id = futures[future]
|
|
img_meta = future.result()
|
|
chunk = chunk_lookup.get(chunk_id)
|
|
if chunk:
|
|
chunk.update({k: v for k, v in img_meta.items() if v is not None})
|
|
|
|
# Phase 5: Table stitching check
|
|
print("\n=== Phase 5: Table stitching ===", flush=True)
|
|
tables_stitched = 0
|
|
table_markers = [c for c in all_chunks if c.get("type") == "table_marker"]
|
|
print(f" Found {len(table_markers)} table markers (no cross-page stitching needed)", flush=True)
|
|
|
|
# Phase 6: Write chunk files
|
|
print("\n=== Phase 6: Writing chunk files ===", flush=True)
|
|
for chunk in all_chunks:
|
|
write_chunk_file(chunk)
|
|
print(f" Wrote {len(all_chunks)} chunk files", flush=True)
|
|
|
|
# Phase 7: Write _index.json
|
|
print("\n=== Phase 7: Writing _index.json ===", flush=True)
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
index_chunks = []
|
|
for chunk in all_chunks:
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
content_en = chunk.get("content_en", "")
|
|
preview = content_en[:80] + ("..." if len(content_en) > 80 else "")
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk.get("page", 1),
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": bbox,
|
|
"preview": preview
|
|
})
|
|
|
|
index = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": len(all_chunks),
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-haiku-4-5",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks
|
|
}
|
|
(OUT_DIR / "_index.json").write_text(
|
|
json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
)
|
|
|
|
# Phase 8: Assemble document.md
|
|
print("\n=== Phase 8: Assembling document.md ===", flush=True)
|
|
|
|
type_histogram = {}
|
|
for chunk in all_chunks:
|
|
t = chunk.get("type", "paragraph")
|
|
type_histogram[t] = type_histogram.get(t, 0) + 1
|
|
|
|
ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
|
|
cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
|
|
images_extracted = len(image_chunks)
|
|
|
|
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
|
|
|
|
def list_yaml(items):
|
|
if not items:
|
|
return " []"
|
|
return "\n".join(f" - {i}" for i in items)
|
|
|
|
doc_parts = [f"""---
|
|
schema_version: "0.2.0"
|
|
type: master_document
|
|
doc_id: {DOC_ID}
|
|
canonical_title: "{DOC_TITLE}"
|
|
total_pages: {TOTAL_PAGES}
|
|
total_chunks: {len(all_chunks)}
|
|
chunk_types_histogram:
|
|
{histogram_yaml}
|
|
multi_page_tables: []
|
|
ufo_anomalies_flagged:
|
|
{list_yaml(ufo_flagged)}
|
|
cryptid_anomalies_flagged:
|
|
{list_yaml(cryptid_flagged)}
|
|
build_approach: "subagents"
|
|
build_model: "claude-haiku-4-5"
|
|
build_at: "{build_at}"
|
|
---
|
|
"""]
|
|
|
|
chunks_by_page = {}
|
|
for chunk in all_chunks:
|
|
p = chunk.get("page", 1)
|
|
chunks_by_page.setdefault(p, []).append(chunk)
|
|
|
|
for page_seq in sorted(chunks_by_page.keys()):
|
|
page_chunks = chunks_by_page[page_seq]
|
|
doc_parts.append(f"\n## Page {page_seq}\n")
|
|
|
|
for chunk in sorted(page_chunks, key=lambda c: c.get("order_in_page", 1)):
|
|
chunk_id = chunk["chunk_id"]
|
|
ctype = chunk.get("type", "paragraph")
|
|
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"
|
|
|
|
doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
|
|
doc_parts.append(f'<a id="{chunk_id}"></a>\n')
|
|
doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
|
|
|
|
doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n")
|
|
doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n")
|
|
|
|
if ctype == "image":
|
|
doc_parts.append(f"\n\n")
|
|
if chunk.get("image_description_en"):
|
|
doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n")
|
|
if chunk.get("image_description_pt_br"):
|
|
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n")
|
|
|
|
# Metadata block
|
|
meta = {
|
|
"chunk_id": chunk_id,
|
|
"type": ctype,
|
|
"page": chunk.get("page"),
|
|
"order_in_page": chunk.get("order_in_page"),
|
|
"order_global": chunk.get("order_global"),
|
|
"bbox": bbox,
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting", []),
|
|
"cross_page_hint": chunk.get("cross_page_hint"),
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|
"next_chunk": chunk.get("next_chunk"),
|
|
"ocr_confidence": chunk.get("ocr_confidence"),
|
|
"redaction_code": chunk.get("redaction_code"),
|
|
"image_type": chunk.get("image_type"),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
|
|
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
|
|
"ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
|
|
}
|
|
meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
|
|
doc_parts.append(
|
|
f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n"
|
|
)
|
|
|
|
doc_md = "".join(doc_parts)
|
|
(OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8")
|
|
doc_md_bytes = len(doc_md.encode("utf-8"))
|
|
|
|
# Cleanup intermediate state
|
|
state_path = OUT_DIR / "_rebuild_state.json"
|
|
if state_path.exists():
|
|
state_path.unlink()
|
|
|
|
t_end = time.time()
|
|
wall_seconds = int(t_end - t_start)
|
|
|
|
print(f"\n=== DONE ===", flush=True)
|
|
final_line = f"pages_done={TOTAL_PAGES}, chunks_total={len(all_chunks)}, images_extracted={images_extracted}, tables_stitched={tables_stitched}, ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, wall_seconds={wall_seconds}"
|
|
print(final_line, flush=True)
|
|
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_extracted} tables={tables_stitched} ufo={len(ufo_flagged)} cryptid={len(cryptid_flagged)} doc_md_bytes={doc_md_bytes}", flush=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|