553 lines
21 KiB
Python
553 lines
21 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
rebuild_doc65_serial130_resume.py
|
||
Resume rebuild for doc-65-hs1-834228961-62-hq-83894-serial-130.
|
||
|
||
Pages 1-50 already processed (chunks c0001-c0204 exist).
|
||
This script:
|
||
Phase A: Process pages 51-91 via claude CLI → write c0205+
|
||
Phase B: Read ALL chunk files → rebuild _index.json + document.md
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import time
|
||
import subprocess
|
||
import concurrent.futures
|
||
import re
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
try:
|
||
from PIL import Image as PILImage
|
||
PILLOW_OK = True
|
||
except ImportError:
|
||
PILLOW_OK = False
|
||
|
||
# ── Config ──────────────────────────────────────────────────────────────────
|
||
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130"
|
||
DOC_TITLE = "HQ Air Defense Command – Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)"
|
||
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
|
||
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
|
||
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
|
||
CHUNKS_DIR = OUT_DIR / "chunks"
|
||
IMAGES_DIR = OUT_DIR / "images"
|
||
TABLES_DIR = OUT_DIR / "tables"
|
||
CLAUDE_BIN = "/Users/guto/.local/bin/claude"
|
||
|
||
TOTAL_PAGES = 91
|
||
START_PAGE = 51 # first missing page
|
||
FIRST_CHUNK_NUM = 205 # c0205 onwards for new chunks
|
||
BATCH_SIZE = 4
|
||
CLAUDE_TIMEOUT = 150
|
||
|
||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||
def load_ocr(page_num: int) -> str:
|
||
ocr_path = OCR_DIR / f"p-{page_num - 1:03d}.txt"
|
||
if ocr_path.exists():
|
||
text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
|
||
return text[:2000] if text else ""
|
||
return ""
|
||
|
||
|
||
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent analyzing a page from a declassified US government document about Unidentified Flying Objects (UFO/UAP) investigations.
|
||
|
||
Document: {doc_title}
|
||
Page: {page_num} of {total_pages}
|
||
PNG file: /Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png
|
||
|
||
OCR text (may be incomplete):
|
||
{ocr_text}
|
||
|
||
Use the Read tool to read the image at:
|
||
/Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png
|
||
|
||
Analyze ALL visible content and return ONLY a JSON object (no markdown fences, no extra text):
|
||
{{
|
||
"page_number": {page_num},
|
||
"chunks": [
|
||
{{
|
||
"order_in_page": 1,
|
||
"type": "letterhead",
|
||
"content_en": "exact transcription or description in English",
|
||
"content_pt_br": "transcrição ou descrição em português brasileiro",
|
||
"bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}},
|
||
"classification": null,
|
||
"formatting": [],
|
||
"cross_page_hint": "self_contained",
|
||
"ocr_confidence": 0.85,
|
||
"ocr_source_lines": [],
|
||
"redaction_code": null,
|
||
"redaction_inferred_content_type": null,
|
||
"image_type": null,
|
||
"ufo_anomaly_detected": false,
|
||
"ufo_anomaly_type": null,
|
||
"ufo_anomaly_rationale": null,
|
||
"cryptid_anomaly_detected": false,
|
||
"cryptid_anomaly_type": null,
|
||
"cryptid_anomaly_rationale": null
|
||
}}
|
||
]
|
||
}}
|
||
|
||
RULES:
|
||
- type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank | classification_banner | signature_block | redaction_block
|
||
- bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0)
|
||
- Split page into logical chunks (letterhead separate from body, stamps separate, etc.)
|
||
- For redacted blocks: type=redaction, redaction_code e.g. "(b)(1)", "(b)(3)", "(b)(6)"
|
||
- For photos/sketches/diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other
|
||
- cross_page_hint: self_contained | continues_to_next | continues_from_prev
|
||
- content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]"
|
||
- content_pt_br: Brazilian Portuguese translation/description
|
||
- ufo_anomaly_detected: true ONLY if page has image/sketch of anomalous aerial object
|
||
- Blank pages: one chunk with type=blank
|
||
- Return ONLY valid JSON, nothing else"""
|
||
|
||
|
||
def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str:
|
||
try:
|
||
result = subprocess.run(
|
||
[CLAUDE_BIN, "-p", "--dangerously-skip-permissions",
|
||
"--model", "claude-haiku-4-5",
|
||
"--no-session-persistence",
|
||
prompt],
|
||
capture_output=True, text=True, timeout=timeout,
|
||
env={**os.environ}
|
||
)
|
||
return result.stdout.strip()
|
||
except subprocess.TimeoutExpired:
|
||
return ""
|
||
except Exception as e:
|
||
return f"ERROR: {e}"
|
||
|
||
|
||
def parse_json(raw: str):
|
||
text = raw.strip()
|
||
if text.startswith("```"):
|
||
lines = text.split("\n")[1:]
|
||
if lines and lines[-1].strip() == "```":
|
||
lines = lines[:-1]
|
||
text = "\n".join(lines).strip()
|
||
start = text.find("{")
|
||
if start == -1:
|
||
return None
|
||
depth = 0
|
||
end = -1
|
||
for i, ch in enumerate(text[start:]):
|
||
if ch == "{":
|
||
depth += 1
|
||
elif ch == "}":
|
||
depth -= 1
|
||
if depth == 0:
|
||
end = start + i + 1
|
||
break
|
||
if end == -1:
|
||
return None
|
||
try:
|
||
return json.loads(text[start:end])
|
||
except json.JSONDecodeError:
|
||
return None
|
||
|
||
|
||
def rebuild_page(page_num: int) -> dict:
|
||
png_num = page_num - 1 # 0-indexed
|
||
ocr_text = load_ocr(page_num)
|
||
prompt = PAGE_REBUILDER_PROMPT.format(
|
||
doc_title=DOC_TITLE,
|
||
page_num=page_num,
|
||
total_pages=TOTAL_PAGES,
|
||
doc_id=DOC_ID,
|
||
png_num=png_num,
|
||
ocr_text=ocr_text or "(no OCR available)"
|
||
)
|
||
for attempt in range(3):
|
||
raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT)
|
||
if not raw or raw.startswith("ERROR:"):
|
||
if attempt < 2:
|
||
time.sleep(5 * (attempt + 1))
|
||
continue
|
||
break
|
||
data = parse_json(raw)
|
||
if data and "chunks" in data:
|
||
data["page_number"] = page_num
|
||
data["png_num"] = png_num
|
||
for i, ch in enumerate(data["chunks"]):
|
||
ch["order_in_page"] = i + 1
|
||
ch["page"] = page_num
|
||
print(f" [OK] page {page_num:03d} → {len(data['chunks'])} chunks", flush=True)
|
||
return data
|
||
if attempt < 2:
|
||
print(f" [RETRY {attempt+1}] page {page_num}: bad JSON", flush=True)
|
||
time.sleep(3)
|
||
else:
|
||
print(f" [FAIL] page {page_num}: {raw[:200]}", flush=True)
|
||
|
||
# Fallback
|
||
return {
|
||
"page_number": page_num, "png_num": page_num - 1,
|
||
"chunks": [{
|
||
"order_in_page": 1, "type": "blank", "page": page_num,
|
||
"content_en": "[Page processing failed]",
|
||
"content_pt_br": "[Falha no processamento da página]",
|
||
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
||
"classification": None, "formatting": [],
|
||
"cross_page_hint": "self_contained", "ocr_confidence": 0.0,
|
||
"ocr_source_lines": [], "redaction_code": None,
|
||
"redaction_inferred_content_type": None, "image_type": None,
|
||
"ufo_anomaly_detected": False, "ufo_anomaly_type": None,
|
||
"ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False,
|
||
"cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None
|
||
}]
|
||
}
|
||
|
||
|
||
def yv(v):
|
||
if v is None:
|
||
return "null"
|
||
if isinstance(v, bool):
|
||
return str(v).lower()
|
||
s = str(v)
|
||
if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '<', '>', '=', '!', '%', '@', '`']):
|
||
return f'"{s}"'
|
||
return s
|
||
|
||
|
||
def write_chunk_file(chunk: dict):
|
||
chunk_id = chunk["chunk_id"]
|
||
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
||
page = chunk.get("page", 1)
|
||
png_num = chunk.get("png_num", page - 1)
|
||
ctype = chunk.get("type", "paragraph")
|
||
fmt_list = chunk.get("formatting") or []
|
||
fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
|
||
ocr_lines = chunk.get("ocr_source_lines") or []
|
||
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
|
||
related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null"
|
||
|
||
content = f"""---
|
||
chunk_id: {chunk_id}
|
||
type: {ctype}
|
||
page: {page}
|
||
order_in_page: {chunk.get("order_in_page", 1)}
|
||
order_global: {chunk.get("order_global", 1)}
|
||
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
|
||
classification: {yv(chunk.get("classification"))}
|
||
formatting: {fmt_str}
|
||
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
|
||
prev_chunk: {chunk.get("prev_chunk") or "null"}
|
||
next_chunk: {chunk.get("next_chunk") or "null"}
|
||
related_image: {related_image}
|
||
related_table: null
|
||
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
|
||
ocr_source_lines: {ocr_lines_str}
|
||
redaction_code: {yv(chunk.get("redaction_code"))}
|
||
redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))}
|
||
image_type: {yv(chunk.get("image_type"))}
|
||
ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()}
|
||
cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()}
|
||
ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))}
|
||
ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))}
|
||
cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))}
|
||
cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))}
|
||
image_description_en: {yv(chunk.get("image_description_en"))}
|
||
image_description_pt_br: {yv(chunk.get("image_description_pt_br"))}
|
||
extracted_text: {yv(chunk.get("extracted_text"))}
|
||
source_png: ../../processing/png/{DOC_ID}/p-{png_num:03d}.png
|
||
---
|
||
|
||
**EN:** {chunk.get("content_en", "")}
|
||
|
||
**PT-BR:** {chunk.get("content_pt_br", "")}
|
||
"""
|
||
(CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8")
|
||
|
||
|
||
def crop_image(chunk: dict):
|
||
chunk_id = chunk["chunk_id"]
|
||
png_num = chunk.get("png_num", chunk.get("page", 1) - 1)
|
||
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
||
src = PNG_DIR / f"p-{png_num:03d}.png"
|
||
dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
||
if not PILLOW_OK or not src.exists():
|
||
return
|
||
try:
|
||
im = PILImage.open(src)
|
||
W, H = im.size
|
||
x = max(0.0, min(1.0, bbox.get("x", 0.0)))
|
||
y = max(0.0, min(1.0, bbox.get("y", 0.0)))
|
||
w = max(0.01, min(1.0 - x, bbox.get("w", 1.0)))
|
||
h = max(0.01, min(1.0 - y, bbox.get("h", 0.1)))
|
||
pad = 0.005
|
||
left = max(0, int((x - pad) * W))
|
||
top = max(0, int((y - pad) * H))
|
||
right = min(W, int((x + w + pad) * W))
|
||
bottom = min(H, int((y + h + pad) * H))
|
||
im.crop((left, top, right, bottom)).save(str(dst))
|
||
print(f" [CROP] {chunk_id}", flush=True)
|
||
except Exception as e:
|
||
print(f" [CROP FAIL] {chunk_id}: {e}", flush=True)
|
||
|
||
|
||
def parse_frontmatter(path: Path) -> dict:
|
||
"""Read YAML frontmatter from a chunk .md file."""
|
||
text = path.read_text(encoding="utf-8", errors="replace")
|
||
if not text.startswith("---"):
|
||
return {}
|
||
end = text.find("\n---\n", 3)
|
||
if end == -1:
|
||
return {}
|
||
fm_text = text[3:end]
|
||
data = {}
|
||
for line in fm_text.split("\n"):
|
||
m = re.match(r'^(\w+):\s*(.*)', line)
|
||
if not m:
|
||
continue
|
||
key, val = m.group(1), m.group(2).strip()
|
||
if val == "null":
|
||
data[key] = None
|
||
elif val == "true":
|
||
data[key] = True
|
||
elif val == "false":
|
||
data[key] = False
|
||
else:
|
||
# Try int
|
||
try:
|
||
data[key] = int(val)
|
||
except ValueError:
|
||
# Strip surrounding quotes
|
||
if val.startswith('"') and val.endswith('"'):
|
||
data[key] = val[1:-1]
|
||
else:
|
||
data[key] = val
|
||
# Parse bbox specially
|
||
bbox_m = re.search(r'bbox:\s*\{x:\s*([\d.]+),\s*y:\s*([\d.]+),\s*w:\s*([\d.]+),\s*h:\s*([\d.]+)\}', text)
|
||
if bbox_m:
|
||
data["bbox"] = {
|
||
"x": float(bbox_m.group(1)),
|
||
"y": float(bbox_m.group(2)),
|
||
"w": float(bbox_m.group(3)),
|
||
"h": float(bbox_m.group(4)),
|
||
}
|
||
# Extract body content
|
||
body = text[end + 5:].strip()
|
||
en_m = re.search(r'\*\*EN:\*\*\s*(.*?)(?=\n\n\*\*PT-BR:|$)', body, re.DOTALL)
|
||
ptbr_m = re.search(r'\*\*PT-BR:\*\*\s*(.*?)$', body, re.DOTALL)
|
||
data["content_en"] = en_m.group(1).strip() if en_m else ""
|
||
data["content_pt_br"] = ptbr_m.group(1).strip() if ptbr_m else ""
|
||
return data
|
||
|
||
|
||
def build_assembly(all_chunks: list, build_at: str):
|
||
"""Write _index.json and document.md from all_chunks list."""
|
||
type_histogram = {}
|
||
for chunk in all_chunks:
|
||
t = chunk.get("type", "paragraph")
|
||
type_histogram[t] = type_histogram.get(t, 0) + 1
|
||
|
||
ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
|
||
cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
|
||
images_extracted = sum(1 for c in all_chunks if c.get("type") == "image")
|
||
|
||
# _index.json
|
||
index_chunks = []
|
||
for chunk in all_chunks:
|
||
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
||
content_en = chunk.get("content_en", "")
|
||
preview = content_en[:80] + ("..." if len(content_en) > 80 else "")
|
||
index_chunks.append({
|
||
"chunk_id": chunk["chunk_id"],
|
||
"type": chunk.get("type", "paragraph"),
|
||
"page": chunk.get("page", 1),
|
||
"order_in_page": chunk.get("order_in_page", 1),
|
||
"order_global": chunk.get("order_global", 1),
|
||
"file": f"chunks/{chunk['chunk_id']}.md",
|
||
"bbox": bbox,
|
||
"preview": preview
|
||
})
|
||
|
||
index = {
|
||
"doc_id": DOC_ID,
|
||
"schema_version": "0.2.0",
|
||
"total_pages": TOTAL_PAGES,
|
||
"total_chunks": len(all_chunks),
|
||
"build_approach": "subagents",
|
||
"build_model": "claude-haiku-4-5",
|
||
"build_at": build_at,
|
||
"chunks": index_chunks
|
||
}
|
||
(OUT_DIR / "_index.json").write_text(
|
||
json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
print(f" Wrote _index.json ({len(all_chunks)} chunks)", flush=True)
|
||
|
||
# document.md
|
||
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
|
||
def list_yaml(items):
|
||
return " []" if not items else "\n".join(f" - {i}" for i in items)
|
||
|
||
doc_parts = [f"""---
|
||
schema_version: "0.2.0"
|
||
type: master_document
|
||
doc_id: {DOC_ID}
|
||
canonical_title: "{DOC_TITLE}"
|
||
total_pages: {TOTAL_PAGES}
|
||
total_chunks: {len(all_chunks)}
|
||
chunk_types_histogram:
|
||
{histogram_yaml}
|
||
multi_page_tables: []
|
||
ufo_anomalies_flagged:
|
||
{list_yaml(ufo_flagged)}
|
||
cryptid_anomalies_flagged:
|
||
{list_yaml(cryptid_flagged)}
|
||
build_approach: "subagents"
|
||
build_model: "claude-haiku-4-5"
|
||
build_at: "{build_at}"
|
||
---
|
||
"""]
|
||
|
||
chunks_by_page: dict = {}
|
||
for chunk in all_chunks:
|
||
p = chunk.get("page", 1)
|
||
chunks_by_page.setdefault(p, []).append(chunk)
|
||
|
||
for page_seq in sorted(chunks_by_page.keys()):
|
||
png_num = page_seq - 1
|
||
doc_parts.append(f"\n## Page {page_seq} (source: p-{png_num:03d}.png)\n")
|
||
for chunk in sorted(chunks_by_page[page_seq], key=lambda c: c.get("order_in_page", 1)):
|
||
chunk_id = chunk["chunk_id"]
|
||
ctype = chunk.get("type", "paragraph")
|
||
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
|
||
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"
|
||
|
||
doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
|
||
doc_parts.append(f'<a id="{chunk_id}"></a>\n')
|
||
doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
|
||
doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n")
|
||
doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n")
|
||
|
||
if ctype == "image":
|
||
doc_parts.append(f"\n\n")
|
||
if chunk.get("image_description_en"):
|
||
doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n")
|
||
if chunk.get("image_description_pt_br"):
|
||
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n")
|
||
|
||
meta = {k: chunk.get(k) for k in [
|
||
"chunk_id", "type", "page", "order_in_page", "order_global",
|
||
"bbox", "classification", "formatting", "cross_page_hint",
|
||
"prev_chunk", "next_chunk", "ocr_confidence", "redaction_code",
|
||
"image_type", "ufo_anomaly_detected", "cryptid_anomaly_detected",
|
||
"ufo_anomaly_type", "ufo_anomaly_rationale",
|
||
]}
|
||
meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
|
||
doc_parts.append(
|
||
f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n"
|
||
)
|
||
|
||
doc_md = "".join(doc_parts)
|
||
(OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8")
|
||
print(f" Wrote document.md ({len(doc_md):,} chars)", flush=True)
|
||
|
||
return images_extracted, ufo_flagged, cryptid_flagged
|
||
|
||
|
||
def main():
|
||
t_start = time.time()
|
||
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
||
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
||
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"=== Phase A: Process pages {START_PAGE}-{TOTAL_PAGES} via claude CLI ===", flush=True)
|
||
|
||
pages_to_process = list(range(START_PAGE, TOTAL_PAGES + 1))
|
||
new_page_results: dict = {}
|
||
|
||
for batch_start in range(0, len(pages_to_process), BATCH_SIZE):
|
||
batch = pages_to_process[batch_start:batch_start + BATCH_SIZE]
|
||
print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True)
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as ex:
|
||
futures = {ex.submit(rebuild_page, p): p for p in batch}
|
||
for fut in concurrent.futures.as_completed(futures):
|
||
result = fut.result()
|
||
new_page_results[result["page_number"]] = result
|
||
|
||
# Assign global chunk IDs (continuing from c0204)
|
||
print(f"\n=== Phase A2: Numbering new chunks from c{FIRST_CHUNK_NUM:04d} ===", flush=True)
|
||
new_chunks = []
|
||
order_global = FIRST_CHUNK_NUM - 1
|
||
for page_num in sorted(new_page_results.keys()):
|
||
result = new_page_results[page_num]
|
||
png_num = result.get("png_num", page_num - 1)
|
||
for ch in sorted(result.get("chunks", []), key=lambda c: c.get("order_in_page", 0)):
|
||
order_global += 1
|
||
ch["chunk_id"] = f"c{order_global:04d}"
|
||
ch["order_global"] = order_global
|
||
ch["png_num"] = png_num
|
||
new_chunks.append(ch)
|
||
|
||
# prev/next links (will be re-linked globally in Phase B)
|
||
for i, ch in enumerate(new_chunks):
|
||
ch["prev_chunk"] = new_chunks[i-1]["chunk_id"] if i > 0 else None
|
||
ch["next_chunk"] = new_chunks[i+1]["chunk_id"] if i < len(new_chunks)-1 else None
|
||
|
||
print(f" {len(new_chunks)} new chunks generated", flush=True)
|
||
|
||
# Crop images
|
||
image_chunks = [c for c in new_chunks if c.get("type") == "image"]
|
||
if image_chunks:
|
||
print(f"\n=== Phase A3: Cropping {len(image_chunks)} images ===", flush=True)
|
||
for ch in image_chunks:
|
||
crop_image(ch)
|
||
|
||
# Write new chunk files
|
||
print(f"\n=== Phase A4: Writing {len(new_chunks)} new chunk files ===", flush=True)
|
||
for ch in new_chunks:
|
||
write_chunk_file(ch)
|
||
|
||
# ── Phase B: Read ALL chunks and rebuild assembly ──────────────────────
|
||
print(f"\n=== Phase B: Reading all chunk files for full assembly ===", flush=True)
|
||
|
||
all_chunk_files = sorted(CHUNKS_DIR.glob("c*.md"))
|
||
print(f" Found {len(all_chunk_files)} total chunk files", flush=True)
|
||
|
||
all_chunks = []
|
||
for path in all_chunk_files:
|
||
fm = parse_frontmatter(path)
|
||
if not fm.get("chunk_id"):
|
||
fm["chunk_id"] = path.stem
|
||
all_chunks.append(fm)
|
||
|
||
# Sort by order_global
|
||
all_chunks.sort(key=lambda c: (c.get("order_global", 999999), c.get("page", 0), c.get("order_in_page", 0)))
|
||
|
||
# Re-link prev/next globally
|
||
for i, ch in enumerate(all_chunks):
|
||
ch["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
|
||
ch["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
|
||
|
||
print(f" Total chunks: {len(all_chunks)}", flush=True)
|
||
|
||
print(f"\n=== Phase B2: Building _index.json and document.md ===", flush=True)
|
||
build_at = datetime.now(timezone.utc).isoformat()
|
||
images_extracted, ufo_flagged, cryptid_flagged = build_assembly(all_chunks, build_at)
|
||
|
||
t_end = time.time()
|
||
wall_seconds = int(t_end - t_start)
|
||
|
||
pages_done = TOTAL_PAGES
|
||
chunks_total = len(all_chunks)
|
||
tables_stitched = 0
|
||
|
||
final = (
|
||
f"pages_done={pages_done}, chunks_total={chunks_total}, "
|
||
f"images_extracted={images_extracted}, tables_stitched={tables_stitched}, "
|
||
f"ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, "
|
||
f"wall_seconds={wall_seconds}"
|
||
)
|
||
print(f"\n=== DONE ===\n{final}", flush=True)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|