disclosure-bureau/scripts/rebuild_doc65_serial130_resume.py

553 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_serial130_resume.py
Resume rebuild for doc-65-hs1-834228961-62-hq-83894-serial-130.
Pages 1-50 already processed (chunks c0001-c0204 exist).
This script:
Phase A: Process pages 51-91 via claude CLI → write c0205+
Phase B: Read ALL chunk files → rebuild _index.json + document.md
"""
import os
import sys
import json
import time
import subprocess
import concurrent.futures
import re
from datetime import datetime, timezone
from pathlib import Path
try:
from PIL import Image as PILImage
PILLOW_OK = True
except ImportError:
PILLOW_OK = False
# ── Config ──────────────────────────────────────────────────────────────────
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130"
DOC_TITLE = "HQ Air Defense Command Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)"
PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID
OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID
OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
CLAUDE_BIN = "/Users/guto/.local/bin/claude"
TOTAL_PAGES = 91
START_PAGE = 51 # first missing page
FIRST_CHUNK_NUM = 205 # c0205 onwards for new chunks
BATCH_SIZE = 4
CLAUDE_TIMEOUT = 150
# ── Helpers ──────────────────────────────────────────────────────────────────
def load_ocr(page_num: int) -> str:
ocr_path = OCR_DIR / f"p-{page_num - 1:03d}.txt"
if ocr_path.exists():
text = ocr_path.read_text(encoding="utf-8", errors="replace").strip()
return text[:2000] if text else ""
return ""
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent analyzing a page from a declassified US government document about Unidentified Flying Objects (UFO/UAP) investigations.
Document: {doc_title}
Page: {page_num} of {total_pages}
PNG file: /Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png
OCR text (may be incomplete):
{ocr_text}
Use the Read tool to read the image at:
/Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png
Analyze ALL visible content and return ONLY a JSON object (no markdown fences, no extra text):
{{
"page_number": {page_num},
"chunks": [
{{
"order_in_page": 1,
"type": "letterhead",
"content_en": "exact transcription or description in English",
"content_pt_br": "transcrição ou descrição em português brasileiro",
"bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}}
]
}}
RULES:
- type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank | classification_banner | signature_block | redaction_block
- bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0)
- Split page into logical chunks (letterhead separate from body, stamps separate, etc.)
- For redacted blocks: type=redaction, redaction_code e.g. "(b)(1)", "(b)(3)", "(b)(6)"
- For photos/sketches/diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other
- cross_page_hint: self_contained | continues_to_next | continues_from_prev
- content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]"
- content_pt_br: Brazilian Portuguese translation/description
- ufo_anomaly_detected: true ONLY if page has image/sketch of anomalous aerial object
- Blank pages: one chunk with type=blank
- Return ONLY valid JSON, nothing else"""
def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str:
try:
result = subprocess.run(
[CLAUDE_BIN, "-p", "--dangerously-skip-permissions",
"--model", "claude-haiku-4-5",
"--no-session-persistence",
prompt],
capture_output=True, text=True, timeout=timeout,
env={**os.environ}
)
return result.stdout.strip()
except subprocess.TimeoutExpired:
return ""
except Exception as e:
return f"ERROR: {e}"
def parse_json(raw: str):
text = raw.strip()
if text.startswith("```"):
lines = text.split("\n")[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
text = "\n".join(lines).strip()
start = text.find("{")
if start == -1:
return None
depth = 0
end = -1
for i, ch in enumerate(text[start:]):
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
end = start + i + 1
break
if end == -1:
return None
try:
return json.loads(text[start:end])
except json.JSONDecodeError:
return None
def rebuild_page(page_num: int) -> dict:
png_num = page_num - 1 # 0-indexed
ocr_text = load_ocr(page_num)
prompt = PAGE_REBUILDER_PROMPT.format(
doc_title=DOC_TITLE,
page_num=page_num,
total_pages=TOTAL_PAGES,
doc_id=DOC_ID,
png_num=png_num,
ocr_text=ocr_text or "(no OCR available)"
)
for attempt in range(3):
raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT)
if not raw or raw.startswith("ERROR:"):
if attempt < 2:
time.sleep(5 * (attempt + 1))
continue
break
data = parse_json(raw)
if data and "chunks" in data:
data["page_number"] = page_num
data["png_num"] = png_num
for i, ch in enumerate(data["chunks"]):
ch["order_in_page"] = i + 1
ch["page"] = page_num
print(f" [OK] page {page_num:03d}{len(data['chunks'])} chunks", flush=True)
return data
if attempt < 2:
print(f" [RETRY {attempt+1}] page {page_num}: bad JSON", flush=True)
time.sleep(3)
else:
print(f" [FAIL] page {page_num}: {raw[:200]}", flush=True)
# Fallback
return {
"page_number": page_num, "png_num": page_num - 1,
"chunks": [{
"order_in_page": 1, "type": "blank", "page": page_num,
"content_en": "[Page processing failed]",
"content_pt_br": "[Falha no processamento da página]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained", "ocr_confidence": 0.0,
"ocr_source_lines": [], "redaction_code": None,
"redaction_inferred_content_type": None, "image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None,
"ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None
}]
}
def yv(v):
if v is None:
return "null"
if isinstance(v, bool):
return str(v).lower()
s = str(v)
if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '<', '>', '=', '!', '%', '@', '`']):
return f'"{s}"'
return s
def write_chunk_file(chunk: dict):
chunk_id = chunk["chunk_id"]
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
page = chunk.get("page", 1)
png_num = chunk.get("png_num", page - 1)
ctype = chunk.get("type", "paragraph")
fmt_list = chunk.get("formatting") or []
fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]"
ocr_lines = chunk.get("ocr_source_lines") or []
ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]"
related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null"
content = f"""---
chunk_id: {chunk_id}
type: {ctype}
page: {page}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}}
classification: {yv(chunk.get("classification"))}
formatting: {fmt_str}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {chunk.get("prev_chunk") or "null"}
next_chunk: {chunk.get("next_chunk") or "null"}
related_image: {related_image}
related_table: null
ocr_confidence: {chunk.get("ocr_confidence", 0.85)}
ocr_source_lines: {ocr_lines_str}
redaction_code: {yv(chunk.get("redaction_code"))}
redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))}
image_type: {yv(chunk.get("image_type"))}
ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()}
cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()}
ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {yv(chunk.get("image_description_en"))}
image_description_pt_br: {yv(chunk.get("image_description_pt_br"))}
extracted_text: {yv(chunk.get("extracted_text"))}
source_png: ../../processing/png/{DOC_ID}/p-{png_num:03d}.png
---
**EN:** {chunk.get("content_en", "")}
**PT-BR:** {chunk.get("content_pt_br", "")}
"""
(CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8")
def crop_image(chunk: dict):
chunk_id = chunk["chunk_id"]
png_num = chunk.get("png_num", chunk.get("page", 1) - 1)
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
src = PNG_DIR / f"p-{png_num:03d}.png"
dst = IMAGES_DIR / f"IMG-{chunk_id}.png"
if not PILLOW_OK or not src.exists():
return
try:
im = PILImage.open(src)
W, H = im.size
x = max(0.0, min(1.0, bbox.get("x", 0.0)))
y = max(0.0, min(1.0, bbox.get("y", 0.0)))
w = max(0.01, min(1.0 - x, bbox.get("w", 1.0)))
h = max(0.01, min(1.0 - y, bbox.get("h", 0.1)))
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
im.crop((left, top, right, bottom)).save(str(dst))
print(f" [CROP] {chunk_id}", flush=True)
except Exception as e:
print(f" [CROP FAIL] {chunk_id}: {e}", flush=True)
def parse_frontmatter(path: Path) -> dict:
"""Read YAML frontmatter from a chunk .md file."""
text = path.read_text(encoding="utf-8", errors="replace")
if not text.startswith("---"):
return {}
end = text.find("\n---\n", 3)
if end == -1:
return {}
fm_text = text[3:end]
data = {}
for line in fm_text.split("\n"):
m = re.match(r'^(\w+):\s*(.*)', line)
if not m:
continue
key, val = m.group(1), m.group(2).strip()
if val == "null":
data[key] = None
elif val == "true":
data[key] = True
elif val == "false":
data[key] = False
else:
# Try int
try:
data[key] = int(val)
except ValueError:
# Strip surrounding quotes
if val.startswith('"') and val.endswith('"'):
data[key] = val[1:-1]
else:
data[key] = val
# Parse bbox specially
bbox_m = re.search(r'bbox:\s*\{x:\s*([\d.]+),\s*y:\s*([\d.]+),\s*w:\s*([\d.]+),\s*h:\s*([\d.]+)\}', text)
if bbox_m:
data["bbox"] = {
"x": float(bbox_m.group(1)),
"y": float(bbox_m.group(2)),
"w": float(bbox_m.group(3)),
"h": float(bbox_m.group(4)),
}
# Extract body content
body = text[end + 5:].strip()
en_m = re.search(r'\*\*EN:\*\*\s*(.*?)(?=\n\n\*\*PT-BR:|$)', body, re.DOTALL)
ptbr_m = re.search(r'\*\*PT-BR:\*\*\s*(.*?)$', body, re.DOTALL)
data["content_en"] = en_m.group(1).strip() if en_m else ""
data["content_pt_br"] = ptbr_m.group(1).strip() if ptbr_m else ""
return data
def build_assembly(all_chunks: list, build_at: str):
"""Write _index.json and document.md from all_chunks list."""
type_histogram = {}
for chunk in all_chunks:
t = chunk.get("type", "paragraph")
type_histogram[t] = type_histogram.get(t, 0) + 1
ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
images_extracted = sum(1 for c in all_chunks if c.get("type") == "image")
# _index.json
index_chunks = []
for chunk in all_chunks:
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
content_en = chunk.get("content_en", "")
preview = content_en[:80] + ("..." if len(content_en) > 80 else "")
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk.get("page", 1),
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": bbox,
"preview": preview
})
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": "claude-haiku-4-5",
"build_at": build_at,
"chunks": index_chunks
}
(OUT_DIR / "_index.json").write_text(
json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
)
print(f" Wrote _index.json ({len(all_chunks)} chunks)", flush=True)
# document.md
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items()))
def list_yaml(items):
return " []" if not items else "\n".join(f" - {i}" for i in items)
doc_parts = [f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {TOTAL_PAGES}
total_chunks: {len(all_chunks)}
chunk_types_histogram:
{histogram_yaml}
multi_page_tables: []
ufo_anomalies_flagged:
{list_yaml(ufo_flagged)}
cryptid_anomalies_flagged:
{list_yaml(cryptid_flagged)}
build_approach: "subagents"
build_model: "claude-haiku-4-5"
build_at: "{build_at}"
---
"""]
chunks_by_page: dict = {}
for chunk in all_chunks:
p = chunk.get("page", 1)
chunks_by_page.setdefault(p, []).append(chunk)
for page_seq in sorted(chunks_by_page.keys()):
png_num = page_seq - 1
doc_parts.append(f"\n## Page {page_seq} (source: p-{png_num:03d}.png)\n")
for chunk in sorted(chunks_by_page[page_seq], key=lambda c: c.get("order_in_page", 1)):
chunk_id = chunk["chunk_id"]
ctype = chunk.get("type", "paragraph")
bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1})
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}"
doc_parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->\n")
doc_parts.append(f'<a id="{chunk_id}"></a>\n')
doc_parts.append(f"### Chunk {chunk_id}{ctype} · p{page_seq} · bbox: {bbox_str}\n\n")
doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n")
doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n")
if ctype == "image":
doc_parts.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)\n\n")
if chunk.get("image_description_en"):
doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n")
if chunk.get("image_description_pt_br"):
doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n")
meta = {k: chunk.get(k) for k in [
"chunk_id", "type", "page", "order_in_page", "order_global",
"bbox", "classification", "formatting", "cross_page_hint",
"prev_chunk", "next_chunk", "ocr_confidence", "redaction_code",
"image_type", "ufo_anomaly_detected", "cryptid_anomaly_detected",
"ufo_anomaly_type", "ufo_anomaly_rationale",
]}
meta_json = json.dumps(meta, indent=2, ensure_ascii=False)
doc_parts.append(
f"<details><summary>metadata</summary>\n\n```json\n{meta_json}\n```\n\n</details>\n\n---\n\n"
)
doc_md = "".join(doc_parts)
(OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8")
print(f" Wrote document.md ({len(doc_md):,} chars)", flush=True)
return images_extracted, ufo_flagged, cryptid_flagged
def main():
t_start = time.time()
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
print(f"=== Phase A: Process pages {START_PAGE}-{TOTAL_PAGES} via claude CLI ===", flush=True)
pages_to_process = list(range(START_PAGE, TOTAL_PAGES + 1))
new_page_results: dict = {}
for batch_start in range(0, len(pages_to_process), BATCH_SIZE):
batch = pages_to_process[batch_start:batch_start + BATCH_SIZE]
print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as ex:
futures = {ex.submit(rebuild_page, p): p for p in batch}
for fut in concurrent.futures.as_completed(futures):
result = fut.result()
new_page_results[result["page_number"]] = result
# Assign global chunk IDs (continuing from c0204)
print(f"\n=== Phase A2: Numbering new chunks from c{FIRST_CHUNK_NUM:04d} ===", flush=True)
new_chunks = []
order_global = FIRST_CHUNK_NUM - 1
for page_num in sorted(new_page_results.keys()):
result = new_page_results[page_num]
png_num = result.get("png_num", page_num - 1)
for ch in sorted(result.get("chunks", []), key=lambda c: c.get("order_in_page", 0)):
order_global += 1
ch["chunk_id"] = f"c{order_global:04d}"
ch["order_global"] = order_global
ch["png_num"] = png_num
new_chunks.append(ch)
# prev/next links (will be re-linked globally in Phase B)
for i, ch in enumerate(new_chunks):
ch["prev_chunk"] = new_chunks[i-1]["chunk_id"] if i > 0 else None
ch["next_chunk"] = new_chunks[i+1]["chunk_id"] if i < len(new_chunks)-1 else None
print(f" {len(new_chunks)} new chunks generated", flush=True)
# Crop images
image_chunks = [c for c in new_chunks if c.get("type") == "image"]
if image_chunks:
print(f"\n=== Phase A3: Cropping {len(image_chunks)} images ===", flush=True)
for ch in image_chunks:
crop_image(ch)
# Write new chunk files
print(f"\n=== Phase A4: Writing {len(new_chunks)} new chunk files ===", flush=True)
for ch in new_chunks:
write_chunk_file(ch)
# ── Phase B: Read ALL chunks and rebuild assembly ──────────────────────
print(f"\n=== Phase B: Reading all chunk files for full assembly ===", flush=True)
all_chunk_files = sorted(CHUNKS_DIR.glob("c*.md"))
print(f" Found {len(all_chunk_files)} total chunk files", flush=True)
all_chunks = []
for path in all_chunk_files:
fm = parse_frontmatter(path)
if not fm.get("chunk_id"):
fm["chunk_id"] = path.stem
all_chunks.append(fm)
# Sort by order_global
all_chunks.sort(key=lambda c: (c.get("order_global", 999999), c.get("page", 0), c.get("order_in_page", 0)))
# Re-link prev/next globally
for i, ch in enumerate(all_chunks):
ch["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
ch["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
print(f" Total chunks: {len(all_chunks)}", flush=True)
print(f"\n=== Phase B2: Building _index.json and document.md ===", flush=True)
build_at = datetime.now(timezone.utc).isoformat()
images_extracted, ufo_flagged, cryptid_flagged = build_assembly(all_chunks, build_at)
t_end = time.time()
wall_seconds = int(t_end - t_start)
pages_done = TOTAL_PAGES
chunks_total = len(all_chunks)
tables_stitched = 0
final = (
f"pages_done={pages_done}, chunks_total={chunks_total}, "
f"images_extracted={images_extracted}, tables_stitched={tables_stitched}, "
f"ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, "
f"wall_seconds={wall_seconds}"
)
print(f"\n=== DONE ===\n{final}", flush=True)
if __name__ == "__main__":
main()