disclosure-bureau/scripts/rebuild_doc255.py

633 lines
25 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rebuild doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for
into structured chunk files, _index.json, and document.md.
Uses `claude -p --model haiku` subprocess calls (OAuth via Max plan).
"""
import json
import os
import random
import re
import subprocess
import sys
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
DOC_ID = "doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for"
DOC_TITLE = "UFO's and Defense: What Should We Prepare For?"
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
CHUNKS_DIR = OUT_DIR / "chunks"
IMAGES_DIR = OUT_DIR / "images"
TABLES_DIR = OUT_DIR / "tables"
MODEL = "haiku"
TOTAL_PAGES = 93
WORKERS = 4
TIMEOUT = 240 # seconds per page call
_print_lock = threading.Lock()
def safe_print(*args, **kwargs):
with _print_lock:
print(*args, **kwargs, flush=True)
PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder agent. Analyze the scanned document page image and extract all content into structured chunks.
Document: {doc_title}
Page: {page_number} of {total_pages}
Doc ID: {doc_id}
STEP 1: Use the Read tool to view this PNG image:
{png_path}
STEP 2: Analyze every element on the page carefully.
STEP 3: Return ONE JSON object only (no markdown fence, no commentary):
{{
"page_number": {page_number},
"chunks": [
{{
"order_in_page": 1,
"type": "paragraph",
"content_en": "verbatim English text from page",
"content_pt_br": "tradução em português brasileiro",
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null,
"image_description_en": null,
"image_description_pt_br": null,
"extracted_text": null
}}
]
}}
CHUNK TYPES (use exactly one):
- letterhead: document header/letterhead
- classification_marking: classification marking (TOP SECRET, CUI, etc.)
- date_line: date field
- address_block: TO:/FROM:/distribution fields
- heading: section/chapter/subject heading
- paragraph: body text paragraph
- numbered_item: numbered list item
- bulleted_item: bullet list item
- table_marker: table content
- image: photograph, diagram, chart, sketch, map, graph
- caption: figure/image caption
- footer: page footer
- page_number: standalone page number
- signature: signature/signatory block
- redaction: blacked-out/redacted area
- stamp: official stamp or seal
- handwriting: handwritten annotation
- blank_area: empty area
- form_field: form field with label and value
- unknown: unidentifiable element
RULES:
1. Split content into logical chunks (one concept per chunk). A typical page has 3-15 chunks.
2. For image chunks: describe what you see in content_en and set image_type.
3. image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other
4. bbox: normalized coordinates 0.0-1.0 (x=left, y=top, w=width, h=height)
5. content_en: verbatim text if text chunk; visual description if image chunk
6. content_pt_br: Brazilian Portuguese translation (NOT European Portuguese)
7. classification: null or the marking text (e.g. "CUI", "UNCLASSIFIED")
8. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
9. formatting: array from ["bold", "italic", "all_caps", "underline"]
10. If page is completely blank: ONE chunk of type "blank_area"
11. Preserve French text verbatim (document may contain French)
12. For redaction chunks: set redaction_code if visible (e.g. "(b)(1)")
13. ufo_anomaly_detected: true ONLY for image chunks showing actual UAP/anomalous phenomena
Output ONLY the JSON object. No preamble. No fence. No commentary.'''
IMAGE_ANALYST_PROMPT = '''You are an image analyst for a UAP/UFO declassified document.
STEP 1: Use the Read tool to view this cropped image:
{image_path}
STEP 2: Analyze it carefully.
STEP 3: Return ONE JSON object only (no markdown fence):
{{
"image_description_en": "detailed English description",
"image_description_pt_br": "descrição detalhada em português brasileiro",
"image_type": "photograph",
"extracted_text": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null
}}
image_type options: photograph, diagram, chart, sketch, map, graph, seal, signature_image, other
Set ufo_anomaly_detected=true only if the image shows an actual UAP/UFO or anomalous aerial phenomenon.
Set cryptid_anomaly_detected=true only if the image shows a cryptid or unknown creature.
extracted_text: any text visible inside the image (verbatim), or null.
Output ONLY the JSON object.'''
def extract_json(text: str) -> dict:
"""Extract JSON from claude CLI output."""
text = text.strip()
# Strip markdown fences if present
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```\s*$", "", text.rstrip())
# Find first { and matching }
start = text.find("{")
if start == -1:
raise ValueError(f"No JSON found in: {text[:200]}")
depth = 0
for i, c in enumerate(text[start:], start):
if c == "{":
depth += 1
elif c == "}":
depth -= 1
if depth == 0:
return json.loads(text[start:i + 1])
raise ValueError("Unclosed JSON in response")
def call_claude(prompt: str, allowed_tools: str = "Read", timeout: int = TIMEOUT) -> str:
"""Call claude -p CLI and return result text."""
cmd = [
"claude", "-p",
"--model", MODEL,
"--output-format", "json",
"--max-turns", "5",
"--allowedTools", allowed_tools,
"--add-dir", str(PNG_DIR),
"--add-dir", str(IMAGES_DIR),
"--",
prompt,
]
res = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
check=False,
env={**os.environ},
)
if res.returncode != 0:
raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}")
cli = json.loads(res.stdout)
if cli.get("is_error"):
raise RuntimeError(f"claude error: {cli.get('result', '')[:500]}")
return cli.get("result", "")
def process_page(page_num: int) -> dict:
"""Process a single page using claude -p CLI."""
png_path = PNG_DIR / f"p-{page_num:03d}.png"
if not png_path.exists():
safe_print(f" Page {page_num}: PNG missing — placeholder")
return {
"page_number": page_num,
"chunks": [{
"order_in_page": 1,
"type": "blank_area",
"content_en": f"[Page {page_num} — PNG not available]",
"content_pt_br": f"[Página {page_num} — PNG não disponível]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [],
"redaction_code": None, "redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None
}]
}
prompt = PAGE_REBUILDER_PROMPT.format(
doc_title=DOC_TITLE,
page_number=page_num,
total_pages=TOTAL_PAGES,
doc_id=DOC_ID,
png_path=str(png_path),
)
max_retries = 3
for attempt in range(1, max_retries + 1):
try:
result_text = call_claude(prompt, allowed_tools="Read")
data = extract_json(result_text)
data["page_number"] = page_num
# Validate chunks exist
if not isinstance(data.get("chunks"), list) or len(data["chunks"]) == 0:
raise ValueError("No chunks in response")
safe_print(f" Page {page_num}: {len(data['chunks'])} chunks")
return data
except (subprocess.TimeoutExpired,) as e:
safe_print(f" Page {page_num}: timeout attempt {attempt}/{max_retries}")
if attempt == max_retries:
break
time.sleep(10 * attempt)
except (RuntimeError, ValueError, json.JSONDecodeError) as e:
safe_print(f" Page {page_num}: error attempt {attempt}/{max_retries}: {str(e)[:100]}")
if attempt == max_retries:
break
backoff = 5 * attempt + random.uniform(0, 3)
time.sleep(backoff)
# Return fallback
safe_print(f" Page {page_num}: FALLBACK after {max_retries} attempts")
return {
"page_number": page_num,
"chunks": [{
"order_in_page": 1,
"type": "unknown",
"content_en": f"[Page {page_num} — content extraction failed after {max_retries} attempts]",
"content_pt_br": f"[Página {page_num} — extração de conteúdo falhou após {max_retries} tentativas]",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [],
"redaction_code": None, "redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None
}]
}
def crop_image(chunk_id: str, png_path: Path, bbox: dict) -> object:
"""Crop image region from page PNG."""
from PIL import Image
cropped_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
try:
im = Image.open(png_path)
W, H = im.size
x = max(0.0, float(bbox.get("x", 0)))
y = max(0.0, float(bbox.get("y", 0)))
w = max(0.01, float(bbox.get("w", 1)))
h = max(0.01, float(bbox.get("h", 0.1)))
pad = 0.005
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
if right <= left or bottom <= top:
safe_print(f" Crop {chunk_id}: degenerate bbox {bbox}")
return None
cropped = im.crop((left, top, right, bottom))
cropped.save(str(cropped_path))
safe_print(f" Cropped {chunk_id}: {left},{top},{right},{bottom} from {W}x{H}")
return cropped_path
except Exception as e:
safe_print(f" Crop {chunk_id}: error: {e}")
return None
def analyze_image(chunk_id: str, cropped_path: Path) -> dict:
"""Analyze a cropped image using claude -p CLI."""
if not cropped_path or not cropped_path.exists():
return {}
prompt = IMAGE_ANALYST_PROMPT.format(image_path=str(cropped_path))
max_retries = 2
for attempt in range(1, max_retries + 1):
try:
result_text = call_claude(prompt, allowed_tools="Read", timeout=120)
data = extract_json(result_text)
safe_print(f" Image {chunk_id}: analyzed (ufo={data.get('ufo_anomaly_detected', False)})")
return data
except Exception as e:
safe_print(f" Image {chunk_id}: error attempt {attempt}: {str(e)[:80]}")
if attempt < max_retries:
time.sleep(5)
return {}
def write_chunk_file(chunk: dict) -> None:
"""Write chunk .md file."""
chunk_id = chunk["chunk_id"]
chunk_path = CHUNKS_DIR / f"{chunk_id}.md"
bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
page_num = chunk.get("page", 1)
source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png"
content = f"""---
chunk_id: {chunk_id}
type: {chunk.get("type", "paragraph")}
page: {page_num}
order_in_page: {chunk.get("order_in_page", 1)}
order_global: {chunk.get("order_global", 1)}
bbox: {{x: {float(bbox.get('x') or 0):.2f}, y: {float(bbox.get('y') or 0):.2f}, w: {float(bbox.get('w') or 1):.2f}, h: {float(bbox.get('h') or 0.1):.2f}}}
classification: {json.dumps(chunk.get("classification"))}
formatting: {json.dumps(chunk.get("formatting", []))}
cross_page_hint: {chunk.get("cross_page_hint", "self_contained")}
prev_chunk: {json.dumps(chunk.get("prev_chunk"))}
next_chunk: {json.dumps(chunk.get("next_chunk"))}
related_image: {json.dumps(chunk.get("related_image"))}
related_table: null
ocr_confidence: {float(chunk.get("ocr_confidence") or 0.85)}
ocr_source_lines: {json.dumps(chunk.get("ocr_source_lines", []))}
redaction_code: {json.dumps(chunk.get("redaction_code"))}
redaction_inferred_content_type: {json.dumps(chunk.get("redaction_inferred_content_type"))}
image_type: {json.dumps(chunk.get("image_type"))}
ufo_anomaly_detected: {str(bool(chunk.get("ufo_anomaly_detected", False))).lower()}
cryptid_anomaly_detected: {str(bool(chunk.get("cryptid_anomaly_detected", False))).lower()}
ufo_anomaly_type: {json.dumps(chunk.get("ufo_anomaly_type"))}
ufo_anomaly_rationale: {json.dumps(chunk.get("ufo_anomaly_rationale"))}
cryptid_anomaly_type: {json.dumps(chunk.get("cryptid_anomaly_type"))}
cryptid_anomaly_rationale: {json.dumps(chunk.get("cryptid_anomaly_rationale"))}
image_description_en: {json.dumps(chunk.get("image_description_en"))}
image_description_pt_br: {json.dumps(chunk.get("image_description_pt_br"))}
extracted_text: {json.dumps(chunk.get("extracted_text"))}
source_png: {source_png}
---
**EN:** {chunk.get("content_en", "")}
**PT-BR:** {chunk.get("content_pt_br", "")}
"""
chunk_path.write_text(content, encoding="utf-8")
def main():
start_time = time.time()
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
print(f"Rebuilding {DOC_ID}")
print(f"Processing {TOTAL_PAGES} pages with {WORKERS} workers...")
print("=" * 70)
page_numbers = list(range(1, TOTAL_PAGES + 1)) # 1..93
all_page_data = {}
# Process pages in batches of WORKERS
for batch_start in range(0, len(page_numbers), WORKERS):
batch = page_numbers[batch_start:batch_start + WORKERS]
batch_num = batch_start // WORKERS + 1
total_batches = (len(page_numbers) + WORKERS - 1) // WORKERS
safe_print(f"\nBatch {batch_num}/{total_batches}: pages {batch}")
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
futures = {executor.submit(process_page, p): p for p in batch}
for future in as_completed(futures):
page_num = futures[future]
try:
data = future.result()
all_page_data[page_num] = data
except Exception as e:
safe_print(f" Page {page_num}: CRITICAL FAILURE: {e}")
all_page_data[page_num] = {
"page_number": page_num,
"chunks": [{
"order_in_page": 1,
"type": "unknown",
"content_en": f"[Page {page_num} — critical failure]",
"content_pt_br": f"[Página {page_num} — falha crítica]",
"bbox": {"x": 0, "y": 0, "w": 1, "h": 1},
"classification": None, "formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0, "ocr_source_lines": [],
"redaction_code": None, "redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
"image_description_en": None, "image_description_pt_br": None, "extracted_text": None
}]
}
print(f"\nAll pages processed. Assigning global chunk IDs...")
# Assign global chunk IDs in page order
all_chunks = []
chunk_counter = 1
for page_num in sorted(all_page_data.keys()):
page_data = all_page_data[page_num]
chunks = page_data.get("chunks", [])
chunks.sort(key=lambda c: c.get("order_in_page", 1))
for chunk in chunks:
chunk_id = f"c{chunk_counter:04d}"
chunk["chunk_id"] = chunk_id
chunk["page"] = page_num
chunk["order_global"] = chunk_counter
chunk_counter += 1
all_chunks.append(chunk)
total_chunks = len(all_chunks)
print(f"Total chunks: {total_chunks}")
# Prev/next pointers
for i, chunk in enumerate(all_chunks):
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None
# Identify image chunks
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f"\nCropping {len(image_chunks)} images...")
# Crop all images first
crop_results = {}
for chunk in image_chunks:
chunk_id = chunk["chunk_id"]
page_num = chunk["page"]
png_path = PNG_DIR / f"p-{page_num:03d}.png"
if png_path.exists():
cp = crop_image(chunk_id, png_path, chunk.get("bbox", {}))
crop_results[chunk_id] = cp
else:
crop_results[chunk_id] = None
# Analyze images in batches
image_items = [(c["chunk_id"], crop_results.get(c["chunk_id"]))
for c in image_chunks if crop_results.get(c["chunk_id"])]
print(f"\nAnalyzing {len(image_items)} cropped images...")
image_analysis = {}
for batch_start in range(0, len(image_items), WORKERS):
batch = image_items[batch_start:batch_start + WORKERS]
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
futures = {executor.submit(analyze_image, cid, cp): cid for cid, cp in batch}
for future in as_completed(futures):
chunk_id = futures[future]
try:
image_analysis[chunk_id] = future.result()
except Exception as e:
safe_print(f" Image analysis {chunk_id}: {e}")
image_analysis[chunk_id] = {}
# Merge image analysis into chunks
for chunk in all_chunks:
chunk_id = chunk["chunk_id"]
if chunk.get("type") == "image":
chunk["related_image"] = f"IMG-{chunk_id}.png"
if chunk_id in image_analysis:
for field in ["image_description_en", "image_description_pt_br", "image_type",
"extracted_text", "ufo_anomaly_detected", "ufo_anomaly_type",
"ufo_anomaly_rationale", "cryptid_anomaly_detected",
"cryptid_anomaly_type", "cryptid_anomaly_rationale"]:
if field in image_analysis[chunk_id]:
chunk[field] = image_analysis[chunk_id][field]
# Write chunk files
print(f"\nWriting {total_chunks} chunk files...")
for chunk in all_chunks:
write_chunk_file(chunk)
print("Chunk files written.")
# Build _index.json
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
index_chunks = []
for chunk in all_chunks:
bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
index_chunks.append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk.get("page", 1),
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": bbox,
"preview": chunk.get("content_en", "")[:80]
})
index_data = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": total_chunks,
"build_approach": "subagents",
"build_model": "claude-haiku-4-5",
"build_at": now_iso,
"chunks": index_chunks
}
(OUT_DIR / "_index.json").write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
print("_index.json written.")
# Compute stats
chunk_types = {}
ufo_anomalies = []
cryptid_anomalies = []
images_count = 0
for chunk in all_chunks:
t = chunk.get("type", "paragraph")
chunk_types[t] = chunk_types.get(t, 0) + 1
if chunk.get("ufo_anomaly_detected"):
ufo_anomalies.append(chunk["chunk_id"])
if chunk.get("cryptid_anomaly_detected"):
cryptid_anomalies.append(chunk["chunk_id"])
if t == "image":
images_count += 1
# Assemble document.md
print("\nAssembling document.md...")
parts = []
# Frontmatter
parts.append("---")
parts.append('schema_version: "0.2.0"')
parts.append("type: master_document")
parts.append(f"doc_id: {DOC_ID}")
parts.append(f'canonical_title: "{DOC_TITLE}"')
parts.append(f"total_pages: {TOTAL_PAGES}")
parts.append(f"total_chunks: {total_chunks}")
parts.append("chunk_types_histogram:")
for t, count in sorted(chunk_types.items()):
parts.append(f" {t}: {count}")
parts.append("multi_page_tables: []")
parts.append(f"ufo_anomalies_flagged: {json.dumps(ufo_anomalies)}")
parts.append(f"cryptid_anomalies_flagged: {json.dumps(cryptid_anomalies)}")
parts.append('build_approach: "subagents"')
parts.append("build_model: claude-haiku-4-5")
parts.append(f"build_at: {now_iso}")
parts.append("---")
parts.append("")
current_page = None
for chunk in all_chunks:
page = chunk.get("page", 1)
if page != current_page:
current_page = page
parts.append(f"\n## Page {page}\n")
chunk_id = chunk["chunk_id"]
bbox = chunk.get("bbox") or {"x": 0, "y": 0, "w": 1, "h": 0.1}
bbox_str = f"{float(bbox.get('x') or 0):.2f}/{float(bbox.get('y') or 0):.2f}/{float(bbox.get('w') or 1):.2f}/{float(bbox.get('h') or 0.1):.2f}"
ctype = chunk.get("type", "paragraph")
parts.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
parts.append(f'<a id="{chunk_id}"></a>')
parts.append(f"### Chunk {chunk_id}{ctype} · p{page} · bbox: {bbox_str}")
parts.append("")
parts.append(f"**EN:** {chunk.get('content_en', '')}")
parts.append("")
parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}")
parts.append("")
if ctype == "image":
img_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
if img_path.exists():
parts.append(f"![chunk image](./images/IMG-{chunk_id}.png)")
parts.append("")
if chunk.get("image_description_en"):
parts.append(f"*{chunk['image_description_en']}*")
parts.append("")
# Metadata block
meta = {k: v for k, v in chunk.items() if k not in ["content_en", "content_pt_br"]}
parts.append("<details><summary>metadata</summary>")
parts.append("")
parts.append("```json")
parts.append(json.dumps(meta, ensure_ascii=False, indent=2))
parts.append("```")
parts.append("")
parts.append("</details>")
parts.append("")
parts.append("---")
parts.append("")
document_md = "\n".join(parts)
doc_path = OUT_DIR / "document.md"
doc_path.write_text(document_md, encoding="utf-8")
doc_md_bytes = len(document_md.encode("utf-8"))
print(f"document.md written ({doc_md_bytes:,} bytes)")
wall_seconds = int(time.time() - start_time)
print(f"\n{'='*70}")
print(f"pages_done={TOTAL_PAGES}, chunks_total={total_chunks}, images_extracted={images_count}, tables_stitched=0, ufo_anomalies={len(ufo_anomalies)}, cryptid_anomalies={len(cryptid_anomalies)}, wall_seconds={wall_seconds}")
if __name__ == "__main__":
main()