disclosure-bureau/scripts/rebuild_doc65_v2.py

272 lines
9.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Rebuild doc-65-hs1-834228961-62-hq-83894-section-1
Uses claude CLI (OAuth, Max plan) via subprocess — no direct API key needed.
Processes pages 1-150 in parallel batches of 5.
"""
from __future__ import annotations
import base64
import json
import os
import re
import subprocess
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)"
PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1")
OCR_DIR = Path("/Users/guto/ufo/processing/ocr/doc-65-hs1-834228961-62-hq-83894-section-1")
OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1")
TOTAL_PAGES = 150
MAX_WORKERS = 4
TIMEOUT = 180
RETRIES = 3
_lock = threading.Lock()
def safe_print(*args, **kwargs):
with _lock:
print(*args, **kwargs, flush=True)
def load_ocr(page_num: int) -> str:
txt_path = OCR_DIR / f"p-{page_num:03d}.txt"
if txt_path.exists():
try:
content = txt_path.read_text(encoding="utf-8").strip()
return content[:3000] if content else "(empty)"
except Exception:
return "(unreadable)"
return "(not found)"
def extract_json(text: str) -> dict:
"""Extract JSON object from text, stripping markdown fences."""
text = text.strip()
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```\s*$", "", text)
start = text.find("{")
if start == -1:
raise ValueError("No JSON object found")
depth = 0
for i, c in enumerate(text[start:], start):
if c == "{":
depth += 1
elif c == "}":
depth -= 1
if depth == 0:
return json.loads(text[start:i + 1])
raise ValueError("Unclosed JSON object")
PAGE_PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document.
Document: {doc_title}
Page: {page_number} of {total_pages}
STEP 1: Use the Read tool to view this image: {png_path}
STEP 2: Analyze the page carefully and extract ALL content as structured chunks.
STEP 3: Output ONLY a valid JSON object (no markdown, no code fences, no preamble):
{{
"page_number": {page_number},
"chunks": [
{{
"order_in_page": 1,
"type": "paragraph",
"content_en": "verbatim text or description in English",
"content_pt_br": "tradução em português brasileiro",
"bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.05}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null,
"image_description_en": null,
"image_description_pt_br": null,
"extracted_text": null
}}
]
}}
ALLOWED chunk types (use only these exact strings):
letterhead, classification_banner, header, subheader, paragraph, list_item,
caption, footnote, page_number, signature_block, stamp, redaction_block,
image, table_marker, form_field, watermark, separator, blank
RULES:
1. Extract EVERY visible element — no skipping
2. bbox: normalized 0.01.0 (x=left, y=top, w=width, h=height)
3. content_en: verbatim OCR text for text elements; description for images
4. content_pt_br: Brazilian Portuguese (NOT European) translation
5. Preserve UTF-8 accents: ç, ã, á, é, í, ó, ú, â, ê, ô, à
6. Redacted sections: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]"
7. Images/photos: type="image", fill image_description_en and image_description_pt_br
8. classification: visible marking text (e.g. "SECRET", "UNCLASSIFIED") or null
9. formatting: subset of ["bold","italic","underline","all_caps","handwritten","typewritten","strikethrough"]
10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
11. ufo_anomaly_detected: true if chunk contains UAP sighting data, coordinates, witness accounts
12. Blank page: one chunk type="blank"
13. Order chunks top-to-bottom, left-to-right
14. Return ONLY the JSON — no text before or after
OCR hint (may be empty):
{ocr_text}
"""
def process_page(page_num: int) -> dict:
png_path = PNG_DIR / f"p-{page_num:03d}.png"
if not png_path.exists():
safe_print(f" WARNING p{page_num:03d}: PNG missing")
return _error_page(page_num, "[PAGE NOT FOUND]", "[PÁGINA NÃO ENCONTRADA]")
ocr_text = load_ocr(page_num)
prompt = PAGE_PROMPT_TEMPLATE.format(
doc_title=DOC_TITLE,
page_number=page_num,
total_pages=TOTAL_PAGES,
png_path=str(png_path),
ocr_text=ocr_text,
)
for attempt in range(1, RETRIES + 1):
try:
cmd = [
"claude", "-p",
"--model", "haiku",
"--output-format", "json",
"--max-turns", "3",
"--allowedTools", "Read",
"--add-dir", str(PNG_DIR),
"--",
prompt,
]
res = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=TIMEOUT,
check=False,
)
if res.returncode != 0:
raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}")
cli_out = json.loads(res.stdout)
if cli_out.get("is_error"):
raise RuntimeError(f"claude error: {cli_out.get('result','')[:500]}")
result_text = cli_out.get("result", "")
data = extract_json(result_text)
data["page_number"] = page_num
n_chunks = len(data.get("chunks", []))
safe_print(f" p{page_num:03d} OK — {n_chunks} chunks")
return data
except subprocess.TimeoutExpired:
safe_print(f" p{page_num:03d} TIMEOUT (attempt {attempt})")
if attempt == RETRIES:
return _error_page(page_num, "[TIMEOUT]", "[TIMEOUT]")
time.sleep(5 * attempt)
except (RuntimeError, json.JSONDecodeError, ValueError) as e:
safe_print(f" p{page_num:03d} ERROR (attempt {attempt}): {str(e)[:200]}")
if attempt == RETRIES:
return _error_page(page_num, f"[ERROR: {str(e)[:80]}]", f"[ERRO: {str(e)[:80]}]")
time.sleep(5 * attempt)
return _error_page(page_num, "[UNKNOWN ERROR]", "[ERRO DESCONHECIDO]")
def _error_page(page_num: int, msg_en: str, msg_pt: str) -> dict:
return {
"page_number": page_num,
"chunks": [{
"order_in_page": 1,
"type": "blank",
"content_en": msg_en,
"content_pt_br": msg_pt,
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.0,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None,
}]
}
def main():
pages = list(range(1, TOTAL_PAGES + 1))
results: dict[int, dict] = {}
start_time = time.time()
print(f"Processing {len(pages)} pages, {MAX_WORKERS} workers, batches of 5...")
batch_size = 5
for b_start in range(0, len(pages), batch_size):
batch = pages[b_start:b_start + batch_size]
print(f"\nBatch {b_start//batch_size + 1}/{(len(pages)+batch_size-1)//batch_size}: pages {batch[0]}-{batch[-1]}")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
futures = {ex.submit(process_page, p): p for p in batch}
for fut in as_completed(futures):
p = futures[fut]
try:
results[p] = fut.result()
except Exception as e:
safe_print(f" p{p:03d} FATAL: {e}")
results[p] = _error_page(p, f"[FATAL: {str(e)[:80]}]", f"[FATAL: {str(e)[:80]}]")
# Pause between batches
if b_start + batch_size < len(pages):
time.sleep(2)
elapsed = time.time() - start_time
sorted_results = [results[p] for p in sorted(results.keys())]
total_chunks = sum(len(r.get("chunks", [])) for r in sorted_results)
out_path = OUTPUT_DIR / "_pages_raw.json"
with open(out_path, "w", encoding="utf-8") as f:
json.dump(sorted_results, f, ensure_ascii=False, indent=2)
print(f"\nDone in {elapsed:.0f}s — {len(sorted_results)} pages, {total_chunks} chunks")
print(f"Saved: {out_path}")
if __name__ == "__main__":
main()