272 lines
9.3 KiB
Python
272 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Rebuild doc-65-hs1-834228961-62-hq-83894-section-1
|
||
Uses claude CLI (OAuth, Max plan) via subprocess — no direct API key needed.
|
||
Processes pages 1-150 in parallel batches of 5.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import base64
|
||
import json
|
||
import os
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
import threading
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1"
|
||
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)"
|
||
PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1")
|
||
OCR_DIR = Path("/Users/guto/ufo/processing/ocr/doc-65-hs1-834228961-62-hq-83894-section-1")
|
||
OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1")
|
||
TOTAL_PAGES = 150
|
||
MAX_WORKERS = 4
|
||
TIMEOUT = 180
|
||
RETRIES = 3
|
||
|
||
_lock = threading.Lock()
|
||
|
||
|
||
def safe_print(*args, **kwargs):
|
||
with _lock:
|
||
print(*args, **kwargs, flush=True)
|
||
|
||
|
||
def load_ocr(page_num: int) -> str:
|
||
txt_path = OCR_DIR / f"p-{page_num:03d}.txt"
|
||
if txt_path.exists():
|
||
try:
|
||
content = txt_path.read_text(encoding="utf-8").strip()
|
||
return content[:3000] if content else "(empty)"
|
||
except Exception:
|
||
return "(unreadable)"
|
||
return "(not found)"
|
||
|
||
|
||
def extract_json(text: str) -> dict:
|
||
"""Extract JSON object from text, stripping markdown fences."""
|
||
text = text.strip()
|
||
if text.startswith("```"):
|
||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||
text = re.sub(r"\s*```\s*$", "", text)
|
||
start = text.find("{")
|
||
if start == -1:
|
||
raise ValueError("No JSON object found")
|
||
depth = 0
|
||
for i, c in enumerate(text[start:], start):
|
||
if c == "{":
|
||
depth += 1
|
||
elif c == "}":
|
||
depth -= 1
|
||
if depth == 0:
|
||
return json.loads(text[start:i + 1])
|
||
raise ValueError("Unclosed JSON object")
|
||
|
||
|
||
PAGE_PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document.
|
||
|
||
Document: {doc_title}
|
||
Page: {page_number} of {total_pages}
|
||
|
||
STEP 1: Use the Read tool to view this image: {png_path}
|
||
|
||
STEP 2: Analyze the page carefully and extract ALL content as structured chunks.
|
||
|
||
STEP 3: Output ONLY a valid JSON object (no markdown, no code fences, no preamble):
|
||
{{
|
||
"page_number": {page_number},
|
||
"chunks": [
|
||
{{
|
||
"order_in_page": 1,
|
||
"type": "paragraph",
|
||
"content_en": "verbatim text or description in English",
|
||
"content_pt_br": "tradução em português brasileiro",
|
||
"bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.05}},
|
||
"classification": null,
|
||
"formatting": [],
|
||
"cross_page_hint": "self_contained",
|
||
"ocr_confidence": 0.85,
|
||
"ocr_source_lines": [],
|
||
"redaction_code": null,
|
||
"redaction_inferred_content_type": null,
|
||
"image_type": null,
|
||
"ufo_anomaly_detected": false,
|
||
"ufo_anomaly_type": null,
|
||
"ufo_anomaly_rationale": null,
|
||
"cryptid_anomaly_detected": false,
|
||
"cryptid_anomaly_type": null,
|
||
"cryptid_anomaly_rationale": null,
|
||
"image_description_en": null,
|
||
"image_description_pt_br": null,
|
||
"extracted_text": null
|
||
}}
|
||
]
|
||
}}
|
||
|
||
ALLOWED chunk types (use only these exact strings):
|
||
letterhead, classification_banner, header, subheader, paragraph, list_item,
|
||
caption, footnote, page_number, signature_block, stamp, redaction_block,
|
||
image, table_marker, form_field, watermark, separator, blank
|
||
|
||
RULES:
|
||
1. Extract EVERY visible element — no skipping
|
||
2. bbox: normalized 0.0–1.0 (x=left, y=top, w=width, h=height)
|
||
3. content_en: verbatim OCR text for text elements; description for images
|
||
4. content_pt_br: Brazilian Portuguese (NOT European) translation
|
||
5. Preserve UTF-8 accents: ç, ã, á, é, í, ó, ú, â, ê, ô, à
|
||
6. Redacted sections: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]"
|
||
7. Images/photos: type="image", fill image_description_en and image_description_pt_br
|
||
8. classification: visible marking text (e.g. "SECRET", "UNCLASSIFIED") or null
|
||
9. formatting: subset of ["bold","italic","underline","all_caps","handwritten","typewritten","strikethrough"]
|
||
10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
|
||
11. ufo_anomaly_detected: true if chunk contains UAP sighting data, coordinates, witness accounts
|
||
12. Blank page: one chunk type="blank"
|
||
13. Order chunks top-to-bottom, left-to-right
|
||
14. Return ONLY the JSON — no text before or after
|
||
|
||
OCR hint (may be empty):
|
||
{ocr_text}
|
||
"""
|
||
|
||
|
||
def process_page(page_num: int) -> dict:
|
||
png_path = PNG_DIR / f"p-{page_num:03d}.png"
|
||
|
||
if not png_path.exists():
|
||
safe_print(f" WARNING p{page_num:03d}: PNG missing")
|
||
return _error_page(page_num, "[PAGE NOT FOUND]", "[PÁGINA NÃO ENCONTRADA]")
|
||
|
||
ocr_text = load_ocr(page_num)
|
||
|
||
prompt = PAGE_PROMPT_TEMPLATE.format(
|
||
doc_title=DOC_TITLE,
|
||
page_number=page_num,
|
||
total_pages=TOTAL_PAGES,
|
||
png_path=str(png_path),
|
||
ocr_text=ocr_text,
|
||
)
|
||
|
||
for attempt in range(1, RETRIES + 1):
|
||
try:
|
||
cmd = [
|
||
"claude", "-p",
|
||
"--model", "haiku",
|
||
"--output-format", "json",
|
||
"--max-turns", "3",
|
||
"--allowedTools", "Read",
|
||
"--add-dir", str(PNG_DIR),
|
||
"--",
|
||
prompt,
|
||
]
|
||
res = subprocess.run(
|
||
cmd,
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=TIMEOUT,
|
||
check=False,
|
||
)
|
||
|
||
if res.returncode != 0:
|
||
raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}")
|
||
|
||
cli_out = json.loads(res.stdout)
|
||
if cli_out.get("is_error"):
|
||
raise RuntimeError(f"claude error: {cli_out.get('result','')[:500]}")
|
||
|
||
result_text = cli_out.get("result", "")
|
||
data = extract_json(result_text)
|
||
data["page_number"] = page_num
|
||
|
||
n_chunks = len(data.get("chunks", []))
|
||
safe_print(f" p{page_num:03d} OK — {n_chunks} chunks")
|
||
return data
|
||
|
||
except subprocess.TimeoutExpired:
|
||
safe_print(f" p{page_num:03d} TIMEOUT (attempt {attempt})")
|
||
if attempt == RETRIES:
|
||
return _error_page(page_num, "[TIMEOUT]", "[TIMEOUT]")
|
||
time.sleep(5 * attempt)
|
||
|
||
except (RuntimeError, json.JSONDecodeError, ValueError) as e:
|
||
safe_print(f" p{page_num:03d} ERROR (attempt {attempt}): {str(e)[:200]}")
|
||
if attempt == RETRIES:
|
||
return _error_page(page_num, f"[ERROR: {str(e)[:80]}]", f"[ERRO: {str(e)[:80]}]")
|
||
time.sleep(5 * attempt)
|
||
|
||
return _error_page(page_num, "[UNKNOWN ERROR]", "[ERRO DESCONHECIDO]")
|
||
|
||
|
||
def _error_page(page_num: int, msg_en: str, msg_pt: str) -> dict:
|
||
return {
|
||
"page_number": page_num,
|
||
"chunks": [{
|
||
"order_in_page": 1,
|
||
"type": "blank",
|
||
"content_en": msg_en,
|
||
"content_pt_br": msg_pt,
|
||
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
||
"classification": None,
|
||
"formatting": [],
|
||
"cross_page_hint": "self_contained",
|
||
"ocr_confidence": 0.0,
|
||
"ocr_source_lines": [],
|
||
"redaction_code": None,
|
||
"redaction_inferred_content_type": None,
|
||
"image_type": None,
|
||
"ufo_anomaly_detected": False,
|
||
"ufo_anomaly_type": None,
|
||
"ufo_anomaly_rationale": None,
|
||
"cryptid_anomaly_detected": False,
|
||
"cryptid_anomaly_type": None,
|
||
"cryptid_anomaly_rationale": None,
|
||
"image_description_en": None,
|
||
"image_description_pt_br": None,
|
||
"extracted_text": None,
|
||
}]
|
||
}
|
||
|
||
|
||
def main():
|
||
pages = list(range(1, TOTAL_PAGES + 1))
|
||
results: dict[int, dict] = {}
|
||
start_time = time.time()
|
||
|
||
print(f"Processing {len(pages)} pages, {MAX_WORKERS} workers, batches of 5...")
|
||
|
||
batch_size = 5
|
||
for b_start in range(0, len(pages), batch_size):
|
||
batch = pages[b_start:b_start + batch_size]
|
||
print(f"\nBatch {b_start//batch_size + 1}/{(len(pages)+batch_size-1)//batch_size}: pages {batch[0]}-{batch[-1]}")
|
||
|
||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
||
futures = {ex.submit(process_page, p): p for p in batch}
|
||
for fut in as_completed(futures):
|
||
p = futures[fut]
|
||
try:
|
||
results[p] = fut.result()
|
||
except Exception as e:
|
||
safe_print(f" p{p:03d} FATAL: {e}")
|
||
results[p] = _error_page(p, f"[FATAL: {str(e)[:80]}]", f"[FATAL: {str(e)[:80]}]")
|
||
|
||
# Pause between batches
|
||
if b_start + batch_size < len(pages):
|
||
time.sleep(2)
|
||
|
||
elapsed = time.time() - start_time
|
||
sorted_results = [results[p] for p in sorted(results.keys())]
|
||
total_chunks = sum(len(r.get("chunks", [])) for r in sorted_results)
|
||
|
||
out_path = OUTPUT_DIR / "_pages_raw.json"
|
||
with open(out_path, "w", encoding="utf-8") as f:
|
||
json.dump(sorted_results, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\nDone in {elapsed:.0f}s — {len(sorted_results)} pages, {total_chunks} chunks")
|
||
print(f"Saved: {out_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|