287 lines
12 KiB
Python
287 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Page rebuilder for doc-65-hs1-834228961-62-hq-83894-section-1
|
|
Processes pages 1-150 using vision (PNGs at p-001.png .. p-150.png)
|
|
Outputs JSON per page with chunks list.
|
|
"""
|
|
import anthropic
|
|
import base64
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1"
|
|
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UFO/UAP Investigative File)"
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1")
|
|
OCR_DIR = Path("/Users/guto/ufo/processing/ocr/doc-65-hs1-834228961-62-hq-83894-section-1")
|
|
OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1")
|
|
TOTAL_PAGES = 150
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
CHUNK_TYPES = [
|
|
"letterhead", "classification_banner", "header", "subheader",
|
|
"paragraph", "list_item", "caption", "footnote", "page_number",
|
|
"signature_block", "stamp", "redaction_block", "image", "table_marker",
|
|
"form_field", "watermark", "separator", "blank"
|
|
]
|
|
|
|
PAGE_REBUILDER_PROMPT = """You are a page-rebuilder for a declassified UAP/UFO government document.
|
|
|
|
Document: {doc_title}
|
|
Page: {page_number} of {total_pages}
|
|
|
|
Analyze this page image carefully and extract ALL content as structured chunks.
|
|
|
|
Return a JSON object with this exact structure:
|
|
{{
|
|
"page_number": {page_number},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<chunk_type>",
|
|
"content_en": "<English text or description>",
|
|
"content_pt_br": "<Brazilian Portuguese translation>",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null,
|
|
"image_description_en": null,
|
|
"image_description_pt_br": null,
|
|
"extracted_text": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
CHUNK TYPES (use only these):
|
|
letterhead, classification_banner, header, subheader, paragraph, list_item,
|
|
caption, footnote, page_number, signature_block, stamp, redaction_block,
|
|
image, table_marker, form_field, watermark, separator, blank
|
|
|
|
RULES:
|
|
1. Extract EVERY element on the page — nothing is skipped
|
|
2. bbox: normalized coordinates (x=left, y=top, w=width, h=height) relative to page size (0.0 to 1.0)
|
|
3. content_en: verbatim OCR text for text chunks; for images describe what you see
|
|
4. content_pt_br: Brazilian Portuguese translation of content_en (NOT European Portuguese)
|
|
5. Preserve UTF-8 accents: ç, ã, á, é, í, ó, ú, â, ê, ô, à
|
|
6. For redacted sections: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]"
|
|
7. For images/photos: type="image", describe the visual content in image_description_en and image_description_pt_br
|
|
8. For stamps: type="stamp"
|
|
9. classification: extract classification markings if visible (e.g. "SECRET", "CONFIDENTIAL")
|
|
10. formatting: array of applicable ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"]
|
|
11. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev"
|
|
12. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, or anomalous phenomena
|
|
13. If page is blank: return one chunk with type="blank"
|
|
14. Order chunks top-to-bottom, left-to-right as they appear on the page
|
|
15. Return ONLY valid JSON, no markdown code blocks, no extra text
|
|
|
|
OCR text hint (may be empty or garbled):
|
|
{ocr_text}
|
|
"""
|
|
|
|
def load_image_b64(png_path: Path) -> str:
|
|
with open(png_path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
def load_ocr(page_num: int) -> str:
|
|
txt_path = OCR_DIR / f"p-{page_num:03d}.txt"
|
|
if txt_path.exists():
|
|
try:
|
|
content = txt_path.read_text(encoding="utf-8").strip()
|
|
return content if content else "(empty)"
|
|
except Exception:
|
|
return "(unreadable)"
|
|
return "(not found)"
|
|
|
|
def process_page(page_num: int, retries: int = 3) -> dict:
|
|
png_path = PNG_DIR / f"p-{page_num:03d}.png"
|
|
if not png_path.exists():
|
|
print(f" WARNING: PNG not found for page {page_num}: {png_path}", file=sys.stderr)
|
|
return {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": "[PAGE NOT FOUND]",
|
|
"content_pt_br": "[PÁGINA NÃO ENCONTRADA]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}]
|
|
}
|
|
|
|
ocr_text = load_ocr(page_num)
|
|
img_b64 = load_image_b64(png_path)
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
page_number=page_num,
|
|
total_pages=TOTAL_PAGES,
|
|
ocr_text=ocr_text[:2000] # cap at 2000 chars
|
|
)
|
|
|
|
for attempt in range(retries):
|
|
try:
|
|
response = client.messages.create(
|
|
model="claude-haiku-4-5",
|
|
max_tokens=4096,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_b64
|
|
}
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": prompt
|
|
}
|
|
]
|
|
}]
|
|
)
|
|
|
|
raw = response.content[0].text.strip()
|
|
# Strip markdown code blocks if present
|
|
if raw.startswith("```"):
|
|
lines = raw.split("\n")
|
|
raw = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
|
|
|
|
result = json.loads(raw)
|
|
result["page_number"] = page_num # ensure correct
|
|
print(f" Page {page_num:3d} done — {len(result.get('chunks', []))} chunks", flush=True)
|
|
return result
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f" Page {page_num} JSON error (attempt {attempt+1}): {e}", file=sys.stderr)
|
|
if attempt == retries - 1:
|
|
return {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": f"[PARSE ERROR: {str(e)[:100]}]",
|
|
"content_pt_br": f"[ERRO DE ANÁLISE: {str(e)[:100]}]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}]
|
|
}
|
|
time.sleep(2 ** attempt)
|
|
|
|
except Exception as e:
|
|
print(f" Page {page_num} API error (attempt {attempt+1}): {e}", file=sys.stderr)
|
|
if attempt == retries - 1:
|
|
return {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": f"[API ERROR: {str(e)[:100]}]",
|
|
"content_pt_br": f"[ERRO DE API: {str(e)[:100]}]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}]
|
|
}
|
|
time.sleep(2 ** attempt)
|
|
|
|
def main():
|
|
pages = list(range(1, TOTAL_PAGES + 1))
|
|
results = {}
|
|
|
|
print(f"Processing {len(pages)} pages in parallel batches of 5...")
|
|
batch_size = 5
|
|
|
|
for batch_start in range(0, len(pages), batch_size):
|
|
batch = pages[batch_start:batch_start + batch_size]
|
|
print(f"Batch {batch_start//batch_size + 1}: pages {batch[0]}-{batch[-1]}")
|
|
|
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
future_to_page = {executor.submit(process_page, p): p for p in batch}
|
|
for future in as_completed(future_to_page):
|
|
page_num = future_to_page[future]
|
|
try:
|
|
result = future.result()
|
|
results[page_num] = result
|
|
except Exception as e:
|
|
print(f" Page {page_num} FATAL: {e}", file=sys.stderr)
|
|
|
|
# Small pause between batches to avoid rate limits
|
|
if batch_start + batch_size < len(pages):
|
|
time.sleep(1)
|
|
|
|
# Save intermediate results
|
|
out_path = OUTPUT_DIR / "_pages_raw.json"
|
|
sorted_results = [results[p] for p in sorted(results.keys())]
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
json.dump(sorted_results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nSaved {len(sorted_results)} pages to {out_path}")
|
|
total_chunks = sum(len(r.get("chunks", [])) for r in sorted_results)
|
|
print(f"Total chunks: {total_chunks}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|