478 lines
19 KiB
Python
478 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Rebuild doc-65-hs1-834228961-62-hq-83894-serial-130
|
|
Processes all 91 pages via Claude vision, produces chunks/_index.json/document.md
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import base64
|
|
import time
|
|
import concurrent.futures
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import anthropic
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130"
|
|
DOC_TITLE = "HQ Air Defense Command - Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)"
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
|
|
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
CHUNKS_DIR = RAW_DIR / "chunks"
|
|
IMAGES_DIR = RAW_DIR / "images"
|
|
TABLES_DIR = RAW_DIR / "tables"
|
|
|
|
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
client = anthropic.Anthropic()
|
|
|
|
def encode_image(path: Path) -> str:
|
|
with open(path, "rb") as f:
|
|
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
|
|
PAGE_REBUILDER_PROMPT = '''You are a page-rebuilder subagent. Analyze this document page image and extract ALL content as structured chunks.
|
|
|
|
Document: {doc_title}
|
|
Doc ID: {doc_id}
|
|
Page number (in sequence): {page_number} of {total_pages}
|
|
Source PNG filename: {png_filename}
|
|
|
|
Return a JSON object with this exact structure:
|
|
{{
|
|
"page_number": {page_number},
|
|
"png_filename": "{png_filename}",
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<type>",
|
|
"content_en": "...",
|
|
"content_pt_br": "...",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.9,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null,
|
|
"image_description_en": null,
|
|
"image_description_pt_br": null,
|
|
"extracted_text": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Allowed chunk types: letterhead, classification_banner, subject_line, body_paragraph, list_item, signature_block, date_line, address_block, header, footer, redaction_block, table_marker, image, stamp, handwritten_note, page_number_marker, blank
|
|
|
|
Rules:
|
|
1. Create ONE chunk per distinct visual/logical unit. Do not merge unrelated blocks.
|
|
2. For classification banners (TOP SECRET, SECRET, CONFIDENTIAL, etc.) at top/bottom of page: type=classification_banner, fill classification field.
|
|
3. For any image/photo/diagram/map/sketch: type=image, fill image_type, image_description_en, image_description_pt_br, ufo_anomaly_detected, cryptid_anomaly_detected.
|
|
4. For redacted/blacked-out areas: type=redaction_block, fill redaction_code if visible.
|
|
5. content_en = exact English transcription of text, verbatim. content_pt_br = Brazilian Portuguese translation of content_en (NOT translation of classification banners/stamps/codes — keep those verbatim in both fields).
|
|
6. bbox: normalized coordinates (0.0-1.0): x=left, y=top, w=width, h=height relative to page.
|
|
7. formatting: array of applicable: bold, italic, underline, all_caps, strikethrough, handwritten.
|
|
8. For cross_page_hint: "continues_to_next" if text clearly continues on next page, "continues_from_prev" if it continues from previous page, "self_contained" otherwise.
|
|
9. ocr_confidence: your confidence in the transcription (0.0-1.0).
|
|
10. If page is blank: return single chunk type=blank.
|
|
11. ufo_anomaly_detected: true if the chunk contains or depicts a UAP/UFO, unidentified aerial phenomenon, unknown object in sky, or anomalous craft. Set ufo_anomaly_type and ufo_anomaly_rationale.
|
|
12. IMPORTANT: Return ONLY valid JSON, no markdown code blocks, no explanation.'''
|
|
|
|
def process_page(page_index: int, png_filename: str, total_pages: int) -> dict:
|
|
"""Process a single page and return its chunks."""
|
|
png_path = PNG_DIR / png_filename
|
|
|
|
try:
|
|
img_data = encode_image(png_path)
|
|
|
|
prompt = PAGE_REBUILDER_PROMPT.format(
|
|
doc_title=DOC_TITLE,
|
|
doc_id=DOC_ID,
|
|
page_number=page_index,
|
|
total_pages=total_pages,
|
|
png_filename=png_filename
|
|
)
|
|
|
|
response = client.messages.create(
|
|
model="claude-sonnet-4-6",
|
|
max_tokens=4096,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": img_data
|
|
}
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": prompt
|
|
}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
raw_text = response.content[0].text.strip()
|
|
# Strip markdown code block if present
|
|
if raw_text.startswith("```"):
|
|
lines = raw_text.split("\n")
|
|
# Remove first and last lines if they are code fences
|
|
if lines[0].startswith("```"):
|
|
lines = lines[1:]
|
|
if lines and lines[-1].strip() == "```":
|
|
lines = lines[:-1]
|
|
raw_text = "\n".join(lines)
|
|
|
|
result = json.loads(raw_text)
|
|
result["page_index"] = page_index
|
|
result["png_filename"] = png_filename
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f" ERROR page {page_index} ({png_filename}): {e}", file=sys.stderr)
|
|
# Return minimal fallback
|
|
return {
|
|
"page_number": page_index,
|
|
"page_index": page_index,
|
|
"png_filename": png_filename,
|
|
"chunks": [
|
|
{
|
|
"order_in_page": 1,
|
|
"type": "blank",
|
|
"content_en": f"[Page processing error: {str(e)[:100]}]",
|
|
"content_pt_br": f"[Erro de processamento: {str(e)[:100]}]",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}
|
|
]
|
|
}
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
|
|
# Get all PNG files in sorted order
|
|
png_files = sorted([f.name for f in PNG_DIR.glob("p-*.png")])
|
|
total_pages = len(png_files)
|
|
print(f"Processing {total_pages} pages for {DOC_ID}")
|
|
|
|
# Process in parallel batches of 5
|
|
all_page_results = {}
|
|
batch_size = 5
|
|
|
|
for batch_start in range(0, total_pages, batch_size):
|
|
batch = png_files[batch_start:batch_start + batch_size]
|
|
batch_indices = list(range(batch_start + 1, batch_start + len(batch) + 1))
|
|
|
|
print(f" Batch {batch_start//batch_size + 1}: pages {batch_indices[0]}-{batch_indices[-1]} ({[b for b in batch]})")
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
futures = {
|
|
executor.submit(process_page, idx, fname, total_pages): (idx, fname)
|
|
for idx, fname in zip(batch_indices, batch)
|
|
}
|
|
for future in concurrent.futures.as_completed(futures):
|
|
idx, fname = futures[future]
|
|
try:
|
|
result = future.result(timeout=120)
|
|
all_page_results[idx] = result
|
|
chunk_count = len(result.get("chunks", []))
|
|
print(f" Page {idx} ({fname}): {chunk_count} chunks")
|
|
except Exception as e:
|
|
print(f" FAILED page {idx} ({fname}): {e}", file=sys.stderr)
|
|
|
|
# Globally number chunks
|
|
print("\nNumbering chunks globally...")
|
|
all_chunks = []
|
|
global_order = 0
|
|
|
|
for page_idx in sorted(all_page_results.keys()):
|
|
page_data = all_page_results[page_idx]
|
|
png_filename = page_data.get("png_filename", f"p-{page_idx:03d}.png")
|
|
page_chunks = page_data.get("chunks", [])
|
|
|
|
# Sort by order_in_page
|
|
page_chunks.sort(key=lambda c: c.get("order_in_page", 0))
|
|
|
|
for chunk in page_chunks:
|
|
global_order += 1
|
|
chunk_id = f"c{global_order:04d}"
|
|
chunk["chunk_id"] = chunk_id
|
|
chunk["page"] = page_idx
|
|
chunk["order_global"] = global_order
|
|
chunk["source_png"] = f"../../processing/png/{DOC_ID}/{png_filename}"
|
|
all_chunks.append(chunk)
|
|
|
|
# Set prev/next pointers
|
|
for i, chunk in enumerate(all_chunks):
|
|
chunk["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None
|
|
chunk["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks) - 1 else None
|
|
|
|
# Detect image chunks for cropping
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f"\nFound {len(image_chunks)} image chunks")
|
|
|
|
# Crop images using PIL
|
|
print("Cropping image regions...")
|
|
for chunk in image_chunks:
|
|
chunk_id = chunk["chunk_id"]
|
|
page_idx = chunk["page"]
|
|
png_filename = all_page_results[page_idx]["png_filename"]
|
|
png_path = PNG_DIR / png_filename
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
|
|
try:
|
|
from PIL import Image
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
x, y, w, h = bbox.get("x", 0), bbox.get("y", 0), bbox.get("w", 1), bbox.get("h", 1)
|
|
pad = 0.005
|
|
crop = im.crop((
|
|
max(0, int((x - pad) * W)),
|
|
max(0, int((y - pad) * H)),
|
|
min(W, int((x + w + pad) * W)),
|
|
min(H, int((y + h + pad) * H))
|
|
))
|
|
crop.save(str(out_path))
|
|
chunk["related_image"] = f"IMG-{chunk_id}.png"
|
|
print(f" Cropped {chunk_id} from {png_filename}")
|
|
except Exception as e:
|
|
print(f" CROP ERROR {chunk_id}: {e}", file=sys.stderr)
|
|
chunk["related_image"] = None
|
|
|
|
# For non-image chunks, set related_image to null
|
|
for chunk in all_chunks:
|
|
if "related_image" not in chunk:
|
|
chunk["related_image"] = None
|
|
if "related_table" not in chunk:
|
|
chunk["related_table"] = None
|
|
|
|
# Write individual chunk files
|
|
print("\nWriting chunk files...")
|
|
for chunk in all_chunks:
|
|
chunk_id = chunk["chunk_id"]
|
|
chunk_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
|
|
content = f"""---
|
|
chunk_id: {chunk_id}
|
|
type: {chunk.get('type', 'body_paragraph')}
|
|
page: {chunk.get('page', 1)}
|
|
order_in_page: {chunk.get('order_in_page', 1)}
|
|
order_global: {chunk.get('order_global', 1)}
|
|
bbox: {{x: {bbox.get('x', 0):.3f}, y: {bbox.get('y', 0):.3f}, w: {bbox.get('w', 1):.3f}, h: {bbox.get('h', 1):.3f}}}
|
|
classification: {json.dumps(chunk.get('classification'))}
|
|
formatting: {json.dumps(chunk.get('formatting', []))}
|
|
cross_page_hint: {chunk.get('cross_page_hint', 'self_contained')}
|
|
prev_chunk: {json.dumps(chunk.get('prev_chunk'))}
|
|
next_chunk: {json.dumps(chunk.get('next_chunk'))}
|
|
related_image: {json.dumps(chunk.get('related_image'))}
|
|
related_table: {json.dumps(chunk.get('related_table'))}
|
|
ocr_confidence: {chunk.get('ocr_confidence', 0.9)}
|
|
ocr_source_lines: {json.dumps(chunk.get('ocr_source_lines', []))}
|
|
redaction_code: {json.dumps(chunk.get('redaction_code'))}
|
|
redaction_inferred_content_type: {json.dumps(chunk.get('redaction_inferred_content_type'))}
|
|
image_type: {json.dumps(chunk.get('image_type'))}
|
|
ufo_anomaly_detected: {str(chunk.get('ufo_anomaly_detected', False)).lower()}
|
|
cryptid_anomaly_detected: {str(chunk.get('cryptid_anomaly_detected', False)).lower()}
|
|
ufo_anomaly_type: {json.dumps(chunk.get('ufo_anomaly_type'))}
|
|
ufo_anomaly_rationale: {json.dumps(chunk.get('ufo_anomaly_rationale'))}
|
|
cryptid_anomaly_type: {json.dumps(chunk.get('cryptid_anomaly_type'))}
|
|
cryptid_anomaly_rationale: {json.dumps(chunk.get('cryptid_anomaly_rationale'))}
|
|
image_description_en: {json.dumps(chunk.get('image_description_en'))}
|
|
image_description_pt_br: {json.dumps(chunk.get('image_description_pt_br'))}
|
|
extracted_text: {json.dumps(chunk.get('extracted_text'))}
|
|
source_png: {chunk.get('source_png', '')}
|
|
---
|
|
|
|
**EN:** {chunk.get('content_en', '')}
|
|
|
|
**PT-BR:** {chunk.get('content_pt_br', '')}
|
|
"""
|
|
chunk_path.write_text(content, encoding="utf-8")
|
|
|
|
print(f" Wrote {len(all_chunks)} chunk files")
|
|
|
|
# Build _index.json
|
|
print("\nBuilding _index.json...")
|
|
build_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
index_chunks = []
|
|
for chunk in all_chunks:
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
preview = chunk.get("content_en", "")[:80]
|
|
index_chunks.append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "body_paragraph"),
|
|
"page": chunk.get("page", 1),
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": {
|
|
"x": round(bbox.get("x", 0), 3),
|
|
"y": round(bbox.get("y", 0), 3),
|
|
"w": round(bbox.get("w", 1), 3),
|
|
"h": round(bbox.get("h", 1), 3)
|
|
},
|
|
"preview": preview
|
|
})
|
|
|
|
index_data = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": total_pages,
|
|
"total_chunks": len(all_chunks),
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-sonnet-4-6",
|
|
"build_at": build_at,
|
|
"chunks": index_chunks
|
|
}
|
|
|
|
index_path = RAW_DIR / "_index.json"
|
|
index_path.write_text(json.dumps(index_data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f" Wrote _index.json with {len(all_chunks)} chunks")
|
|
|
|
# Compute histogram
|
|
type_hist = {}
|
|
for chunk in all_chunks:
|
|
t = chunk.get("type", "unknown")
|
|
type_hist[t] = type_hist.get(t, 0) + 1
|
|
|
|
# Collect anomaly lists
|
|
ufo_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")]
|
|
cryptid_anomaly_chunks = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")]
|
|
|
|
# Assemble document.md
|
|
print("\nAssembling document.md...")
|
|
|
|
doc_lines = []
|
|
doc_lines.append(f"""---
|
|
schema_version: "0.2.0"
|
|
type: master_document
|
|
doc_id: {DOC_ID}
|
|
canonical_title: "{DOC_TITLE}"
|
|
total_pages: {total_pages}
|
|
total_chunks: {len(all_chunks)}
|
|
chunk_types_histogram: {json.dumps(type_hist)}
|
|
multi_page_tables: []
|
|
ufo_anomalies_flagged: {json.dumps(ufo_anomaly_chunks)}
|
|
cryptid_anomalies_flagged: {json.dumps(cryptid_anomaly_chunks)}
|
|
build_approach: "subagents"
|
|
build_model: "claude-sonnet-4-6"
|
|
build_at: "{build_at}"
|
|
---
|
|
""")
|
|
|
|
current_page = None
|
|
for chunk in all_chunks:
|
|
page = chunk.get("page")
|
|
if page != current_page:
|
|
current_page = page
|
|
png_fn = all_page_results.get(page, {}).get("png_filename", f"p-{page:03d}.png")
|
|
doc_lines.append(f"\n## Page {page} (source: {png_fn})\n")
|
|
|
|
chunk_id = chunk["chunk_id"]
|
|
ctype = chunk.get("type", "body_paragraph")
|
|
bbox = chunk.get("bbox", {"x": 0, "y": 0, "w": 1, "h": 1})
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',1):.2f}"
|
|
|
|
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
doc_lines.append(f'<a id="{chunk_id}"></a>')
|
|
doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page} · bbox: {bbox_str}\n")
|
|
|
|
content_en = chunk.get("content_en", "")
|
|
content_pt_br = chunk.get("content_pt_br", "")
|
|
doc_lines.append(f"**EN:** {content_en}\n")
|
|
doc_lines.append(f"**PT-BR:** {content_pt_br}\n")
|
|
|
|
# Embed image if applicable
|
|
if ctype == "image" and chunk.get("related_image"):
|
|
img_file = chunk["related_image"]
|
|
doc_lines.append(f"\n")
|
|
if chunk.get("image_description_en"):
|
|
doc_lines.append(f"*Image description: {chunk['image_description_en']}*\n")
|
|
|
|
# Metadata details block
|
|
meta = {
|
|
"chunk_id": chunk_id,
|
|
"type": ctype,
|
|
"page": page,
|
|
"order_in_page": chunk.get("order_in_page"),
|
|
"order_global": chunk.get("order_global"),
|
|
"bbox": chunk.get("bbox"),
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting", []),
|
|
"cross_page_hint": chunk.get("cross_page_hint"),
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|
"next_chunk": chunk.get("next_chunk"),
|
|
"related_image": chunk.get("related_image"),
|
|
"related_table": chunk.get("related_table"),
|
|
"ocr_confidence": chunk.get("ocr_confidence"),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
|
|
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
|
|
"ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
|
|
"cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
|
|
"cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"),
|
|
"image_description_en": chunk.get("image_description_en"),
|
|
"image_description_pt_br": chunk.get("image_description_pt_br"),
|
|
"source_png": chunk.get("source_png")
|
|
}
|
|
|
|
doc_lines.append("<details><summary>metadata</summary>\n")
|
|
doc_lines.append("```json")
|
|
doc_lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
doc_lines.append("```\n")
|
|
doc_lines.append("</details>\n")
|
|
doc_lines.append("---\n")
|
|
|
|
doc_content = "\n".join(doc_lines)
|
|
doc_path = RAW_DIR / "document.md"
|
|
doc_path.write_text(doc_content, encoding="utf-8")
|
|
|
|
wall_seconds = int(time.time() - start_time)
|
|
doc_bytes = len(doc_content.encode("utf-8"))
|
|
|
|
print(f"\nDone!")
|
|
print(f" Chunks: {len(all_chunks)}")
|
|
print(f" Images: {len(image_chunks)}")
|
|
print(f" UFO anomalies: {len(ufo_anomaly_chunks)}")
|
|
print(f" Cryptid anomalies: {len(cryptid_anomaly_chunks)}")
|
|
print(f" document.md: {doc_bytes} bytes")
|
|
print(f" Wall time: {wall_seconds}s")
|
|
print(f"\nSTATS pages={total_pages} chunks={len(all_chunks)} images={len(image_chunks)} tables=0 ufo={len(ufo_anomaly_chunks)} cryptid={len(cryptid_anomaly_chunks)} doc_md_bytes={doc_bytes}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|