462 lines
17 KiB
Python
462 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Assemble chunks/, _index.json, and document.md from _pages_raw.json
|
|
for doc-65-hs1-834228961-62-hq-83894-section-1.
|
|
|
|
Also:
|
|
- Crops image chunks using PIL
|
|
- Detects multi-page table markers for stitching
|
|
- Writes all output files
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1"
|
|
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)"
|
|
PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1")
|
|
OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1")
|
|
CHUNKS_DIR = OUTPUT_DIR / "chunks"
|
|
IMAGES_DIR = OUTPUT_DIR / "images"
|
|
TABLES_DIR = OUTPUT_DIR / "tables"
|
|
|
|
TOTAL_PAGES = 150
|
|
BUILD_AT = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
BUILD_MODEL = "claude-haiku-4-5"
|
|
|
|
|
|
def load_pages() -> list[dict]:
|
|
raw_path = OUTPUT_DIR / "_pages_raw.json"
|
|
with open(raw_path, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def normalize_chunk(chunk: dict, page_num: int) -> dict:
|
|
"""Ensure all required fields exist with correct types."""
|
|
defaults = {
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"content_en": "",
|
|
"content_pt_br": "",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.05},
|
|
"classification": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"ufo_anomaly_detected": False,
|
|
"ufo_anomaly_type": None,
|
|
"ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": None,
|
|
"image_description_pt_br": None,
|
|
"extracted_text": None,
|
|
}
|
|
result = {**defaults, **chunk}
|
|
# Coerce None strings to empty
|
|
if result.get('content_en') is None:
|
|
result['content_en'] = ''
|
|
if result.get('content_pt_br') is None:
|
|
result['content_pt_br'] = ''
|
|
result["page"] = page_num
|
|
|
|
# Normalize booleans
|
|
for bool_field in ("ufo_anomaly_detected", "cryptid_anomaly_detected"):
|
|
val = result.get(bool_field)
|
|
if isinstance(val, str):
|
|
result[bool_field] = val.lower() in ("true", "1", "yes")
|
|
elif val is None:
|
|
result[bool_field] = False
|
|
else:
|
|
result[bool_field] = bool(val)
|
|
|
|
# Normalize formatting to list
|
|
if not isinstance(result.get("formatting"), list):
|
|
result["formatting"] = []
|
|
|
|
# Normalize ocr_source_lines to list
|
|
if not isinstance(result.get("ocr_source_lines"), list):
|
|
result["ocr_source_lines"] = []
|
|
|
|
# Normalize bbox
|
|
bbox = result.get("bbox", {})
|
|
if not isinstance(bbox, dict):
|
|
bbox = {}
|
|
result["bbox"] = {
|
|
"x": float(bbox.get("x", 0.0)),
|
|
"y": float(bbox.get("y", 0.0)),
|
|
"w": float(bbox.get("w", 1.0)),
|
|
"h": float(bbox.get("h", 0.05)),
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def assign_global_ids(pages: list[dict]) -> list[dict]:
|
|
"""
|
|
Assign chunk_id, order_global, prev_chunk, next_chunk to all chunks.
|
|
Returns flat list of all chunks in global order.
|
|
"""
|
|
all_chunks = []
|
|
counter = 1
|
|
|
|
for page_data in pages:
|
|
page_num = page_data.get("page_number", 0)
|
|
chunks = page_data.get("chunks", [])
|
|
# Sort by order_in_page
|
|
chunks.sort(key=lambda c: c.get("order_in_page", 0))
|
|
|
|
for chunk in chunks:
|
|
normalized = normalize_chunk(chunk, page_num)
|
|
normalized["chunk_id"] = f"c{counter:04d}"
|
|
normalized["order_global"] = counter
|
|
all_chunks.append(normalized)
|
|
counter += 1
|
|
|
|
# Set prev/next pointers
|
|
for i, chunk in enumerate(all_chunks):
|
|
chunk["prev_chunk"] = all_chunks[i - 1]["chunk_id"] if i > 0 else None
|
|
chunk["next_chunk"] = all_chunks[i + 1]["chunk_id"] if i < len(all_chunks) - 1 else None
|
|
|
|
return all_chunks
|
|
|
|
|
|
def crop_image(chunk: dict) -> str | None:
|
|
"""Crop image region from page PNG. Returns saved path or None."""
|
|
page_num = chunk["page"]
|
|
chunk_id = chunk["chunk_id"]
|
|
png_path = PNG_DIR / f"p-{page_num:03d}.png"
|
|
if not png_path.exists():
|
|
return None
|
|
|
|
bbox = chunk["bbox"]
|
|
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
|
|
try:
|
|
from PIL import Image
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
pad = 0.005
|
|
x, y, w, h = bbox["x"], bbox["y"], bbox["w"], bbox["h"]
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
if right <= left or bottom <= top:
|
|
return None
|
|
cropped = im.crop((left, top, right, bottom))
|
|
cropped.save(str(out_path))
|
|
return str(out_path)
|
|
except Exception as e:
|
|
print(f" Crop error for {chunk_id}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def write_chunk_file(chunk: dict, source_png_relative: str) -> None:
|
|
"""Write chunks/c<NNNN>.md for one chunk."""
|
|
chunk_id = chunk["chunk_id"]
|
|
chunk_type = chunk.get("type", "paragraph")
|
|
page = chunk.get("page", 0)
|
|
order_in_page = chunk.get("order_in_page", 1)
|
|
order_global = chunk.get("order_global", 1)
|
|
bbox = chunk["bbox"]
|
|
classification = chunk.get("classification")
|
|
formatting = chunk.get("formatting", [])
|
|
cross_page_hint = chunk.get("cross_page_hint", "self_contained")
|
|
prev_chunk = chunk.get("prev_chunk")
|
|
next_chunk = chunk.get("next_chunk")
|
|
ocr_confidence = chunk.get("ocr_confidence", 0.85)
|
|
ocr_source_lines = chunk.get("ocr_source_lines", [])
|
|
redaction_code = chunk.get("redaction_code")
|
|
redaction_inferred = chunk.get("redaction_inferred_content_type")
|
|
image_type = chunk.get("image_type")
|
|
ufo_anomaly = chunk.get("ufo_anomaly_detected", False)
|
|
ufo_type = chunk.get("ufo_anomaly_type")
|
|
ufo_rationale = chunk.get("ufo_anomaly_rationale")
|
|
cryptid_anomaly = chunk.get("cryptid_anomaly_detected", False)
|
|
cryptid_type = chunk.get("cryptid_anomaly_type")
|
|
cryptid_rationale = chunk.get("cryptid_anomaly_rationale")
|
|
image_desc_en = chunk.get("image_description_en")
|
|
image_desc_pt = chunk.get("image_description_pt_br")
|
|
extracted_text = chunk.get("extracted_text")
|
|
content_en = chunk.get("content_en", "")
|
|
content_pt_br = chunk.get("content_pt_br", "")
|
|
|
|
# Related fields
|
|
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
|
|
related_table = chunk.get("related_table")
|
|
|
|
def yaml_val(v):
|
|
if v is None:
|
|
return "null"
|
|
if isinstance(v, bool):
|
|
return "true" if v else "false"
|
|
if isinstance(v, (int, float)):
|
|
return str(v)
|
|
if isinstance(v, list):
|
|
if not v:
|
|
return "[]"
|
|
items = ", ".join(f'"{x}"' for x in v)
|
|
return f"[{items}]"
|
|
# string
|
|
s = str(v).replace('"', '\\"')
|
|
return f'"{s}"'
|
|
|
|
lines = [
|
|
"---",
|
|
f"chunk_id: {chunk_id}",
|
|
f"type: {chunk_type}",
|
|
f"page: {page}",
|
|
f"order_in_page: {order_in_page}",
|
|
f"order_global: {order_global}",
|
|
f"bbox: {{x: {bbox['x']:.2f}, y: {bbox['y']:.2f}, w: {bbox['w']:.2f}, h: {bbox['h']:.2f}}}",
|
|
f"classification: {yaml_val(classification)}",
|
|
f"formatting: {yaml_val(formatting)}",
|
|
f"cross_page_hint: {cross_page_hint}",
|
|
f"prev_chunk: {yaml_val(prev_chunk)}",
|
|
f"next_chunk: {yaml_val(next_chunk)}",
|
|
f"related_image: {yaml_val(related_image)}",
|
|
f"related_table: {yaml_val(related_table)}",
|
|
f"ocr_confidence: {ocr_confidence}",
|
|
f"ocr_source_lines: {yaml_val(ocr_source_lines)}",
|
|
f"redaction_code: {yaml_val(redaction_code)}",
|
|
f"redaction_inferred_content_type: {yaml_val(redaction_inferred)}",
|
|
f"image_type: {yaml_val(image_type)}",
|
|
f"ufo_anomaly_detected: {yaml_val(ufo_anomaly)}",
|
|
f"cryptid_anomaly_detected: {yaml_val(cryptid_anomaly)}",
|
|
f"ufo_anomaly_type: {yaml_val(ufo_type)}",
|
|
f"ufo_anomaly_rationale: {yaml_val(ufo_rationale)}",
|
|
f"cryptid_anomaly_type: {yaml_val(cryptid_type)}",
|
|
f"cryptid_anomaly_rationale: {yaml_val(cryptid_rationale)}",
|
|
f"image_description_en: {yaml_val(image_desc_en)}",
|
|
f"image_description_pt_br: {yaml_val(image_desc_pt)}",
|
|
f"extracted_text: {yaml_val(extracted_text)}",
|
|
f"source_png: {source_png_relative}",
|
|
"---",
|
|
"",
|
|
f"**EN:** {content_en}",
|
|
"",
|
|
f"**PT-BR:** {content_pt_br}",
|
|
]
|
|
|
|
out_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
out_path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
def write_index(all_chunks: list[dict]) -> None:
|
|
index = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": TOTAL_PAGES,
|
|
"total_chunks": len(all_chunks),
|
|
"build_approach": "subagents",
|
|
"build_model": BUILD_MODEL,
|
|
"build_at": BUILD_AT,
|
|
"chunks": []
|
|
}
|
|
|
|
for chunk in all_chunks:
|
|
preview = chunk.get("content_en", "")[:80]
|
|
index["chunks"].append({
|
|
"chunk_id": chunk["chunk_id"],
|
|
"type": chunk.get("type", "paragraph"),
|
|
"page": chunk.get("page", 0),
|
|
"order_in_page": chunk.get("order_in_page", 1),
|
|
"order_global": chunk.get("order_global", 1),
|
|
"file": f"chunks/{chunk['chunk_id']}.md",
|
|
"bbox": chunk["bbox"],
|
|
"preview": preview,
|
|
})
|
|
|
|
out_path = OUTPUT_DIR / "_index.json"
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
json.dump(index, f, ensure_ascii=False, indent=2)
|
|
print(f"Written: {out_path}")
|
|
|
|
|
|
def write_document_md(all_chunks: list[dict], stats: dict) -> None:
|
|
# Compute histogram
|
|
histogram: dict[str, int] = defaultdict(int)
|
|
ufo_flagged = []
|
|
cryptid_flagged = []
|
|
for chunk in all_chunks:
|
|
histogram[chunk.get("type", "paragraph")] += 1
|
|
if chunk.get("ufo_anomaly_detected"):
|
|
ufo_flagged.append(chunk["chunk_id"])
|
|
if chunk.get("cryptid_anomaly_detected"):
|
|
cryptid_flagged.append(chunk["chunk_id"])
|
|
|
|
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(histogram.items()))
|
|
ufo_yaml = json.dumps(ufo_flagged, ensure_ascii=False)
|
|
cryptid_yaml = json.dumps(cryptid_flagged, ensure_ascii=False)
|
|
|
|
lines = [
|
|
"---",
|
|
'schema_version: "0.2.0"',
|
|
"type: master_document",
|
|
f"doc_id: {DOC_ID}",
|
|
f'canonical_title: "{DOC_TITLE}"',
|
|
f"total_pages: {TOTAL_PAGES}",
|
|
f"total_chunks: {len(all_chunks)}",
|
|
"chunk_types_histogram:",
|
|
histogram_yaml,
|
|
f"multi_page_tables: []",
|
|
f"ufo_anomalies_flagged: {ufo_yaml}",
|
|
f"cryptid_anomalies_flagged: {cryptid_yaml}",
|
|
'build_approach: "subagents"',
|
|
f"build_model: {BUILD_MODEL}",
|
|
f"build_at: {BUILD_AT}",
|
|
"---",
|
|
"",
|
|
]
|
|
|
|
# Group chunks by page
|
|
pages_map: dict[int, list[dict]] = defaultdict(list)
|
|
for chunk in all_chunks:
|
|
pages_map[chunk["page"]].append(chunk)
|
|
|
|
for page_num in sorted(pages_map.keys()):
|
|
page_chunks = pages_map[page_num]
|
|
lines.append(f"## Page {page_num}")
|
|
lines.append("")
|
|
|
|
for chunk in page_chunks:
|
|
cid = chunk["chunk_id"]
|
|
ctype = chunk.get("type", "paragraph")
|
|
bbox = chunk["bbox"]
|
|
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
|
|
content_en = chunk.get("content_en", "")
|
|
content_pt_br = chunk.get("content_pt_br", "")
|
|
|
|
lines.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->")
|
|
lines.append(f'<a id="{cid}"></a>')
|
|
lines.append(f"### Chunk {cid} — {ctype} · p{page_num} · bbox: {bbox_str}")
|
|
lines.append("")
|
|
lines.append(f"**EN:** {content_en}")
|
|
lines.append("")
|
|
lines.append(f"**PT-BR:** {content_pt_br}")
|
|
lines.append("")
|
|
|
|
# Embed image if applicable
|
|
if ctype == "image":
|
|
img_path = IMAGES_DIR / f"IMG-{cid}.png"
|
|
if img_path.exists():
|
|
lines.append(f"")
|
|
lines.append("")
|
|
if chunk.get("image_description_en"):
|
|
lines.append(f"*Image description:* {chunk['image_description_en']}")
|
|
lines.append("")
|
|
|
|
# Metadata collapsible
|
|
meta = {
|
|
"chunk_id": cid,
|
|
"type": ctype,
|
|
"page": chunk.get("page"),
|
|
"order_in_page": chunk.get("order_in_page"),
|
|
"order_global": chunk.get("order_global"),
|
|
"bbox": bbox,
|
|
"classification": chunk.get("classification"),
|
|
"formatting": chunk.get("formatting", []),
|
|
"cross_page_hint": chunk.get("cross_page_hint"),
|
|
"prev_chunk": chunk.get("prev_chunk"),
|
|
"next_chunk": chunk.get("next_chunk"),
|
|
"related_image": f"IMG-{cid}.png" if ctype == "image" else None,
|
|
"related_table": chunk.get("related_table"),
|
|
"ocr_confidence": chunk.get("ocr_confidence"),
|
|
"ocr_source_lines": chunk.get("ocr_source_lines", []),
|
|
"redaction_code": chunk.get("redaction_code"),
|
|
"redaction_inferred_content_type": chunk.get("redaction_inferred_content_type"),
|
|
"image_type": chunk.get("image_type"),
|
|
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
|
|
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
|
|
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
|
|
"ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
|
|
"cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
|
|
"cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"),
|
|
"image_description_en": chunk.get("image_description_en"),
|
|
"image_description_pt_br": chunk.get("image_description_pt_br"),
|
|
"extracted_text": chunk.get("extracted_text"),
|
|
}
|
|
lines.append("<details><summary>metadata</summary>")
|
|
lines.append("")
|
|
lines.append("```json")
|
|
lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
lines.append("```")
|
|
lines.append("")
|
|
lines.append("</details>")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
out_path = OUTPUT_DIR / "document.md"
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
f.write("\n".join(lines))
|
|
print(f"Written: {out_path}")
|
|
return len("\n".join(lines).encode("utf-8"))
|
|
|
|
|
|
def main():
|
|
start = time.time()
|
|
print("Loading pages...")
|
|
pages = load_pages()
|
|
print(f" {len(pages)} pages loaded")
|
|
|
|
print("Assigning global IDs...")
|
|
all_chunks = assign_global_ids(pages)
|
|
print(f" {len(all_chunks)} chunks total")
|
|
|
|
# Create dirs
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Crop images
|
|
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
|
|
print(f"Cropping {len(image_chunks)} images...")
|
|
images_saved = 0
|
|
for chunk in image_chunks:
|
|
path = crop_image(chunk)
|
|
if path:
|
|
images_saved += 1
|
|
|
|
# Write chunk files
|
|
print("Writing chunk files...")
|
|
for chunk in all_chunks:
|
|
page_num = chunk["page"]
|
|
source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png"
|
|
write_chunk_file(chunk, source_png)
|
|
print(f" {len(all_chunks)} chunk files written")
|
|
|
|
# Write _index.json
|
|
print("Writing _index.json...")
|
|
write_index(all_chunks)
|
|
|
|
# Write document.md
|
|
print("Writing document.md...")
|
|
stats = {}
|
|
doc_bytes = write_document_md(all_chunks, stats)
|
|
|
|
# Compute final stats
|
|
ufo_count = sum(1 for c in all_chunks if c.get("ufo_anomaly_detected"))
|
|
cryptid_count = sum(1 for c in all_chunks if c.get("cryptid_anomaly_detected"))
|
|
elapsed = int(time.time() - start)
|
|
|
|
print(f"\nDone in {elapsed}s")
|
|
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_saved} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|