disclosure-bureau/scripts/rebuild_doc65_assemble.py

462 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Assemble chunks/, _index.json, and document.md from _pages_raw.json
for doc-65-hs1-834228961-62-hq-83894-section-1.
Also:
- Crops image chunks using PIL
- Detects multi-page table markers for stitching
- Writes all output files
"""
from __future__ import annotations
import json
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from collections import defaultdict
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)"
PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1")
OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1")
CHUNKS_DIR = OUTPUT_DIR / "chunks"
IMAGES_DIR = OUTPUT_DIR / "images"
TABLES_DIR = OUTPUT_DIR / "tables"
TOTAL_PAGES = 150
BUILD_AT = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
BUILD_MODEL = "claude-haiku-4-5"
def load_pages() -> list[dict]:
raw_path = OUTPUT_DIR / "_pages_raw.json"
with open(raw_path, encoding="utf-8") as f:
return json.load(f)
def normalize_chunk(chunk: dict, page_num: int) -> dict:
"""Ensure all required fields exist with correct types."""
defaults = {
"order_in_page": 1,
"type": "paragraph",
"content_en": "",
"content_pt_br": "",
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.05},
"classification": None,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"ocr_source_lines": [],
"redaction_code": None,
"redaction_inferred_content_type": None,
"image_type": None,
"ufo_anomaly_detected": False,
"ufo_anomaly_type": None,
"ufo_anomaly_rationale": None,
"cryptid_anomaly_detected": False,
"cryptid_anomaly_type": None,
"cryptid_anomaly_rationale": None,
"image_description_en": None,
"image_description_pt_br": None,
"extracted_text": None,
}
result = {**defaults, **chunk}
# Coerce None strings to empty
if result.get('content_en') is None:
result['content_en'] = ''
if result.get('content_pt_br') is None:
result['content_pt_br'] = ''
result["page"] = page_num
# Normalize booleans
for bool_field in ("ufo_anomaly_detected", "cryptid_anomaly_detected"):
val = result.get(bool_field)
if isinstance(val, str):
result[bool_field] = val.lower() in ("true", "1", "yes")
elif val is None:
result[bool_field] = False
else:
result[bool_field] = bool(val)
# Normalize formatting to list
if not isinstance(result.get("formatting"), list):
result["formatting"] = []
# Normalize ocr_source_lines to list
if not isinstance(result.get("ocr_source_lines"), list):
result["ocr_source_lines"] = []
# Normalize bbox
bbox = result.get("bbox", {})
if not isinstance(bbox, dict):
bbox = {}
result["bbox"] = {
"x": float(bbox.get("x", 0.0)),
"y": float(bbox.get("y", 0.0)),
"w": float(bbox.get("w", 1.0)),
"h": float(bbox.get("h", 0.05)),
}
return result
def assign_global_ids(pages: list[dict]) -> list[dict]:
"""
Assign chunk_id, order_global, prev_chunk, next_chunk to all chunks.
Returns flat list of all chunks in global order.
"""
all_chunks = []
counter = 1
for page_data in pages:
page_num = page_data.get("page_number", 0)
chunks = page_data.get("chunks", [])
# Sort by order_in_page
chunks.sort(key=lambda c: c.get("order_in_page", 0))
for chunk in chunks:
normalized = normalize_chunk(chunk, page_num)
normalized["chunk_id"] = f"c{counter:04d}"
normalized["order_global"] = counter
all_chunks.append(normalized)
counter += 1
# Set prev/next pointers
for i, chunk in enumerate(all_chunks):
chunk["prev_chunk"] = all_chunks[i - 1]["chunk_id"] if i > 0 else None
chunk["next_chunk"] = all_chunks[i + 1]["chunk_id"] if i < len(all_chunks) - 1 else None
return all_chunks
def crop_image(chunk: dict) -> str | None:
"""Crop image region from page PNG. Returns saved path or None."""
page_num = chunk["page"]
chunk_id = chunk["chunk_id"]
png_path = PNG_DIR / f"p-{page_num:03d}.png"
if not png_path.exists():
return None
bbox = chunk["bbox"]
out_path = IMAGES_DIR / f"IMG-{chunk_id}.png"
try:
from PIL import Image
im = Image.open(png_path)
W, H = im.size
pad = 0.005
x, y, w, h = bbox["x"], bbox["y"], bbox["w"], bbox["h"]
left = max(0, int((x - pad) * W))
top = max(0, int((y - pad) * H))
right = min(W, int((x + w + pad) * W))
bottom = min(H, int((y + h + pad) * H))
if right <= left or bottom <= top:
return None
cropped = im.crop((left, top, right, bottom))
cropped.save(str(out_path))
return str(out_path)
except Exception as e:
print(f" Crop error for {chunk_id}: {e}", file=sys.stderr)
return None
def write_chunk_file(chunk: dict, source_png_relative: str) -> None:
"""Write chunks/c<NNNN>.md for one chunk."""
chunk_id = chunk["chunk_id"]
chunk_type = chunk.get("type", "paragraph")
page = chunk.get("page", 0)
order_in_page = chunk.get("order_in_page", 1)
order_global = chunk.get("order_global", 1)
bbox = chunk["bbox"]
classification = chunk.get("classification")
formatting = chunk.get("formatting", [])
cross_page_hint = chunk.get("cross_page_hint", "self_contained")
prev_chunk = chunk.get("prev_chunk")
next_chunk = chunk.get("next_chunk")
ocr_confidence = chunk.get("ocr_confidence", 0.85)
ocr_source_lines = chunk.get("ocr_source_lines", [])
redaction_code = chunk.get("redaction_code")
redaction_inferred = chunk.get("redaction_inferred_content_type")
image_type = chunk.get("image_type")
ufo_anomaly = chunk.get("ufo_anomaly_detected", False)
ufo_type = chunk.get("ufo_anomaly_type")
ufo_rationale = chunk.get("ufo_anomaly_rationale")
cryptid_anomaly = chunk.get("cryptid_anomaly_detected", False)
cryptid_type = chunk.get("cryptid_anomaly_type")
cryptid_rationale = chunk.get("cryptid_anomaly_rationale")
image_desc_en = chunk.get("image_description_en")
image_desc_pt = chunk.get("image_description_pt_br")
extracted_text = chunk.get("extracted_text")
content_en = chunk.get("content_en", "")
content_pt_br = chunk.get("content_pt_br", "")
# Related fields
related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None
related_table = chunk.get("related_table")
def yaml_val(v):
if v is None:
return "null"
if isinstance(v, bool):
return "true" if v else "false"
if isinstance(v, (int, float)):
return str(v)
if isinstance(v, list):
if not v:
return "[]"
items = ", ".join(f'"{x}"' for x in v)
return f"[{items}]"
# string
s = str(v).replace('"', '\\"')
return f'"{s}"'
lines = [
"---",
f"chunk_id: {chunk_id}",
f"type: {chunk_type}",
f"page: {page}",
f"order_in_page: {order_in_page}",
f"order_global: {order_global}",
f"bbox: {{x: {bbox['x']:.2f}, y: {bbox['y']:.2f}, w: {bbox['w']:.2f}, h: {bbox['h']:.2f}}}",
f"classification: {yaml_val(classification)}",
f"formatting: {yaml_val(formatting)}",
f"cross_page_hint: {cross_page_hint}",
f"prev_chunk: {yaml_val(prev_chunk)}",
f"next_chunk: {yaml_val(next_chunk)}",
f"related_image: {yaml_val(related_image)}",
f"related_table: {yaml_val(related_table)}",
f"ocr_confidence: {ocr_confidence}",
f"ocr_source_lines: {yaml_val(ocr_source_lines)}",
f"redaction_code: {yaml_val(redaction_code)}",
f"redaction_inferred_content_type: {yaml_val(redaction_inferred)}",
f"image_type: {yaml_val(image_type)}",
f"ufo_anomaly_detected: {yaml_val(ufo_anomaly)}",
f"cryptid_anomaly_detected: {yaml_val(cryptid_anomaly)}",
f"ufo_anomaly_type: {yaml_val(ufo_type)}",
f"ufo_anomaly_rationale: {yaml_val(ufo_rationale)}",
f"cryptid_anomaly_type: {yaml_val(cryptid_type)}",
f"cryptid_anomaly_rationale: {yaml_val(cryptid_rationale)}",
f"image_description_en: {yaml_val(image_desc_en)}",
f"image_description_pt_br: {yaml_val(image_desc_pt)}",
f"extracted_text: {yaml_val(extracted_text)}",
f"source_png: {source_png_relative}",
"---",
"",
f"**EN:** {content_en}",
"",
f"**PT-BR:** {content_pt_br}",
]
out_path = CHUNKS_DIR / f"{chunk_id}.md"
out_path.write_text("\n".join(lines), encoding="utf-8")
def write_index(all_chunks: list[dict]) -> None:
index = {
"doc_id": DOC_ID,
"schema_version": "0.2.0",
"total_pages": TOTAL_PAGES,
"total_chunks": len(all_chunks),
"build_approach": "subagents",
"build_model": BUILD_MODEL,
"build_at": BUILD_AT,
"chunks": []
}
for chunk in all_chunks:
preview = chunk.get("content_en", "")[:80]
index["chunks"].append({
"chunk_id": chunk["chunk_id"],
"type": chunk.get("type", "paragraph"),
"page": chunk.get("page", 0),
"order_in_page": chunk.get("order_in_page", 1),
"order_global": chunk.get("order_global", 1),
"file": f"chunks/{chunk['chunk_id']}.md",
"bbox": chunk["bbox"],
"preview": preview,
})
out_path = OUTPUT_DIR / "_index.json"
with open(out_path, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
print(f"Written: {out_path}")
def write_document_md(all_chunks: list[dict], stats: dict) -> None:
# Compute histogram
histogram: dict[str, int] = defaultdict(int)
ufo_flagged = []
cryptid_flagged = []
for chunk in all_chunks:
histogram[chunk.get("type", "paragraph")] += 1
if chunk.get("ufo_anomaly_detected"):
ufo_flagged.append(chunk["chunk_id"])
if chunk.get("cryptid_anomaly_detected"):
cryptid_flagged.append(chunk["chunk_id"])
histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(histogram.items()))
ufo_yaml = json.dumps(ufo_flagged, ensure_ascii=False)
cryptid_yaml = json.dumps(cryptid_flagged, ensure_ascii=False)
lines = [
"---",
'schema_version: "0.2.0"',
"type: master_document",
f"doc_id: {DOC_ID}",
f'canonical_title: "{DOC_TITLE}"',
f"total_pages: {TOTAL_PAGES}",
f"total_chunks: {len(all_chunks)}",
"chunk_types_histogram:",
histogram_yaml,
f"multi_page_tables: []",
f"ufo_anomalies_flagged: {ufo_yaml}",
f"cryptid_anomalies_flagged: {cryptid_yaml}",
'build_approach: "subagents"',
f"build_model: {BUILD_MODEL}",
f"build_at: {BUILD_AT}",
"---",
"",
]
# Group chunks by page
pages_map: dict[int, list[dict]] = defaultdict(list)
for chunk in all_chunks:
pages_map[chunk["page"]].append(chunk)
for page_num in sorted(pages_map.keys()):
page_chunks = pages_map[page_num]
lines.append(f"## Page {page_num}")
lines.append("")
for chunk in page_chunks:
cid = chunk["chunk_id"]
ctype = chunk.get("type", "paragraph")
bbox = chunk["bbox"]
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
content_en = chunk.get("content_en", "")
content_pt_br = chunk.get("content_pt_br", "")
lines.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->")
lines.append(f'<a id="{cid}"></a>')
lines.append(f"### Chunk {cid}{ctype} · p{page_num} · bbox: {bbox_str}")
lines.append("")
lines.append(f"**EN:** {content_en}")
lines.append("")
lines.append(f"**PT-BR:** {content_pt_br}")
lines.append("")
# Embed image if applicable
if ctype == "image":
img_path = IMAGES_DIR / f"IMG-{cid}.png"
if img_path.exists():
lines.append(f"![{cid} image](./images/IMG-{cid}.png)")
lines.append("")
if chunk.get("image_description_en"):
lines.append(f"*Image description:* {chunk['image_description_en']}")
lines.append("")
# Metadata collapsible
meta = {
"chunk_id": cid,
"type": ctype,
"page": chunk.get("page"),
"order_in_page": chunk.get("order_in_page"),
"order_global": chunk.get("order_global"),
"bbox": bbox,
"classification": chunk.get("classification"),
"formatting": chunk.get("formatting", []),
"cross_page_hint": chunk.get("cross_page_hint"),
"prev_chunk": chunk.get("prev_chunk"),
"next_chunk": chunk.get("next_chunk"),
"related_image": f"IMG-{cid}.png" if ctype == "image" else None,
"related_table": chunk.get("related_table"),
"ocr_confidence": chunk.get("ocr_confidence"),
"ocr_source_lines": chunk.get("ocr_source_lines", []),
"redaction_code": chunk.get("redaction_code"),
"redaction_inferred_content_type": chunk.get("redaction_inferred_content_type"),
"image_type": chunk.get("image_type"),
"ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False),
"cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False),
"ufo_anomaly_type": chunk.get("ufo_anomaly_type"),
"ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"),
"cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"),
"cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"),
"image_description_en": chunk.get("image_description_en"),
"image_description_pt_br": chunk.get("image_description_pt_br"),
"extracted_text": chunk.get("extracted_text"),
}
lines.append("<details><summary>metadata</summary>")
lines.append("")
lines.append("```json")
lines.append(json.dumps(meta, ensure_ascii=False, indent=2))
lines.append("```")
lines.append("")
lines.append("</details>")
lines.append("")
lines.append("---")
lines.append("")
out_path = OUTPUT_DIR / "document.md"
with open(out_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
print(f"Written: {out_path}")
return len("\n".join(lines).encode("utf-8"))
def main():
start = time.time()
print("Loading pages...")
pages = load_pages()
print(f" {len(pages)} pages loaded")
print("Assigning global IDs...")
all_chunks = assign_global_ids(pages)
print(f" {len(all_chunks)} chunks total")
# Create dirs
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
# Crop images
image_chunks = [c for c in all_chunks if c.get("type") == "image"]
print(f"Cropping {len(image_chunks)} images...")
images_saved = 0
for chunk in image_chunks:
path = crop_image(chunk)
if path:
images_saved += 1
# Write chunk files
print("Writing chunk files...")
for chunk in all_chunks:
page_num = chunk["page"]
source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png"
write_chunk_file(chunk, source_png)
print(f" {len(all_chunks)} chunk files written")
# Write _index.json
print("Writing _index.json...")
write_index(all_chunks)
# Write document.md
print("Writing document.md...")
stats = {}
doc_bytes = write_document_md(all_chunks, stats)
# Compute final stats
ufo_count = sum(1 for c in all_chunks if c.get("ufo_anomaly_detected"))
cryptid_count = sum(1 for c in all_chunks if c.get("cryptid_anomaly_detected"))
elapsed = int(time.time() - start)
print(f"\nDone in {elapsed}s")
print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_saved} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}")
if __name__ == "__main__":
main()