#!/usr/bin/env python3 """ Assemble chunks/, _index.json, and document.md from _pages_raw.json for doc-65-hs1-834228961-62-hq-83894-section-1. Also: - Crops image chunks using PIL - Detects multi-page table markers for stitching - Writes all output files """ from __future__ import annotations import json import os import re import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path from collections import defaultdict DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-1" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 1 (FBI UAP/UFO Investigative File)" PNG_DIR = Path("/Users/guto/ufo/processing/png/doc-65-hs1-834228961-62-hq-83894-section-1") OUTPUT_DIR = Path("/Users/guto/ufo/raw/doc-65-hs1-834228961-62-hq-83894-section-1") CHUNKS_DIR = OUTPUT_DIR / "chunks" IMAGES_DIR = OUTPUT_DIR / "images" TABLES_DIR = OUTPUT_DIR / "tables" TOTAL_PAGES = 150 BUILD_AT = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") BUILD_MODEL = "claude-haiku-4-5" def load_pages() -> list[dict]: raw_path = OUTPUT_DIR / "_pages_raw.json" with open(raw_path, encoding="utf-8") as f: return json.load(f) def normalize_chunk(chunk: dict, page_num: int) -> dict: """Ensure all required fields exist with correct types.""" defaults = { "order_in_page": 1, "type": "paragraph", "content_en": "", "content_pt_br": "", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.05}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None, "image_description_en": None, "image_description_pt_br": None, "extracted_text": None, } result = {**defaults, **chunk} # Coerce None strings to empty if result.get('content_en') is None: result['content_en'] = '' if result.get('content_pt_br') is None: result['content_pt_br'] = '' result["page"] = page_num # Normalize booleans for bool_field in ("ufo_anomaly_detected", "cryptid_anomaly_detected"): val = result.get(bool_field) if isinstance(val, str): result[bool_field] = val.lower() in ("true", "1", "yes") elif val is None: result[bool_field] = False else: result[bool_field] = bool(val) # Normalize formatting to list if not isinstance(result.get("formatting"), list): result["formatting"] = [] # Normalize ocr_source_lines to list if not isinstance(result.get("ocr_source_lines"), list): result["ocr_source_lines"] = [] # Normalize bbox bbox = result.get("bbox", {}) if not isinstance(bbox, dict): bbox = {} result["bbox"] = { "x": float(bbox.get("x", 0.0)), "y": float(bbox.get("y", 0.0)), "w": float(bbox.get("w", 1.0)), "h": float(bbox.get("h", 0.05)), } return result def assign_global_ids(pages: list[dict]) -> list[dict]: """ Assign chunk_id, order_global, prev_chunk, next_chunk to all chunks. Returns flat list of all chunks in global order. """ all_chunks = [] counter = 1 for page_data in pages: page_num = page_data.get("page_number", 0) chunks = page_data.get("chunks", []) # Sort by order_in_page chunks.sort(key=lambda c: c.get("order_in_page", 0)) for chunk in chunks: normalized = normalize_chunk(chunk, page_num) normalized["chunk_id"] = f"c{counter:04d}" normalized["order_global"] = counter all_chunks.append(normalized) counter += 1 # Set prev/next pointers for i, chunk in enumerate(all_chunks): chunk["prev_chunk"] = all_chunks[i - 1]["chunk_id"] if i > 0 else None chunk["next_chunk"] = all_chunks[i + 1]["chunk_id"] if i < len(all_chunks) - 1 else None return all_chunks def crop_image(chunk: dict) -> str | None: """Crop image region from page PNG. Returns saved path or None.""" page_num = chunk["page"] chunk_id = chunk["chunk_id"] png_path = PNG_DIR / f"p-{page_num:03d}.png" if not png_path.exists(): return None bbox = chunk["bbox"] out_path = IMAGES_DIR / f"IMG-{chunk_id}.png" try: from PIL import Image im = Image.open(png_path) W, H = im.size pad = 0.005 x, y, w, h = bbox["x"], bbox["y"], bbox["w"], bbox["h"] left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) if right <= left or bottom <= top: return None cropped = im.crop((left, top, right, bottom)) cropped.save(str(out_path)) return str(out_path) except Exception as e: print(f" Crop error for {chunk_id}: {e}", file=sys.stderr) return None def write_chunk_file(chunk: dict, source_png_relative: str) -> None: """Write chunks/c.md for one chunk.""" chunk_id = chunk["chunk_id"] chunk_type = chunk.get("type", "paragraph") page = chunk.get("page", 0) order_in_page = chunk.get("order_in_page", 1) order_global = chunk.get("order_global", 1) bbox = chunk["bbox"] classification = chunk.get("classification") formatting = chunk.get("formatting", []) cross_page_hint = chunk.get("cross_page_hint", "self_contained") prev_chunk = chunk.get("prev_chunk") next_chunk = chunk.get("next_chunk") ocr_confidence = chunk.get("ocr_confidence", 0.85) ocr_source_lines = chunk.get("ocr_source_lines", []) redaction_code = chunk.get("redaction_code") redaction_inferred = chunk.get("redaction_inferred_content_type") image_type = chunk.get("image_type") ufo_anomaly = chunk.get("ufo_anomaly_detected", False) ufo_type = chunk.get("ufo_anomaly_type") ufo_rationale = chunk.get("ufo_anomaly_rationale") cryptid_anomaly = chunk.get("cryptid_anomaly_detected", False) cryptid_type = chunk.get("cryptid_anomaly_type") cryptid_rationale = chunk.get("cryptid_anomaly_rationale") image_desc_en = chunk.get("image_description_en") image_desc_pt = chunk.get("image_description_pt_br") extracted_text = chunk.get("extracted_text") content_en = chunk.get("content_en", "") content_pt_br = chunk.get("content_pt_br", "") # Related fields related_image = f"IMG-{chunk_id}.png" if chunk_type == "image" else None related_table = chunk.get("related_table") def yaml_val(v): if v is None: return "null" if isinstance(v, bool): return "true" if v else "false" if isinstance(v, (int, float)): return str(v) if isinstance(v, list): if not v: return "[]" items = ", ".join(f'"{x}"' for x in v) return f"[{items}]" # string s = str(v).replace('"', '\\"') return f'"{s}"' lines = [ "---", f"chunk_id: {chunk_id}", f"type: {chunk_type}", f"page: {page}", f"order_in_page: {order_in_page}", f"order_global: {order_global}", f"bbox: {{x: {bbox['x']:.2f}, y: {bbox['y']:.2f}, w: {bbox['w']:.2f}, h: {bbox['h']:.2f}}}", f"classification: {yaml_val(classification)}", f"formatting: {yaml_val(formatting)}", f"cross_page_hint: {cross_page_hint}", f"prev_chunk: {yaml_val(prev_chunk)}", f"next_chunk: {yaml_val(next_chunk)}", f"related_image: {yaml_val(related_image)}", f"related_table: {yaml_val(related_table)}", f"ocr_confidence: {ocr_confidence}", f"ocr_source_lines: {yaml_val(ocr_source_lines)}", f"redaction_code: {yaml_val(redaction_code)}", f"redaction_inferred_content_type: {yaml_val(redaction_inferred)}", f"image_type: {yaml_val(image_type)}", f"ufo_anomaly_detected: {yaml_val(ufo_anomaly)}", f"cryptid_anomaly_detected: {yaml_val(cryptid_anomaly)}", f"ufo_anomaly_type: {yaml_val(ufo_type)}", f"ufo_anomaly_rationale: {yaml_val(ufo_rationale)}", f"cryptid_anomaly_type: {yaml_val(cryptid_type)}", f"cryptid_anomaly_rationale: {yaml_val(cryptid_rationale)}", f"image_description_en: {yaml_val(image_desc_en)}", f"image_description_pt_br: {yaml_val(image_desc_pt)}", f"extracted_text: {yaml_val(extracted_text)}", f"source_png: {source_png_relative}", "---", "", f"**EN:** {content_en}", "", f"**PT-BR:** {content_pt_br}", ] out_path = CHUNKS_DIR / f"{chunk_id}.md" out_path.write_text("\n".join(lines), encoding="utf-8") def write_index(all_chunks: list[dict]) -> None: index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": len(all_chunks), "build_approach": "subagents", "build_model": BUILD_MODEL, "build_at": BUILD_AT, "chunks": [] } for chunk in all_chunks: preview = chunk.get("content_en", "")[:80] index["chunks"].append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "paragraph"), "page": chunk.get("page", 0), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk['chunk_id']}.md", "bbox": chunk["bbox"], "preview": preview, }) out_path = OUTPUT_DIR / "_index.json" with open(out_path, "w", encoding="utf-8") as f: json.dump(index, f, ensure_ascii=False, indent=2) print(f"Written: {out_path}") def write_document_md(all_chunks: list[dict], stats: dict) -> None: # Compute histogram histogram: dict[str, int] = defaultdict(int) ufo_flagged = [] cryptid_flagged = [] for chunk in all_chunks: histogram[chunk.get("type", "paragraph")] += 1 if chunk.get("ufo_anomaly_detected"): ufo_flagged.append(chunk["chunk_id"]) if chunk.get("cryptid_anomaly_detected"): cryptid_flagged.append(chunk["chunk_id"]) histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(histogram.items())) ufo_yaml = json.dumps(ufo_flagged, ensure_ascii=False) cryptid_yaml = json.dumps(cryptid_flagged, ensure_ascii=False) lines = [ "---", 'schema_version: "0.2.0"', "type: master_document", f"doc_id: {DOC_ID}", f'canonical_title: "{DOC_TITLE}"', f"total_pages: {TOTAL_PAGES}", f"total_chunks: {len(all_chunks)}", "chunk_types_histogram:", histogram_yaml, f"multi_page_tables: []", f"ufo_anomalies_flagged: {ufo_yaml}", f"cryptid_anomalies_flagged: {cryptid_yaml}", 'build_approach: "subagents"', f"build_model: {BUILD_MODEL}", f"build_at: {BUILD_AT}", "---", "", ] # Group chunks by page pages_map: dict[int, list[dict]] = defaultdict(list) for chunk in all_chunks: pages_map[chunk["page"]].append(chunk) for page_num in sorted(pages_map.keys()): page_chunks = pages_map[page_num] lines.append(f"## Page {page_num}") lines.append("") for chunk in page_chunks: cid = chunk["chunk_id"] ctype = chunk.get("type", "paragraph") bbox = chunk["bbox"] bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}" content_en = chunk.get("content_en", "") content_pt_br = chunk.get("content_pt_br", "") lines.append(f"") lines.append(f'') lines.append(f"### Chunk {cid} — {ctype} · p{page_num} · bbox: {bbox_str}") lines.append("") lines.append(f"**EN:** {content_en}") lines.append("") lines.append(f"**PT-BR:** {content_pt_br}") lines.append("") # Embed image if applicable if ctype == "image": img_path = IMAGES_DIR / f"IMG-{cid}.png" if img_path.exists(): lines.append(f"![{cid} image](./images/IMG-{cid}.png)") lines.append("") if chunk.get("image_description_en"): lines.append(f"*Image description:* {chunk['image_description_en']}") lines.append("") # Metadata collapsible meta = { "chunk_id": cid, "type": ctype, "page": chunk.get("page"), "order_in_page": chunk.get("order_in_page"), "order_global": chunk.get("order_global"), "bbox": bbox, "classification": chunk.get("classification"), "formatting": chunk.get("formatting", []), "cross_page_hint": chunk.get("cross_page_hint"), "prev_chunk": chunk.get("prev_chunk"), "next_chunk": chunk.get("next_chunk"), "related_image": f"IMG-{cid}.png" if ctype == "image" else None, "related_table": chunk.get("related_table"), "ocr_confidence": chunk.get("ocr_confidence"), "ocr_source_lines": chunk.get("ocr_source_lines", []), "redaction_code": chunk.get("redaction_code"), "redaction_inferred_content_type": chunk.get("redaction_inferred_content_type"), "image_type": chunk.get("image_type"), "ufo_anomaly_detected": chunk.get("ufo_anomaly_detected", False), "cryptid_anomaly_detected": chunk.get("cryptid_anomaly_detected", False), "ufo_anomaly_type": chunk.get("ufo_anomaly_type"), "ufo_anomaly_rationale": chunk.get("ufo_anomaly_rationale"), "cryptid_anomaly_type": chunk.get("cryptid_anomaly_type"), "cryptid_anomaly_rationale": chunk.get("cryptid_anomaly_rationale"), "image_description_en": chunk.get("image_description_en"), "image_description_pt_br": chunk.get("image_description_pt_br"), "extracted_text": chunk.get("extracted_text"), } lines.append("
metadata") lines.append("") lines.append("```json") lines.append(json.dumps(meta, ensure_ascii=False, indent=2)) lines.append("```") lines.append("") lines.append("
") lines.append("") lines.append("---") lines.append("") out_path = OUTPUT_DIR / "document.md" with open(out_path, "w", encoding="utf-8") as f: f.write("\n".join(lines)) print(f"Written: {out_path}") return len("\n".join(lines).encode("utf-8")) def main(): start = time.time() print("Loading pages...") pages = load_pages() print(f" {len(pages)} pages loaded") print("Assigning global IDs...") all_chunks = assign_global_ids(pages) print(f" {len(all_chunks)} chunks total") # Create dirs CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) # Crop images image_chunks = [c for c in all_chunks if c.get("type") == "image"] print(f"Cropping {len(image_chunks)} images...") images_saved = 0 for chunk in image_chunks: path = crop_image(chunk) if path: images_saved += 1 # Write chunk files print("Writing chunk files...") for chunk in all_chunks: page_num = chunk["page"] source_png = f"../../processing/png/{DOC_ID}/p-{page_num:03d}.png" write_chunk_file(chunk, source_png) print(f" {len(all_chunks)} chunk files written") # Write _index.json print("Writing _index.json...") write_index(all_chunks) # Write document.md print("Writing document.md...") stats = {} doc_bytes = write_document_md(all_chunks, stats) # Compute final stats ufo_count = sum(1 for c in all_chunks if c.get("ufo_anomaly_detected")) cryptid_count = sum(1 for c in all_chunks if c.get("cryptid_anomaly_detected")) elapsed = int(time.time() - start) print(f"\nDone in {elapsed}s") print(f"STATS pages={TOTAL_PAGES} chunks={len(all_chunks)} images={images_saved} tables=0 ufo={ufo_count} cryptid={cryptid_count} doc_md_bytes={doc_bytes}") if __name__ == "__main__": main()