#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ rebuild_doc65_serial130_resume.py Resume rebuild for doc-65-hs1-834228961-62-hq-83894-serial-130. Pages 1-50 already processed (chunks c0001-c0204 exist). This script: Phase A: Process pages 51-91 via claude CLI → write c0205+ Phase B: Read ALL chunk files → rebuild _index.json + document.md """ import os import sys import json import time import subprocess import concurrent.futures import re from datetime import datetime, timezone from pathlib import Path try: from PIL import Image as PILImage PILLOW_OK = True except ImportError: PILLOW_OK = False # ── Config ────────────────────────────────────────────────────────────────── DOC_ID = "doc-65-hs1-834228961-62-hq-83894-serial-130" DOC_TITLE = "HQ Air Defense Command – Unidentified Flying Objects Reports (65-HS1-834228961 / 62-HQ-83894 Serial 130)" PNG_DIR = Path("/Users/guto/ufo/processing/png") / DOC_ID OCR_DIR = Path("/Users/guto/ufo/processing/ocr") / DOC_ID OUT_DIR = Path("/Users/guto/ufo/raw") / DOC_ID CHUNKS_DIR = OUT_DIR / "chunks" IMAGES_DIR = OUT_DIR / "images" TABLES_DIR = OUT_DIR / "tables" CLAUDE_BIN = "/Users/guto/.local/bin/claude" TOTAL_PAGES = 91 START_PAGE = 51 # first missing page FIRST_CHUNK_NUM = 205 # c0205 onwards for new chunks BATCH_SIZE = 4 CLAUDE_TIMEOUT = 150 # ── Helpers ────────────────────────────────────────────────────────────────── def load_ocr(page_num: int) -> str: ocr_path = OCR_DIR / f"p-{page_num - 1:03d}.txt" if ocr_path.exists(): text = ocr_path.read_text(encoding="utf-8", errors="replace").strip() return text[:2000] if text else "" return "" PAGE_REBUILDER_PROMPT = """You are a page-rebuilder agent analyzing a page from a declassified US government document about Unidentified Flying Objects (UFO/UAP) investigations. Document: {doc_title} Page: {page_num} of {total_pages} PNG file: /Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png OCR text (may be incomplete): {ocr_text} Use the Read tool to read the image at: /Users/guto/ufo/processing/png/{doc_id}/p-{png_num:03d}.png Analyze ALL visible content and return ONLY a JSON object (no markdown fences, no extra text): {{ "page_number": {page_num}, "chunks": [ {{ "order_in_page": 1, "type": "letterhead", "content_en": "exact transcription or description in English", "content_pt_br": "transcrição ou descrição em português brasileiro", "bbox": {{"x": 0.00, "y": 0.00, "w": 1.00, "h": 0.10}}, "classification": null, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.85, "ocr_source_lines": [], "redaction_code": null, "redaction_inferred_content_type": null, "image_type": null, "ufo_anomaly_detected": false, "ufo_anomaly_type": null, "ufo_anomaly_rationale": null, "cryptid_anomaly_detected": false, "cryptid_anomaly_type": null, "cryptid_anomaly_rationale": null }} ] }} RULES: - type must be ONE of: cover | letterhead | stamp | header | subheader | paragraph | redaction | signature | image | table_marker | footer | page_number | classification_marking | separator | handwriting | form_field | caption | list_item | annotation | blank | classification_banner | signature_block | redaction_block - bbox: x,y = top-left corner fraction (0.0-1.0), w,h = width/height fractions (0.0-1.0) - Split page into logical chunks (letterhead separate from body, stamps separate, etc.) - For redacted blocks: type=redaction, redaction_code e.g. "(b)(1)", "(b)(3)", "(b)(6)" - For photos/sketches/diagrams: type=image, set image_type to photo|diagram|sketch|map|chart|signature_block|stamp|seal|other - cross_page_hint: self_contained | continues_to_next | continues_from_prev - content_en: verbatim transcription when legible; describe otherwise e.g. "[Stamp: RECEIVED OCT 6 1947]" - content_pt_br: Brazilian Portuguese translation/description - ufo_anomaly_detected: true ONLY if page has image/sketch of anomalous aerial object - Blank pages: one chunk with type=blank - Return ONLY valid JSON, nothing else""" def run_claude(prompt: str, timeout: int = CLAUDE_TIMEOUT) -> str: try: result = subprocess.run( [CLAUDE_BIN, "-p", "--dangerously-skip-permissions", "--model", "claude-haiku-4-5", "--no-session-persistence", prompt], capture_output=True, text=True, timeout=timeout, env={**os.environ} ) return result.stdout.strip() except subprocess.TimeoutExpired: return "" except Exception as e: return f"ERROR: {e}" def parse_json(raw: str): text = raw.strip() if text.startswith("```"): lines = text.split("\n")[1:] if lines and lines[-1].strip() == "```": lines = lines[:-1] text = "\n".join(lines).strip() start = text.find("{") if start == -1: return None depth = 0 end = -1 for i, ch in enumerate(text[start:]): if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: end = start + i + 1 break if end == -1: return None try: return json.loads(text[start:end]) except json.JSONDecodeError: return None def rebuild_page(page_num: int) -> dict: png_num = page_num - 1 # 0-indexed ocr_text = load_ocr(page_num) prompt = PAGE_REBUILDER_PROMPT.format( doc_title=DOC_TITLE, page_num=page_num, total_pages=TOTAL_PAGES, doc_id=DOC_ID, png_num=png_num, ocr_text=ocr_text or "(no OCR available)" ) for attempt in range(3): raw = run_claude(prompt, timeout=CLAUDE_TIMEOUT) if not raw or raw.startswith("ERROR:"): if attempt < 2: time.sleep(5 * (attempt + 1)) continue break data = parse_json(raw) if data and "chunks" in data: data["page_number"] = page_num data["png_num"] = png_num for i, ch in enumerate(data["chunks"]): ch["order_in_page"] = i + 1 ch["page"] = page_num print(f" [OK] page {page_num:03d} → {len(data['chunks'])} chunks", flush=True) return data if attempt < 2: print(f" [RETRY {attempt+1}] page {page_num}: bad JSON", flush=True) time.sleep(3) else: print(f" [FAIL] page {page_num}: {raw[:200]}", flush=True) # Fallback return { "page_number": page_num, "png_num": page_num - 1, "chunks": [{ "order_in_page": 1, "type": "blank", "page": page_num, "content_en": "[Page processing failed]", "content_pt_br": "[Falha no processamento da página]", "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, "classification": None, "formatting": [], "cross_page_hint": "self_contained", "ocr_confidence": 0.0, "ocr_source_lines": [], "redaction_code": None, "redaction_inferred_content_type": None, "image_type": None, "ufo_anomaly_detected": False, "ufo_anomaly_type": None, "ufo_anomaly_rationale": None, "cryptid_anomaly_detected": False, "cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None }] } def yv(v): if v is None: return "null" if isinstance(v, bool): return str(v).lower() s = str(v) if any(c in s for c in [':', '{', '}', '[', ']', ',', '\n', '#', '&', '*', '?', '|', '<', '>', '=', '!', '%', '@', '`']): return f'"{s}"' return s def write_chunk_file(chunk: dict): chunk_id = chunk["chunk_id"] bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) page = chunk.get("page", 1) png_num = chunk.get("png_num", page - 1) ctype = chunk.get("type", "paragraph") fmt_list = chunk.get("formatting") or [] fmt_str = "[" + ", ".join(f'"{f}"' for f in fmt_list) + "]" ocr_lines = chunk.get("ocr_source_lines") or [] ocr_lines_str = "[" + ", ".join(str(l) for l in ocr_lines) + "]" related_image = f"IMG-{chunk_id}.png" if ctype == "image" else "null" content = f"""--- chunk_id: {chunk_id} type: {ctype} page: {page} order_in_page: {chunk.get("order_in_page", 1)} order_global: {chunk.get("order_global", 1)} bbox: {{x: {bbox.get("x", 0.0):.3f}, y: {bbox.get("y", 0.0):.3f}, w: {bbox.get("w", 1.0):.3f}, h: {bbox.get("h", 0.1):.3f}}} classification: {yv(chunk.get("classification"))} formatting: {fmt_str} cross_page_hint: {chunk.get("cross_page_hint", "self_contained")} prev_chunk: {chunk.get("prev_chunk") or "null"} next_chunk: {chunk.get("next_chunk") or "null"} related_image: {related_image} related_table: null ocr_confidence: {chunk.get("ocr_confidence", 0.85)} ocr_source_lines: {ocr_lines_str} redaction_code: {yv(chunk.get("redaction_code"))} redaction_inferred_content_type: {yv(chunk.get("redaction_inferred_content_type"))} image_type: {yv(chunk.get("image_type"))} ufo_anomaly_detected: {str(chunk.get("ufo_anomaly_detected", False)).lower()} cryptid_anomaly_detected: {str(chunk.get("cryptid_anomaly_detected", False)).lower()} ufo_anomaly_type: {yv(chunk.get("ufo_anomaly_type"))} ufo_anomaly_rationale: {yv(chunk.get("ufo_anomaly_rationale"))} cryptid_anomaly_type: {yv(chunk.get("cryptid_anomaly_type"))} cryptid_anomaly_rationale: {yv(chunk.get("cryptid_anomaly_rationale"))} image_description_en: {yv(chunk.get("image_description_en"))} image_description_pt_br: {yv(chunk.get("image_description_pt_br"))} extracted_text: {yv(chunk.get("extracted_text"))} source_png: ../../processing/png/{DOC_ID}/p-{png_num:03d}.png --- **EN:** {chunk.get("content_en", "")} **PT-BR:** {chunk.get("content_pt_br", "")} """ (CHUNKS_DIR / f"{chunk_id}.md").write_text(content, encoding="utf-8") def crop_image(chunk: dict): chunk_id = chunk["chunk_id"] png_num = chunk.get("png_num", chunk.get("page", 1) - 1) bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) src = PNG_DIR / f"p-{png_num:03d}.png" dst = IMAGES_DIR / f"IMG-{chunk_id}.png" if not PILLOW_OK or not src.exists(): return try: im = PILImage.open(src) W, H = im.size x = max(0.0, min(1.0, bbox.get("x", 0.0))) y = max(0.0, min(1.0, bbox.get("y", 0.0))) w = max(0.01, min(1.0 - x, bbox.get("w", 1.0))) h = max(0.01, min(1.0 - y, bbox.get("h", 0.1))) pad = 0.005 left = max(0, int((x - pad) * W)) top = max(0, int((y - pad) * H)) right = min(W, int((x + w + pad) * W)) bottom = min(H, int((y + h + pad) * H)) im.crop((left, top, right, bottom)).save(str(dst)) print(f" [CROP] {chunk_id}", flush=True) except Exception as e: print(f" [CROP FAIL] {chunk_id}: {e}", flush=True) def parse_frontmatter(path: Path) -> dict: """Read YAML frontmatter from a chunk .md file.""" text = path.read_text(encoding="utf-8", errors="replace") if not text.startswith("---"): return {} end = text.find("\n---\n", 3) if end == -1: return {} fm_text = text[3:end] data = {} for line in fm_text.split("\n"): m = re.match(r'^(\w+):\s*(.*)', line) if not m: continue key, val = m.group(1), m.group(2).strip() if val == "null": data[key] = None elif val == "true": data[key] = True elif val == "false": data[key] = False else: # Try int try: data[key] = int(val) except ValueError: # Strip surrounding quotes if val.startswith('"') and val.endswith('"'): data[key] = val[1:-1] else: data[key] = val # Parse bbox specially bbox_m = re.search(r'bbox:\s*\{x:\s*([\d.]+),\s*y:\s*([\d.]+),\s*w:\s*([\d.]+),\s*h:\s*([\d.]+)\}', text) if bbox_m: data["bbox"] = { "x": float(bbox_m.group(1)), "y": float(bbox_m.group(2)), "w": float(bbox_m.group(3)), "h": float(bbox_m.group(4)), } # Extract body content body = text[end + 5:].strip() en_m = re.search(r'\*\*EN:\*\*\s*(.*?)(?=\n\n\*\*PT-BR:|$)', body, re.DOTALL) ptbr_m = re.search(r'\*\*PT-BR:\*\*\s*(.*?)$', body, re.DOTALL) data["content_en"] = en_m.group(1).strip() if en_m else "" data["content_pt_br"] = ptbr_m.group(1).strip() if ptbr_m else "" return data def build_assembly(all_chunks: list, build_at: str): """Write _index.json and document.md from all_chunks list.""" type_histogram = {} for chunk in all_chunks: t = chunk.get("type", "paragraph") type_histogram[t] = type_histogram.get(t, 0) + 1 ufo_flagged = [c["chunk_id"] for c in all_chunks if c.get("ufo_anomaly_detected")] cryptid_flagged = [c["chunk_id"] for c in all_chunks if c.get("cryptid_anomaly_detected")] images_extracted = sum(1 for c in all_chunks if c.get("type") == "image") # _index.json index_chunks = [] for chunk in all_chunks: bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) content_en = chunk.get("content_en", "") preview = content_en[:80] + ("..." if len(content_en) > 80 else "") index_chunks.append({ "chunk_id": chunk["chunk_id"], "type": chunk.get("type", "paragraph"), "page": chunk.get("page", 1), "order_in_page": chunk.get("order_in_page", 1), "order_global": chunk.get("order_global", 1), "file": f"chunks/{chunk['chunk_id']}.md", "bbox": bbox, "preview": preview }) index = { "doc_id": DOC_ID, "schema_version": "0.2.0", "total_pages": TOTAL_PAGES, "total_chunks": len(all_chunks), "build_approach": "subagents", "build_model": "claude-haiku-4-5", "build_at": build_at, "chunks": index_chunks } (OUT_DIR / "_index.json").write_text( json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8" ) print(f" Wrote _index.json ({len(all_chunks)} chunks)", flush=True) # document.md histogram_yaml = "\n".join(f" {k}: {v}" for k, v in sorted(type_histogram.items())) def list_yaml(items): return " []" if not items else "\n".join(f" - {i}" for i in items) doc_parts = [f"""--- schema_version: "0.2.0" type: master_document doc_id: {DOC_ID} canonical_title: "{DOC_TITLE}" total_pages: {TOTAL_PAGES} total_chunks: {len(all_chunks)} chunk_types_histogram: {histogram_yaml} multi_page_tables: [] ufo_anomalies_flagged: {list_yaml(ufo_flagged)} cryptid_anomalies_flagged: {list_yaml(cryptid_flagged)} build_approach: "subagents" build_model: "claude-haiku-4-5" build_at: "{build_at}" --- """] chunks_by_page: dict = {} for chunk in all_chunks: p = chunk.get("page", 1) chunks_by_page.setdefault(p, []).append(chunk) for page_seq in sorted(chunks_by_page.keys()): png_num = page_seq - 1 doc_parts.append(f"\n## Page {page_seq} (source: p-{png_num:03d}.png)\n") for chunk in sorted(chunks_by_page[page_seq], key=lambda c: c.get("order_in_page", 1)): chunk_id = chunk["chunk_id"] ctype = chunk.get("type", "paragraph") bbox = chunk.get("bbox", {"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}) bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',1):.2f}/{bbox.get('h',0.1):.2f}" doc_parts.append(f"\n") doc_parts.append(f'\n') doc_parts.append(f"### Chunk {chunk_id} — {ctype} · p{page_seq} · bbox: {bbox_str}\n\n") doc_parts.append(f"**EN:** {chunk.get('content_en', '')}\n\n") doc_parts.append(f"**PT-BR:** {chunk.get('content_pt_br', '')}\n\n") if ctype == "image": doc_parts.append(f"![{chunk_id} image](./images/IMG-{chunk_id}.png)\n\n") if chunk.get("image_description_en"): doc_parts.append(f"**Image Description (EN):** {chunk['image_description_en']}\n\n") if chunk.get("image_description_pt_br"): doc_parts.append(f"**Descrição da Imagem (PT-BR):** {chunk['image_description_pt_br']}\n\n") meta = {k: chunk.get(k) for k in [ "chunk_id", "type", "page", "order_in_page", "order_global", "bbox", "classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk", "ocr_confidence", "redaction_code", "image_type", "ufo_anomaly_detected", "cryptid_anomaly_detected", "ufo_anomaly_type", "ufo_anomaly_rationale", ]} meta_json = json.dumps(meta, indent=2, ensure_ascii=False) doc_parts.append( f"
metadata\n\n```json\n{meta_json}\n```\n\n
\n\n---\n\n" ) doc_md = "".join(doc_parts) (OUT_DIR / "document.md").write_text(doc_md, encoding="utf-8") print(f" Wrote document.md ({len(doc_md):,} chars)", flush=True) return images_extracted, ufo_flagged, cryptid_flagged def main(): t_start = time.time() CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) print(f"=== Phase A: Process pages {START_PAGE}-{TOTAL_PAGES} via claude CLI ===", flush=True) pages_to_process = list(range(START_PAGE, TOTAL_PAGES + 1)) new_page_results: dict = {} for batch_start in range(0, len(pages_to_process), BATCH_SIZE): batch = pages_to_process[batch_start:batch_start + BATCH_SIZE] print(f" Batch pages {batch[0]}-{batch[-1]}...", flush=True) with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as ex: futures = {ex.submit(rebuild_page, p): p for p in batch} for fut in concurrent.futures.as_completed(futures): result = fut.result() new_page_results[result["page_number"]] = result # Assign global chunk IDs (continuing from c0204) print(f"\n=== Phase A2: Numbering new chunks from c{FIRST_CHUNK_NUM:04d} ===", flush=True) new_chunks = [] order_global = FIRST_CHUNK_NUM - 1 for page_num in sorted(new_page_results.keys()): result = new_page_results[page_num] png_num = result.get("png_num", page_num - 1) for ch in sorted(result.get("chunks", []), key=lambda c: c.get("order_in_page", 0)): order_global += 1 ch["chunk_id"] = f"c{order_global:04d}" ch["order_global"] = order_global ch["png_num"] = png_num new_chunks.append(ch) # prev/next links (will be re-linked globally in Phase B) for i, ch in enumerate(new_chunks): ch["prev_chunk"] = new_chunks[i-1]["chunk_id"] if i > 0 else None ch["next_chunk"] = new_chunks[i+1]["chunk_id"] if i < len(new_chunks)-1 else None print(f" {len(new_chunks)} new chunks generated", flush=True) # Crop images image_chunks = [c for c in new_chunks if c.get("type") == "image"] if image_chunks: print(f"\n=== Phase A3: Cropping {len(image_chunks)} images ===", flush=True) for ch in image_chunks: crop_image(ch) # Write new chunk files print(f"\n=== Phase A4: Writing {len(new_chunks)} new chunk files ===", flush=True) for ch in new_chunks: write_chunk_file(ch) # ── Phase B: Read ALL chunks and rebuild assembly ────────────────────── print(f"\n=== Phase B: Reading all chunk files for full assembly ===", flush=True) all_chunk_files = sorted(CHUNKS_DIR.glob("c*.md")) print(f" Found {len(all_chunk_files)} total chunk files", flush=True) all_chunks = [] for path in all_chunk_files: fm = parse_frontmatter(path) if not fm.get("chunk_id"): fm["chunk_id"] = path.stem all_chunks.append(fm) # Sort by order_global all_chunks.sort(key=lambda c: (c.get("order_global", 999999), c.get("page", 0), c.get("order_in_page", 0))) # Re-link prev/next globally for i, ch in enumerate(all_chunks): ch["prev_chunk"] = all_chunks[i-1]["chunk_id"] if i > 0 else None ch["next_chunk"] = all_chunks[i+1]["chunk_id"] if i < len(all_chunks)-1 else None print(f" Total chunks: {len(all_chunks)}", flush=True) print(f"\n=== Phase B2: Building _index.json and document.md ===", flush=True) build_at = datetime.now(timezone.utc).isoformat() images_extracted, ufo_flagged, cryptid_flagged = build_assembly(all_chunks, build_at) t_end = time.time() wall_seconds = int(t_end - t_start) pages_done = TOTAL_PAGES chunks_total = len(all_chunks) tables_stitched = 0 final = ( f"pages_done={pages_done}, chunks_total={chunks_total}, " f"images_extracted={images_extracted}, tables_stitched={tables_stitched}, " f"ufo_anomalies={len(ufo_flagged)}, cryptid_anomalies={len(cryptid_flagged)}, " f"wall_seconds={wall_seconds}" ) print(f"\n=== DONE ===\n{final}", flush=True) if __name__ == "__main__": main()