#!/usr/bin/env python3 """ 40_reading_version.py — Generate a clean, webdesigner-grade reading version of a scanned document from its already-extracted chunks. The scanned PDFs are messy: duplicate transcriptions (typed + handwritten), OCR noise, repeated headers/banners, two classification variants of the same narrative. This pass uses an LLM to produce ONE clean, deduplicated, well- structured bilingual Markdown document for reading — faithful to the content (invents nothing) but merging duplicate versions and dropping page furniture. Output: raw/--subagent/reading.md (frontmatter + EN + PT-BR sections) The raw chunks and per-page scan view stay untouched ("ver scan original"). Run: python3 scripts/synthesize/40_reading_version.py """ from __future__ import annotations import os import subprocess import sys from pathlib import Path UFO = Path("/Users/guto/ufo") RAW = UFO / "raw" BUILD_DOC = UFO / "scripts" / "reextract" / "build_doc_text.py" PROMPT = """You are a meticulous archivist-typographer for The Disclosure Bureau, an investigative wiki of declassified UAP/UFO documents. You receive the raw machine-extracted text of ONE scanned document (chunk by chunk, with page markers). The scan is messy: it often contains the SAME content twice (e.g. a typed transcript followed by a handwritten re-transcription, or a SECRET//NOFORN narrative immediately followed by a near-identical SECRET//REL version), plus OCR noise, repeated letterheads, classification banners, page numbers and routing stamps. Produce ONE clean, faithful, beautifully structured reading version in Markdown. RULES (non-negotiable): 1. FAITHFUL — never invent facts, names, dates, codes, or quotes. Use only what is in the text. If something is redacted/illegible, keep a marker like [redacted] / [ilegível]. 2. DEDUPLICATE — when the same content appears more than once (typed vs handwritten, NOFORN vs REL), MERGE into a single best version. Prefer the most complete/legible wording. Never drop unique details that appear in only one version (e.g. a line spoken by a different person). 3. DROP PAGE FURNITURE — remove repeated letterheads, classification banners, bare page numbers, routing stamps, "DISPATCHED" stamps, distribution lists, and OCR garbage. Keep ONE classification line at the top if present. 4. STRUCTURE — use clear Markdown: a top H1 title, short intro line, logical headings (## sections), and for transcripts use a clean dialogue format (**SPEAKER:** line). Preserve chronological/communication order. 5. BILINGUAL — output BOTH languages. First the full English reading version under "## Reading (EN)", then the full Brazilian-Portuguese version under "## Leitura (PT-BR)". PT-BR must be natural Brazilian Portuguese with correct accents. 6. PRESERVE INVESTIGATIVE SUBSTANCE — every sighting detail, coordinate, time, witness name, object description, and quote that matters to an investigation must survive the cleanup. Return ONLY the Markdown body (no code fence, no preamble). Start directly with the H1 title line. DOCUMENT (doc_id: {doc_id}) — raw extracted chunks follow: {doc_text} """ DISALLOWED = ( "AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep," "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput," "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit," "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree," "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch," "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool," "ShareOnboardingGuide" ) def build_doc_text(doc_id: str) -> str: r = subprocess.run(["python3", str(BUILD_DOC), doc_id], capture_output=True, text=True, encoding="utf-8") if r.returncode != 0: sys.exit(f"build_doc_text failed: {r.stderr}") return r.stdout def call_llm(prompt: str) -> str: import tempfile env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"} with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as t: tmp = t.name try: with open(tmp, "wb") as out: r = subprocess.run( ["claude", "-p", "--model", "sonnet", "--output-format", "text", "--disallowed-tools", DISALLOWED], input=prompt.encode("utf-8"), stdout=out, stderr=subprocess.PIPE, env=env, timeout=600, ) if r.returncode != 0: sys.exit(f"claude failed rc={r.returncode}: {r.stderr.decode('utf-8','replace')[:500]}") return Path(tmp).read_text(encoding="utf-8") finally: try: os.unlink(tmp) except OSError: pass def main() -> int: if len(sys.argv) < 2: sys.exit("usage: 40_reading_version.py ") doc_id = sys.argv[1] out_path = RAW / f"{doc_id}--subagent" / "reading.md" print(f"[1/3] building doc text for {doc_id} ...") doc_text = build_doc_text(doc_id) print(f" {len(doc_text)} chars (~{len(doc_text)//4} tokens)") print("[2/3] generating reading version (Sonnet) ...") md = call_llm(PROMPT.format(doc_id=doc_id, doc_text=doc_text)).strip() if md.startswith("```"): md = "\n".join(l for l in md.splitlines() if not l.startswith("```")).strip() front = ( f"---\nschema_version: \"0.1.0\"\ntype: reading\ndoc_id: {doc_id}\n" f"generator: sonnet-reading-v1\n---\n\n" ) out_path.write_text(front + md + "\n", encoding="utf-8") print(f"[3/3] saved {out_path} ({len(md)} chars)") return 0 if __name__ == "__main__": sys.exit(main())