diff --git a/scripts/synthesize/40_reading_version.py b/scripts/synthesize/40_reading_version.py new file mode 100644 index 0000000..a7a6ede --- /dev/null +++ b/scripts/synthesize/40_reading_version.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +40_reading_version.py — Generate a clean, webdesigner-grade reading version of a +scanned document from its already-extracted chunks. + +The scanned PDFs are messy: duplicate transcriptions (typed + handwritten), +OCR noise, repeated headers/banners, two classification variants of the same +narrative. This pass uses an LLM to produce ONE clean, deduplicated, well- +structured bilingual Markdown document for reading — faithful to the content +(invents nothing) but merging duplicate versions and dropping page furniture. + +Output: raw/--subagent/reading.md (frontmatter + EN + PT-BR sections) + +The raw chunks and per-page scan view stay untouched ("ver scan original"). + +Run: + python3 scripts/synthesize/40_reading_version.py +""" +from __future__ import annotations +import os +import subprocess +import sys +from pathlib import Path + +UFO = Path("/Users/guto/ufo") +RAW = UFO / "raw" +BUILD_DOC = UFO / "scripts" / "reextract" / "build_doc_text.py" + +PROMPT = """You are a meticulous archivist-typographer for The Disclosure Bureau, an +investigative wiki of declassified UAP/UFO documents. You receive the raw +machine-extracted text of ONE scanned document (chunk by chunk, with page +markers). The scan is messy: it often contains the SAME content twice (e.g. a +typed transcript followed by a handwritten re-transcription, or a SECRET//NOFORN +narrative immediately followed by a near-identical SECRET//REL version), plus +OCR noise, repeated letterheads, classification banners, page numbers and +routing stamps. + +Produce ONE clean, faithful, beautifully structured reading version in Markdown. + +RULES (non-negotiable): +1. FAITHFUL — never invent facts, names, dates, codes, or quotes. Use only what + is in the text. If something is redacted/illegible, keep a marker like + [redacted] / [ilegível]. +2. DEDUPLICATE — when the same content appears more than once (typed vs + handwritten, NOFORN vs REL), MERGE into a single best version. Prefer the + most complete/legible wording. Never drop unique details that appear in only + one version (e.g. a line spoken by a different person). +3. DROP PAGE FURNITURE — remove repeated letterheads, classification banners, + bare page numbers, routing stamps, "DISPATCHED" stamps, distribution lists, + and OCR garbage. Keep ONE classification line at the top if present. +4. STRUCTURE — use clear Markdown: a top H1 title, short intro line, logical + headings (## sections), and for transcripts use a clean dialogue format + (**SPEAKER:** line). Preserve chronological/communication order. +5. BILINGUAL — output BOTH languages. First the full English reading version + under "## Reading (EN)", then the full Brazilian-Portuguese version under + "## Leitura (PT-BR)". PT-BR must be natural Brazilian Portuguese with correct + accents. +6. PRESERVE INVESTIGATIVE SUBSTANCE — every sighting detail, coordinate, time, + witness name, object description, and quote that matters to an investigation + must survive the cleanup. + +Return ONLY the Markdown body (no code fence, no preamble). Start directly with +the H1 title line. + +DOCUMENT (doc_id: {doc_id}) — raw extracted chunks follow: + +{doc_text} +""" + +DISALLOWED = ( + "AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep," + "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput," + "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit," + "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree," + "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch," + "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool," + "ShareOnboardingGuide" +) + + +def build_doc_text(doc_id: str) -> str: + r = subprocess.run(["python3", str(BUILD_DOC), doc_id], + capture_output=True, text=True, encoding="utf-8") + if r.returncode != 0: + sys.exit(f"build_doc_text failed: {r.stderr}") + return r.stdout + + +def call_llm(prompt: str) -> str: + import tempfile + env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"} + with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as t: + tmp = t.name + try: + with open(tmp, "wb") as out: + r = subprocess.run( + ["claude", "-p", "--model", "sonnet", "--output-format", "text", + "--disallowed-tools", DISALLOWED], + input=prompt.encode("utf-8"), stdout=out, stderr=subprocess.PIPE, env=env, + timeout=600, + ) + if r.returncode != 0: + sys.exit(f"claude failed rc={r.returncode}: {r.stderr.decode('utf-8','replace')[:500]}") + return Path(tmp).read_text(encoding="utf-8") + finally: + try: os.unlink(tmp) + except OSError: pass + + +def main() -> int: + if len(sys.argv) < 2: + sys.exit("usage: 40_reading_version.py ") + doc_id = sys.argv[1] + out_path = RAW / f"{doc_id}--subagent" / "reading.md" + + print(f"[1/3] building doc text for {doc_id} ...") + doc_text = build_doc_text(doc_id) + print(f" {len(doc_text)} chars (~{len(doc_text)//4} tokens)") + + print("[2/3] generating reading version (Sonnet) ...") + md = call_llm(PROMPT.format(doc_id=doc_id, doc_text=doc_text)).strip() + if md.startswith("```"): + md = "\n".join(l for l in md.splitlines() if not l.startswith("```")).strip() + + front = ( + f"---\nschema_version: \"0.1.0\"\ntype: reading\ndoc_id: {doc_id}\n" + f"generator: sonnet-reading-v1\n---\n\n" + ) + out_path.write_text(front + md + "\n", encoding="utf-8") + print(f"[3/3] saved {out_path} ({len(md)} chars)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/web/app/d/[docId]/page.tsx b/web/app/d/[docId]/page.tsx index 4059bb4..a827769 100644 --- a/web/app/d/[docId]/page.tsx +++ b/web/app/d/[docId]/page.tsx @@ -6,13 +6,13 @@ */ import Link from "next/link"; import { notFound } from "next/navigation"; -import { readChunksByPage, readIndex, hasChunks } from "@/lib/chunks"; +import { readChunksByPage, readIndex, hasChunks, readReadingVersion } from "@/lib/chunks"; import { readDocument } from "@/lib/wiki"; import { pickPitch } from "@/lib/doc-summary"; import { getLocale } from "@/components/locale-toggle"; import { AuthBar } from "@/components/auth-bar"; import { ChatBubble } from "@/components/chat-bubble"; -import { DocRendererV2 } from "@/components/doc-renderer-v2"; +import { DocReadingView } from "@/components/doc-reading-view"; import { MarkdownBody } from "@/components/markdown-body"; export const dynamic = "force-dynamic"; @@ -45,10 +45,11 @@ export default async function DocPage({ ); } - const [idx, byPage, doc] = await Promise.all([ + const [idx, byPage, doc, reading] = await Promise.all([ readIndex(docId), readChunksByPage(docId), readDocument(docId), + readReadingVersion(docId), ]); if (!idx) notFound(); @@ -135,7 +136,7 @@ export default async function DocPage({ )} - + diff --git a/web/components/doc-reading-view.tsx b/web/components/doc-reading-view.tsx new file mode 100644 index 0000000..0ef4451 --- /dev/null +++ b/web/components/doc-reading-view.tsx @@ -0,0 +1,62 @@ +"use client"; + +/** + * DocReadingView — toggles between the clean LLM reading version (default) and + * the raw extracted chunks. The per-page "ver scan original" stays available + * inside the chunks renderer. When no reading version exists, only chunks show. + */ +import { useState } from "react"; +import { MarkdownBody } from "@/components/markdown-body"; +import { DocRendererV2 } from "@/components/doc-renderer-v2"; +import type { ParsedChunk } from "@/lib/chunks"; + +type View = "reading" | "chunks"; + +export function DocReadingView({ + docId, + reading, + chunksByPage, +}: { + docId: string; + reading: string | null; + chunksByPage: Array<[number, ParsedChunk[]]>; +}) { + const [view, setView] = useState(reading ? "reading" : "chunks"); + + return ( +
+ {reading && ( +
+ + +
+ )} + + {view === "reading" && reading ? ( +
+ {reading} +
+ ) : ( + + )} +
+ ); +} diff --git a/web/lib/chunks.ts b/web/lib/chunks.ts index 0b7bea8..dfc9e0e 100644 --- a/web/lib/chunks.ts +++ b/web/lib/chunks.ts @@ -82,6 +82,21 @@ export async function hasChunks(docId: string): Promise { } } +/** Clean LLM-generated reading version (raw/--subagent/reading.md), if it + * exists. Returns the Markdown body without frontmatter, or null. */ +export async function readReadingVersion(docId: string): Promise { + try { + const raw = await fs.readFile(path.join(archivePath(docId), "reading.md"), "utf-8"); + if (raw.startsWith("---")) { + const end = raw.indexOf("\n---", 3); + if (end !== -1) return raw.slice(raw.indexOf("\n", end + 1) + 1).trim(); + } + return raw.trim(); + } catch { + return null; + } +} + export async function readIndex(docId: string): Promise { try { const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");