add clean LLM reading version of documents (the core goal)

Scanned docs are messy — duplicate transcriptions (typed + handwritten),
two classification variants of the same narrative, OCR noise, repeated
banners. The doc page showed raw chunks, so everything appeared twice.

40_reading_version.py generates ONE clean, deduplicated, well-structured
bilingual Markdown reading version per doc (Sonnet): merges duplicate versions
without losing unique lines, drops page furniture, formats transcripts as
dialogue. Faithful — invents nothing; redactions kept as markers.

/d/[docId] now defaults to a "📖 leitura" tab rendering this clean version,
with "🔍 trechos · scan original" preserving the faithful per-chunk + per-page
scan view. reading.md lives in raw/<doc>--subagent/ alongside the chunks.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Luiz Gustavo 2026-05-21 17:23:36 -03:00
parent 5b62d0a3fe
commit e75ca5eda2
4 changed files with 217 additions and 4 deletions

View file

@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""
40_reading_version.py Generate a clean, webdesigner-grade reading version of a
scanned document from its already-extracted chunks.
The scanned PDFs are messy: duplicate transcriptions (typed + handwritten),
OCR noise, repeated headers/banners, two classification variants of the same
narrative. This pass uses an LLM to produce ONE clean, deduplicated, well-
structured bilingual Markdown document for reading faithful to the content
(invents nothing) but merging duplicate versions and dropping page furniture.
Output: raw/<doc>--subagent/reading.md (frontmatter + EN + PT-BR sections)
The raw chunks and per-page scan view stay untouched ("ver scan original").
Run:
python3 scripts/synthesize/40_reading_version.py <doc-id>
"""
from __future__ import annotations
import os
import subprocess
import sys
from pathlib import Path
UFO = Path("/Users/guto/ufo")
RAW = UFO / "raw"
BUILD_DOC = UFO / "scripts" / "reextract" / "build_doc_text.py"
PROMPT = """You are a meticulous archivist-typographer for The Disclosure Bureau, an
investigative wiki of declassified UAP/UFO documents. You receive the raw
machine-extracted text of ONE scanned document (chunk by chunk, with page
markers). The scan is messy: it often contains the SAME content twice (e.g. a
typed transcript followed by a handwritten re-transcription, or a SECRET//NOFORN
narrative immediately followed by a near-identical SECRET//REL version), plus
OCR noise, repeated letterheads, classification banners, page numbers and
routing stamps.
Produce ONE clean, faithful, beautifully structured reading version in Markdown.
RULES (non-negotiable):
1. FAITHFUL never invent facts, names, dates, codes, or quotes. Use only what
is in the text. If something is redacted/illegible, keep a marker like
[redacted] / [ilegível].
2. DEDUPLICATE when the same content appears more than once (typed vs
handwritten, NOFORN vs REL), MERGE into a single best version. Prefer the
most complete/legible wording. Never drop unique details that appear in only
one version (e.g. a line spoken by a different person).
3. DROP PAGE FURNITURE remove repeated letterheads, classification banners,
bare page numbers, routing stamps, "DISPATCHED" stamps, distribution lists,
and OCR garbage. Keep ONE classification line at the top if present.
4. STRUCTURE use clear Markdown: a top H1 title, short intro line, logical
headings (## sections), and for transcripts use a clean dialogue format
(**SPEAKER:** line). Preserve chronological/communication order.
5. BILINGUAL output BOTH languages. First the full English reading version
under "## Reading (EN)", then the full Brazilian-Portuguese version under
"## Leitura (PT-BR)". PT-BR must be natural Brazilian Portuguese with correct
accents.
6. PRESERVE INVESTIGATIVE SUBSTANCE every sighting detail, coordinate, time,
witness name, object description, and quote that matters to an investigation
must survive the cleanup.
Return ONLY the Markdown body (no code fence, no preamble). Start directly with
the H1 title line.
DOCUMENT (doc_id: {doc_id}) raw extracted chunks follow:
{doc_text}
"""
DISALLOWED = (
"AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep,"
"TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
"Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
"EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
"CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
"PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
"ShareOnboardingGuide"
)
def build_doc_text(doc_id: str) -> str:
r = subprocess.run(["python3", str(BUILD_DOC), doc_id],
capture_output=True, text=True, encoding="utf-8")
if r.returncode != 0:
sys.exit(f"build_doc_text failed: {r.stderr}")
return r.stdout
def call_llm(prompt: str) -> str:
import tempfile
env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as t:
tmp = t.name
try:
with open(tmp, "wb") as out:
r = subprocess.run(
["claude", "-p", "--model", "sonnet", "--output-format", "text",
"--disallowed-tools", DISALLOWED],
input=prompt.encode("utf-8"), stdout=out, stderr=subprocess.PIPE, env=env,
timeout=600,
)
if r.returncode != 0:
sys.exit(f"claude failed rc={r.returncode}: {r.stderr.decode('utf-8','replace')[:500]}")
return Path(tmp).read_text(encoding="utf-8")
finally:
try: os.unlink(tmp)
except OSError: pass
def main() -> int:
if len(sys.argv) < 2:
sys.exit("usage: 40_reading_version.py <doc-id>")
doc_id = sys.argv[1]
out_path = RAW / f"{doc_id}--subagent" / "reading.md"
print(f"[1/3] building doc text for {doc_id} ...")
doc_text = build_doc_text(doc_id)
print(f" {len(doc_text)} chars (~{len(doc_text)//4} tokens)")
print("[2/3] generating reading version (Sonnet) ...")
md = call_llm(PROMPT.format(doc_id=doc_id, doc_text=doc_text)).strip()
if md.startswith("```"):
md = "\n".join(l for l in md.splitlines() if not l.startswith("```")).strip()
front = (
f"---\nschema_version: \"0.1.0\"\ntype: reading\ndoc_id: {doc_id}\n"
f"generator: sonnet-reading-v1\n---\n\n"
)
out_path.write_text(front + md + "\n", encoding="utf-8")
print(f"[3/3] saved {out_path} ({len(md)} chars)")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -6,13 +6,13 @@
*/
import Link from "next/link";
import { notFound } from "next/navigation";
import { readChunksByPage, readIndex, hasChunks } from "@/lib/chunks";
import { readChunksByPage, readIndex, hasChunks, readReadingVersion } from "@/lib/chunks";
import { readDocument } from "@/lib/wiki";
import { pickPitch } from "@/lib/doc-summary";
import { getLocale } from "@/components/locale-toggle";
import { AuthBar } from "@/components/auth-bar";
import { ChatBubble } from "@/components/chat-bubble";
import { DocRendererV2 } from "@/components/doc-renderer-v2";
import { DocReadingView } from "@/components/doc-reading-view";
import { MarkdownBody } from "@/components/markdown-body";
export const dynamic = "force-dynamic";
@ -45,10 +45,11 @@ export default async function DocPage({
);
}
const [idx, byPage, doc] = await Promise.all([
const [idx, byPage, doc, reading] = await Promise.all([
readIndex(docId),
readChunksByPage(docId),
readDocument(docId),
readReadingVersion(docId),
]);
if (!idx) notFound();
@ -135,7 +136,7 @@ export default async function DocPage({
)}
</header>
<DocRendererV2 docId={docId} chunksByPage={ordered} />
<DocReadingView docId={docId} reading={reading} chunksByPage={ordered} />
<ChatBubble context={{ doc_id: docId }} />
</main>

View file

@ -0,0 +1,62 @@
"use client";
/**
* DocReadingView toggles between the clean LLM reading version (default) and
* the raw extracted chunks. The per-page "ver scan original" stays available
* inside the chunks renderer. When no reading version exists, only chunks show.
*/
import { useState } from "react";
import { MarkdownBody } from "@/components/markdown-body";
import { DocRendererV2 } from "@/components/doc-renderer-v2";
import type { ParsedChunk } from "@/lib/chunks";
type View = "reading" | "chunks";
export function DocReadingView({
docId,
reading,
chunksByPage,
}: {
docId: string;
reading: string | null;
chunksByPage: Array<[number, ParsedChunk[]]>;
}) {
const [view, setView] = useState<View>(reading ? "reading" : "chunks");
return (
<div>
{reading && (
<div className="mb-6 flex items-center gap-2 font-mono text-xs">
<button
onClick={() => setView("reading")}
className={`px-3 py-1.5 border rounded ${
view === "reading"
? "border-[#00ff9c] text-[#00ff9c] bg-[rgba(0,255,156,0.08)]"
: "border-[rgba(0,255,156,0.20)] text-[#8896aa]"
}`}
>
📖 leitura
</button>
<button
onClick={() => setView("chunks")}
className={`px-3 py-1.5 border rounded ${
view === "chunks"
? "border-[#7fdbff] text-[#7fdbff] bg-[rgba(127,219,255,0.08)]"
: "border-[rgba(127,219,255,0.20)] text-[#8896aa]"
}`}
>
🔍 trechos · scan original
</button>
</div>
)}
{view === "reading" && reading ? (
<article className="markdown-body max-w-3xl text-[#c8d4e6] leading-relaxed">
<MarkdownBody>{reading}</MarkdownBody>
</article>
) : (
<DocRendererV2 docId={docId} chunksByPage={chunksByPage} />
)}
</div>
);
}

View file

@ -82,6 +82,21 @@ export async function hasChunks(docId: string): Promise<boolean> {
}
}
/** Clean LLM-generated reading version (raw/<doc>--subagent/reading.md), if it
* exists. Returns the Markdown body without frontmatter, or null. */
export async function readReadingVersion(docId: string): Promise<string | null> {
try {
const raw = await fs.readFile(path.join(archivePath(docId), "reading.md"), "utf-8");
if (raw.startsWith("---")) {
const end = raw.indexOf("\n---", 3);
if (end !== -1) return raw.slice(raw.indexOf("\n", end + 1) + 1).trim();
}
return raw.trim();
} catch {
return null;
}
}
export async function readIndex(docId: string): Promise<ChunkIndex | null> {
try {
const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");