add clean LLM reading version of documents (the core goal)

Scanned docs are messy — duplicate transcriptions (typed + handwritten), two classification variants of the same narrative, OCR noise, repeated banners. The doc page showed raw chunks, so everything appeared twice. 40_reading_version.py generates ONE clean, deduplicated, well-structured bilingual Markdown reading version per doc (Sonnet): merges duplicate versions without losing unique lines, drops page furniture, formats transcripts as dialogue. Faithful — invents nothing; redactions kept as markers. /d/[docId] now defaults to a "📖 leitura" tab rendering this clean version, with "🔍 trechos · scan original" preserving the faithful per-chunk + per-page scan view. reading.md lives in raw/<doc>--subagent/ alongside the chunks. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 17:23:36 -03:00 · 2026-05-21 17:23:36 -03:00 · e75ca5eda2
commit e75ca5eda2
parent 5b62d0a3fe
4 changed files with 217 additions and 4 deletions
--- a/scripts/synthesize/40_reading_version.py
+++ b/scripts/synthesize/40_reading_version.py
@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+40_reading_version.py — Generate a clean, webdesigner-grade reading version of a
+scanned document from its already-extracted chunks.
+
+The scanned PDFs are messy: duplicate transcriptions (typed + handwritten),
+OCR noise, repeated headers/banners, two classification variants of the same
+narrative. This pass uses an LLM to produce ONE clean, deduplicated, well-
+structured bilingual Markdown document for reading — faithful to the content
+(invents nothing) but merging duplicate versions and dropping page furniture.
+
+Output: raw/<doc>--subagent/reading.md  (frontmatter + EN + PT-BR sections)
+
+The raw chunks and per-page scan view stay untouched ("ver scan original").
+
+Run:
+  python3 scripts/synthesize/40_reading_version.py <doc-id>
+"""
+from __future__ import annotations
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+UFO = Path("/Users/guto/ufo")
+RAW = UFO / "raw"
+BUILD_DOC = UFO / "scripts" / "reextract" / "build_doc_text.py"
+
+PROMPT = """You are a meticulous archivist-typographer for The Disclosure Bureau, an
+investigative wiki of declassified UAP/UFO documents. You receive the raw
+machine-extracted text of ONE scanned document (chunk by chunk, with page
+markers). The scan is messy: it often contains the SAME content twice (e.g. a
+typed transcript followed by a handwritten re-transcription, or a SECRET//NOFORN
+narrative immediately followed by a near-identical SECRET//REL version), plus
+OCR noise, repeated letterheads, classification banners, page numbers and
+routing stamps.
+
+Produce ONE clean, faithful, beautifully structured reading version in Markdown.
+
+RULES (non-negotiable):
+1. FAITHFUL — never invent facts, names, dates, codes, or quotes. Use only what
+   is in the text. If something is redacted/illegible, keep a marker like
+   [redacted] / [ilegível].
+2. DEDUPLICATE — when the same content appears more than once (typed vs
+   handwritten, NOFORN vs REL), MERGE into a single best version. Prefer the
+   most complete/legible wording. Never drop unique details that appear in only
+   one version (e.g. a line spoken by a different person).
+3. DROP PAGE FURNITURE — remove repeated letterheads, classification banners,
+   bare page numbers, routing stamps, "DISPATCHED" stamps, distribution lists,
+   and OCR garbage. Keep ONE classification line at the top if present.
+4. STRUCTURE — use clear Markdown: a top H1 title, short intro line, logical
+   headings (## sections), and for transcripts use a clean dialogue format
+   (**SPEAKER:** line). Preserve chronological/communication order.
+5. BILINGUAL — output BOTH languages. First the full English reading version
+   under "## Reading (EN)", then the full Brazilian-Portuguese version under
+   "## Leitura (PT-BR)". PT-BR must be natural Brazilian Portuguese with correct
+   accents.
+6. PRESERVE INVESTIGATIVE SUBSTANCE — every sighting detail, coordinate, time,
+   witness name, object description, and quote that matters to an investigation
+   must survive the cleanup.
+
+Return ONLY the Markdown body (no code fence, no preamble). Start directly with
+the H1 title line.
+
+DOCUMENT (doc_id: {doc_id}) — raw extracted chunks follow:
+
+{doc_text}
+"""
+
+DISALLOWED = (
+    "AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep,"
+    "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
+    "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
+    "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
+    "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
+    "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
+    "ShareOnboardingGuide"
+)
+
+
+def build_doc_text(doc_id: str) -> str:
+    r = subprocess.run(["python3", str(BUILD_DOC), doc_id],
+                       capture_output=True, text=True, encoding="utf-8")
+    if r.returncode != 0:
+        sys.exit(f"build_doc_text failed: {r.stderr}")
+    return r.stdout
+
+
+def call_llm(prompt: str) -> str:
+    import tempfile
+    env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
+    with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as t:
+        tmp = t.name
+    try:
+        with open(tmp, "wb") as out:
+            r = subprocess.run(
+                ["claude", "-p", "--model", "sonnet", "--output-format", "text",
+                 "--disallowed-tools", DISALLOWED],
+                input=prompt.encode("utf-8"), stdout=out, stderr=subprocess.PIPE, env=env,
+                timeout=600,
+            )
+        if r.returncode != 0:
+            sys.exit(f"claude failed rc={r.returncode}: {r.stderr.decode('utf-8','replace')[:500]}")
+        return Path(tmp).read_text(encoding="utf-8")
+    finally:
+        try: os.unlink(tmp)
+        except OSError: pass
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        sys.exit("usage: 40_reading_version.py <doc-id>")
+    doc_id = sys.argv[1]
+    out_path = RAW / f"{doc_id}--subagent" / "reading.md"
+
+    print(f"[1/3] building doc text for {doc_id} ...")
+    doc_text = build_doc_text(doc_id)
+    print(f"      {len(doc_text)} chars (~{len(doc_text)//4} tokens)")
+
+    print("[2/3] generating reading version (Sonnet) ...")
+    md = call_llm(PROMPT.format(doc_id=doc_id, doc_text=doc_text)).strip()
+    if md.startswith("```"):
+        md = "\n".join(l for l in md.splitlines() if not l.startswith("```")).strip()
+
+    front = (
+        f"---\nschema_version: \"0.1.0\"\ntype: reading\ndoc_id: {doc_id}\n"
+        f"generator: sonnet-reading-v1\n---\n\n"
+    )
+    out_path.write_text(front + md + "\n", encoding="utf-8")
+    print(f"[3/3] saved {out_path} ({len(md)} chars)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/web/app/d/[docId]/page.tsx
+++ b/web/app/d/[docId]/page.tsx
@ -6,13 +6,13 @@
 */
 import Link from "next/link";
 import { notFound } from "next/navigation";
-import { readChunksByPage, readIndex, hasChunks } from "@/lib/chunks";
+import { readChunksByPage, readIndex, hasChunks, readReadingVersion } from "@/lib/chunks";
 import { readDocument } from "@/lib/wiki";
 import { pickPitch } from "@/lib/doc-summary";
 import { getLocale } from "@/components/locale-toggle";
 import { AuthBar } from "@/components/auth-bar";
 import { ChatBubble } from "@/components/chat-bubble";
-import { DocRendererV2 } from "@/components/doc-renderer-v2";
+import { DocReadingView } from "@/components/doc-reading-view";
 import { MarkdownBody } from "@/components/markdown-body";

 export const dynamic = "force-dynamic";
@ -45,10 +45,11 @@ export default async function DocPage({
    );
  }

-  const [idx, byPage, doc] = await Promise.all([
+  const [idx, byPage, doc, reading] = await Promise.all([
    readIndex(docId),
    readChunksByPage(docId),
    readDocument(docId),
+    readReadingVersion(docId),
  ]);
  if (!idx) notFound();

@ -135,7 +136,7 @@ export default async function DocPage({
        )}
      </header>

-      <DocRendererV2 docId={docId} chunksByPage={ordered} />
+      <DocReadingView docId={docId} reading={reading} chunksByPage={ordered} />

      <ChatBubble context={{ doc_id: docId }} />
    </main>
--- a/web/components/doc-reading-view.tsx
+++ b/web/components/doc-reading-view.tsx
@ -0,0 +1,62 @@
+"use client";
+
+/**
+ * DocReadingView — toggles between the clean LLM reading version (default) and
+ * the raw extracted chunks. The per-page "ver scan original" stays available
+ * inside the chunks renderer. When no reading version exists, only chunks show.
+ */
+import { useState } from "react";
+import { MarkdownBody } from "@/components/markdown-body";
+import { DocRendererV2 } from "@/components/doc-renderer-v2";
+import type { ParsedChunk } from "@/lib/chunks";
+
+type View = "reading" | "chunks";
+
+export function DocReadingView({
+  docId,
+  reading,
+  chunksByPage,
+}: {
+  docId: string;
+  reading: string | null;
+  chunksByPage: Array<[number, ParsedChunk[]]>;
+}) {
+  const [view, setView] = useState<View>(reading ? "reading" : "chunks");
+
+  return (
+    <div>
+      {reading && (
+        <div className="mb-6 flex items-center gap-2 font-mono text-xs">
+          <button
+            onClick={() => setView("reading")}
+            className={`px-3 py-1.5 border rounded ${
+              view === "reading"
+                ? "border-[#00ff9c] text-[#00ff9c] bg-[rgba(0,255,156,0.08)]"
+                : "border-[rgba(0,255,156,0.20)] text-[#8896aa]"
+            }`}
+          >
+            📖 leitura
+          </button>
+          <button
+            onClick={() => setView("chunks")}
+            className={`px-3 py-1.5 border rounded ${
+              view === "chunks"
+                ? "border-[#7fdbff] text-[#7fdbff] bg-[rgba(127,219,255,0.08)]"
+                : "border-[rgba(127,219,255,0.20)] text-[#8896aa]"
+            }`}
+          >
+            🔍 trechos · scan original
+          </button>
+        </div>
+      )}
+
+      {view === "reading" && reading ? (
+        <article className="markdown-body max-w-3xl text-[#c8d4e6] leading-relaxed">
+          <MarkdownBody>{reading}</MarkdownBody>
+        </article>
+      ) : (
+        <DocRendererV2 docId={docId} chunksByPage={chunksByPage} />
+      )}
+    </div>
+  );
+}
--- a/web/lib/chunks.ts
+++ b/web/lib/chunks.ts
@ -82,6 +82,21 @@ export async function hasChunks(docId: string): Promise<boolean> {
  }
 }

+/** Clean LLM-generated reading version (raw/<doc>--subagent/reading.md), if it
+ *  exists. Returns the Markdown body without frontmatter, or null. */
+export async function readReadingVersion(docId: string): Promise<string | null> {
+  try {
+    const raw = await fs.readFile(path.join(archivePath(docId), "reading.md"), "utf-8");
+    if (raw.startsWith("---")) {
+      const end = raw.indexOf("\n---", 3);
+      if (end !== -1) return raw.slice(raw.indexOf("\n", end + 1) + 1).trim();
+    }
+    return raw.trim();
+  } catch {
+    return null;
+  }
+}
+
 export async function readIndex(docId: string): Promise<ChunkIndex | null> {
  try {
    const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");