add clean LLM reading version of documents (the core goal)
Scanned docs are messy — duplicate transcriptions (typed + handwritten), two classification variants of the same narrative, OCR noise, repeated banners. The doc page showed raw chunks, so everything appeared twice. 40_reading_version.py generates ONE clean, deduplicated, well-structured bilingual Markdown reading version per doc (Sonnet): merges duplicate versions without losing unique lines, drops page furniture, formats transcripts as dialogue. Faithful — invents nothing; redactions kept as markers. /d/[docId] now defaults to a "📖 leitura" tab rendering this clean version, with "🔍 trechos · scan original" preserving the faithful per-chunk + per-page scan view. reading.md lives in raw/<doc>--subagent/ alongside the chunks. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
5b62d0a3fe
commit
e75ca5eda2
4 changed files with 217 additions and 4 deletions
135
scripts/synthesize/40_reading_version.py
Normal file
135
scripts/synthesize/40_reading_version.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
40_reading_version.py — Generate a clean, webdesigner-grade reading version of a
|
||||
scanned document from its already-extracted chunks.
|
||||
|
||||
The scanned PDFs are messy: duplicate transcriptions (typed + handwritten),
|
||||
OCR noise, repeated headers/banners, two classification variants of the same
|
||||
narrative. This pass uses an LLM to produce ONE clean, deduplicated, well-
|
||||
structured bilingual Markdown document for reading — faithful to the content
|
||||
(invents nothing) but merging duplicate versions and dropping page furniture.
|
||||
|
||||
Output: raw/<doc>--subagent/reading.md (frontmatter + EN + PT-BR sections)
|
||||
|
||||
The raw chunks and per-page scan view stay untouched ("ver scan original").
|
||||
|
||||
Run:
|
||||
python3 scripts/synthesize/40_reading_version.py <doc-id>
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
UFO = Path("/Users/guto/ufo")
|
||||
RAW = UFO / "raw"
|
||||
BUILD_DOC = UFO / "scripts" / "reextract" / "build_doc_text.py"
|
||||
|
||||
PROMPT = """You are a meticulous archivist-typographer for The Disclosure Bureau, an
|
||||
investigative wiki of declassified UAP/UFO documents. You receive the raw
|
||||
machine-extracted text of ONE scanned document (chunk by chunk, with page
|
||||
markers). The scan is messy: it often contains the SAME content twice (e.g. a
|
||||
typed transcript followed by a handwritten re-transcription, or a SECRET//NOFORN
|
||||
narrative immediately followed by a near-identical SECRET//REL version), plus
|
||||
OCR noise, repeated letterheads, classification banners, page numbers and
|
||||
routing stamps.
|
||||
|
||||
Produce ONE clean, faithful, beautifully structured reading version in Markdown.
|
||||
|
||||
RULES (non-negotiable):
|
||||
1. FAITHFUL — never invent facts, names, dates, codes, or quotes. Use only what
|
||||
is in the text. If something is redacted/illegible, keep a marker like
|
||||
[redacted] / [ilegível].
|
||||
2. DEDUPLICATE — when the same content appears more than once (typed vs
|
||||
handwritten, NOFORN vs REL), MERGE into a single best version. Prefer the
|
||||
most complete/legible wording. Never drop unique details that appear in only
|
||||
one version (e.g. a line spoken by a different person).
|
||||
3. DROP PAGE FURNITURE — remove repeated letterheads, classification banners,
|
||||
bare page numbers, routing stamps, "DISPATCHED" stamps, distribution lists,
|
||||
and OCR garbage. Keep ONE classification line at the top if present.
|
||||
4. STRUCTURE — use clear Markdown: a top H1 title, short intro line, logical
|
||||
headings (## sections), and for transcripts use a clean dialogue format
|
||||
(**SPEAKER:** line). Preserve chronological/communication order.
|
||||
5. BILINGUAL — output BOTH languages. First the full English reading version
|
||||
under "## Reading (EN)", then the full Brazilian-Portuguese version under
|
||||
"## Leitura (PT-BR)". PT-BR must be natural Brazilian Portuguese with correct
|
||||
accents.
|
||||
6. PRESERVE INVESTIGATIVE SUBSTANCE — every sighting detail, coordinate, time,
|
||||
witness name, object description, and quote that matters to an investigation
|
||||
must survive the cleanup.
|
||||
|
||||
Return ONLY the Markdown body (no code fence, no preamble). Start directly with
|
||||
the H1 title line.
|
||||
|
||||
DOCUMENT (doc_id: {doc_id}) — raw extracted chunks follow:
|
||||
|
||||
{doc_text}
|
||||
"""
|
||||
|
||||
DISALLOWED = (
|
||||
"AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep,"
|
||||
"TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
|
||||
"Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
|
||||
"EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
|
||||
"CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
|
||||
"PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
|
||||
"ShareOnboardingGuide"
|
||||
)
|
||||
|
||||
|
||||
def build_doc_text(doc_id: str) -> str:
|
||||
r = subprocess.run(["python3", str(BUILD_DOC), doc_id],
|
||||
capture_output=True, text=True, encoding="utf-8")
|
||||
if r.returncode != 0:
|
||||
sys.exit(f"build_doc_text failed: {r.stderr}")
|
||||
return r.stdout
|
||||
|
||||
|
||||
def call_llm(prompt: str) -> str:
|
||||
import tempfile
|
||||
env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
|
||||
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as t:
|
||||
tmp = t.name
|
||||
try:
|
||||
with open(tmp, "wb") as out:
|
||||
r = subprocess.run(
|
||||
["claude", "-p", "--model", "sonnet", "--output-format", "text",
|
||||
"--disallowed-tools", DISALLOWED],
|
||||
input=prompt.encode("utf-8"), stdout=out, stderr=subprocess.PIPE, env=env,
|
||||
timeout=600,
|
||||
)
|
||||
if r.returncode != 0:
|
||||
sys.exit(f"claude failed rc={r.returncode}: {r.stderr.decode('utf-8','replace')[:500]}")
|
||||
return Path(tmp).read_text(encoding="utf-8")
|
||||
finally:
|
||||
try: os.unlink(tmp)
|
||||
except OSError: pass
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit("usage: 40_reading_version.py <doc-id>")
|
||||
doc_id = sys.argv[1]
|
||||
out_path = RAW / f"{doc_id}--subagent" / "reading.md"
|
||||
|
||||
print(f"[1/3] building doc text for {doc_id} ...")
|
||||
doc_text = build_doc_text(doc_id)
|
||||
print(f" {len(doc_text)} chars (~{len(doc_text)//4} tokens)")
|
||||
|
||||
print("[2/3] generating reading version (Sonnet) ...")
|
||||
md = call_llm(PROMPT.format(doc_id=doc_id, doc_text=doc_text)).strip()
|
||||
if md.startswith("```"):
|
||||
md = "\n".join(l for l in md.splitlines() if not l.startswith("```")).strip()
|
||||
|
||||
front = (
|
||||
f"---\nschema_version: \"0.1.0\"\ntype: reading\ndoc_id: {doc_id}\n"
|
||||
f"generator: sonnet-reading-v1\n---\n\n"
|
||||
)
|
||||
out_path.write_text(front + md + "\n", encoding="utf-8")
|
||||
print(f"[3/3] saved {out_path} ({len(md)} chars)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -6,13 +6,13 @@
|
|||
*/
|
||||
import Link from "next/link";
|
||||
import { notFound } from "next/navigation";
|
||||
import { readChunksByPage, readIndex, hasChunks } from "@/lib/chunks";
|
||||
import { readChunksByPage, readIndex, hasChunks, readReadingVersion } from "@/lib/chunks";
|
||||
import { readDocument } from "@/lib/wiki";
|
||||
import { pickPitch } from "@/lib/doc-summary";
|
||||
import { getLocale } from "@/components/locale-toggle";
|
||||
import { AuthBar } from "@/components/auth-bar";
|
||||
import { ChatBubble } from "@/components/chat-bubble";
|
||||
import { DocRendererV2 } from "@/components/doc-renderer-v2";
|
||||
import { DocReadingView } from "@/components/doc-reading-view";
|
||||
import { MarkdownBody } from "@/components/markdown-body";
|
||||
|
||||
export const dynamic = "force-dynamic";
|
||||
|
|
@ -45,10 +45,11 @@ export default async function DocPage({
|
|||
);
|
||||
}
|
||||
|
||||
const [idx, byPage, doc] = await Promise.all([
|
||||
const [idx, byPage, doc, reading] = await Promise.all([
|
||||
readIndex(docId),
|
||||
readChunksByPage(docId),
|
||||
readDocument(docId),
|
||||
readReadingVersion(docId),
|
||||
]);
|
||||
if (!idx) notFound();
|
||||
|
||||
|
|
@ -135,7 +136,7 @@ export default async function DocPage({
|
|||
)}
|
||||
</header>
|
||||
|
||||
<DocRendererV2 docId={docId} chunksByPage={ordered} />
|
||||
<DocReadingView docId={docId} reading={reading} chunksByPage={ordered} />
|
||||
|
||||
<ChatBubble context={{ doc_id: docId }} />
|
||||
</main>
|
||||
|
|
|
|||
62
web/components/doc-reading-view.tsx
Normal file
62
web/components/doc-reading-view.tsx
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
"use client";
|
||||
|
||||
/**
|
||||
* DocReadingView — toggles between the clean LLM reading version (default) and
|
||||
* the raw extracted chunks. The per-page "ver scan original" stays available
|
||||
* inside the chunks renderer. When no reading version exists, only chunks show.
|
||||
*/
|
||||
import { useState } from "react";
|
||||
import { MarkdownBody } from "@/components/markdown-body";
|
||||
import { DocRendererV2 } from "@/components/doc-renderer-v2";
|
||||
import type { ParsedChunk } from "@/lib/chunks";
|
||||
|
||||
type View = "reading" | "chunks";
|
||||
|
||||
export function DocReadingView({
|
||||
docId,
|
||||
reading,
|
||||
chunksByPage,
|
||||
}: {
|
||||
docId: string;
|
||||
reading: string | null;
|
||||
chunksByPage: Array<[number, ParsedChunk[]]>;
|
||||
}) {
|
||||
const [view, setView] = useState<View>(reading ? "reading" : "chunks");
|
||||
|
||||
return (
|
||||
<div>
|
||||
{reading && (
|
||||
<div className="mb-6 flex items-center gap-2 font-mono text-xs">
|
||||
<button
|
||||
onClick={() => setView("reading")}
|
||||
className={`px-3 py-1.5 border rounded ${
|
||||
view === "reading"
|
||||
? "border-[#00ff9c] text-[#00ff9c] bg-[rgba(0,255,156,0.08)]"
|
||||
: "border-[rgba(0,255,156,0.20)] text-[#8896aa]"
|
||||
}`}
|
||||
>
|
||||
📖 leitura
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setView("chunks")}
|
||||
className={`px-3 py-1.5 border rounded ${
|
||||
view === "chunks"
|
||||
? "border-[#7fdbff] text-[#7fdbff] bg-[rgba(127,219,255,0.08)]"
|
||||
: "border-[rgba(127,219,255,0.20)] text-[#8896aa]"
|
||||
}`}
|
||||
>
|
||||
🔍 trechos · scan original
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{view === "reading" && reading ? (
|
||||
<article className="markdown-body max-w-3xl text-[#c8d4e6] leading-relaxed">
|
||||
<MarkdownBody>{reading}</MarkdownBody>
|
||||
</article>
|
||||
) : (
|
||||
<DocRendererV2 docId={docId} chunksByPage={chunksByPage} />
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -82,6 +82,21 @@ export async function hasChunks(docId: string): Promise<boolean> {
|
|||
}
|
||||
}
|
||||
|
||||
/** Clean LLM-generated reading version (raw/<doc>--subagent/reading.md), if it
|
||||
* exists. Returns the Markdown body without frontmatter, or null. */
|
||||
export async function readReadingVersion(docId: string): Promise<string | null> {
|
||||
try {
|
||||
const raw = await fs.readFile(path.join(archivePath(docId), "reading.md"), "utf-8");
|
||||
if (raw.startsWith("---")) {
|
||||
const end = raw.indexOf("\n---", 3);
|
||||
if (end !== -1) return raw.slice(raw.indexOf("\n", end + 1) + 1).trim();
|
||||
}
|
||||
return raw.trim();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function readIndex(docId: string): Promise<ChunkIndex | null> {
|
||||
try {
|
||||
const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");
|
||||
|
|
|
|||
Loading…
Reference in a new issue