disclosure-bureau/web/app/d/[docId]/[page]/page.tsx
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

142 lines
5.3 KiB
TypeScript

/**
* /d/[docId]/[page] — single-page chunks view.
*
* Scoped to one page (e.g., p007). Shows the PNG of the page alongside
* the chunks for cross-reference.
*/
import Link from "next/link";
import Image from "next/image";
import { notFound } from "next/navigation";
import { readChunksByPage, readIndex, hasChunks } from "@/lib/chunks";
import { readDocument } from "@/lib/wiki";
import { AuthBar } from "@/components/auth-bar";
import { ChatBubble } from "@/components/chat-bubble";
import { DocRendererV2 } from "@/components/doc-renderer-v2";
export const dynamic = "force-dynamic";
export default async function DocPageView({
params,
}: {
params: Promise<{ docId: string; page: string }>;
}) {
const { docId, page } = await params;
const stem = /^p\d{3}$/.test(page) ? page : `p${page.padStart(3, "0")}`;
const m = stem.match(/^p(\d{3})$/);
if (!m) notFound();
const pageNum = parseInt(m[1], 10);
if (!(await hasChunks(docId))) {
return (
<main className="min-h-screen p-6 md:p-10 max-w-4xl mx-auto">
<div className="flex items-start justify-between gap-4 mb-6">
<Link href={`/d/${docId}`} className="font-mono text-xs text-[#7fdbff] hover:text-[#00ff9c]">
documento
</Link>
<AuthBar />
</div>
<div className="border border-[rgba(0,255,156,0.15)] bg-[#0a121e] rounded p-6">
<h1 className="font-mono text-lg text-[#00ff9c] mb-2"> documento ainda não indexado</h1>
<p className="text-[#c8d4e6] text-sm">
Este documento ainda não foi processado.
</p>
</div>
</main>
);
}
const [idx, byPage, doc] = await Promise.all([
readIndex(docId),
readChunksByPage(docId),
readDocument(docId),
]);
if (!idx) notFound();
const pageChunks = byPage.get(pageNum) ?? [];
const pngUrl = `/api/static/processing/png/${docId}/p-${m[1]}.png`;
const totalPages = idx.total_pages;
return (
<main className="min-h-screen p-6 md:p-10 max-w-6xl mx-auto">
<div className="flex items-start justify-between gap-4 mb-6">
<div className="flex items-center gap-3 font-mono text-xs">
<Link href={`/d/${docId}`} className="text-[#7fdbff] hover:text-[#00ff9c]">
documento inteiro
</Link>
</div>
<AuthBar />
</div>
<header className="mb-6 pb-4 border-b border-[rgba(0,255,156,0.32)]">
<div className="font-mono text-[10px] text-[#5a6678] tracking-widest uppercase mb-2">
página {pageNum} de {totalPages} · {pageChunks.length} trechos · doc:{" "}
<span className="text-[#7fdbff]">{docId}</span>
</div>
<h1 className="font-mono text-xl text-[#00ff9c]">
{(doc?.fm.canonical_title as string) ?? docId} · p{pageNum}
</h1>
</header>
<div className="grid grid-cols-1 lg:grid-cols-[1fr_1fr] gap-8">
<aside className="lg:sticky lg:top-6 lg:self-start lg:max-h-[85vh] lg:overflow-y-auto">
<h2 className="font-mono text-xs uppercase tracking-widest text-[#7fdbff] mb-2">
scanned PNG · 72 DPI
</h2>
<div className="border border-[rgba(0,255,156,0.18)] rounded overflow-hidden bg-[#0a121e]">
<Image
src={pngUrl}
alt={`página ${pageNum}`}
width={800}
height={1100}
sizes="(max-width: 1024px) 100vw, 50vw"
className="block w-full h-auto"
/>
</div>
<div className="mt-2 flex items-center justify-between font-mono text-[10px] text-[#5a6678]">
<Link
href={pageNum > 1 ? `/d/${docId}/p${String(pageNum - 1).padStart(3, "0")}` : "#"}
className={pageNum > 1 ? "hover:text-[#00ff9c]" : "opacity-30 pointer-events-none"}
>
p{pageNum - 1}
</Link>
<span>
{pageNum} / {totalPages}
</span>
<Link
href={
pageNum < totalPages
? `/d/${docId}/p${String(pageNum + 1).padStart(3, "0")}`
: "#"
}
className={
pageNum < totalPages ? "hover:text-[#00ff9c]" : "opacity-30 pointer-events-none"
}
>
p{pageNum + 1}
</Link>
</div>
</aside>
<article>
<h2 className="font-mono text-xs uppercase tracking-widest text-[#7fdbff] mb-2">
trechos (ordem de leitura)
</h2>
{pageChunks.length === 0 ? (
<div className="border border-[rgba(0,255,156,0.15)] bg-[#0a121e] rounded p-6 text-sm text-[#c8d4e6]">
<p className="font-mono text-[#7fdbff] mb-2"> página sem trechos extraídos</p>
<p className="text-[#5a6678] text-xs">
O scan existe (veja à esquerda) mas o processo de chunking não gerou trechos
para esta página específica. Pode ser página em branco, divisor de seção
ou conteúdo sem texto extraível. Próxima execução do chunker preencherá.
</p>
</div>
) : (
<DocRendererV2 docId={docId} chunksByPage={[[pageNum, pageChunks]]} />
)}
</article>
</div>
<ChatBubble context={{ doc_id: docId, page_id: `${docId}/${stem}` }} />
</main>
);
}