Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
142 lines
5.3 KiB
TypeScript
142 lines
5.3 KiB
TypeScript
/**
|
|
* /d/[docId]/[page] — single-page chunks view.
|
|
*
|
|
* Scoped to one page (e.g., p007). Shows the PNG of the page alongside
|
|
* the chunks for cross-reference.
|
|
*/
|
|
import Link from "next/link";
|
|
import Image from "next/image";
|
|
import { notFound } from "next/navigation";
|
|
import { readChunksByPage, readIndex, hasChunks } from "@/lib/chunks";
|
|
import { readDocument } from "@/lib/wiki";
|
|
import { AuthBar } from "@/components/auth-bar";
|
|
import { ChatBubble } from "@/components/chat-bubble";
|
|
import { DocRendererV2 } from "@/components/doc-renderer-v2";
|
|
|
|
export const dynamic = "force-dynamic";
|
|
|
|
export default async function DocPageView({
|
|
params,
|
|
}: {
|
|
params: Promise<{ docId: string; page: string }>;
|
|
}) {
|
|
const { docId, page } = await params;
|
|
const stem = /^p\d{3}$/.test(page) ? page : `p${page.padStart(3, "0")}`;
|
|
const m = stem.match(/^p(\d{3})$/);
|
|
if (!m) notFound();
|
|
const pageNum = parseInt(m[1], 10);
|
|
|
|
if (!(await hasChunks(docId))) {
|
|
return (
|
|
<main className="min-h-screen p-6 md:p-10 max-w-4xl mx-auto">
|
|
<div className="flex items-start justify-between gap-4 mb-6">
|
|
<Link href={`/d/${docId}`} className="font-mono text-xs text-[#7fdbff] hover:text-[#00ff9c]">
|
|
← documento
|
|
</Link>
|
|
<AuthBar />
|
|
</div>
|
|
<div className="border border-[rgba(0,255,156,0.15)] bg-[#0a121e] rounded p-6">
|
|
<h1 className="font-mono text-lg text-[#00ff9c] mb-2">▍ documento ainda não indexado</h1>
|
|
<p className="text-[#c8d4e6] text-sm">
|
|
Este documento ainda não foi processado.
|
|
</p>
|
|
</div>
|
|
</main>
|
|
);
|
|
}
|
|
|
|
const [idx, byPage, doc] = await Promise.all([
|
|
readIndex(docId),
|
|
readChunksByPage(docId),
|
|
readDocument(docId),
|
|
]);
|
|
if (!idx) notFound();
|
|
|
|
const pageChunks = byPage.get(pageNum) ?? [];
|
|
const pngUrl = `/api/static/processing/png/${docId}/p-${m[1]}.png`;
|
|
const totalPages = idx.total_pages;
|
|
|
|
return (
|
|
<main className="min-h-screen p-6 md:p-10 max-w-6xl mx-auto">
|
|
<div className="flex items-start justify-between gap-4 mb-6">
|
|
<div className="flex items-center gap-3 font-mono text-xs">
|
|
<Link href={`/d/${docId}`} className="text-[#7fdbff] hover:text-[#00ff9c]">
|
|
← documento inteiro
|
|
</Link>
|
|
</div>
|
|
<AuthBar />
|
|
</div>
|
|
|
|
<header className="mb-6 pb-4 border-b border-[rgba(0,255,156,0.32)]">
|
|
<div className="font-mono text-[10px] text-[#5a6678] tracking-widest uppercase mb-2">
|
|
página {pageNum} de {totalPages} · {pageChunks.length} trechos · doc:{" "}
|
|
<span className="text-[#7fdbff]">{docId}</span>
|
|
</div>
|
|
<h1 className="font-mono text-xl text-[#00ff9c]">
|
|
▍ {(doc?.fm.canonical_title as string) ?? docId} · p{pageNum}
|
|
</h1>
|
|
</header>
|
|
|
|
<div className="grid grid-cols-1 lg:grid-cols-[1fr_1fr] gap-8">
|
|
<aside className="lg:sticky lg:top-6 lg:self-start lg:max-h-[85vh] lg:overflow-y-auto">
|
|
<h2 className="font-mono text-xs uppercase tracking-widest text-[#7fdbff] mb-2">
|
|
scanned PNG · 72 DPI
|
|
</h2>
|
|
<div className="border border-[rgba(0,255,156,0.18)] rounded overflow-hidden bg-[#0a121e]">
|
|
<Image
|
|
src={pngUrl}
|
|
alt={`página ${pageNum}`}
|
|
width={800}
|
|
height={1100}
|
|
sizes="(max-width: 1024px) 100vw, 50vw"
|
|
className="block w-full h-auto"
|
|
/>
|
|
</div>
|
|
<div className="mt-2 flex items-center justify-between font-mono text-[10px] text-[#5a6678]">
|
|
<Link
|
|
href={pageNum > 1 ? `/d/${docId}/p${String(pageNum - 1).padStart(3, "0")}` : "#"}
|
|
className={pageNum > 1 ? "hover:text-[#00ff9c]" : "opacity-30 pointer-events-none"}
|
|
>
|
|
← p{pageNum - 1}
|
|
</Link>
|
|
<span>
|
|
{pageNum} / {totalPages}
|
|
</span>
|
|
<Link
|
|
href={
|
|
pageNum < totalPages
|
|
? `/d/${docId}/p${String(pageNum + 1).padStart(3, "0")}`
|
|
: "#"
|
|
}
|
|
className={
|
|
pageNum < totalPages ? "hover:text-[#00ff9c]" : "opacity-30 pointer-events-none"
|
|
}
|
|
>
|
|
p{pageNum + 1} →
|
|
</Link>
|
|
</div>
|
|
</aside>
|
|
|
|
<article>
|
|
<h2 className="font-mono text-xs uppercase tracking-widest text-[#7fdbff] mb-2">
|
|
trechos (ordem de leitura)
|
|
</h2>
|
|
{pageChunks.length === 0 ? (
|
|
<div className="border border-[rgba(0,255,156,0.15)] bg-[#0a121e] rounded p-6 text-sm text-[#c8d4e6]">
|
|
<p className="font-mono text-[#7fdbff] mb-2">▍ página sem trechos extraídos</p>
|
|
<p className="text-[#5a6678] text-xs">
|
|
O scan existe (veja à esquerda) mas o processo de chunking não gerou trechos
|
|
para esta página específica. Pode ser página em branco, divisor de seção
|
|
ou conteúdo sem texto extraível. Próxima execução do chunker preencherá.
|
|
</p>
|
|
</div>
|
|
) : (
|
|
<DocRendererV2 docId={docId} chunksByPage={[[pageNum, pageChunks]]} />
|
|
)}
|
|
</article>
|
|
</div>
|
|
|
|
<ChatBubble context={{ doc_id: docId, page_id: `${docId}/${stem}` }} />
|
|
</main>
|
|
);
|
|
}
|