disclosure-bureau/web/lib/chunks.ts
Luiz Gustavo e75ca5eda2 add clean LLM reading version of documents (the core goal)
Scanned docs are messy — duplicate transcriptions (typed + handwritten),
two classification variants of the same narrative, OCR noise, repeated
banners. The doc page showed raw chunks, so everything appeared twice.

40_reading_version.py generates ONE clean, deduplicated, well-structured
bilingual Markdown reading version per doc (Sonnet): merges duplicate versions
without losing unique lines, drops page furniture, formats transcripts as
dialogue. Faithful — invents nothing; redactions kept as markers.

/d/[docId] now defaults to a "📖 leitura" tab rendering this clean version,
with "🔍 trechos · scan original" preserving the faithful per-chunk + per-page
scan view. reading.md lives in raw/<doc>--subagent/ alongside the chunks.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 17:23:36 -03:00

155 lines
4.5 KiB
TypeScript

/**
* Read agentic chunks from raw/<doc-id>--subagent/ filesystem.
*
* The DB (Postgres + pgvector) is the retrieval layer, but the filesystem
* remains source-of-truth. Server components use these helpers when they
* need full chunk content (not just hits from retrieval).
*/
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { UFO_ROOT } from "./wiki";
export const RAW = path.join(UFO_ROOT, "raw");
export interface ChunkFrontmatter {
chunk_id: string;
type: string;
page: number;
order_in_page: number;
order_global: number;
bbox: { x: number; y: number; w: number; h: number } | null;
classification: string | null;
formatting?: string[];
cross_page_hint?: string;
prev_chunk?: string | null;
next_chunk?: string | null;
related_image?: string | null;
related_table?: string | null;
ocr_confidence?: number | null;
ocr_source_lines?: number[];
redaction_code?: string | null;
redaction_inferred_content_type?: string | null;
image_type?: string | null;
ufo_anomaly_detected?: boolean;
ufo_anomaly_type?: string | null;
ufo_anomaly_rationale?: string | null;
cryptid_anomaly_detected?: boolean;
cryptid_anomaly_type?: string | null;
cryptid_anomaly_rationale?: string | null;
image_description_en?: string | null;
image_description_pt_br?: string | null;
extracted_text?: string | null;
source_png?: string;
}
export interface ParsedChunk {
fm: ChunkFrontmatter;
content_en: string;
content_pt: string;
}
export interface ChunkIndex {
doc_id: string;
schema_version: string;
total_pages: number;
total_chunks: number;
build_approach: string;
build_model: string;
build_at: string;
chunks: Array<{
chunk_id: string;
type: string;
page: number;
order_in_page: number;
order_global: number;
file: string;
bbox: { x: number; y: number; w: number; h: number };
preview: string;
}>;
}
function archivePath(docId: string): string {
return path.join(RAW, `${docId}--subagent`);
}
export async function hasChunks(docId: string): Promise<boolean> {
try {
await fs.access(path.join(archivePath(docId), "_index.json"));
return true;
} catch {
return false;
}
}
/** Clean LLM-generated reading version (raw/<doc>--subagent/reading.md), if it
* exists. Returns the Markdown body without frontmatter, or null. */
export async function readReadingVersion(docId: string): Promise<string | null> {
try {
const raw = await fs.readFile(path.join(archivePath(docId), "reading.md"), "utf-8");
if (raw.startsWith("---")) {
const end = raw.indexOf("\n---", 3);
if (end !== -1) return raw.slice(raw.indexOf("\n", end + 1) + 1).trim();
}
return raw.trim();
} catch {
return null;
}
}
export async function readIndex(docId: string): Promise<ChunkIndex | null> {
try {
const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");
return JSON.parse(buf) as ChunkIndex;
} catch {
return null;
}
}
function splitBilingual(body: string): { en: string; pt: string } {
let en = "";
let pt = "";
for (const line of body.split("\n")) {
const s = line.trim();
if (s.startsWith("**EN:**")) en = s.replace(/^\*\*EN:\*\*\s*/, "");
else if (s.startsWith("**PT-BR:**")) pt = s.replace(/^\*\*PT-BR:\*\*\s*/, "");
}
return { en, pt };
}
export async function readChunk(docId: string, chunkId: string): Promise<ParsedChunk | null> {
try {
const p = path.join(archivePath(docId), "chunks", `${chunkId}.md`);
const raw = await fs.readFile(p, "utf-8");
const parsed = matter(raw);
const { en, pt } = splitBilingual(parsed.content);
return { fm: parsed.data as ChunkFrontmatter, content_en: en, content_pt: pt };
} catch {
return null;
}
}
export async function readAllChunks(docId: string): Promise<ParsedChunk[]> {
const idx = await readIndex(docId);
if (!idx) return [];
const chunks: ParsedChunk[] = [];
for (const entry of idx.chunks) {
const c = await readChunk(docId, entry.chunk_id);
if (c) chunks.push(c);
}
return chunks;
}
/** Return chunks grouped by page in reading order. */
export async function readChunksByPage(docId: string): Promise<Map<number, ParsedChunk[]>> {
const all = await readAllChunks(docId);
const byPage = new Map<number, ParsedChunk[]>();
for (const c of all) {
if (!byPage.has(c.fm.page)) byPage.set(c.fm.page, []);
byPage.get(c.fm.page)!.push(c);
}
for (const list of byPage.values()) {
list.sort((a, b) => a.fm.order_in_page - b.fm.order_in_page);
}
return byPage;
}