disclosure-bureau/web/lib/chunks.ts

141 lines
4 KiB
TypeScript
Raw Normal View History

/**
* Read agentic chunks from raw/<doc-id>--subagent/ filesystem.
*
* The DB (Postgres + pgvector) is the retrieval layer, but the filesystem
* remains source-of-truth. Server components use these helpers when they
* need full chunk content (not just hits from retrieval).
*/
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { UFO_ROOT } from "./wiki";
export const RAW = path.join(UFO_ROOT, "raw");
export interface ChunkFrontmatter {
chunk_id: string;
type: string;
page: number;
order_in_page: number;
order_global: number;
bbox: { x: number; y: number; w: number; h: number } | null;
classification: string | null;
formatting?: string[];
cross_page_hint?: string;
prev_chunk?: string | null;
next_chunk?: string | null;
related_image?: string | null;
related_table?: string | null;
ocr_confidence?: number | null;
ocr_source_lines?: number[];
redaction_code?: string | null;
redaction_inferred_content_type?: string | null;
image_type?: string | null;
ufo_anomaly_detected?: boolean;
ufo_anomaly_type?: string | null;
ufo_anomaly_rationale?: string | null;
cryptid_anomaly_detected?: boolean;
cryptid_anomaly_type?: string | null;
cryptid_anomaly_rationale?: string | null;
image_description_en?: string | null;
image_description_pt_br?: string | null;
extracted_text?: string | null;
source_png?: string;
}
export interface ParsedChunk {
fm: ChunkFrontmatter;
content_en: string;
content_pt: string;
}
export interface ChunkIndex {
doc_id: string;
schema_version: string;
total_pages: number;
total_chunks: number;
build_approach: string;
build_model: string;
build_at: string;
chunks: Array<{
chunk_id: string;
type: string;
page: number;
order_in_page: number;
order_global: number;
file: string;
bbox: { x: number; y: number; w: number; h: number };
preview: string;
}>;
}
function archivePath(docId: string): string {
return path.join(RAW, `${docId}--subagent`);
}
export async function hasChunks(docId: string): Promise<boolean> {
try {
await fs.access(path.join(archivePath(docId), "_index.json"));
return true;
} catch {
return false;
}
}
export async function readIndex(docId: string): Promise<ChunkIndex | null> {
try {
const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");
return JSON.parse(buf) as ChunkIndex;
} catch {
return null;
}
}
function splitBilingual(body: string): { en: string; pt: string } {
let en = "";
let pt = "";
for (const line of body.split("\n")) {
const s = line.trim();
if (s.startsWith("**EN:**")) en = s.replace(/^\*\*EN:\*\*\s*/, "");
else if (s.startsWith("**PT-BR:**")) pt = s.replace(/^\*\*PT-BR:\*\*\s*/, "");
}
return { en, pt };
}
export async function readChunk(docId: string, chunkId: string): Promise<ParsedChunk | null> {
try {
const p = path.join(archivePath(docId), "chunks", `${chunkId}.md`);
const raw = await fs.readFile(p, "utf-8");
const parsed = matter(raw);
const { en, pt } = splitBilingual(parsed.content);
return { fm: parsed.data as ChunkFrontmatter, content_en: en, content_pt: pt };
} catch {
return null;
}
}
export async function readAllChunks(docId: string): Promise<ParsedChunk[]> {
const idx = await readIndex(docId);
if (!idx) return [];
const chunks: ParsedChunk[] = [];
for (const entry of idx.chunks) {
const c = await readChunk(docId, entry.chunk_id);
if (c) chunks.push(c);
}
return chunks;
}
/** Return chunks grouped by page in reading order. */
export async function readChunksByPage(docId: string): Promise<Map<number, ParsedChunk[]>> {
const all = await readAllChunks(docId);
const byPage = new Map<number, ParsedChunk[]>();
for (const c of all) {
if (!byPage.has(c.fm.page)) byPage.set(c.fm.page, []);
byPage.get(c.fm.page)!.push(c);
}
for (const list of byPage.values()) {
list.sort((a, b) => a.fm.order_in_page - b.fm.order_in_page);
}
return byPage;
}