140 lines
4 KiB
TypeScript
140 lines
4 KiB
TypeScript
/**
|
|
* Read agentic chunks from raw/<doc-id>--subagent/ filesystem.
|
|
*
|
|
* The DB (Postgres + pgvector) is the retrieval layer, but the filesystem
|
|
* remains source-of-truth. Server components use these helpers when they
|
|
* need full chunk content (not just hits from retrieval).
|
|
*/
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import matter from "gray-matter";
|
|
import { UFO_ROOT } from "./wiki";
|
|
|
|
export const RAW = path.join(UFO_ROOT, "raw");
|
|
|
|
export interface ChunkFrontmatter {
|
|
chunk_id: string;
|
|
type: string;
|
|
page: number;
|
|
order_in_page: number;
|
|
order_global: number;
|
|
bbox: { x: number; y: number; w: number; h: number } | null;
|
|
classification: string | null;
|
|
formatting?: string[];
|
|
cross_page_hint?: string;
|
|
prev_chunk?: string | null;
|
|
next_chunk?: string | null;
|
|
related_image?: string | null;
|
|
related_table?: string | null;
|
|
ocr_confidence?: number | null;
|
|
ocr_source_lines?: number[];
|
|
redaction_code?: string | null;
|
|
redaction_inferred_content_type?: string | null;
|
|
image_type?: string | null;
|
|
ufo_anomaly_detected?: boolean;
|
|
ufo_anomaly_type?: string | null;
|
|
ufo_anomaly_rationale?: string | null;
|
|
cryptid_anomaly_detected?: boolean;
|
|
cryptid_anomaly_type?: string | null;
|
|
cryptid_anomaly_rationale?: string | null;
|
|
image_description_en?: string | null;
|
|
image_description_pt_br?: string | null;
|
|
extracted_text?: string | null;
|
|
source_png?: string;
|
|
}
|
|
|
|
export interface ParsedChunk {
|
|
fm: ChunkFrontmatter;
|
|
content_en: string;
|
|
content_pt: string;
|
|
}
|
|
|
|
export interface ChunkIndex {
|
|
doc_id: string;
|
|
schema_version: string;
|
|
total_pages: number;
|
|
total_chunks: number;
|
|
build_approach: string;
|
|
build_model: string;
|
|
build_at: string;
|
|
chunks: Array<{
|
|
chunk_id: string;
|
|
type: string;
|
|
page: number;
|
|
order_in_page: number;
|
|
order_global: number;
|
|
file: string;
|
|
bbox: { x: number; y: number; w: number; h: number };
|
|
preview: string;
|
|
}>;
|
|
}
|
|
|
|
function archivePath(docId: string): string {
|
|
return path.join(RAW, `${docId}--subagent`);
|
|
}
|
|
|
|
export async function hasChunks(docId: string): Promise<boolean> {
|
|
try {
|
|
await fs.access(path.join(archivePath(docId), "_index.json"));
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
export async function readIndex(docId: string): Promise<ChunkIndex | null> {
|
|
try {
|
|
const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");
|
|
return JSON.parse(buf) as ChunkIndex;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function splitBilingual(body: string): { en: string; pt: string } {
|
|
let en = "";
|
|
let pt = "";
|
|
for (const line of body.split("\n")) {
|
|
const s = line.trim();
|
|
if (s.startsWith("**EN:**")) en = s.replace(/^\*\*EN:\*\*\s*/, "");
|
|
else if (s.startsWith("**PT-BR:**")) pt = s.replace(/^\*\*PT-BR:\*\*\s*/, "");
|
|
}
|
|
return { en, pt };
|
|
}
|
|
|
|
export async function readChunk(docId: string, chunkId: string): Promise<ParsedChunk | null> {
|
|
try {
|
|
const p = path.join(archivePath(docId), "chunks", `${chunkId}.md`);
|
|
const raw = await fs.readFile(p, "utf-8");
|
|
const parsed = matter(raw);
|
|
const { en, pt } = splitBilingual(parsed.content);
|
|
return { fm: parsed.data as ChunkFrontmatter, content_en: en, content_pt: pt };
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function readAllChunks(docId: string): Promise<ParsedChunk[]> {
|
|
const idx = await readIndex(docId);
|
|
if (!idx) return [];
|
|
const chunks: ParsedChunk[] = [];
|
|
for (const entry of idx.chunks) {
|
|
const c = await readChunk(docId, entry.chunk_id);
|
|
if (c) chunks.push(c);
|
|
}
|
|
return chunks;
|
|
}
|
|
|
|
/** Return chunks grouped by page in reading order. */
|
|
export async function readChunksByPage(docId: string): Promise<Map<number, ParsedChunk[]>> {
|
|
const all = await readAllChunks(docId);
|
|
const byPage = new Map<number, ParsedChunk[]>();
|
|
for (const c of all) {
|
|
if (!byPage.has(c.fm.page)) byPage.set(c.fm.page, []);
|
|
byPage.get(c.fm.page)!.push(c);
|
|
}
|
|
for (const list of byPage.values()) {
|
|
list.sort((a, b) => a.fm.order_in_page - b.fm.order_in_page);
|
|
}
|
|
return byPage;
|
|
}
|