/** * Read agentic chunks from raw/--subagent/ filesystem. * * The DB (Postgres + pgvector) is the retrieval layer, but the filesystem * remains source-of-truth. Server components use these helpers when they * need full chunk content (not just hits from retrieval). */ import fs from "node:fs/promises"; import path from "node:path"; import matter from "gray-matter"; import { UFO_ROOT } from "./wiki"; export const RAW = path.join(UFO_ROOT, "raw"); export interface ChunkFrontmatter { chunk_id: string; type: string; page: number; order_in_page: number; order_global: number; bbox: { x: number; y: number; w: number; h: number } | null; classification: string | null; formatting?: string[]; cross_page_hint?: string; prev_chunk?: string | null; next_chunk?: string | null; related_image?: string | null; related_table?: string | null; ocr_confidence?: number | null; ocr_source_lines?: number[]; redaction_code?: string | null; redaction_inferred_content_type?: string | null; image_type?: string | null; ufo_anomaly_detected?: boolean; ufo_anomaly_type?: string | null; ufo_anomaly_rationale?: string | null; cryptid_anomaly_detected?: boolean; cryptid_anomaly_type?: string | null; cryptid_anomaly_rationale?: string | null; image_description_en?: string | null; image_description_pt_br?: string | null; extracted_text?: string | null; source_png?: string; } export interface ParsedChunk { fm: ChunkFrontmatter; content_en: string; content_pt: string; } export interface ChunkIndex { doc_id: string; schema_version: string; total_pages: number; total_chunks: number; build_approach: string; build_model: string; build_at: string; chunks: Array<{ chunk_id: string; type: string; page: number; order_in_page: number; order_global: number; file: string; bbox: { x: number; y: number; w: number; h: number }; preview: string; }>; } function archivePath(docId: string): string { return path.join(RAW, `${docId}--subagent`); } export async function hasChunks(docId: string): Promise { try { await fs.access(path.join(archivePath(docId), "_index.json")); return true; } catch { return false; } } /** Clean LLM-generated reading version (raw/--subagent/reading.md), if it * exists. Returns the Markdown body without frontmatter, or null. */ export async function readReadingVersion(docId: string): Promise { try { const raw = await fs.readFile(path.join(archivePath(docId), "reading.md"), "utf-8"); if (raw.startsWith("---")) { const end = raw.indexOf("\n---", 3); if (end !== -1) return raw.slice(raw.indexOf("\n", end + 1) + 1).trim(); } return raw.trim(); } catch { return null; } } export async function readIndex(docId: string): Promise { try { const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8"); return JSON.parse(buf) as ChunkIndex; } catch { return null; } } function splitBilingual(body: string): { en: string; pt: string } { let en = ""; let pt = ""; for (const line of body.split("\n")) { const s = line.trim(); if (s.startsWith("**EN:**")) en = s.replace(/^\*\*EN:\*\*\s*/, ""); else if (s.startsWith("**PT-BR:**")) pt = s.replace(/^\*\*PT-BR:\*\*\s*/, ""); } return { en, pt }; } export async function readChunk(docId: string, chunkId: string): Promise { try { const p = path.join(archivePath(docId), "chunks", `${chunkId}.md`); const raw = await fs.readFile(p, "utf-8"); const parsed = matter(raw); const { en, pt } = splitBilingual(parsed.content); return { fm: parsed.data as ChunkFrontmatter, content_en: en, content_pt: pt }; } catch { return null; } } export async function readAllChunks(docId: string): Promise { const idx = await readIndex(docId); if (!idx) return []; const chunks: ParsedChunk[] = []; for (const entry of idx.chunks) { const c = await readChunk(docId, entry.chunk_id); if (c) chunks.push(c); } return chunks; } /** Return chunks grouped by page in reading order. */ export async function readChunksByPage(docId: string): Promise> { const all = await readAllChunks(docId); const byPage = new Map(); for (const c of all) { if (!byPage.has(c.fm.page)) byPage.set(c.fm.page, []); byPage.get(c.fm.page)!.push(c); } for (const list of byPage.values()) { list.sort((a, b) => a.fm.order_in_page - b.fm.order_in_page); } return byPage; }