disclosure-bureau/web/lib/chunks.ts

/**
 * Read agentic chunks from raw/<doc-id>--subagent/ filesystem.
 *
 * The DB (Postgres + pgvector) is the retrieval layer, but the filesystem
 * remains source-of-truth. Server components use these helpers when they
 * need full chunk content (not just hits from retrieval).
 */
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { UFO_ROOT } from "./wiki";

export const RAW = path.join(UFO_ROOT, "raw");

export interface ChunkFrontmatter {
  chunk_id: string;
  type: string;
  page: number;
  order_in_page: number;
  order_global: number;
  bbox: { x: number; y: number; w: number; h: number } | null;
  classification: string | null;
  formatting?: string[];
  cross_page_hint?: string;
  prev_chunk?: string | null;
  next_chunk?: string | null;
  related_image?: string | null;
  related_table?: string | null;
  ocr_confidence?: number | null;
  ocr_source_lines?: number[];
  redaction_code?: string | null;
  redaction_inferred_content_type?: string | null;
  image_type?: string | null;
  ufo_anomaly_detected?: boolean;
  ufo_anomaly_type?: string | null;
  ufo_anomaly_rationale?: string | null;
  cryptid_anomaly_detected?: boolean;
  cryptid_anomaly_type?: string | null;
  cryptid_anomaly_rationale?: string | null;
  image_description_en?: string | null;
  image_description_pt_br?: string | null;
  extracted_text?: string | null;
  source_png?: string;
}

export interface ParsedChunk {
  fm: ChunkFrontmatter;
  content_en: string;
  content_pt: string;
}

export interface ChunkIndex {
  doc_id: string;
  schema_version: string;
  total_pages: number;
  total_chunks: number;
  build_approach: string;
  build_model: string;
  build_at: string;
  chunks: Array<{
    chunk_id: string;
    type: string;
    page: number;
    order_in_page: number;
    order_global: number;
    file: string;
    bbox: { x: number; y: number; w: number; h: number };
    preview: string;
  }>;
}

function archivePath(docId: string): string {
  return path.join(RAW, `${docId}--subagent`);
}

export async function hasChunks(docId: string): Promise<boolean> {
  try {
    await fs.access(path.join(archivePath(docId), "_index.json"));
    return true;
  } catch {
    return false;
  }
}

export async function readIndex(docId: string): Promise<ChunkIndex | null> {
  try {
    const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");
    return JSON.parse(buf) as ChunkIndex;
  } catch {
    return null;
  }
}

function splitBilingual(body: string): { en: string; pt: string } {
  let en = "";
  let pt = "";
  for (const line of body.split("\n")) {
    const s = line.trim();
    if (s.startsWith("**EN:**")) en = s.replace(/^\*\*EN:\*\*\s*/, "");
    else if (s.startsWith("**PT-BR:**")) pt = s.replace(/^\*\*PT-BR:\*\*\s*/, "");
  }
  return { en, pt };
}

export async function readChunk(docId: string, chunkId: string): Promise<ParsedChunk | null> {
  try {
    const p = path.join(archivePath(docId), "chunks", `${chunkId}.md`);
    const raw = await fs.readFile(p, "utf-8");
    const parsed = matter(raw);
    const { en, pt } = splitBilingual(parsed.content);
    return { fm: parsed.data as ChunkFrontmatter, content_en: en, content_pt: pt };
  } catch {
    return null;
  }
}

export async function readAllChunks(docId: string): Promise<ParsedChunk[]> {
  const idx = await readIndex(docId);
  if (!idx) return [];
  const chunks: ParsedChunk[] = [];
  for (const entry of idx.chunks) {
    const c = await readChunk(docId, entry.chunk_id);
    if (c) chunks.push(c);
  }
  return chunks;
}

/** Return chunks grouped by page in reading order. */
export async function readChunksByPage(docId: string): Promise<Map<number, ParsedChunk[]>> {
  const all = await readAllChunks(docId);
  const byPage = new Map<number, ParsedChunk[]>();
  for (const c of all) {
    if (!byPage.has(c.fm.page)) byPage.set(c.fm.page, []);
    byPage.get(c.fm.page)!.push(c);
  }
  for (const list of byPage.values()) {
    list.sort((a, b) => a.fm.order_in_page - b.fm.order_in_page);
  }
  return byPage;
}
baseline: Disclosure Bureau pipeline + Next.js UI + Supabase stack 2026-05-18 01:44:36 +00:00			`/**`
			`* Read agentic chunks from raw/<doc-id>--subagent/ filesystem.`
			`*`
			`* The DB (Postgres + pgvector) is the retrieval layer, but the filesystem`
			`* remains source-of-truth. Server components use these helpers when they`
			`* need full chunk content (not just hits from retrieval).`
			`*/`
			`import fs from "node:fs/promises";`
			`import path from "node:path";`
			`import matter from "gray-matter";`
			`import { UFO_ROOT } from "./wiki";`

			`export const RAW = path.join(UFO_ROOT, "raw");`

			`export interface ChunkFrontmatter {`
			`chunk_id: string;`
			`type: string;`
			`page: number;`
			`order_in_page: number;`
			`order_global: number;`
			`bbox: { x: number; y: number; w: number; h: number } \| null;`
			`classification: string \| null;`
			`formatting?: string[];`
			`cross_page_hint?: string;`
			`prev_chunk?: string \| null;`
			`next_chunk?: string \| null;`
			`related_image?: string \| null;`
			`related_table?: string \| null;`
			`ocr_confidence?: number \| null;`
			`ocr_source_lines?: number[];`
			`redaction_code?: string \| null;`
			`redaction_inferred_content_type?: string \| null;`
			`image_type?: string \| null;`
			`ufo_anomaly_detected?: boolean;`
			`ufo_anomaly_type?: string \| null;`
			`ufo_anomaly_rationale?: string \| null;`
			`cryptid_anomaly_detected?: boolean;`
			`cryptid_anomaly_type?: string \| null;`
			`cryptid_anomaly_rationale?: string \| null;`
			`image_description_en?: string \| null;`
			`image_description_pt_br?: string \| null;`
			`extracted_text?: string \| null;`
			`source_png?: string;`
			`}`

			`export interface ParsedChunk {`
			`fm: ChunkFrontmatter;`
			`content_en: string;`
			`content_pt: string;`
			`}`

			`export interface ChunkIndex {`
			`doc_id: string;`
			`schema_version: string;`
			`total_pages: number;`
			`total_chunks: number;`
			`build_approach: string;`
			`build_model: string;`
			`build_at: string;`
			`chunks: Array<{`
			`chunk_id: string;`
			`type: string;`
			`page: number;`
			`order_in_page: number;`
			`order_global: number;`
			`file: string;`
			`bbox: { x: number; y: number; w: number; h: number };`
			`preview: string;`
			`}>;`
			`}`

			`function archivePath(docId: string): string {`
			return path.join(RAW, `${docId}--subagent`);
			`}`

			`export async function hasChunks(docId: string): Promise<boolean> {`
			`try {`
			`await fs.access(path.join(archivePath(docId), "_index.json"));`
			`return true;`
			`} catch {`
			`return false;`
			`}`
			`}`

			`export async function readIndex(docId: string): Promise<ChunkIndex \| null> {`
			`try {`
			`const buf = await fs.readFile(path.join(archivePath(docId), "_index.json"), "utf-8");`
			`return JSON.parse(buf) as ChunkIndex;`
			`} catch {`
			`return null;`
			`}`
			`}`

			`function splitBilingual(body: string): { en: string; pt: string } {`
			`let en = "";`
			`let pt = "";`
			`for (const line of body.split("\n")) {`
			`const s = line.trim();`
			`if (s.startsWith("EN:")) en = s.replace(/^\\EN:\\\s*/, "");`
			`else if (s.startsWith("PT-BR:")) pt = s.replace(/^\\PT-BR:\\\s*/, "");`
			`}`
			`return { en, pt };`
			`}`

			`export async function readChunk(docId: string, chunkId: string): Promise<ParsedChunk \| null> {`
			`try {`
			const p = path.join(archivePath(docId), "chunks", `${chunkId}.md`);
			`const raw = await fs.readFile(p, "utf-8");`
			`const parsed = matter(raw);`
			`const { en, pt } = splitBilingual(parsed.content);`
			`return { fm: parsed.data as ChunkFrontmatter, content_en: en, content_pt: pt };`
			`} catch {`
			`return null;`
			`}`
			`}`

			`export async function readAllChunks(docId: string): Promise<ParsedChunk[]> {`
			`const idx = await readIndex(docId);`
			`if (!idx) return [];`
			`const chunks: ParsedChunk[] = [];`
			`for (const entry of idx.chunks) {`
			`const c = await readChunk(docId, entry.chunk_id);`
			`if (c) chunks.push(c);`
			`}`
			`return chunks;`
			`}`

			`/** Return chunks grouped by page in reading order. */`
			`export async function readChunksByPage(docId: string): Promise<Map<number, ParsedChunk[]>> {`
			`const all = await readAllChunks(docId);`
			`const byPage = new Map<number, ParsedChunk[]>();`
			`for (const c of all) {`
			`if (!byPage.has(c.fm.page)) byPage.set(c.fm.page, []);`
			`byPage.get(c.fm.page)!.push(c);`
			`}`
			`for (const list of byPage.values()) {`
			`list.sort((a, b) => a.fm.order_in_page - b.fm.order_in_page);`
			`}`
			`return byPage;`
			`}`