/** * /api/admin/stats — Corpus-wide analytics. * * Mixes filesystem reads (always available) with DB queries (when retrieval * layer is up). Gracefully degrades when DB is offline. */ import fs from "node:fs/promises"; import path from "node:path"; import matter from "gray-matter"; import { UFO_ROOT, WIKI, listDocuments } from "@/lib/wiki"; import { pgQuery } from "@/lib/retrieval/db"; export const runtime = "nodejs"; export const dynamic = "force-dynamic"; const ENTITY_CLASSES = [ "people", "organizations", "locations", "events", "uap-objects", "vehicles", "operations", "concepts", ]; function json(data: unknown, status = 200) { return new Response(JSON.stringify(data), { status, headers: { "content-type": "application/json", "cache-control": "public, max-age=120" }, }); } async function fsStats() { // Document collection breakdown const docIds = await listDocuments(); const collections: Record = {}; const documentClass: Record = {}; const contentClassification: Record = {}; let totalPages = 0; let totalRedactions = 0; for (const id of docIds) { try { const raw = await fs.readFile(path.join(WIKI, "documents", `${id}.md`), "utf-8"); const fm = matter(raw).data as Record; const c = String(fm.collection ?? "uncategorized"); collections[c] = (collections[c] || 0) + 1; const dc = String(fm.document_class ?? "unknown"); documentClass[dc] = (documentClass[dc] || 0) + 1; totalPages += Number(fm.page_count ?? 0); const cc = fm.content_classification; if (Array.isArray(cc)) { for (const tag of cc) { contentClassification[String(tag)] = (contentClassification[String(tag)] || 0) + 1; } } const reds = fm.redactions_count ?? fm.total_redactions ?? 0; totalRedactions += Number(reds); } catch { /* skip */ } } // Entity counts per class const entityCounts: Record = {}; for (const cls of ENTITY_CLASSES) { try { const dir = await fs.readdir(path.join(WIKI, "entities", cls)); entityCounts[cls] = dir.filter((f) => f.endsWith(".md")).length; } catch { entityCounts[cls] = 0; } } // Chunks on disk per --subagent dir const rawRoot = path.join(UFO_ROOT, "raw"); let chunksOnDisk = 0; let docsRebuilt = 0; try { const archives = (await fs.readdir(rawRoot)).filter((e) => e.endsWith("--subagent")); docsRebuilt = archives.length; for (const a of archives) { try { const c = await fs.readdir(path.join(rawRoot, a, "chunks")); chunksOnDisk += c.filter((f) => f.startsWith("c")).length; } catch { /* skip */ } } } catch { /* skip */ } return { documents_total: docIds.length, documents_rebuilt_v2: docsRebuilt, pages_total: totalPages, chunks_on_disk: chunksOnDisk, redactions_total: totalRedactions, collections, document_class: documentClass, content_classification: contentClassification, entity_counts: entityCounts, entities_total: Object.values(entityCounts).reduce((s, n) => s + n, 0), }; } interface ChunkTypeRow { type: string; count: number; } interface ClassificationRow { classification: string | null; count: number; } interface DocChunkRow { doc_id: string; count: number; } interface AnomalyRow { anomaly_type: string | null; count: number; } async function dbStats() { try { const [ core, chunkTypes, classifications, topDocs, ufoTypes, cryptidCount, embedReady, ] = await Promise.all([ pgQuery<{ docs: number; chunks: number; entities: number; mentions: number }>( `SELECT (SELECT COUNT(*) FROM public.documents)::INT AS docs, (SELECT COUNT(*) FROM public.chunks)::INT AS chunks, (SELECT COUNT(*) FROM public.entities)::INT AS entities, (SELECT COUNT(*) FROM public.entity_mentions)::INT AS mentions`, [], ), pgQuery( `SELECT type, COUNT(*)::INT AS count FROM public.chunks GROUP BY type ORDER BY count DESC LIMIT 20`, [], ), pgQuery( `SELECT classification, COUNT(*)::INT AS count FROM public.chunks WHERE classification IS NOT NULL GROUP BY classification ORDER BY count DESC LIMIT 10`, [], ), pgQuery( `SELECT doc_id, COUNT(*)::INT AS count FROM public.chunks GROUP BY doc_id ORDER BY count DESC LIMIT 10`, [], ), pgQuery( `SELECT ufo_anomaly_type AS anomaly_type, COUNT(*)::INT AS count FROM public.chunks WHERE ufo_anomaly = TRUE GROUP BY ufo_anomaly_type ORDER BY count DESC LIMIT 15`, [], ), pgQuery<{ count: number }>( `SELECT COUNT(*)::INT AS count FROM public.chunks WHERE cryptid_anomaly = TRUE`, [], ), pgQuery<{ count: number }>( `SELECT COUNT(*)::INT AS count FROM public.chunks WHERE embedding IS NOT NULL`, [], ), ]); return { ok: true, core: core[0], chunk_types: chunkTypes, classifications, top_docs_by_chunks: topDocs, ufo_anomaly_types: ufoTypes, cryptid_count: cryptidCount[0]?.count ?? 0, embedded_count: embedReady[0]?.count ?? 0, }; } catch (e) { return { ok: false, error: (e as Error).message }; } } export async function GET() { const [fsResult, dbResult] = await Promise.all([fsStats(), dbStats()]); return json({ fs: fsResult, db: dbResult }); }