disclosure-bureau/web/app/api/admin/stats/route.ts

189 lines
5.5 KiB
TypeScript
Raw Normal View History

/**
* /api/admin/stats Corpus-wide analytics.
*
* Mixes filesystem reads (always available) with DB queries (when retrieval
* layer is up). Gracefully degrades when DB is offline.
*/
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { UFO_ROOT, WIKI, listDocuments } from "@/lib/wiki";
import { pgQuery } from "@/lib/retrieval/db";
export const runtime = "nodejs";
export const dynamic = "force-dynamic";
const ENTITY_CLASSES = [
"people",
"organizations",
"locations",
"events",
"uap-objects",
"vehicles",
"operations",
"concepts",
];
function json(data: unknown, status = 200) {
return new Response(JSON.stringify(data), {
status,
headers: { "content-type": "application/json", "cache-control": "public, max-age=120" },
});
}
async function fsStats() {
// Document collection breakdown
const docIds = await listDocuments();
const collections: Record<string, number> = {};
const documentClass: Record<string, number> = {};
const contentClassification: Record<string, number> = {};
let totalPages = 0;
let totalRedactions = 0;
for (const id of docIds) {
try {
const raw = await fs.readFile(path.join(WIKI, "documents", `${id}.md`), "utf-8");
const fm = matter(raw).data as Record<string, unknown>;
const c = String(fm.collection ?? "uncategorized");
collections[c] = (collections[c] || 0) + 1;
const dc = String(fm.document_class ?? "unknown");
documentClass[dc] = (documentClass[dc] || 0) + 1;
totalPages += Number(fm.page_count ?? 0);
const cc = fm.content_classification;
if (Array.isArray(cc)) {
for (const tag of cc) {
contentClassification[String(tag)] = (contentClassification[String(tag)] || 0) + 1;
}
}
const reds = fm.redactions_count ?? fm.total_redactions ?? 0;
totalRedactions += Number(reds);
} catch {
/* skip */
}
}
// Entity counts per class
const entityCounts: Record<string, number> = {};
for (const cls of ENTITY_CLASSES) {
try {
const dir = await fs.readdir(path.join(WIKI, "entities", cls));
entityCounts[cls] = dir.filter((f) => f.endsWith(".md")).length;
} catch {
entityCounts[cls] = 0;
}
}
// Chunks on disk per --subagent dir
const rawRoot = path.join(UFO_ROOT, "raw");
let chunksOnDisk = 0;
let docsRebuilt = 0;
try {
const archives = (await fs.readdir(rawRoot)).filter((e) => e.endsWith("--subagent"));
docsRebuilt = archives.length;
for (const a of archives) {
try {
const c = await fs.readdir(path.join(rawRoot, a, "chunks"));
chunksOnDisk += c.filter((f) => f.startsWith("c")).length;
} catch {
/* skip */
}
}
} catch {
/* skip */
}
return {
documents_total: docIds.length,
documents_rebuilt_v2: docsRebuilt,
pages_total: totalPages,
chunks_on_disk: chunksOnDisk,
redactions_total: totalRedactions,
collections,
document_class: documentClass,
content_classification: contentClassification,
entity_counts: entityCounts,
entities_total: Object.values(entityCounts).reduce((s, n) => s + n, 0),
};
}
interface ChunkTypeRow {
type: string;
count: number;
}
interface ClassificationRow {
classification: string | null;
count: number;
}
interface DocChunkRow {
doc_id: string;
count: number;
}
interface AnomalyRow {
anomaly_type: string | null;
count: number;
}
async function dbStats() {
try {
const [
core,
chunkTypes,
classifications,
topDocs,
ufoTypes,
cryptidCount,
embedReady,
] = await Promise.all([
pgQuery<{ docs: number; chunks: number; entities: number; mentions: number }>(
`SELECT
(SELECT COUNT(*) FROM public.documents)::INT AS docs,
(SELECT COUNT(*) FROM public.chunks)::INT AS chunks,
(SELECT COUNT(*) FROM public.entities)::INT AS entities,
(SELECT COUNT(*) FROM public.entity_mentions)::INT AS mentions`,
[],
),
pgQuery<ChunkTypeRow>(
`SELECT type, COUNT(*)::INT AS count FROM public.chunks GROUP BY type ORDER BY count DESC LIMIT 20`,
[],
),
pgQuery<ClassificationRow>(
`SELECT classification, COUNT(*)::INT AS count FROM public.chunks WHERE classification IS NOT NULL GROUP BY classification ORDER BY count DESC LIMIT 10`,
[],
),
pgQuery<DocChunkRow>(
`SELECT doc_id, COUNT(*)::INT AS count FROM public.chunks GROUP BY doc_id ORDER BY count DESC LIMIT 10`,
[],
),
pgQuery<AnomalyRow>(
`SELECT ufo_anomaly_type AS anomaly_type, COUNT(*)::INT AS count FROM public.chunks WHERE ufo_anomaly = TRUE GROUP BY ufo_anomaly_type ORDER BY count DESC LIMIT 15`,
[],
),
pgQuery<{ count: number }>(
`SELECT COUNT(*)::INT AS count FROM public.chunks WHERE cryptid_anomaly = TRUE`,
[],
),
pgQuery<{ count: number }>(
`SELECT COUNT(*)::INT AS count FROM public.chunks WHERE embedding IS NOT NULL`,
[],
),
]);
return {
ok: true,
core: core[0],
chunk_types: chunkTypes,
classifications,
top_docs_by_chunks: topDocs,
ufo_anomaly_types: ufoTypes,
cryptid_count: cryptidCount[0]?.count ?? 0,
embedded_count: embedReady[0]?.count ?? 0,
};
} catch (e) {
return { ok: false, error: (e as Error).message };
}
}
export async function GET() {
const [fsResult, dbResult] = await Promise.all([fsStats(), dbStats()]);
return json({ fs: fsResult, db: dbResult });
}