188 lines
5.5 KiB
TypeScript
188 lines
5.5 KiB
TypeScript
/**
|
|
* /api/admin/stats — Corpus-wide analytics.
|
|
*
|
|
* Mixes filesystem reads (always available) with DB queries (when retrieval
|
|
* layer is up). Gracefully degrades when DB is offline.
|
|
*/
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import matter from "gray-matter";
|
|
import { UFO_ROOT, WIKI, listDocuments } from "@/lib/wiki";
|
|
import { pgQuery } from "@/lib/retrieval/db";
|
|
|
|
export const runtime = "nodejs";
|
|
export const dynamic = "force-dynamic";
|
|
|
|
const ENTITY_CLASSES = [
|
|
"people",
|
|
"organizations",
|
|
"locations",
|
|
"events",
|
|
"uap-objects",
|
|
"vehicles",
|
|
"operations",
|
|
"concepts",
|
|
];
|
|
|
|
function json(data: unknown, status = 200) {
|
|
return new Response(JSON.stringify(data), {
|
|
status,
|
|
headers: { "content-type": "application/json", "cache-control": "public, max-age=120" },
|
|
});
|
|
}
|
|
|
|
async function fsStats() {
|
|
// Document collection breakdown
|
|
const docIds = await listDocuments();
|
|
const collections: Record<string, number> = {};
|
|
const documentClass: Record<string, number> = {};
|
|
const contentClassification: Record<string, number> = {};
|
|
let totalPages = 0;
|
|
let totalRedactions = 0;
|
|
|
|
for (const id of docIds) {
|
|
try {
|
|
const raw = await fs.readFile(path.join(WIKI, "documents", `${id}.md`), "utf-8");
|
|
const fm = matter(raw).data as Record<string, unknown>;
|
|
const c = String(fm.collection ?? "uncategorized");
|
|
collections[c] = (collections[c] || 0) + 1;
|
|
const dc = String(fm.document_class ?? "unknown");
|
|
documentClass[dc] = (documentClass[dc] || 0) + 1;
|
|
totalPages += Number(fm.page_count ?? 0);
|
|
const cc = fm.content_classification;
|
|
if (Array.isArray(cc)) {
|
|
for (const tag of cc) {
|
|
contentClassification[String(tag)] = (contentClassification[String(tag)] || 0) + 1;
|
|
}
|
|
}
|
|
const reds = fm.redactions_count ?? fm.total_redactions ?? 0;
|
|
totalRedactions += Number(reds);
|
|
} catch {
|
|
/* skip */
|
|
}
|
|
}
|
|
|
|
// Entity counts per class
|
|
const entityCounts: Record<string, number> = {};
|
|
for (const cls of ENTITY_CLASSES) {
|
|
try {
|
|
const dir = await fs.readdir(path.join(WIKI, "entities", cls));
|
|
entityCounts[cls] = dir.filter((f) => f.endsWith(".md")).length;
|
|
} catch {
|
|
entityCounts[cls] = 0;
|
|
}
|
|
}
|
|
|
|
// Chunks on disk per --subagent dir
|
|
const rawRoot = path.join(UFO_ROOT, "raw");
|
|
let chunksOnDisk = 0;
|
|
let docsRebuilt = 0;
|
|
try {
|
|
const archives = (await fs.readdir(rawRoot)).filter((e) => e.endsWith("--subagent"));
|
|
docsRebuilt = archives.length;
|
|
for (const a of archives) {
|
|
try {
|
|
const c = await fs.readdir(path.join(rawRoot, a, "chunks"));
|
|
chunksOnDisk += c.filter((f) => f.startsWith("c")).length;
|
|
} catch {
|
|
/* skip */
|
|
}
|
|
}
|
|
} catch {
|
|
/* skip */
|
|
}
|
|
|
|
return {
|
|
documents_total: docIds.length,
|
|
documents_rebuilt_v2: docsRebuilt,
|
|
pages_total: totalPages,
|
|
chunks_on_disk: chunksOnDisk,
|
|
redactions_total: totalRedactions,
|
|
collections,
|
|
document_class: documentClass,
|
|
content_classification: contentClassification,
|
|
entity_counts: entityCounts,
|
|
entities_total: Object.values(entityCounts).reduce((s, n) => s + n, 0),
|
|
};
|
|
}
|
|
|
|
interface ChunkTypeRow {
|
|
type: string;
|
|
count: number;
|
|
}
|
|
interface ClassificationRow {
|
|
classification: string | null;
|
|
count: number;
|
|
}
|
|
interface DocChunkRow {
|
|
doc_id: string;
|
|
count: number;
|
|
}
|
|
interface AnomalyRow {
|
|
anomaly_type: string | null;
|
|
count: number;
|
|
}
|
|
|
|
async function dbStats() {
|
|
try {
|
|
const [
|
|
core,
|
|
chunkTypes,
|
|
classifications,
|
|
topDocs,
|
|
ufoTypes,
|
|
cryptidCount,
|
|
embedReady,
|
|
] = await Promise.all([
|
|
pgQuery<{ docs: number; chunks: number; entities: number; mentions: number }>(
|
|
`SELECT
|
|
(SELECT COUNT(*) FROM public.documents)::INT AS docs,
|
|
(SELECT COUNT(*) FROM public.chunks)::INT AS chunks,
|
|
(SELECT COUNT(*) FROM public.entities)::INT AS entities,
|
|
(SELECT COUNT(*) FROM public.entity_mentions)::INT AS mentions`,
|
|
[],
|
|
),
|
|
pgQuery<ChunkTypeRow>(
|
|
`SELECT type, COUNT(*)::INT AS count FROM public.chunks GROUP BY type ORDER BY count DESC LIMIT 20`,
|
|
[],
|
|
),
|
|
pgQuery<ClassificationRow>(
|
|
`SELECT classification, COUNT(*)::INT AS count FROM public.chunks WHERE classification IS NOT NULL GROUP BY classification ORDER BY count DESC LIMIT 10`,
|
|
[],
|
|
),
|
|
pgQuery<DocChunkRow>(
|
|
`SELECT doc_id, COUNT(*)::INT AS count FROM public.chunks GROUP BY doc_id ORDER BY count DESC LIMIT 10`,
|
|
[],
|
|
),
|
|
pgQuery<AnomalyRow>(
|
|
`SELECT ufo_anomaly_type AS anomaly_type, COUNT(*)::INT AS count FROM public.chunks WHERE ufo_anomaly = TRUE GROUP BY ufo_anomaly_type ORDER BY count DESC LIMIT 15`,
|
|
[],
|
|
),
|
|
pgQuery<{ count: number }>(
|
|
`SELECT COUNT(*)::INT AS count FROM public.chunks WHERE cryptid_anomaly = TRUE`,
|
|
[],
|
|
),
|
|
pgQuery<{ count: number }>(
|
|
`SELECT COUNT(*)::INT AS count FROM public.chunks WHERE embedding IS NOT NULL`,
|
|
[],
|
|
),
|
|
]);
|
|
return {
|
|
ok: true,
|
|
core: core[0],
|
|
chunk_types: chunkTypes,
|
|
classifications,
|
|
top_docs_by_chunks: topDocs,
|
|
ufo_anomaly_types: ufoTypes,
|
|
cryptid_count: cryptidCount[0]?.count ?? 0,
|
|
embedded_count: embedReady[0]?.count ?? 0,
|
|
};
|
|
} catch (e) {
|
|
return { ok: false, error: (e as Error).message };
|
|
}
|
|
}
|
|
|
|
export async function GET() {
|
|
const [fsResult, dbResult] = await Promise.all([fsStats(), dbStats()]);
|
|
return json({ fs: fsResult, db: dbResult });
|
|
}
|