/** * Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk. * * Why YAML and not the DB? Because the corpus has TWO independent extraction * layers (Haiku page-level, Sonnet chunk-level) and each catches a different * subset of entities. The DB's entity_mentions table is one of those signals — * useful for chat retrieval but incomplete for the entity catalog itself. * * Reading from disk lets us merge every signal into one stat (`total_mentions`) * via the maintain/42_sync_entity_stats.py pipeline and serve consistent * numbers everywhere in the UI. * * The DB is still queried for ONE thing: the actual chunk text for previews, * because we don't want to re-parse 21k chunk files on every page render. */ import fs from "node:fs/promises"; import path from "node:path"; import matter from "gray-matter"; import { pgQuery } from "./db"; import { WIKI } from "@/lib/wiki"; const FOLDER_BY_CLASS: Record = { person: "people", organization: "organizations", location: "locations", event: "events", uap_object: "uap-objects", vehicle: "vehicles", operation: "operations", concept: "concepts", }; export interface EntityCore { entity_pk: number | null; // db-side primary key; null if entity is wiki-only entity_class: string; entity_id: string; canonical_name: string; aliases: string[]; total_mentions: number; documents_count: number; signal_strength: "strong" | "weak" | "orphan" | "unverified"; signal_sources: { db_chunks: number; page_refs: number; cross_refs: number; text_refs: number; }; mentioned_in: string[]; // [[doc-id/p007]] — structured page refs (Haiku) text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill) referenced_by: string[]; // [[class/id]] cross-links enrichment_status: string | null; narrative_summary: string | null; narrative_summary_pt_br: string | null; summary_status: string | null; } interface RawFm { [k: string]: unknown; } function num(v: unknown, fallback = 0): number { if (typeof v === "number" && Number.isFinite(v)) return v; if (typeof v === "string") { const n = Number(v); return Number.isFinite(n) ? n : fallback; } return fallback; } function arr(v: unknown): string[] { if (!v) return []; if (Array.isArray(v)) return v.filter((x) => typeof x === "string"); return []; } function strOrNull(v: unknown): string | null { return typeof v === "string" && v.trim() ? v : null; } async function readEntityYaml(entityClass: string, entityId: string): Promise { const folder = FOLDER_BY_CLASS[entityClass]; if (!folder) return null; const p = path.join(WIKI, "entities", folder, `${entityId}.md`); try { const raw = await fs.readFile(p, "utf-8"); return matter(raw).data as RawFm; } catch { return null; } } /** * Load a single entity card from its YAML. Returns null if archived or * missing — keeps the route handler simple. */ export async function getEntityCore( entityClass: string, entityId: string, ): Promise { const fm = await readEntityYaml(entityClass, entityId); if (!fm) return null; // Best-effort lookup of the DB entity_pk so getEntityChunks can still // query by primary key. Also pull the AI-generated narrative summary // (W5.3 / migration 0008) for the entity detail page header. let entity_pk: number | null = null; let dbSummaryEn: string | null = null; let dbSummaryPt: string | null = null; let dbSummaryStatus: string | null = null; try { const rows = await pgQuery<{ entity_pk: number; summary_en: string | null; summary_pt_br: string | null; summary_status: string | null; }>( `SELECT entity_pk, summary_en, summary_pt_br, summary_status FROM public.entities WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`, [entityClass, entityId], ); const r = rows[0]; if (r) { entity_pk = r.entity_pk; dbSummaryEn = r.summary_en; dbSummaryPt = r.summary_pt_br; dbSummaryStatus = r.summary_status; } } catch { entity_pk = null; } const sigSources = (fm.signal_sources as Record | undefined) ?? {}; const strength = (typeof fm.signal_strength === "string" ? fm.signal_strength : "unverified") as EntityCore["signal_strength"]; return { entity_pk, entity_class: entityClass, entity_id: entityId, canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId, aliases: arr(fm.aliases), total_mentions: num(fm.total_mentions, 0), documents_count: num(fm.documents_count, 0), signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength) ? strength : "unverified", signal_sources: { db_chunks: num(sigSources.db_chunks, 0), page_refs: num(sigSources.page_refs, 0), cross_refs: num(sigSources.cross_refs, 0), text_refs: num(sigSources.text_refs, 0), }, mentioned_in: arr(fm.mentioned_in), text_mentioned_in: arr(fm.text_mentioned_in), referenced_by: arr(fm.referenced_by), enrichment_status: strOrNull(fm.enrichment_status), // Prefer DB-stored AI-generated summaries (the curated layer the bureau // writes). Fall back to wiki YAML narrative if the DB is silent. narrative_summary: dbSummaryEn ?? strOrNull(fm.narrative_summary), narrative_summary_pt_br: dbSummaryPt ?? strOrNull(fm.narrative_summary_pt_br), summary_status: dbSummaryStatus ?? strOrNull(fm.summary_status), }; } export interface EntityMentionGroup { doc_id: string; canonical_title: string | null; collection: string | null; page_count: number | null; classification: string | null; mention_count: number; pages: number[]; text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence) } /** * Group reverse-references by document. Derived from the YAML's mentioned_in[] * (which the maintain script writes consolidating page YAMLs). Optionally * enriches with document metadata read from wiki/documents/.md. */ export async function getEntityMentionsByDoc( entityClass: string, entityId: string, limit = 100, ): Promise { const fm = await readEntityYaml(entityClass, entityId); if (!fm) return []; const structuredRefs = arr(fm.mentioned_in); const textRefs = arr(fm.text_mentioned_in); // Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters. const byDoc = new Map; text: Set }>(); const addRef = (ref: string, source: "structured" | "text") => { const m = ref.match(/\[\[([^\]|]+?)\]\]/); const target = (m ? m[1] : ref).trim(); const [docId, pageStr] = target.split("/", 2); if (!docId) return; const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN; if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() }); if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum); }; for (const r of structuredRefs) addRef(r, "structured"); for (const r of textRefs) addRef(r, "text"); // Hydrate each doc's metadata from wiki/documents/.md const groups: EntityMentionGroup[] = []; for (const [docId, sets] of byDoc) { let canonical_title: string | null = null; let collection: string | null = null; let page_count: number | null = null; let classification: string | null = null; try { const docRaw = await fs.readFile( path.join(WIKI, "documents", `${docId}.md`), "utf-8", ); const dfm = matter(docRaw).data as Record; canonical_title = strOrNull(dfm.canonical_title); collection = strOrNull(dfm.collection); page_count = num(dfm.page_count, 0) || null; classification = strOrNull(dfm.highest_classification); } catch { /* doc missing — use raw id */ } const merged = new Set([...sets.structured, ...sets.text]); groups.push({ doc_id: docId, canonical_title, collection, page_count, classification, mention_count: merged.size, pages: Array.from(merged).sort((a, b) => a - b), text_only: sets.structured.size === 0 && sets.text.size > 0, }); } groups.sort((a, b) => b.mention_count - a.mention_count); return groups.slice(0, limit); } export interface EntityChunkPreview { chunk_pk: number; doc_id: string; chunk_id: string; page: number; type: string; bbox: { x: number; y: number; w: number; h: number } | null; classification: string | null; content_pt: string | null; content_en: string | null; ufo_anomaly: boolean | null; ufo_anomaly_type: string | null; } /** * Top chunks that textually mention this entity. Reads from DB because * chunk content is big (we don't re-parse files at request time). Returns [] * if the entity isn't indexed in the DB. */ export async function getEntityChunks( entityPk: number | null, limit = 30, ): Promise { if (entityPk == null) return []; return pgQuery( `SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification, c.content_pt, c.content_en, c.ufo_anomaly, c.ufo_anomaly_type FROM public.entity_mentions em JOIN public.chunks c ON c.chunk_pk = em.chunk_pk WHERE em.entity_pk = $1 ORDER BY c.ufo_anomaly DESC NULLS LAST, c.doc_id, c.order_global LIMIT $2`, [entityPk, limit], ); } // Backwards-compat for callers that imported findEntity from the old path. export { findEntity } from "./graph";