disclosure-bureau/web/lib/retrieval/entity-pages.ts

/**
 * Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk.
 *
 * Why YAML and not the DB? Because the corpus has TWO independent extraction
 * layers (Haiku page-level, Sonnet chunk-level) and each catches a different
 * subset of entities. The DB's entity_mentions table is one of those signals —
 * useful for chat retrieval but incomplete for the entity catalog itself.
 *
 * Reading from disk lets us merge every signal into one stat (`total_mentions`)
 * via the maintain/42_sync_entity_stats.py pipeline and serve consistent
 * numbers everywhere in the UI.
 *
 * The DB is still queried for ONE thing: the actual chunk text for previews,
 * because we don't want to re-parse 21k chunk files on every page render.
 */
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { pgQuery } from "./db";
import { WIKI } from "@/lib/wiki";

const FOLDER_BY_CLASS: Record<string, string> = {
  person: "people",
  organization: "organizations",
  location: "locations",
  event: "events",
  uap_object: "uap-objects",
  vehicle: "vehicles",
  operation: "operations",
  concept: "concepts",
};

export interface EntityCore {
  entity_pk: number | null; // db-side primary key; null if entity is wiki-only
  entity_class: string;
  entity_id: string;
  canonical_name: string;
  aliases: string[];
  total_mentions: number;
  documents_count: number;
  signal_strength: "strong" | "weak" | "orphan" | "unverified";
  signal_sources: {
    db_chunks: number;
    page_refs: number;
    cross_refs: number;
    text_refs: number;
  };
  mentioned_in: string[];      // [[doc-id/p007]] — structured page refs (Haiku)
  text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill)
  referenced_by: string[];     // [[class/id]] cross-links
  enrichment_status: string | null;
  narrative_summary: string | null;
  narrative_summary_pt_br: string | null;
  summary_status: string | null;
}

interface RawFm {
  [k: string]: unknown;
}

function num(v: unknown, fallback = 0): number {
  if (typeof v === "number" && Number.isFinite(v)) return v;
  if (typeof v === "string") {
    const n = Number(v);
    return Number.isFinite(n) ? n : fallback;
  }
  return fallback;
}

function arr(v: unknown): string[] {
  if (!v) return [];
  if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
  return [];
}

function strOrNull(v: unknown): string | null {
  return typeof v === "string" && v.trim() ? v : null;
}

async function readEntityYaml(entityClass: string, entityId: string): Promise<RawFm | null> {
  const folder = FOLDER_BY_CLASS[entityClass];
  if (!folder) return null;
  const p = path.join(WIKI, "entities", folder, `${entityId}.md`);
  try {
    const raw = await fs.readFile(p, "utf-8");
    return matter(raw).data as RawFm;
  } catch {
    return null;
  }
}

/**
 * Load a single entity card from its YAML. Returns null if archived or
 * missing — keeps the route handler simple.
 */
export async function getEntityCore(
  entityClass: string,
  entityId: string,
): Promise<EntityCore | null> {
  const fm = await readEntityYaml(entityClass, entityId);
  if (!fm) return null;

  // Best-effort lookup of the DB entity_pk so getEntityChunks can still
  // query by primary key. Don't fail if the entity isn't in the DB at all.
  let entity_pk: number | null = null;
  try {
    const rows = await pgQuery<{ entity_pk: number }>(
      `SELECT entity_pk FROM public.entities
       WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`,
      [entityClass, entityId],
    );
    entity_pk = rows[0]?.entity_pk ?? null;
  } catch {
    entity_pk = null;
  }

  const sigSources = (fm.signal_sources as Record<string, unknown> | undefined) ?? {};
  const strength = (typeof fm.signal_strength === "string"
    ? fm.signal_strength
    : "unverified") as EntityCore["signal_strength"];

  return {
    entity_pk,
    entity_class: entityClass,
    entity_id: entityId,
    canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId,
    aliases: arr(fm.aliases),
    total_mentions: num(fm.total_mentions, 0),
    documents_count: num(fm.documents_count, 0),
    signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength)
      ? strength
      : "unverified",
    signal_sources: {
      db_chunks: num(sigSources.db_chunks, 0),
      page_refs: num(sigSources.page_refs, 0),
      cross_refs: num(sigSources.cross_refs, 0),
      text_refs: num(sigSources.text_refs, 0),
    },
    mentioned_in: arr(fm.mentioned_in),
    text_mentioned_in: arr(fm.text_mentioned_in),
    referenced_by: arr(fm.referenced_by),
    enrichment_status: strOrNull(fm.enrichment_status),
    narrative_summary: strOrNull(fm.narrative_summary),
    narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br),
    summary_status: strOrNull(fm.summary_status),
  };
}

export interface EntityMentionGroup {
  doc_id: string;
  canonical_title: string | null;
  collection: string | null;
  page_count: number | null;
  classification: string | null;
  mention_count: number;
  pages: number[];
  text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence)
}

/**
 * Group reverse-references by document. Derived from the YAML's mentioned_in[]
 * (which the maintain script writes consolidating page YAMLs). Optionally
 * enriches with document metadata read from wiki/documents/<doc-id>.md.
 */
export async function getEntityMentionsByDoc(
  entityClass: string,
  entityId: string,
  limit = 100,
): Promise<EntityMentionGroup[]> {
  const fm = await readEntityYaml(entityClass, entityId);
  if (!fm) return [];
  const structuredRefs = arr(fm.mentioned_in);
  const textRefs = arr(fm.text_mentioned_in);
  // Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
  const byDoc = new Map<string, { structured: Set<number>; text: Set<number> }>();
  const addRef = (ref: string, source: "structured" | "text") => {
    const m = ref.match(/\[\[([^\]|]+?)\]\]/);
    const target = (m ? m[1] : ref).trim();
    const [docId, pageStr] = target.split("/", 2);
    if (!docId) return;
    const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
    if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() });
    if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum);
  };
  for (const r of structuredRefs) addRef(r, "structured");
  for (const r of textRefs) addRef(r, "text");

  // Hydrate each doc's metadata from wiki/documents/<doc-id>.md
  const groups: EntityMentionGroup[] = [];
  for (const [docId, sets] of byDoc) {
    let canonical_title: string | null = null;
    let collection: string | null = null;
    let page_count: number | null = null;
    let classification: string | null = null;
    try {
      const docRaw = await fs.readFile(
        path.join(WIKI, "documents", `${docId}.md`),
        "utf-8",
      );
      const dfm = matter(docRaw).data as Record<string, unknown>;
      canonical_title = strOrNull(dfm.canonical_title);
      collection = strOrNull(dfm.collection);
      page_count = num(dfm.page_count, 0) || null;
      classification = strOrNull(dfm.highest_classification);
    } catch {
      /* doc missing — use raw id */
    }
    const merged = new Set<number>([...sets.structured, ...sets.text]);
    groups.push({
      doc_id: docId,
      canonical_title,
      collection,
      page_count,
      classification,
      mention_count: merged.size,
      pages: Array.from(merged).sort((a, b) => a - b),
      text_only: sets.structured.size === 0 && sets.text.size > 0,
    });
  }
  groups.sort((a, b) => b.mention_count - a.mention_count);
  return groups.slice(0, limit);
}

export interface EntityChunkPreview {
  chunk_pk: number;
  doc_id: string;
  chunk_id: string;
  page: number;
  type: string;
  bbox: { x: number; y: number; w: number; h: number } | null;
  classification: string | null;
  content_pt: string | null;
  content_en: string | null;
  ufo_anomaly: boolean | null;
  ufo_anomaly_type: string | null;
}

/**
 * Top chunks that textually mention this entity. Reads from DB because
 * chunk content is big (we don't re-parse files at request time). Returns []
 * if the entity isn't indexed in the DB.
 */
export async function getEntityChunks(
  entityPk: number | null,
  limit = 30,
): Promise<EntityChunkPreview[]> {
  if (entityPk == null) return [];
  return pgQuery<EntityChunkPreview>(
    `SELECT
       c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification,
       c.content_pt, c.content_en, c.ufo_anomaly, c.ufo_anomaly_type
     FROM public.entity_mentions em
     JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
     WHERE em.entity_pk = $1
     ORDER BY c.ufo_anomaly DESC NULLS LAST, c.doc_id, c.order_global
     LIMIT $2`,
    [entityPk, limit],
  );
}

// Backwards-compat for callers that imported findEntity from the old path.
export { findEntity } from "./graph";