disclosure-bureau/web/lib/retrieval/graph.ts

/**
 * Entity graph traversal — relacionamentos entre entidades, documentos e chunks.
 *
 * Construído a partir de:
 *   - `public.entity_mentions` (chunk ↔ entity, materializado por lint)
 *   - `public.entities` (com aliases + embedding)
 *   - `public.chunks` (com doc_id + page)
 *
 * Não usa graph DB — Postgres recursive CTEs + JOINs resolvem multi-hop até depth 4.
 */
import { pgQuery } from "./db";

export interface EntityNode {
  entity_pk: number;
  entity_class: string;
  entity_id: string;
  canonical_name: string;
  total_mentions: number;
  documents_count: number;
}

export interface GraphEdge {
  from_entity_pk: number;
  to_entity_pk: number;
  weight: number; // count of co-mentions
  via_chunks: number[]; // sample of chunk_pks where they co-occur
}

/** Find an entity by class+id or by canonical_name match. */
export async function findEntity(
  entityClass: string,
  entityIdOrName: string,
): Promise<EntityNode | null> {
  const rows = await pgQuery<EntityNode>(
    `SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count
     FROM public.entities
     WHERE entity_class = $1
       AND (entity_id = $2 OR canonical_name ILIKE $2 OR $2 = ANY(aliases))
     LIMIT 1`,
    [entityClass, entityIdOrName],
  );
  return rows[0] ?? null;
}

/** All entities co-mentioned with the given entity. Returns up to `limit` neighbors sorted by edge weight. */
export async function getNeighbors(
  entityPk: number,
  opts: { limit?: number; classes?: string[] } = {},
): Promise<Array<EntityNode & { weight: number; sample_chunks: number[] }>> {
  const limit = Math.min(opts.limit ?? 30, 100);
  const params: unknown[] = [entityPk];
  let classFilter = "";
  if (opts.classes && opts.classes.length > 0) {
    params.push(opts.classes);
    classFilter = `AND e.entity_class = ANY($${params.length}::text[])`;
  }
  params.push(limit);

  return pgQuery(
    `WITH coloc AS (
       SELECT em2.entity_pk AS other_pk,
              COUNT(*) AS weight,
              (array_agg(em1.chunk_pk))[1:5] AS sample_chunks
       FROM public.entity_mentions em1
       JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
       WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1
       GROUP BY em2.entity_pk
     )
     SELECT e.entity_pk, e.entity_class, e.entity_id, e.canonical_name,
            e.total_mentions, e.documents_count, c.weight, c.sample_chunks
     FROM coloc c
     JOIN public.entities e ON e.entity_pk = c.other_pk
     WHERE NOT e.is_generic ${classFilter}
     ORDER BY c.weight DESC
     LIMIT $${params.length}`,
    params,
  );
}

/** Paths between two entities via shared chunks, up to `maxHops` hops. */
export async function findPaths(
  fromPk: number,
  toPk: number,
  maxHops: number = 3,
): Promise<Array<{ path: number[]; hops: number }>> {
  if (maxHops < 1 || maxHops > 4) maxHops = 3;
  // Recursive CTE — explore through entity_mentions co-occurrence graph
  return pgQuery(
    `WITH RECURSIVE paths AS (
       SELECT ARRAY[$1::BIGINT, em2.entity_pk] AS path, 1 AS hops
       FROM public.entity_mentions em1
       JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
       WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1

       UNION ALL

       SELECT path || em2.entity_pk, hops + 1
       FROM paths p
       JOIN public.entity_mentions em1 ON em1.entity_pk = p.path[array_length(p.path, 1)]
       JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
       WHERE em2.entity_pk <> ALL(p.path)
         AND p.hops < $3
     )
     SELECT path, hops
     FROM paths
     WHERE path[array_length(path, 1)] = $2
     ORDER BY hops ASC, path ASC
     LIMIT 10`,
    [fromPk, toPk, maxHops],
  );
}

/** Seed for the force-directed graph view — top-N entities + their internal edges.
 *  Filters out noise (very short canonical names — OCR fragments, abbreviations).
 *  Deduplicates by canonical_name + entity_class (keeps highest-mention version). */
export async function getGraphSeed(opts: {
  limit?: number;
  classes?: string[];
  minWeight?: number;
} = {}): Promise<{
  nodes: Array<EntityNode & { entity_class_short: string }>;
  links: Array<{ source: number; target: number; weight: number }>;
}> {
  const limit = Math.min(opts.limit ?? 40, 300);
  const minWeight = opts.minWeight ?? 3;
  const params: unknown[] = [limit];
  let classFilter = "";
  if (opts.classes && opts.classes.length > 0) {
    params.push(opts.classes);
    classFilter = `AND entity_class = ANY($${params.length}::text[])`;
  }

  const nodes = await pgQuery<EntityNode & { entity_class_short: string }>(
    `WITH ranked AS (
       SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count,
              LEFT(entity_class, 3) AS entity_class_short,
              ROW_NUMBER() OVER (
                PARTITION BY entity_class, LOWER(TRIM(canonical_name))
                ORDER BY total_mentions DESC NULLS LAST
              ) AS rn
       FROM public.entities
       WHERE LENGTH(TRIM(canonical_name)) >= 4
         AND canonical_name !~ '^[A-Z]{1,3}$'
         AND canonical_name !~ '^[0-9.()-]+$'
         AND NOT is_generic
         ${classFilter}
     )
     SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, entity_class_short
     FROM ranked
     WHERE rn = 1
     ORDER BY total_mentions DESC NULLS LAST
     LIMIT $1`,
    params,
  );

  if (nodes.length === 0) return { nodes: [], links: [] };

  const pks = nodes.map((n) => n.entity_pk);
  // Edges where BOTH endpoints are in the top-N set
  const links = await pgQuery<{ source: number; target: number; weight: number }>(
    `SELECT em1.entity_pk AS source, em2.entity_pk AS target, COUNT(*)::INT AS weight
     FROM public.entity_mentions em1
     JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
     WHERE em1.entity_pk = ANY($1::bigint[])
       AND em2.entity_pk = ANY($1::bigint[])
       AND em1.entity_pk < em2.entity_pk
     GROUP BY em1.entity_pk, em2.entity_pk
     HAVING COUNT(*) >= $2
     ORDER BY weight DESC
     LIMIT 2000`,
    [pks, minWeight],
  );

  return { nodes, links };
}

/** Chunks where two entities co-occur. */
export async function getCoMentionChunks(
  entityA: number,
  entityB: number,
  limit: number = 20,
): Promise<Array<{
  chunk_pk: number;
  doc_id: string;
  chunk_id: string;
  page: number;
  content_pt: string | null;
  content_en: string | null;
}>> {
  return pgQuery(
    `SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.content_pt, c.content_en
     FROM public.chunks c
     WHERE c.chunk_pk IN (
       SELECT em.chunk_pk
       FROM public.entity_mentions em
       WHERE em.entity_pk = $1
         AND em.chunk_pk IN (
           SELECT chunk_pk FROM public.entity_mentions WHERE entity_pk = $2
         )
     )
     ORDER BY c.doc_id, c.order_global
     LIMIT $3`,
    [entityA, entityB, limit],
  );
}