/** * Entity graph traversal — relacionamentos entre entidades, documentos e chunks. * * Construído a partir de: * - `public.entity_mentions` (chunk ↔ entity, materializado por lint) * - `public.entities` (com aliases + embedding) * - `public.chunks` (com doc_id + page) * * Não usa graph DB — Postgres recursive CTEs + JOINs resolvem multi-hop até depth 4. */ import { pgQuery } from "./db"; export interface EntityNode { entity_pk: number; entity_class: string; entity_id: string; canonical_name: string; total_mentions: number; documents_count: number; } export interface GraphEdge { from_entity_pk: number; to_entity_pk: number; weight: number; // count of co-mentions via_chunks: number[]; // sample of chunk_pks where they co-occur } /** Find an entity by class+id or by canonical_name match. */ export async function findEntity( entityClass: string, entityIdOrName: string, ): Promise { const rows = await pgQuery( `SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count FROM public.entities WHERE entity_class = $1 AND (entity_id = $2 OR canonical_name ILIKE $2 OR $2 = ANY(aliases)) LIMIT 1`, [entityClass, entityIdOrName], ); return rows[0] ?? null; } /** All entities co-mentioned with the given entity. Returns up to `limit` neighbors sorted by edge weight. */ export async function getNeighbors( entityPk: number, opts: { limit?: number; classes?: string[] } = {}, ): Promise> { const limit = Math.min(opts.limit ?? 30, 100); const params: unknown[] = [entityPk]; let classFilter = ""; if (opts.classes && opts.classes.length > 0) { params.push(opts.classes); classFilter = `AND e.entity_class = ANY($${params.length}::text[])`; } params.push(limit); return pgQuery( `WITH coloc AS ( SELECT em2.entity_pk AS other_pk, COUNT(*) AS weight, (array_agg(em1.chunk_pk))[1:5] AS sample_chunks FROM public.entity_mentions em1 JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1 GROUP BY em2.entity_pk ) SELECT e.entity_pk, e.entity_class, e.entity_id, e.canonical_name, e.total_mentions, e.documents_count, c.weight, c.sample_chunks FROM coloc c JOIN public.entities e ON e.entity_pk = c.other_pk WHERE NOT e.is_generic ${classFilter} ORDER BY c.weight DESC LIMIT $${params.length}`, params, ); } /** Paths between two entities via shared chunks, up to `maxHops` hops. */ export async function findPaths( fromPk: number, toPk: number, maxHops: number = 3, ): Promise> { if (maxHops < 1 || maxHops > 4) maxHops = 3; // Recursive CTE — explore through entity_mentions co-occurrence graph return pgQuery( `WITH RECURSIVE paths AS ( SELECT ARRAY[$1::BIGINT, em2.entity_pk] AS path, 1 AS hops FROM public.entity_mentions em1 JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1 UNION ALL SELECT path || em2.entity_pk, hops + 1 FROM paths p JOIN public.entity_mentions em1 ON em1.entity_pk = p.path[array_length(p.path, 1)] JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk WHERE em2.entity_pk <> ALL(p.path) AND p.hops < $3 ) SELECT path, hops FROM paths WHERE path[array_length(path, 1)] = $2 ORDER BY hops ASC, path ASC LIMIT 10`, [fromPk, toPk, maxHops], ); } /** Seed for the force-directed graph view — top-N entities + their internal edges. * Filters out noise (very short canonical names — OCR fragments, abbreviations). * Deduplicates by canonical_name + entity_class (keeps highest-mention version). */ export async function getGraphSeed(opts: { limit?: number; classes?: string[]; minWeight?: number; } = {}): Promise<{ nodes: Array; links: Array<{ source: number; target: number; weight: number }>; }> { const limit = Math.min(opts.limit ?? 40, 300); const minWeight = opts.minWeight ?? 3; const params: unknown[] = [limit]; let classFilter = ""; if (opts.classes && opts.classes.length > 0) { params.push(opts.classes); classFilter = `AND entity_class = ANY($${params.length}::text[])`; } const nodes = await pgQuery( `WITH ranked AS ( SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, LEFT(entity_class, 3) AS entity_class_short, ROW_NUMBER() OVER ( PARTITION BY entity_class, LOWER(TRIM(canonical_name)) ORDER BY total_mentions DESC NULLS LAST ) AS rn FROM public.entities WHERE LENGTH(TRIM(canonical_name)) >= 4 AND canonical_name !~ '^[A-Z]{1,3}$' AND canonical_name !~ '^[0-9.()-]+$' AND NOT is_generic ${classFilter} ) SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, entity_class_short FROM ranked WHERE rn = 1 ORDER BY total_mentions DESC NULLS LAST LIMIT $1`, params, ); if (nodes.length === 0) return { nodes: [], links: [] }; const pks = nodes.map((n) => n.entity_pk); // Edges where BOTH endpoints are in the top-N set const links = await pgQuery<{ source: number; target: number; weight: number }>( `SELECT em1.entity_pk AS source, em2.entity_pk AS target, COUNT(*)::INT AS weight FROM public.entity_mentions em1 JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk WHERE em1.entity_pk = ANY($1::bigint[]) AND em2.entity_pk = ANY($1::bigint[]) AND em1.entity_pk < em2.entity_pk GROUP BY em1.entity_pk, em2.entity_pk HAVING COUNT(*) >= $2 ORDER BY weight DESC LIMIT 2000`, [pks, minWeight], ); return { nodes, links }; } /** Chunks where two entities co-occur. */ export async function getCoMentionChunks( entityA: number, entityB: number, limit: number = 20, ): Promise> { return pgQuery( `SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.content_pt, c.content_en FROM public.chunks c WHERE c.chunk_pk IN ( SELECT em.chunk_pk FROM public.entity_mentions em WHERE em.entity_pk = $1 AND em.chunk_pk IN ( SELECT chunk_pk FROM public.entity_mentions WHERE entity_pk = $2 ) ) ORDER BY c.doc_id, c.order_global LIMIT $3`, [entityA, entityB, limit], ); }