disclosure-bureau/web/lib/retrieval/graph.ts
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

205 lines
6.8 KiB
TypeScript

/**
* Entity graph traversal — relacionamentos entre entidades, documentos e chunks.
*
* Construído a partir de:
* - `public.entity_mentions` (chunk ↔ entity, materializado por lint)
* - `public.entities` (com aliases + embedding)
* - `public.chunks` (com doc_id + page)
*
* Não usa graph DB — Postgres recursive CTEs + JOINs resolvem multi-hop até depth 4.
*/
import { pgQuery } from "./db";
export interface EntityNode {
entity_pk: number;
entity_class: string;
entity_id: string;
canonical_name: string;
total_mentions: number;
documents_count: number;
}
export interface GraphEdge {
from_entity_pk: number;
to_entity_pk: number;
weight: number; // count of co-mentions
via_chunks: number[]; // sample of chunk_pks where they co-occur
}
/** Find an entity by class+id or by canonical_name match. */
export async function findEntity(
entityClass: string,
entityIdOrName: string,
): Promise<EntityNode | null> {
const rows = await pgQuery<EntityNode>(
`SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count
FROM public.entities
WHERE entity_class = $1
AND (entity_id = $2 OR canonical_name ILIKE $2 OR $2 = ANY(aliases))
LIMIT 1`,
[entityClass, entityIdOrName],
);
return rows[0] ?? null;
}
/** All entities co-mentioned with the given entity. Returns up to `limit` neighbors sorted by edge weight. */
export async function getNeighbors(
entityPk: number,
opts: { limit?: number; classes?: string[] } = {},
): Promise<Array<EntityNode & { weight: number; sample_chunks: number[] }>> {
const limit = Math.min(opts.limit ?? 30, 100);
const params: unknown[] = [entityPk];
let classFilter = "";
if (opts.classes && opts.classes.length > 0) {
params.push(opts.classes);
classFilter = `AND e.entity_class = ANY($${params.length}::text[])`;
}
params.push(limit);
return pgQuery(
`WITH coloc AS (
SELECT em2.entity_pk AS other_pk,
COUNT(*) AS weight,
(array_agg(em1.chunk_pk))[1:5] AS sample_chunks
FROM public.entity_mentions em1
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1
GROUP BY em2.entity_pk
)
SELECT e.entity_pk, e.entity_class, e.entity_id, e.canonical_name,
e.total_mentions, e.documents_count, c.weight, c.sample_chunks
FROM coloc c
JOIN public.entities e ON e.entity_pk = c.other_pk
WHERE NOT e.is_generic ${classFilter}
ORDER BY c.weight DESC
LIMIT $${params.length}`,
params,
);
}
/** Paths between two entities via shared chunks, up to `maxHops` hops. */
export async function findPaths(
fromPk: number,
toPk: number,
maxHops: number = 3,
): Promise<Array<{ path: number[]; hops: number }>> {
if (maxHops < 1 || maxHops > 4) maxHops = 3;
// Recursive CTE — explore through entity_mentions co-occurrence graph
return pgQuery(
`WITH RECURSIVE paths AS (
SELECT ARRAY[$1::BIGINT, em2.entity_pk] AS path, 1 AS hops
FROM public.entity_mentions em1
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1
UNION ALL
SELECT path || em2.entity_pk, hops + 1
FROM paths p
JOIN public.entity_mentions em1 ON em1.entity_pk = p.path[array_length(p.path, 1)]
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
WHERE em2.entity_pk <> ALL(p.path)
AND p.hops < $3
)
SELECT path, hops
FROM paths
WHERE path[array_length(path, 1)] = $2
ORDER BY hops ASC, path ASC
LIMIT 10`,
[fromPk, toPk, maxHops],
);
}
/** Seed for the force-directed graph view — top-N entities + their internal edges.
* Filters out noise (very short canonical names — OCR fragments, abbreviations).
* Deduplicates by canonical_name + entity_class (keeps highest-mention version). */
export async function getGraphSeed(opts: {
limit?: number;
classes?: string[];
minWeight?: number;
} = {}): Promise<{
nodes: Array<EntityNode & { entity_class_short: string }>;
links: Array<{ source: number; target: number; weight: number }>;
}> {
const limit = Math.min(opts.limit ?? 40, 300);
const minWeight = opts.minWeight ?? 3;
const params: unknown[] = [limit];
let classFilter = "";
if (opts.classes && opts.classes.length > 0) {
params.push(opts.classes);
classFilter = `AND entity_class = ANY($${params.length}::text[])`;
}
const nodes = await pgQuery<EntityNode & { entity_class_short: string }>(
`WITH ranked AS (
SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count,
LEFT(entity_class, 3) AS entity_class_short,
ROW_NUMBER() OVER (
PARTITION BY entity_class, LOWER(TRIM(canonical_name))
ORDER BY total_mentions DESC NULLS LAST
) AS rn
FROM public.entities
WHERE LENGTH(TRIM(canonical_name)) >= 4
AND canonical_name !~ '^[A-Z]{1,3}$'
AND canonical_name !~ '^[0-9.()-]+$'
AND NOT is_generic
${classFilter}
)
SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, entity_class_short
FROM ranked
WHERE rn = 1
ORDER BY total_mentions DESC NULLS LAST
LIMIT $1`,
params,
);
if (nodes.length === 0) return { nodes: [], links: [] };
const pks = nodes.map((n) => n.entity_pk);
// Edges where BOTH endpoints are in the top-N set
const links = await pgQuery<{ source: number; target: number; weight: number }>(
`SELECT em1.entity_pk AS source, em2.entity_pk AS target, COUNT(*)::INT AS weight
FROM public.entity_mentions em1
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
WHERE em1.entity_pk = ANY($1::bigint[])
AND em2.entity_pk = ANY($1::bigint[])
AND em1.entity_pk < em2.entity_pk
GROUP BY em1.entity_pk, em2.entity_pk
HAVING COUNT(*) >= $2
ORDER BY weight DESC
LIMIT 2000`,
[pks, minWeight],
);
return { nodes, links };
}
/** Chunks where two entities co-occur. */
export async function getCoMentionChunks(
entityA: number,
entityB: number,
limit: number = 20,
): Promise<Array<{
chunk_pk: number;
doc_id: string;
chunk_id: string;
page: number;
content_pt: string | null;
content_en: string | null;
}>> {
return pgQuery(
`SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.content_pt, c.content_en
FROM public.chunks c
WHERE c.chunk_pk IN (
SELECT em.chunk_pk
FROM public.entity_mentions em
WHERE em.entity_pk = $1
AND em.chunk_pk IN (
SELECT chunk_pk FROM public.entity_mentions WHERE entity_pk = $2
)
)
ORDER BY c.doc_id, c.order_global
LIMIT $3`,
[entityA, entityB, limit],
);
}