Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
205 lines
6.8 KiB
TypeScript
205 lines
6.8 KiB
TypeScript
/**
|
|
* Entity graph traversal — relacionamentos entre entidades, documentos e chunks.
|
|
*
|
|
* Construído a partir de:
|
|
* - `public.entity_mentions` (chunk ↔ entity, materializado por lint)
|
|
* - `public.entities` (com aliases + embedding)
|
|
* - `public.chunks` (com doc_id + page)
|
|
*
|
|
* Não usa graph DB — Postgres recursive CTEs + JOINs resolvem multi-hop até depth 4.
|
|
*/
|
|
import { pgQuery } from "./db";
|
|
|
|
export interface EntityNode {
|
|
entity_pk: number;
|
|
entity_class: string;
|
|
entity_id: string;
|
|
canonical_name: string;
|
|
total_mentions: number;
|
|
documents_count: number;
|
|
}
|
|
|
|
export interface GraphEdge {
|
|
from_entity_pk: number;
|
|
to_entity_pk: number;
|
|
weight: number; // count of co-mentions
|
|
via_chunks: number[]; // sample of chunk_pks where they co-occur
|
|
}
|
|
|
|
/** Find an entity by class+id or by canonical_name match. */
|
|
export async function findEntity(
|
|
entityClass: string,
|
|
entityIdOrName: string,
|
|
): Promise<EntityNode | null> {
|
|
const rows = await pgQuery<EntityNode>(
|
|
`SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count
|
|
FROM public.entities
|
|
WHERE entity_class = $1
|
|
AND (entity_id = $2 OR canonical_name ILIKE $2 OR $2 = ANY(aliases))
|
|
LIMIT 1`,
|
|
[entityClass, entityIdOrName],
|
|
);
|
|
return rows[0] ?? null;
|
|
}
|
|
|
|
/** All entities co-mentioned with the given entity. Returns up to `limit` neighbors sorted by edge weight. */
|
|
export async function getNeighbors(
|
|
entityPk: number,
|
|
opts: { limit?: number; classes?: string[] } = {},
|
|
): Promise<Array<EntityNode & { weight: number; sample_chunks: number[] }>> {
|
|
const limit = Math.min(opts.limit ?? 30, 100);
|
|
const params: unknown[] = [entityPk];
|
|
let classFilter = "";
|
|
if (opts.classes && opts.classes.length > 0) {
|
|
params.push(opts.classes);
|
|
classFilter = `AND e.entity_class = ANY($${params.length}::text[])`;
|
|
}
|
|
params.push(limit);
|
|
|
|
return pgQuery(
|
|
`WITH coloc AS (
|
|
SELECT em2.entity_pk AS other_pk,
|
|
COUNT(*) AS weight,
|
|
(array_agg(em1.chunk_pk))[1:5] AS sample_chunks
|
|
FROM public.entity_mentions em1
|
|
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
|
|
WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1
|
|
GROUP BY em2.entity_pk
|
|
)
|
|
SELECT e.entity_pk, e.entity_class, e.entity_id, e.canonical_name,
|
|
e.total_mentions, e.documents_count, c.weight, c.sample_chunks
|
|
FROM coloc c
|
|
JOIN public.entities e ON e.entity_pk = c.other_pk
|
|
WHERE NOT e.is_generic ${classFilter}
|
|
ORDER BY c.weight DESC
|
|
LIMIT $${params.length}`,
|
|
params,
|
|
);
|
|
}
|
|
|
|
/** Paths between two entities via shared chunks, up to `maxHops` hops. */
|
|
export async function findPaths(
|
|
fromPk: number,
|
|
toPk: number,
|
|
maxHops: number = 3,
|
|
): Promise<Array<{ path: number[]; hops: number }>> {
|
|
if (maxHops < 1 || maxHops > 4) maxHops = 3;
|
|
// Recursive CTE — explore through entity_mentions co-occurrence graph
|
|
return pgQuery(
|
|
`WITH RECURSIVE paths AS (
|
|
SELECT ARRAY[$1::BIGINT, em2.entity_pk] AS path, 1 AS hops
|
|
FROM public.entity_mentions em1
|
|
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
|
|
WHERE em1.entity_pk = $1 AND em2.entity_pk <> $1
|
|
|
|
UNION ALL
|
|
|
|
SELECT path || em2.entity_pk, hops + 1
|
|
FROM paths p
|
|
JOIN public.entity_mentions em1 ON em1.entity_pk = p.path[array_length(p.path, 1)]
|
|
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
|
|
WHERE em2.entity_pk <> ALL(p.path)
|
|
AND p.hops < $3
|
|
)
|
|
SELECT path, hops
|
|
FROM paths
|
|
WHERE path[array_length(path, 1)] = $2
|
|
ORDER BY hops ASC, path ASC
|
|
LIMIT 10`,
|
|
[fromPk, toPk, maxHops],
|
|
);
|
|
}
|
|
|
|
/** Seed for the force-directed graph view — top-N entities + their internal edges.
|
|
* Filters out noise (very short canonical names — OCR fragments, abbreviations).
|
|
* Deduplicates by canonical_name + entity_class (keeps highest-mention version). */
|
|
export async function getGraphSeed(opts: {
|
|
limit?: number;
|
|
classes?: string[];
|
|
minWeight?: number;
|
|
} = {}): Promise<{
|
|
nodes: Array<EntityNode & { entity_class_short: string }>;
|
|
links: Array<{ source: number; target: number; weight: number }>;
|
|
}> {
|
|
const limit = Math.min(opts.limit ?? 40, 300);
|
|
const minWeight = opts.minWeight ?? 3;
|
|
const params: unknown[] = [limit];
|
|
let classFilter = "";
|
|
if (opts.classes && opts.classes.length > 0) {
|
|
params.push(opts.classes);
|
|
classFilter = `AND entity_class = ANY($${params.length}::text[])`;
|
|
}
|
|
|
|
const nodes = await pgQuery<EntityNode & { entity_class_short: string }>(
|
|
`WITH ranked AS (
|
|
SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count,
|
|
LEFT(entity_class, 3) AS entity_class_short,
|
|
ROW_NUMBER() OVER (
|
|
PARTITION BY entity_class, LOWER(TRIM(canonical_name))
|
|
ORDER BY total_mentions DESC NULLS LAST
|
|
) AS rn
|
|
FROM public.entities
|
|
WHERE LENGTH(TRIM(canonical_name)) >= 4
|
|
AND canonical_name !~ '^[A-Z]{1,3}$'
|
|
AND canonical_name !~ '^[0-9.()-]+$'
|
|
AND NOT is_generic
|
|
${classFilter}
|
|
)
|
|
SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, entity_class_short
|
|
FROM ranked
|
|
WHERE rn = 1
|
|
ORDER BY total_mentions DESC NULLS LAST
|
|
LIMIT $1`,
|
|
params,
|
|
);
|
|
|
|
if (nodes.length === 0) return { nodes: [], links: [] };
|
|
|
|
const pks = nodes.map((n) => n.entity_pk);
|
|
// Edges where BOTH endpoints are in the top-N set
|
|
const links = await pgQuery<{ source: number; target: number; weight: number }>(
|
|
`SELECT em1.entity_pk AS source, em2.entity_pk AS target, COUNT(*)::INT AS weight
|
|
FROM public.entity_mentions em1
|
|
JOIN public.entity_mentions em2 ON em1.chunk_pk = em2.chunk_pk
|
|
WHERE em1.entity_pk = ANY($1::bigint[])
|
|
AND em2.entity_pk = ANY($1::bigint[])
|
|
AND em1.entity_pk < em2.entity_pk
|
|
GROUP BY em1.entity_pk, em2.entity_pk
|
|
HAVING COUNT(*) >= $2
|
|
ORDER BY weight DESC
|
|
LIMIT 2000`,
|
|
[pks, minWeight],
|
|
);
|
|
|
|
return { nodes, links };
|
|
}
|
|
|
|
/** Chunks where two entities co-occur. */
|
|
export async function getCoMentionChunks(
|
|
entityA: number,
|
|
entityB: number,
|
|
limit: number = 20,
|
|
): Promise<Array<{
|
|
chunk_pk: number;
|
|
doc_id: string;
|
|
chunk_id: string;
|
|
page: number;
|
|
content_pt: string | null;
|
|
content_en: string | null;
|
|
}>> {
|
|
return pgQuery(
|
|
`SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.content_pt, c.content_en
|
|
FROM public.chunks c
|
|
WHERE c.chunk_pk IN (
|
|
SELECT em.chunk_pk
|
|
FROM public.entity_mentions em
|
|
WHERE em.entity_pk = $1
|
|
AND em.chunk_pk IN (
|
|
SELECT chunk_pk FROM public.entity_mentions WHERE entity_pk = $2
|
|
)
|
|
)
|
|
ORDER BY c.doc_id, c.order_global
|
|
LIMIT $3`,
|
|
[entityA, entityB, limit],
|
|
);
|
|
}
|