The hybrid_search RPC always returns up to recall_k dense neighbours, so a query for a term absent from the corpus (e.g. "varginha") returned its 12 nearest vectors — irrelevant chunks like PAGE_NUMBER "1". Two bugs: the reranker was skipped whenever results <= top_k, and there was no relevance floor. Now always run the cross-encoder reranker (BGE-reranker-v2-m3, normalized sigmoid) and drop hits below 0.02. Verified: "varginha" → 0 results; "roswell"/"tic tac"/"disco voador" → relevant hits on top (reranker cleanly separates 0.0001 garbage from 0.03-0.27 matches). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
177 lines
5.5 KiB
TypeScript
177 lines
5.5 KiB
TypeScript
/**
|
|
* Hybrid retrieval: BM25 (tsvector) + dense (pgvector) → RRF fusion → reranker.
|
|
*
|
|
* Stage 1 (in Postgres via public.hybrid_search_chunks RPC):
|
|
* - tsvector keyword recall on en_unaccent or pt_unaccent
|
|
* - dense cosine on BGE-M3 embedding (1024 dim)
|
|
* - RRF score combines both rankings
|
|
* - filters: doc_id, type, classification, ufo_only
|
|
*
|
|
* Stage 2 (in Node via embed-service /rerank):
|
|
* - Cross-encoder rerank of top-N candidates
|
|
*
|
|
* Returns chunks sorted by final reranked score, with all metadata for
|
|
* citation rendering (bbox, page, type, classification, image refs).
|
|
*/
|
|
import { embedQuery, rerank, toPgVectorLiteral } from "./embed";
|
|
import { pgQuery } from "./db";
|
|
|
|
export interface ChunkHit {
|
|
chunk_pk: number;
|
|
doc_id: string;
|
|
chunk_id: string;
|
|
page: number;
|
|
type: string;
|
|
bbox: { x: number; y: number; w: number; h: number } | null;
|
|
content_en: string | null;
|
|
content_pt: string | null;
|
|
classification: string | null;
|
|
score: number;
|
|
bm25_rank: number | null;
|
|
dense_rank: number | null;
|
|
rerank_score?: number;
|
|
}
|
|
|
|
export interface HybridSearchOptions {
|
|
query: string;
|
|
lang?: "pt" | "en";
|
|
doc_id?: string | null;
|
|
type?: string | null;
|
|
classification?: string | null;
|
|
ufo_only?: boolean;
|
|
/** Postgres recall window (default 100) — top-k from RRF before rerank. */
|
|
recall_k?: number;
|
|
/** Final list size after rerank (default 20). */
|
|
top_k?: number;
|
|
/** Skip reranker (faster, lower precision). */
|
|
no_rerank?: boolean;
|
|
}
|
|
|
|
export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[]> {
|
|
const {
|
|
query,
|
|
lang = "pt",
|
|
doc_id = null,
|
|
type = null,
|
|
classification = null,
|
|
ufo_only = false,
|
|
recall_k = 100,
|
|
top_k = 20,
|
|
no_rerank = false,
|
|
} = opts;
|
|
|
|
if (!query.trim()) return [];
|
|
|
|
// 1. Embed the query
|
|
const q_embedding = await embedQuery(query);
|
|
|
|
// 2. Call hybrid_search_chunks RPC
|
|
const sql = `
|
|
SELECT *
|
|
FROM public.hybrid_search_chunks(
|
|
$1, $2::vector, $3, $4, $5, $6, $7, $8, 60
|
|
)
|
|
`;
|
|
const rows = await pgQuery<ChunkHit>(sql, [
|
|
query,
|
|
toPgVectorLiteral(q_embedding),
|
|
lang,
|
|
doc_id,
|
|
type,
|
|
classification,
|
|
ufo_only,
|
|
recall_k,
|
|
]);
|
|
|
|
if (rows.length === 0) return [];
|
|
|
|
// 3. Rerank — the cross-encoder score is BOTH the relevance ordering AND the
|
|
// noise filter. The RPC always returns up to recall_k dense neighbours even
|
|
// when the term has zero lexical match (e.g. a name absent from the corpus),
|
|
// so without this filter a nonsense query returns its k nearest vectors.
|
|
// BGE-reranker-v2-m3 with normalize=true → sigmoid 0..1; ~0 means irrelevant.
|
|
const RELEVANCE_MIN = 0.02;
|
|
|
|
if (no_rerank) {
|
|
return rows.slice(0, top_k);
|
|
}
|
|
|
|
const candidateTexts = rows.map((r) =>
|
|
lang === "en" ? r.content_en || r.content_pt || "" : r.content_pt || r.content_en || "",
|
|
);
|
|
let scores: number[] = [];
|
|
try {
|
|
scores = await rerank(query, candidateTexts);
|
|
} catch {
|
|
// Reranker unavailable — fall back to RRF order (no relevance filter possible)
|
|
return rows.slice(0, top_k);
|
|
}
|
|
return rows
|
|
.map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 }))
|
|
.filter((r) => (r.rerank_score ?? 0) >= RELEVANCE_MIN)
|
|
.sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0))
|
|
.slice(0, top_k);
|
|
}
|
|
|
|
/** Quick chunk lookup by chunk_id (no embedding). For citation expansion. */
|
|
export async function getChunk(doc_id: string, chunk_id: string): Promise<ChunkHit | null> {
|
|
const rows = await pgQuery<ChunkHit>(
|
|
`SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt,
|
|
classification, 0::DOUBLE PRECISION AS score,
|
|
NULL::INT AS bm25_rank, NULL::INT AS dense_rank
|
|
FROM public.chunks
|
|
WHERE doc_id = $1 AND chunk_id = $2`,
|
|
[doc_id, chunk_id],
|
|
);
|
|
return rows[0] ?? null;
|
|
}
|
|
|
|
/** List anomaly-flagged chunks. Useful for "show me all UFO sightings" without embedding. */
|
|
export async function listAnomalies(opts: {
|
|
kind: "ufo" | "cryptid";
|
|
doc_id?: string | null;
|
|
limit?: number;
|
|
}): Promise<Array<{
|
|
chunk_id: string;
|
|
doc_id: string;
|
|
page: number;
|
|
anomaly_type: string | null;
|
|
rationale: string | null;
|
|
content_en: string | null;
|
|
content_pt: string | null;
|
|
}>> {
|
|
const col = opts.kind === "ufo" ? "ufo_anomaly" : "cryptid_anomaly";
|
|
const typeCol = opts.kind === "ufo" ? "ufo_anomaly_type" : "cryptid_anomaly_type";
|
|
const ratCol = opts.kind === "ufo" ? "ufo_rationale" : "cryptid_rationale";
|
|
const limit = Math.min(opts.limit ?? 50, 200);
|
|
const params: unknown[] = [];
|
|
let where = `WHERE ${col} = TRUE`;
|
|
if (opts.doc_id) {
|
|
params.push(opts.doc_id);
|
|
where += ` AND doc_id = $${params.length}`;
|
|
}
|
|
params.push(limit);
|
|
const rows = await pgQuery<Record<string, unknown>>(
|
|
`SELECT chunk_id, doc_id, page, ${typeCol} AS anomaly_type, ${ratCol} AS rationale,
|
|
content_en, content_pt
|
|
FROM public.chunks
|
|
${where}
|
|
ORDER BY doc_id, order_global
|
|
LIMIT $${params.length}`,
|
|
params,
|
|
);
|
|
return rows as never;
|
|
}
|
|
|
|
/** Assemble a single page (chunks ordered) directly from DB. */
|
|
export async function getPageChunks(doc_id: string, page: number): Promise<ChunkHit[]> {
|
|
return pgQuery<ChunkHit>(
|
|
`SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt,
|
|
classification, 0::DOUBLE PRECISION AS score,
|
|
NULL::INT AS bm25_rank, NULL::INT AS dense_rank
|
|
FROM public.chunks
|
|
WHERE doc_id = $1 AND page = $2
|
|
ORDER BY order_in_page`,
|
|
[doc_id, page],
|
|
);
|
|
}
|