fix search: rerank-gate results so absent terms return nothing
The hybrid_search RPC always returns up to recall_k dense neighbours, so a query for a term absent from the corpus (e.g. "varginha") returned its 12 nearest vectors — irrelevant chunks like PAGE_NUMBER "1". Two bugs: the reranker was skipped whenever results <= top_k, and there was no relevance floor. Now always run the cross-encoder reranker (BGE-reranker-v2-m3, normalized sigmoid) and drop hits below 0.02. Verified: "varginha" → 0 results; "roswell"/"tic tac"/"disco voador" → relevant hits on top (reranker cleanly separates 0.0001 garbage from 0.03-0.27 matches). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
ebc6fa41e9
commit
4865f974b6
1 changed files with 17 additions and 10 deletions
|
|
@ -85,25 +85,32 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[
|
|||
|
||||
if (rows.length === 0) return [];
|
||||
|
||||
// 3. Optional rerank
|
||||
if (no_rerank || rows.length <= top_k) {
|
||||
// 3. Rerank — the cross-encoder score is BOTH the relevance ordering AND the
|
||||
// noise filter. The RPC always returns up to recall_k dense neighbours even
|
||||
// when the term has zero lexical match (e.g. a name absent from the corpus),
|
||||
// so without this filter a nonsense query returns its k nearest vectors.
|
||||
// BGE-reranker-v2-m3 with normalize=true → sigmoid 0..1; ~0 means irrelevant.
|
||||
const RELEVANCE_MIN = 0.02;
|
||||
|
||||
if (no_rerank) {
|
||||
return rows.slice(0, top_k);
|
||||
}
|
||||
|
||||
const candidateTexts = rows.map((r) => {
|
||||
if (lang === "en") return r.content_en || r.content_pt || "";
|
||||
return r.content_pt || r.content_en || "";
|
||||
});
|
||||
const candidateTexts = rows.map((r) =>
|
||||
lang === "en" ? r.content_en || r.content_pt || "" : r.content_pt || r.content_en || "",
|
||||
);
|
||||
let scores: number[] = [];
|
||||
try {
|
||||
scores = await rerank(query, candidateTexts);
|
||||
} catch {
|
||||
// Reranker unavailable — return RRF order
|
||||
// Reranker unavailable — fall back to RRF order (no relevance filter possible)
|
||||
return rows.slice(0, top_k);
|
||||
}
|
||||
const reranked = rows.map((r, i) => ({ ...r, rerank_score: scores[i] }));
|
||||
reranked.sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0));
|
||||
return reranked.slice(0, top_k);
|
||||
return rows
|
||||
.map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 }))
|
||||
.filter((r) => (r.rerank_score ?? 0) >= RELEVANCE_MIN)
|
||||
.sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0))
|
||||
.slice(0, top_k);
|
||||
}
|
||||
|
||||
/** Quick chunk lookup by chunk_id (no embedding). For citation expansion. */
|
||||
|
|
|
|||
Loading…
Reference in a new issue