From 4865f974b671d72a1537af918e0bf01da67a14f5 Mon Sep 17 00:00:00 2001 From: Luiz Gustavo Date: Thu, 21 May 2026 14:46:49 -0300 Subject: [PATCH] fix search: rerank-gate results so absent terms return nothing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hybrid_search RPC always returns up to recall_k dense neighbours, so a query for a term absent from the corpus (e.g. "varginha") returned its 12 nearest vectors — irrelevant chunks like PAGE_NUMBER "1". Two bugs: the reranker was skipped whenever results <= top_k, and there was no relevance floor. Now always run the cross-encoder reranker (BGE-reranker-v2-m3, normalized sigmoid) and drop hits below 0.02. Verified: "varginha" → 0 results; "roswell"/"tic tac"/"disco voador" → relevant hits on top (reranker cleanly separates 0.0001 garbage from 0.03-0.27 matches). Co-Authored-By: Claude Opus 4.7 --- web/lib/retrieval/hybrid.ts | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/web/lib/retrieval/hybrid.ts b/web/lib/retrieval/hybrid.ts index b157868..df16bbb 100644 --- a/web/lib/retrieval/hybrid.ts +++ b/web/lib/retrieval/hybrid.ts @@ -85,25 +85,32 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise { - if (lang === "en") return r.content_en || r.content_pt || ""; - return r.content_pt || r.content_en || ""; - }); + const candidateTexts = rows.map((r) => + lang === "en" ? r.content_en || r.content_pt || "" : r.content_pt || r.content_en || "", + ); let scores: number[] = []; try { scores = await rerank(query, candidateTexts); } catch { - // Reranker unavailable — return RRF order + // Reranker unavailable — fall back to RRF order (no relevance filter possible) return rows.slice(0, top_k); } - const reranked = rows.map((r, i) => ({ ...r, rerank_score: scores[i] })); - reranked.sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0)); - return reranked.slice(0, top_k); + return rows + .map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 })) + .filter((r) => (r.rerank_score ?? 0) >= RELEVANCE_MIN) + .sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0)) + .slice(0, top_k); } /** Quick chunk lookup by chunk_id (no embedding). For citation expansion. */