search: gate dense recall by cosine-distance threshold in the RPC
Root-cause fix for "search returns garbage for absent terms". The hybrid RPC's dense branch always returned its k nearest vectors regardless of distance, so a query for a term not in the corpus (e.g. "varginha") surfaced unrelated chunks. The cross-encoder reranker would filter these but costs 18-62s on CPU — unusable for interactive search. Add max_dense_dist (default 0.40) to hybrid_search_chunks: dense neighbours beyond that cosine distance are dropped server-side. Calibrated from measured distances — strong semantic match ~0.12-0.20, no real match ~0.46-0.53. BM25 full-text still matches literal terms; the reranker becomes opt-in refinement. Verified live: varginha/abducao → 0, disco voador/roswell → relevant, all <1s. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
4865f974b6
commit
504b20fa5c
2 changed files with 17 additions and 10 deletions
|
|
@ -150,6 +150,10 @@ CREATE INDEX IF NOT EXISTS entity_mentions_chunk_idx ON public.entity_mentions (
|
|||
CREATE INDEX IF NOT EXISTS entity_mentions_entity_idx ON public.entity_mentions (entity_pk);
|
||||
|
||||
-- 8. Hybrid search RPC: BM25 + dense + RRF fusion server-side
|
||||
-- Drop prior signatures so adding the max_dense_dist param replaces rather than
|
||||
-- overloads the function (Postgres treats different arg lists as distinct fns).
|
||||
DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT);
|
||||
DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION);
|
||||
CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
|
||||
q_text TEXT,
|
||||
q_embedding vector(1024),
|
||||
|
|
@ -159,7 +163,12 @@ CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
|
|||
q_classification TEXT DEFAULT NULL,
|
||||
q_ufo_only BOOLEAN DEFAULT FALSE,
|
||||
k INT DEFAULT 100,
|
||||
rrf_k INT DEFAULT 60
|
||||
rrf_k INT DEFAULT 60,
|
||||
-- Max cosine distance for a dense neighbour to count as relevant. Beyond this
|
||||
-- the vector is noise (measured: strong semantic match ~0.12-0.20, no real
|
||||
-- match ~0.46-0.53). Keeps absent-term queries from returning nearest-vector
|
||||
-- garbage without needing the CPU-slow cross-encoder reranker.
|
||||
max_dense_dist DOUBLE PRECISION DEFAULT 0.40
|
||||
)
|
||||
RETURNS TABLE (
|
||||
chunk_pk BIGINT,
|
||||
|
|
@ -206,6 +215,7 @@ BEGIN
|
|||
row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
|
||||
FROM public.chunks c
|
||||
WHERE c.embedding IS NOT NULL
|
||||
AND (c.embedding <=> q_embedding) < max_dense_dist
|
||||
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
|
||||
AND (q_type IS NULL OR c.type = q_type)
|
||||
AND (q_classification IS NULL OR c.classification = q_classification)
|
||||
|
|
|
|||
|
|
@ -83,15 +83,14 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[
|
|||
recall_k,
|
||||
]);
|
||||
|
||||
// Relevance gating happens server-side in the RPC: BM25 only matches real
|
||||
// terms, and the dense branch is bounded by max_dense_dist so absent-term
|
||||
// queries return no nearest-vector noise. So `rows` is already clean.
|
||||
if (rows.length === 0) return [];
|
||||
|
||||
// 3. Rerank — the cross-encoder score is BOTH the relevance ordering AND the
|
||||
// noise filter. The RPC always returns up to recall_k dense neighbours even
|
||||
// when the term has zero lexical match (e.g. a name absent from the corpus),
|
||||
// so without this filter a nonsense query returns its k nearest vectors.
|
||||
// BGE-reranker-v2-m3 with normalize=true → sigmoid 0..1; ~0 means irrelevant.
|
||||
const RELEVANCE_MIN = 0.02;
|
||||
|
||||
// 3. Optional cross-encoder rerank for finer ordering. It's CPU-slow
|
||||
// (seconds per ~dozen candidates), so it's opt-in (rerank=1); the default
|
||||
// fast path trusts the RPC's RRF order over the already-gated candidates.
|
||||
if (no_rerank) {
|
||||
return rows.slice(0, top_k);
|
||||
}
|
||||
|
|
@ -103,12 +102,10 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[
|
|||
try {
|
||||
scores = await rerank(query, candidateTexts);
|
||||
} catch {
|
||||
// Reranker unavailable — fall back to RRF order (no relevance filter possible)
|
||||
return rows.slice(0, top_k);
|
||||
}
|
||||
return rows
|
||||
.map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 }))
|
||||
.filter((r) => (r.rerank_score ?? 0) >= RELEVANCE_MIN)
|
||||
.sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0))
|
||||
.slice(0, top_k);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue