From 504b20fa5c7bb383494653a48ca2ae2b7fa79e8f Mon Sep 17 00:00:00 2001 From: Luiz Gustavo Date: Thu, 21 May 2026 16:36:56 -0300 Subject: [PATCH] search: gate dense recall by cosine-distance threshold in the RPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root-cause fix for "search returns garbage for absent terms". The hybrid RPC's dense branch always returned its k nearest vectors regardless of distance, so a query for a term not in the corpus (e.g. "varginha") surfaced unrelated chunks. The cross-encoder reranker would filter these but costs 18-62s on CPU — unusable for interactive search. Add max_dense_dist (default 0.40) to hybrid_search_chunks: dense neighbours beyond that cosine distance are dropped server-side. Calibrated from measured distances — strong semantic match ~0.12-0.20, no real match ~0.46-0.53. BM25 full-text still matches literal terms; the reranker becomes opt-in refinement. Verified live: varginha/abducao → 0, disco voador/roswell → relevant, all <1s. Co-Authored-By: Claude Opus 4.7 --- .../supabase/migrations/0002_chunks_retrieval.sql | 12 +++++++++++- web/lib/retrieval/hybrid.ts | 15 ++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/infra/supabase/migrations/0002_chunks_retrieval.sql b/infra/supabase/migrations/0002_chunks_retrieval.sql index 3bf103a..c11a8cd 100644 --- a/infra/supabase/migrations/0002_chunks_retrieval.sql +++ b/infra/supabase/migrations/0002_chunks_retrieval.sql @@ -150,6 +150,10 @@ CREATE INDEX IF NOT EXISTS entity_mentions_chunk_idx ON public.entity_mentions ( CREATE INDEX IF NOT EXISTS entity_mentions_entity_idx ON public.entity_mentions (entity_pk); -- 8. Hybrid search RPC: BM25 + dense + RRF fusion server-side +-- Drop prior signatures so adding the max_dense_dist param replaces rather than +-- overloads the function (Postgres treats different arg lists as distinct fns). +DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT); +DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION); CREATE OR REPLACE FUNCTION public.hybrid_search_chunks( q_text TEXT, q_embedding vector(1024), @@ -159,7 +163,12 @@ CREATE OR REPLACE FUNCTION public.hybrid_search_chunks( q_classification TEXT DEFAULT NULL, q_ufo_only BOOLEAN DEFAULT FALSE, k INT DEFAULT 100, - rrf_k INT DEFAULT 60 + rrf_k INT DEFAULT 60, + -- Max cosine distance for a dense neighbour to count as relevant. Beyond this + -- the vector is noise (measured: strong semantic match ~0.12-0.20, no real + -- match ~0.46-0.53). Keeps absent-term queries from returning nearest-vector + -- garbage without needing the CPU-slow cross-encoder reranker. + max_dense_dist DOUBLE PRECISION DEFAULT 0.40 ) RETURNS TABLE ( chunk_pk BIGINT, @@ -206,6 +215,7 @@ BEGIN row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r FROM public.chunks c WHERE c.embedding IS NOT NULL + AND (c.embedding <=> q_embedding) < max_dense_dist AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) AND (q_type IS NULL OR c.type = q_type) AND (q_classification IS NULL OR c.classification = q_classification) diff --git a/web/lib/retrieval/hybrid.ts b/web/lib/retrieval/hybrid.ts index df16bbb..5474409 100644 --- a/web/lib/retrieval/hybrid.ts +++ b/web/lib/retrieval/hybrid.ts @@ -83,15 +83,14 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise ({ ...r, rerank_score: scores[i] ?? 0 })) - .filter((r) => (r.rerank_score ?? 0) >= RELEVANCE_MIN) .sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0)) .slice(0, top_k); }