From 504b20fa5c7bb383494653a48ca2ae2b7fa79e8f Mon Sep 17 00:00:00 2001
From: Luiz Gustavo <gutomec@gmail.com>
Date: Thu, 21 May 2026 16:36:56 -0300
Subject: [PATCH] search: gate dense recall by cosine-distance threshold in the
 RPC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root-cause fix for "search returns garbage for absent terms". The hybrid RPC's
dense branch always returned its k nearest vectors regardless of distance, so a
query for a term not in the corpus (e.g. "varginha") surfaced unrelated chunks.
The cross-encoder reranker would filter these but costs 18-62s on CPU —
unusable for interactive search.

Add max_dense_dist (default 0.40) to hybrid_search_chunks: dense neighbours
beyond that cosine distance are dropped server-side. Calibrated from measured
distances — strong semantic match ~0.12-0.20, no real match ~0.46-0.53. BM25
full-text still matches literal terms; the reranker becomes opt-in refinement.

Verified live: varginha/abducao → 0, disco voador/roswell → relevant, all <1s.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../supabase/migrations/0002_chunks_retrieval.sql | 12 +++++++++++-
 web/lib/retrieval/hybrid.ts                       | 15 ++++++---------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/infra/supabase/migrations/0002_chunks_retrieval.sql b/infra/supabase/migrations/0002_chunks_retrieval.sql
index 3bf103a..c11a8cd 100644
--- a/infra/supabase/migrations/0002_chunks_retrieval.sql
+++ b/infra/supabase/migrations/0002_chunks_retrieval.sql
@@ -150,6 +150,10 @@ CREATE INDEX IF NOT EXISTS entity_mentions_chunk_idx ON public.entity_mentions (
 CREATE INDEX IF NOT EXISTS entity_mentions_entity_idx ON public.entity_mentions (entity_pk);
 
 -- 8. Hybrid search RPC: BM25 + dense + RRF fusion server-side
+-- Drop prior signatures so adding the max_dense_dist param replaces rather than
+-- overloads the function (Postgres treats different arg lists as distinct fns).
+DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT);
+DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION);
 CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
   q_text       TEXT,
   q_embedding  vector(1024),
@@ -159,7 +163,12 @@ CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
   q_classification TEXT DEFAULT NULL,
   q_ufo_only   BOOLEAN DEFAULT FALSE,
   k            INT DEFAULT 100,
-  rrf_k        INT DEFAULT 60
+  rrf_k        INT DEFAULT 60,
+  -- Max cosine distance for a dense neighbour to count as relevant. Beyond this
+  -- the vector is noise (measured: strong semantic match ~0.12-0.20, no real
+  -- match ~0.46-0.53). Keeps absent-term queries from returning nearest-vector
+  -- garbage without needing the CPU-slow cross-encoder reranker.
+  max_dense_dist DOUBLE PRECISION DEFAULT 0.40
 )
 RETURNS TABLE (
   chunk_pk    BIGINT,
@@ -206,6 +215,7 @@ BEGIN
            row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
     FROM public.chunks c
     WHERE c.embedding IS NOT NULL
+      AND (c.embedding <=> q_embedding) < max_dense_dist
       AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
       AND (q_type IS NULL OR c.type = q_type)
       AND (q_classification IS NULL OR c.classification = q_classification)
diff --git a/web/lib/retrieval/hybrid.ts b/web/lib/retrieval/hybrid.ts
index df16bbb..5474409 100644
--- a/web/lib/retrieval/hybrid.ts
+++ b/web/lib/retrieval/hybrid.ts
@@ -83,15 +83,14 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[
     recall_k,
   ]);
 
+  // Relevance gating happens server-side in the RPC: BM25 only matches real
+  // terms, and the dense branch is bounded by max_dense_dist so absent-term
+  // queries return no nearest-vector noise. So `rows` is already clean.
   if (rows.length === 0) return [];
 
-  // 3. Rerank — the cross-encoder score is BOTH the relevance ordering AND the
-  // noise filter. The RPC always returns up to recall_k dense neighbours even
-  // when the term has zero lexical match (e.g. a name absent from the corpus),
-  // so without this filter a nonsense query returns its k nearest vectors.
-  // BGE-reranker-v2-m3 with normalize=true → sigmoid 0..1; ~0 means irrelevant.
-  const RELEVANCE_MIN = 0.02;
-
+  // 3. Optional cross-encoder rerank for finer ordering. It's CPU-slow
+  // (seconds per ~dozen candidates), so it's opt-in (rerank=1); the default
+  // fast path trusts the RPC's RRF order over the already-gated candidates.
   if (no_rerank) {
     return rows.slice(0, top_k);
   }
@@ -103,12 +102,10 @@ export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[
   try {
     scores = await rerank(query, candidateTexts);
   } catch {
-    // Reranker unavailable — fall back to RRF order (no relevance filter possible)
     return rows.slice(0, top_k);
   }
   return rows
     .map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 }))
-    .filter((r) => (r.rerank_score ?? 0) >= RELEVANCE_MIN)
     .sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0))
     .slice(0, top_k);
 }