/** * Hybrid retrieval: BM25 (tsvector) + dense (pgvector) → RRF fusion → reranker. * * Stage 1 (in Postgres via public.hybrid_search_chunks RPC): * - tsvector keyword recall on en_unaccent or pt_unaccent * - dense cosine on BGE-M3 embedding (1024 dim) * - RRF score combines both rankings * - filters: doc_id, type, classification, ufo_only * * Stage 2 (in Node via embed-service /rerank): * - Cross-encoder rerank of top-N candidates * * Returns chunks sorted by final reranked score, with all metadata for * citation rendering (bbox, page, type, classification, image refs). */ import { embedQuery, rerank, toPgVectorLiteral } from "./embed"; import { pgQuery } from "./db"; export interface ChunkHit { chunk_pk: number; doc_id: string; chunk_id: string; page: number; type: string; bbox: { x: number; y: number; w: number; h: number } | null; content_en: string | null; content_pt: string | null; classification: string | null; score: number; bm25_rank: number | null; dense_rank: number | null; rerank_score?: number; } export interface HybridSearchOptions { query: string; lang?: "pt" | "en"; doc_id?: string | null; type?: string | null; classification?: string | null; ufo_only?: boolean; /** Postgres recall window (default 100) — top-k from RRF before rerank. */ recall_k?: number; /** Final list size after rerank (default 20). */ top_k?: number; /** Skip reranker (faster, lower precision). */ no_rerank?: boolean; } export async function hybridSearch(opts: HybridSearchOptions): Promise { const { query, lang = "pt", doc_id = null, type = null, classification = null, ufo_only = false, recall_k = 100, top_k = 20, no_rerank = false, } = opts; if (!query.trim()) return []; // 1. Embed the query const q_embedding = await embedQuery(query); // 2. Call hybrid_search_chunks RPC const sql = ` SELECT * FROM public.hybrid_search_chunks( $1, $2::vector, $3, $4, $5, $6, $7, $8, 60 ) `; const rows = await pgQuery(sql, [ query, toPgVectorLiteral(q_embedding), lang, doc_id, type, classification, ufo_only, recall_k, ]); // Relevance gating happens server-side in the RPC: BM25 only matches real // terms, and the dense branch is bounded by max_dense_dist so absent-term // queries return no nearest-vector noise. So `rows` is already clean. if (rows.length === 0) return []; // 3. Optional cross-encoder rerank for finer ordering. It's CPU-slow // (seconds per ~dozen candidates), so it's opt-in (rerank=1); the default // fast path trusts the RPC's RRF order over the already-gated candidates. if (no_rerank) { return rows.slice(0, top_k); } const candidateTexts = rows.map((r) => lang === "en" ? r.content_en || r.content_pt || "" : r.content_pt || r.content_en || "", ); let scores: number[] = []; try { scores = await rerank(query, candidateTexts); } catch { return rows.slice(0, top_k); } return rows .map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 })) .sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0)) .slice(0, top_k); } /** Quick chunk lookup by chunk_id (no embedding). For citation expansion. */ export async function getChunk(doc_id: string, chunk_id: string): Promise { const rows = await pgQuery( `SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt, classification, 0::DOUBLE PRECISION AS score, NULL::INT AS bm25_rank, NULL::INT AS dense_rank FROM public.chunks WHERE doc_id = $1 AND chunk_id = $2`, [doc_id, chunk_id], ); return rows[0] ?? null; } /** List anomaly-flagged chunks. Useful for "show me all UFO sightings" without embedding. */ export async function listAnomalies(opts: { kind: "ufo" | "cryptid"; doc_id?: string | null; limit?: number; }): Promise> { const col = opts.kind === "ufo" ? "ufo_anomaly" : "cryptid_anomaly"; const typeCol = opts.kind === "ufo" ? "ufo_anomaly_type" : "cryptid_anomaly_type"; const ratCol = opts.kind === "ufo" ? "ufo_rationale" : "cryptid_rationale"; const limit = Math.min(opts.limit ?? 50, 200); const params: unknown[] = []; let where = `WHERE ${col} = TRUE`; if (opts.doc_id) { params.push(opts.doc_id); where += ` AND doc_id = $${params.length}`; } params.push(limit); const rows = await pgQuery>( `SELECT chunk_id, doc_id, page, ${typeCol} AS anomaly_type, ${ratCol} AS rationale, content_en, content_pt FROM public.chunks ${where} ORDER BY doc_id, order_global LIMIT $${params.length}`, params, ); return rows as never; } /** Assemble a single page (chunks ordered) directly from DB. */ export async function getPageChunks(doc_id: string, page: number): Promise { return pgQuery( `SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt, classification, 0::DOUBLE PRECISION AS score, NULL::INT AS bm25_rank, NULL::INT AS dense_rank FROM public.chunks WHERE doc_id = $1 AND page = $2 ORDER BY order_in_page`, [doc_id, page], ); }