disclosure-bureau/web/lib/retrieval/hybrid.ts

/**
 * Hybrid retrieval: BM25 (tsvector) + dense (pgvector) → RRF fusion → reranker.
 *
 * Stage 1 (in Postgres via public.hybrid_search_chunks RPC):
 *   - tsvector keyword recall on en_unaccent or pt_unaccent
 *   - dense cosine on BGE-M3 embedding (1024 dim)
 *   - RRF score combines both rankings
 *   - filters: doc_id, type, classification, ufo_only
 *
 * Stage 2 (in Node via embed-service /rerank):
 *   - Cross-encoder rerank of top-N candidates
 *
 * Returns chunks sorted by final reranked score, with all metadata for
 * citation rendering (bbox, page, type, classification, image refs).
 */
import { embedQuery, rerank, toPgVectorLiteral } from "./embed";
import { pgQuery } from "./db";

export interface ChunkHit {
  chunk_pk: number;
  doc_id: string;
  chunk_id: string;
  page: number;
  type: string;
  bbox: { x: number; y: number; w: number; h: number } | null;
  content_en: string | null;
  content_pt: string | null;
  classification: string | null;
  score: number;
  bm25_rank: number | null;
  dense_rank: number | null;
  rerank_score?: number;
}

export interface HybridSearchOptions {
  query: string;
  lang?: "pt" | "en";
  doc_id?: string | null;
  type?: string | null;
  classification?: string | null;
  ufo_only?: boolean;
  /** Postgres recall window (default 100) — top-k from RRF before rerank. */
  recall_k?: number;
  /** Final list size after rerank (default 20). */
  top_k?: number;
  /** Skip reranker (faster, lower precision). Back-compat shortcut for
   *  `rerank_strategy: "never"`. */
  no_rerank?: boolean;
  /**
   * W2-TD#8: rerank policy.
   *   - "always"        — always run the cross-encoder (highest precision,
   *                       slowest, 5–8s on CPU)
   *   - "when_top_k_gt" — rerank only when `top_k > rerank_threshold`
   *                       (default threshold 15). RRF order from the RPC is
   *                       usually good enough for the tight head of results;
   *                       the reranker pays off when re-sorting a wider list.
   *                       This is the new default — autocomplete / chat
   *                       top-10 calls now skip rerank for free.
   *   - "never"         — same as `no_rerank: true`.
   */
  rerank_strategy?: "always" | "when_top_k_gt" | "never";
  /** Threshold for `when_top_k_gt`. Default 15 (per ADR-001). */
  rerank_threshold?: number;
}

export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[]> {
  const {
    query,
    lang = "pt",
    doc_id = null,
    type = null,
    classification = null,
    ufo_only = false,
    recall_k = 100,
    top_k = 20,
    no_rerank = false,
    rerank_strategy = "when_top_k_gt",
    rerank_threshold = 15,
  } = opts;

  // Effective strategy: explicit `no_rerank=true` always wins (back-compat).
  const strategy: "always" | "when_top_k_gt" | "never" =
    no_rerank ? "never" : rerank_strategy;

  if (!query.trim()) return [];

  // 1. Embed the query
  const q_embedding = await embedQuery(query);

  // 2. Call hybrid_search_chunks RPC
  const sql = `
    SELECT *
    FROM public.hybrid_search_chunks(
      $1, $2::vector, $3, $4, $5, $6, $7, $8, 60
    )
  `;
  const rows = await pgQuery<ChunkHit>(sql, [
    query,
    toPgVectorLiteral(q_embedding),
    lang,
    doc_id,
    type,
    classification,
    ufo_only,
    recall_k,
  ]);

  // Relevance gating happens server-side in the RPC: BM25 only matches real
  // terms, and the dense branch is bounded by max_dense_dist so absent-term
  // queries return no nearest-vector noise. So `rows` is already clean.
  if (rows.length === 0) return [];

  // 3. Optional cross-encoder rerank for finer ordering. It's CPU-slow
  // (seconds per ~dozen candidates). Strategy resolution (W2-TD#8 / ADR-001):
  //   - "never"         → skip
  //   - "when_top_k_gt" → skip when top_k ≤ threshold (RRF is good enough
  //                       for a small head)
  //   - "always"        → run unconditionally
  if (strategy === "never" ||
      (strategy === "when_top_k_gt" && top_k <= rerank_threshold)) {
    return rows.slice(0, top_k);
  }

  const candidateTexts = rows.map((r) =>
    lang === "en" ? r.content_en || r.content_pt || "" : r.content_pt || r.content_en || "",
  );
  let scores: number[] = [];
  try {
    scores = await rerank(query, candidateTexts);
  } catch {
    return rows.slice(0, top_k);
  }
  return rows
    .map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 }))
    .sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0))
    .slice(0, top_k);
}

/** Quick chunk lookup by chunk_id (no embedding). For citation expansion. */
export async function getChunk(doc_id: string, chunk_id: string): Promise<ChunkHit | null> {
  const rows = await pgQuery<ChunkHit>(
    `SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt,
            classification, 0::DOUBLE PRECISION AS score,
            NULL::INT AS bm25_rank, NULL::INT AS dense_rank
     FROM public.chunks
     WHERE doc_id = $1 AND chunk_id = $2`,
    [doc_id, chunk_id],
  );
  return rows[0] ?? null;
}

/** List anomaly-flagged chunks. Useful for "show me all UFO sightings" without embedding. */
export async function listAnomalies(opts: {
  kind: "ufo" | "cryptid";
  doc_id?: string | null;
  limit?: number;
}): Promise<Array<{
  chunk_id: string;
  doc_id: string;
  page: number;
  anomaly_type: string | null;
  rationale: string | null;
  content_en: string | null;
  content_pt: string | null;
}>> {
  const col = opts.kind === "ufo" ? "ufo_anomaly" : "cryptid_anomaly";
  const typeCol = opts.kind === "ufo" ? "ufo_anomaly_type" : "cryptid_anomaly_type";
  const ratCol = opts.kind === "ufo" ? "ufo_rationale" : "cryptid_rationale";
  const limit = Math.min(opts.limit ?? 50, 200);
  const params: unknown[] = [];
  let where = `WHERE ${col} = TRUE`;
  if (opts.doc_id) {
    params.push(opts.doc_id);
    where += ` AND doc_id = $${params.length}`;
  }
  params.push(limit);
  const rows = await pgQuery<Record<string, unknown>>(
    `SELECT chunk_id, doc_id, page, ${typeCol} AS anomaly_type, ${ratCol} AS rationale,
            content_en, content_pt
     FROM public.chunks
     ${where}
     ORDER BY doc_id, order_global
     LIMIT $${params.length}`,
    params,
  );
  return rows as never;
}

/** Assemble a single page (chunks ordered) directly from DB. */
export async function getPageChunks(doc_id: string, page: number): Promise<ChunkHit[]> {
  return pgQuery<ChunkHit>(
    `SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt,
            classification, 0::DOUBLE PRECISION AS score,
            NULL::INT AS bm25_rank, NULL::INT AS dense_rank
     FROM public.chunks
     WHERE doc_id = $1 AND page = $2
     ORDER BY order_in_page`,
    [doc_id, page],
  );
}