disclosure-bureau/web/lib/retrieval/hybrid.ts
Luiz Gustavo eaf282c535
Some checks failed
CI / Web — typecheck + lint + build (push) Failing after 40s
CI / Scripts — Python smoke (push) Failing after 3s
CI / Web — npm audit (push) Failing after 29s
CI / Retrieval — golden set (Recall@5 + MRR) (push) Failing after 3s
W2: rerank opt-in, analyze_image_region tool, RAG eval, graph cleanup, ADRs
- TD#8 hybrid.ts: rerank_strategy {always|when_top_k_gt|never} + threshold
  (default skips rerank for top_k ≤ 15; chat tool uses threshold 10)
- O11 vision.ts + tools.ts: analyze_image_region tool — sharp-crops the
  bbox, claude CLI reads the temp PNG via Read tool, Sonnet vision answers
- TD#12 /graph: SigmaGraph replaces ForceGraphCanvas; react-force-graph-2d
  uninstalled (-37 transitive deps); force-graph-canvas.tsx deleted
- TD#27 messages/route.ts gatherContext slice sizes via CTX_* env vars
- TD#22 tests/rag/: golden.yaml (15 queries) + run.py (Recall@k + MRR +
  negative-pass rate) + baseline.json + CI job in .forgejo/workflows/ci.yml
- docs/adrs/: ADR-001..005 published from systems-atelier deliverables

Verified live on disclosure.top: top_k=5 path skips rerank (6.7s embed-only,
was 12-15s with rerank); rerank=always still available on demand.
First RAG baseline: Recall@5 = 0.2083, MRR = 0.25, Negative pass = 1.0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 19:20:09 -03:00

200 lines
6.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Hybrid retrieval: BM25 (tsvector) + dense (pgvector) → RRF fusion → reranker.
*
* Stage 1 (in Postgres via public.hybrid_search_chunks RPC):
* - tsvector keyword recall on en_unaccent or pt_unaccent
* - dense cosine on BGE-M3 embedding (1024 dim)
* - RRF score combines both rankings
* - filters: doc_id, type, classification, ufo_only
*
* Stage 2 (in Node via embed-service /rerank):
* - Cross-encoder rerank of top-N candidates
*
* Returns chunks sorted by final reranked score, with all metadata for
* citation rendering (bbox, page, type, classification, image refs).
*/
import { embedQuery, rerank, toPgVectorLiteral } from "./embed";
import { pgQuery } from "./db";
export interface ChunkHit {
chunk_pk: number;
doc_id: string;
chunk_id: string;
page: number;
type: string;
bbox: { x: number; y: number; w: number; h: number } | null;
content_en: string | null;
content_pt: string | null;
classification: string | null;
score: number;
bm25_rank: number | null;
dense_rank: number | null;
rerank_score?: number;
}
export interface HybridSearchOptions {
query: string;
lang?: "pt" | "en";
doc_id?: string | null;
type?: string | null;
classification?: string | null;
ufo_only?: boolean;
/** Postgres recall window (default 100) — top-k from RRF before rerank. */
recall_k?: number;
/** Final list size after rerank (default 20). */
top_k?: number;
/** Skip reranker (faster, lower precision). Back-compat shortcut for
* `rerank_strategy: "never"`. */
no_rerank?: boolean;
/**
* W2-TD#8: rerank policy.
* - "always" — always run the cross-encoder (highest precision,
* slowest, 58s on CPU)
* - "when_top_k_gt" — rerank only when `top_k > rerank_threshold`
* (default threshold 15). RRF order from the RPC is
* usually good enough for the tight head of results;
* the reranker pays off when re-sorting a wider list.
* This is the new default — autocomplete / chat
* top-10 calls now skip rerank for free.
* - "never" — same as `no_rerank: true`.
*/
rerank_strategy?: "always" | "when_top_k_gt" | "never";
/** Threshold for `when_top_k_gt`. Default 15 (per ADR-001). */
rerank_threshold?: number;
}
export async function hybridSearch(opts: HybridSearchOptions): Promise<ChunkHit[]> {
const {
query,
lang = "pt",
doc_id = null,
type = null,
classification = null,
ufo_only = false,
recall_k = 100,
top_k = 20,
no_rerank = false,
rerank_strategy = "when_top_k_gt",
rerank_threshold = 15,
} = opts;
// Effective strategy: explicit `no_rerank=true` always wins (back-compat).
const strategy: "always" | "when_top_k_gt" | "never" =
no_rerank ? "never" : rerank_strategy;
if (!query.trim()) return [];
// 1. Embed the query
const q_embedding = await embedQuery(query);
// 2. Call hybrid_search_chunks RPC
const sql = `
SELECT *
FROM public.hybrid_search_chunks(
$1, $2::vector, $3, $4, $5, $6, $7, $8, 60
)
`;
const rows = await pgQuery<ChunkHit>(sql, [
query,
toPgVectorLiteral(q_embedding),
lang,
doc_id,
type,
classification,
ufo_only,
recall_k,
]);
// Relevance gating happens server-side in the RPC: BM25 only matches real
// terms, and the dense branch is bounded by max_dense_dist so absent-term
// queries return no nearest-vector noise. So `rows` is already clean.
if (rows.length === 0) return [];
// 3. Optional cross-encoder rerank for finer ordering. It's CPU-slow
// (seconds per ~dozen candidates). Strategy resolution (W2-TD#8 / ADR-001):
// - "never" → skip
// - "when_top_k_gt" → skip when top_k ≤ threshold (RRF is good enough
// for a small head)
// - "always" → run unconditionally
if (strategy === "never" ||
(strategy === "when_top_k_gt" && top_k <= rerank_threshold)) {
return rows.slice(0, top_k);
}
const candidateTexts = rows.map((r) =>
lang === "en" ? r.content_en || r.content_pt || "" : r.content_pt || r.content_en || "",
);
let scores: number[] = [];
try {
scores = await rerank(query, candidateTexts);
} catch {
return rows.slice(0, top_k);
}
return rows
.map((r, i) => ({ ...r, rerank_score: scores[i] ?? 0 }))
.sort((a, b) => (b.rerank_score ?? 0) - (a.rerank_score ?? 0))
.slice(0, top_k);
}
/** Quick chunk lookup by chunk_id (no embedding). For citation expansion. */
export async function getChunk(doc_id: string, chunk_id: string): Promise<ChunkHit | null> {
const rows = await pgQuery<ChunkHit>(
`SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt,
classification, 0::DOUBLE PRECISION AS score,
NULL::INT AS bm25_rank, NULL::INT AS dense_rank
FROM public.chunks
WHERE doc_id = $1 AND chunk_id = $2`,
[doc_id, chunk_id],
);
return rows[0] ?? null;
}
/** List anomaly-flagged chunks. Useful for "show me all UFO sightings" without embedding. */
export async function listAnomalies(opts: {
kind: "ufo" | "cryptid";
doc_id?: string | null;
limit?: number;
}): Promise<Array<{
chunk_id: string;
doc_id: string;
page: number;
anomaly_type: string | null;
rationale: string | null;
content_en: string | null;
content_pt: string | null;
}>> {
const col = opts.kind === "ufo" ? "ufo_anomaly" : "cryptid_anomaly";
const typeCol = opts.kind === "ufo" ? "ufo_anomaly_type" : "cryptid_anomaly_type";
const ratCol = opts.kind === "ufo" ? "ufo_rationale" : "cryptid_rationale";
const limit = Math.min(opts.limit ?? 50, 200);
const params: unknown[] = [];
let where = `WHERE ${col} = TRUE`;
if (opts.doc_id) {
params.push(opts.doc_id);
where += ` AND doc_id = $${params.length}`;
}
params.push(limit);
const rows = await pgQuery<Record<string, unknown>>(
`SELECT chunk_id, doc_id, page, ${typeCol} AS anomaly_type, ${ratCol} AS rationale,
content_en, content_pt
FROM public.chunks
${where}
ORDER BY doc_id, order_global
LIMIT $${params.length}`,
params,
);
return rows as never;
}
/** Assemble a single page (chunks ordered) directly from DB. */
export async function getPageChunks(doc_id: string, page: number): Promise<ChunkHit[]> {
return pgQuery<ChunkHit>(
`SELECT chunk_pk, doc_id, chunk_id, page, type, bbox, content_en, content_pt,
classification, 0::DOUBLE PRECISION AS score,
NULL::INT AS bm25_rank, NULL::INT AS dense_rank
FROM public.chunks
WHERE doc_id = $1 AND page = $2
ORDER BY order_in_page`,
[doc_id, page],
);
}