diff --git a/investigator-runtime/src/detectives/dupin.ts b/investigator-runtime/src/detectives/dupin.ts index 6c28ea0..7f76f03 100644 --- a/investigator-runtime/src/detectives/dupin.ts +++ b/investigator-runtime/src/detectives/dupin.ts @@ -35,11 +35,13 @@ export interface DupinTask { } function renderChunkBlock(hits: SearchHit[], lang: "pt" | "en"): string { - const blocks = hits.map((h) => { + const blocks = hits.map((h, i) => { const text = (lang === "en" ? h.content_en : h.content_pt) || h.content_en || h.content_pt || ""; - const pageStr = String(h.page).padStart(3, "0"); return [ - `--- ${h.doc_id}/p${pageStr}#${h.chunk_id} ---`, + `--- chunk ${i + 1} ---`, + `doc_id: ${h.doc_id}`, + `chunk_id: ${h.chunk_id}`, + `page: ${h.page}`, `type: ${h.type}`, h.classification ? `classification: ${h.classification}` : null, "", @@ -108,13 +110,33 @@ export async function runDupin(task: DupinTask): Promise< const lang: "pt" | "en" = task.lang ?? "pt"; const k = task.context_chunks ?? 18; - const hits = await hybridSearch({ + // Pass 1: scoped to the requested doc, if any. + let hits = await hybridSearch({ query: task.topic, lang, doc_id: task.doc_id ?? null, top_k: k, recall_k: 80, }); + let scope_widened = false; + + // Pass 2: if a doc_id was set and the head is too thin for a tournament, + // widen the scope to the whole corpus. Cross-doc contradictions are + // valuable too (one doc says X, another says Y). + if (task.doc_id && hits.length < 2) { + const widened = await hybridSearch({ + query: task.topic, + lang, + doc_id: null, + top_k: k, + recall_k: 80, + }); + if (widened.length > hits.length) { + hits = widened; + scope_widened = true; + } + } + await audit({ event: "dupin_grounded", job_id: task.job_id, @@ -122,6 +144,7 @@ export async function runDupin(task: DupinTask): Promise< topic: task.topic, n_chunks: hits.length, doc_id: task.doc_id ?? null, + scope_widened, }); if (hits.length < 2) { return { skipped: true, reason: "insufficient_corpus" }; diff --git a/investigator-runtime/src/tools/write_contradiction.ts b/investigator-runtime/src/tools/write_contradiction.ts index 56ba7a6..d45391a 100644 --- a/investigator-runtime/src/tools/write_contradiction.ts +++ b/investigator-runtime/src/tools/write_contradiction.ts @@ -48,10 +48,20 @@ interface ResolvedPosition extends ContradictionPosition { page: number; } +/** + * Strip page-prefix idioms detectives sometimes emit. Canonical chunk_id is + * just `c0042`. Forms accepted: "c0042", "p007#c0042", "p007/c0042". + */ +function normalizeChunkId(raw: string): string { + const m = raw.match(/c\d{4,}$/); + return m ? m[0] : raw; +} + async function resolveChunk(doc_id: string, chunk_id: string): Promise<{ chunk_pk: number; page: number } | null> { + const cid = normalizeChunkId(chunk_id); const row = await queryOne<{ chunk_pk: number; page: number }>( `SELECT chunk_pk, page FROM public.chunks WHERE doc_id = $1 AND chunk_id = $2`, - [doc_id, chunk_id], + [doc_id, cid], ); return row ?? null; }