disclosure-bureau/investigator-runtime/src/detectives/holmes.ts

/**
 * holmes.ts — hypothesis tournament detective.
 *
 * Workflow (matches agentic-layer-spec sec 7):
 *  1. The runtime grounds Holmes with a small corpus shortlist via
 *     hybridSearch — Holmes never gets the whole DB, just the relevant 8-15
 *     chunks.
 *  2. Claude Sonnet 4.6 reads the question + chunks, emits a JSON array of
 *     2-3 rival hypotheses with priors/posteriors/citations.
 *  3. The runtime parses the array and calls writeHypothesis() for each.
 *     The writer enforces posterior bounds + Tetlock band + FK to evidence.
 *
 * Holmes does NOT get tool calls. All grounding is pre-fed; all writes are
 * applied by the runtime after validation (sa-security gate #2).
 */
import { readFile } from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { audit } from "../lib/audit";
import { callClaude } from "../lib/claude";
import { env } from "../lib/env";
import { hybridSearch, type SearchHit } from "../lib/search";
import { writeHypothesis, type WriteHypothesisArgs } from "../tools/write_hypothesis";

const HERE = path.dirname(fileURLToPath(import.meta.url));
const PROMPT_PATH = path.resolve(HERE, "..", "..", "prompts", "holmes.md");

export interface HolmesTask {
  job_id: string;
  question: string;
  /** Optional scope narrowing — restrict the search to one doc / entity. */
  doc_id?: string;
  lang?: "pt" | "en";
  /** How many chunks to feed Holmes. Default 12. */
  context_chunks?: number;
  budget_cap_usd?: number;
}

function renderChunkBlock(hits: SearchHit[], lang: "pt" | "en"): string {
  const blocks = hits.map((h, i) => {
    const text = (lang === "en" ? h.content_en : h.content_pt) || h.content_en || h.content_pt || "";
    const pageStr = String(h.page).padStart(3, "0");
    return [
      `--- chunk ${i + 1} ---`,
      `id: [[${h.doc_id}/p${pageStr}#${h.chunk_id}]]`,
      `type: ${h.type}`,
      h.classification ? `classification: ${h.classification}` : null,
      "",
      text.slice(0, 1200),
    ].filter(Boolean).join("\n");
  });
  return blocks.join("\n\n");
}

function buildPrompt(task: HolmesTask, hits: SearchHit[], lang: "pt" | "en"): string {
  const block = renderChunkBlock(hits, lang);
  return [
    `# Question to investigate`,
    "",
    task.question,
    "",
    `## Corpus shortlist (${hits.length} chunks${task.doc_id ? `, scoped to ${task.doc_id}` : ""})`,
    "",
    block,
    "",
    "## Your task",
    "",
    "Build 2-3 rival hypotheses about the question above. Each must cite at",
    "least one chunk via [[doc-id/pNNN#cNNNN]] in argument_for and",
    "argument_against. Assign priors + posteriors summing roughly to 1.0.",
    "Emit the JSON array exactly as specified by the system prompt — no prose,",
    "no code fence, no preamble.",
  ].join("\n");
}

function extractJsonArray(text: string): unknown[] | null {
  const t = text.trim();
  if (t === "NO_HYPOTHESES") return null;
  const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
  const first = stripped.indexOf("[");
  const last = stripped.lastIndexOf("]");
  if (first === -1 || last === -1) {
    throw new Error(`holmes returned no JSON array: ${t.slice(0, 200)}`);
  }
  const parsed = JSON.parse(stripped.slice(first, last + 1));
  if (!Array.isArray(parsed)) throw new Error("holmes JSON is not an array");
  return parsed;
}

export async function runHolmes(task: HolmesTask): Promise<
  | { hypotheses: Array<{ hypothesis_id: string; case_file: string }> }
  | { skipped: true; reason: string }
> {
  const lang: "pt" | "en" = task.lang ?? "pt";
  const k = task.context_chunks ?? 12;

  // 1. Ground with hybrid_search.
  const hits = await hybridSearch({
    query: task.question,
    lang,
    doc_id: task.doc_id ?? null,
    top_k: k,
    recall_k: 60,
  });
  await audit({
    event: "holmes_grounded",
    job_id: task.job_id,
    detective: "holmes@detective",
    question: task.question,
    n_chunks: hits.length,
    doc_id: task.doc_id ?? null,
  });
  if (hits.length === 0) {
    return { skipped: true, reason: "no_corpus_match" };
  }

  // 2. Call Claude.
  const systemPrompt = await readFile(PROMPT_PATH, "utf-8");
  const prompt = buildPrompt(task, hits, lang);
  const llm = await callClaude({
    prompt,
    systemPrompt,
    model: env.CLAUDE_MODEL,
    allowedTools: [],
    timeoutMs: env.JOB_TIMEOUT_SECONDS * 1000,
    budgetCapUsd: task.budget_cap_usd ?? env.BUDGET_CAP_USD_PER_JOB,
  });
  await audit({
    event: "detective_completed",
    job_id: task.job_id,
    detective: "holmes@detective",
    cost_usd: llm.costUsd,
    tokens_in: llm.tokensIn,
    tokens_out: llm.tokensOut,
    duration_ms: llm.durationMs,
  });

  console.error(`[holmes] response (${llm.text.length} chars): ${llm.text.slice(0, 800)}`);

  // 3. Parse + write.
  const arr = extractJsonArray(llm.text);
  if (arr === null) return { skipped: true, reason: "NO_HYPOTHESES" };

  const out: Array<{ hypothesis_id: string; case_file: string }> = [];
  for (const raw of arr.slice(0, 3)) {
    const args: WriteHypothesisArgs = {
      question: task.question,
      position: String((raw as { position?: unknown }).position ?? "").trim(),
      argument_for: typeof (raw as { argument_for?: unknown }).argument_for === "string"
        ? (raw as { argument_for: string }).argument_for : undefined,
      argument_against: typeof (raw as { argument_against?: unknown }).argument_against === "string"
        ? (raw as { argument_against: string }).argument_against : undefined,
      prior: Number((raw as { prior?: unknown }).prior),
      posterior: Number((raw as { posterior?: unknown }).posterior),
      confidence_band: (raw as { confidence_band?: WriteHypothesisArgs["confidence_band"] }).confidence_band,
      evidence_refs: Array.isArray((raw as { evidence_refs?: unknown }).evidence_refs)
        ? (raw as { evidence_refs: Array<{ evidence_id?: string; supports?: boolean; weight?: number }> }).evidence_refs
            .filter((r): r is { evidence_id: string; supports?: boolean; weight?: number } =>
              typeof r?.evidence_id === "string" && r.evidence_id.length > 0)
        : [],
    };
    if (!args.position) continue;
    try {
      const r = await writeHypothesis(args, { job_id: task.job_id, detective: "holmes@detective" });
      out.push(r);
    } catch (e) {
      await audit({
        event: "write_hypothesis_failed",
        job_id: task.job_id,
        detective: "holmes@detective",
        error: (e as Error).message,
        position: args.position.slice(0, 200),
      });
    }
  }
  return { hypotheses: out };
}
W3.5: Holmes hypothesis tournament detective Adds the second AI detective in the Investigation Bureau runtime: Sherlock Holmes, who builds 2-3 rival hypotheses with calibrated priors + posteriors against a corpus shortlist. Pipeline: 1. hybridSearch() grounds Holmes with 8-15 chunks via the same hybrid_search_chunks RPC the web uses (BM25 + dense + RRF). Default max_dense_dist=0.55 (runtime favors recall over precision; web's /api/search/hybrid stays at 0.40 for chat). 2. claude-sonnet-4-6 emits a strict JSON array with position + argument_for + argument_against + prior + posterior + confidence_band + evidence_refs. Citations use [[doc-id/pNNN#cNNNN]] wiki-links. 3. writeHypothesis() validates posterior ∈ [0,1], auto-corrects the Tetlock band from the posterior (high ≥0.90, medium 0.60-0.89, low 0.30-0.59, speculation <0.30), checks evidence_refs FK against public.evidence, INSERTs into public.hypotheses + writes case/hypotheses/H-NNNN.md. Discipline guarantees (prompts/holmes.md): - posteriors across rivals sum to ≈1.0 - no claim without chunk citation - prefer lower band when ambiguous (anti-inflation) - declarative one-sentence position, no hedging - emit `NO_HYPOTHESES` when corpus is silent (refuses to fabricate) Smoke test (Sandia green fireballs 1948-49): - H-0001 prior 0.5 → posterior 0.2 (speculation): natural meteoric - H-0002 prior 0.3 → posterior 0.4 (low): classified weapons / tests - H-0003 prior 0.2 → posterior 0.4 (low): genuinely unidentified Bayesian update visible: "natural meteoric" prior dropped 60%; both rivals climbed. 4 unique chunk citations across the 3 hypotheses. orchestrator dispatches `hypothesis_tournament` kind via runHolmes; job marked `failed` if all rivals error, `complete` otherwise. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-24 00:19:43 +00:00			`/**`
			`* holmes.ts — hypothesis tournament detective.`
			`*`
			`* Workflow (matches agentic-layer-spec sec 7):`
			`* 1. The runtime grounds Holmes with a small corpus shortlist via`
			`* hybridSearch — Holmes never gets the whole DB, just the relevant 8-15`
			`* chunks.`
			`* 2. Claude Sonnet 4.6 reads the question + chunks, emits a JSON array of`
			`* 2-3 rival hypotheses with priors/posteriors/citations.`
			`* 3. The runtime parses the array and calls writeHypothesis() for each.`
			`* The writer enforces posterior bounds + Tetlock band + FK to evidence.`
			`*`
			`* Holmes does NOT get tool calls. All grounding is pre-fed; all writes are`
			`* applied by the runtime after validation (sa-security gate #2).`
			`*/`
			`import { readFile } from "node:fs/promises";`
			`import path from "node:path";`
			`import { fileURLToPath } from "node:url";`
			`import { audit } from "../lib/audit";`
			`import { callClaude } from "../lib/claude";`
			`import { env } from "../lib/env";`
			`import { hybridSearch, type SearchHit } from "../lib/search";`
			`import { writeHypothesis, type WriteHypothesisArgs } from "../tools/write_hypothesis";`

			`const HERE = path.dirname(fileURLToPath(import.meta.url));`
			`const PROMPT_PATH = path.resolve(HERE, "..", "..", "prompts", "holmes.md");`

			`export interface HolmesTask {`
			`job_id: string;`
			`question: string;`
			`/** Optional scope narrowing — restrict the search to one doc / entity. */`
			`doc_id?: string;`
			`lang?: "pt" \| "en";`
			`/** How many chunks to feed Holmes. Default 12. */`
			`context_chunks?: number;`
			`budget_cap_usd?: number;`
			`}`

			`function renderChunkBlock(hits: SearchHit[], lang: "pt" \| "en"): string {`
			`const blocks = hits.map((h, i) => {`
			`const text = (lang === "en" ? h.content_en : h.content_pt) \|\| h.content_en \|\| h.content_pt \|\| "";`
			`const pageStr = String(h.page).padStart(3, "0");`
			`return [`
			`--- chunk ${i + 1} ---`,
			`id: [[${h.doc_id}/p${pageStr}#${h.chunk_id}]]`,
			`type: ${h.type}`,
			h.classification ? `classification: ${h.classification}` : null,
			`"",`
			`text.slice(0, 1200),`
			`].filter(Boolean).join("\n");`
			`});`
			`return blocks.join("\n\n");`
			`}`

			`function buildPrompt(task: HolmesTask, hits: SearchHit[], lang: "pt" \| "en"): string {`
			`const block = renderChunkBlock(hits, lang);`
			`return [`
			`# Question to investigate`,
			`"",`
			`task.question,`
			`"",`
			`## Corpus shortlist (${hits.length} chunks${task.doc_id ? `, scoped to ${task.doc_id}` : ""})`,
			`"",`
			`block,`
			`"",`
			`"## Your task",`
			`"",`
			`"Build 2-3 rival hypotheses about the question above. Each must cite at",`
			`"least one chunk via [[doc-id/pNNN#cNNNN]] in argument_for and",`
			`"argument_against. Assign priors + posteriors summing roughly to 1.0.",`
			`"Emit the JSON array exactly as specified by the system prompt — no prose,",`
			`"no code fence, no preamble.",`
			`].join("\n");`
			`}`

			`function extractJsonArray(text: string): unknown[] \| null {`
			`const t = text.trim();`
			`if (t === "NO_HYPOTHESES") return null;`
			const stripped = t.replace(/^```(?:json)?\s\n?/i, "").replace(/\n?```\s$/i, "");
			`const first = stripped.indexOf("[");`
			`const last = stripped.lastIndexOf("]");`
			`if (first === -1 \|\| last === -1) {`
			throw new Error(`holmes returned no JSON array: ${t.slice(0, 200)}`);
			`}`
			`const parsed = JSON.parse(stripped.slice(first, last + 1));`
			`if (!Array.isArray(parsed)) throw new Error("holmes JSON is not an array");`
			`return parsed;`
			`}`

			`export async function runHolmes(task: HolmesTask): Promise<`
			`\| { hypotheses: Array<{ hypothesis_id: string; case_file: string }> }`
			`\| { skipped: true; reason: string }`
			`> {`
			`const lang: "pt" \| "en" = task.lang ?? "pt";`
			`const k = task.context_chunks ?? 12;`

			`// 1. Ground with hybrid_search.`
			`const hits = await hybridSearch({`
			`query: task.question,`
			`lang,`
			`doc_id: task.doc_id ?? null,`
			`top_k: k,`
			`recall_k: 60,`
			`});`
			`await audit({`
			`event: "holmes_grounded",`
			`job_id: task.job_id,`
			`detective: "holmes@detective",`
			`question: task.question,`
			`n_chunks: hits.length,`
			`doc_id: task.doc_id ?? null,`
			`});`
			`if (hits.length === 0) {`
			`return { skipped: true, reason: "no_corpus_match" };`
			`}`

			`// 2. Call Claude.`
			`const systemPrompt = await readFile(PROMPT_PATH, "utf-8");`
			`const prompt = buildPrompt(task, hits, lang);`
			`const llm = await callClaude({`
			`prompt,`
			`systemPrompt,`
			`model: env.CLAUDE_MODEL,`
			`allowedTools: [],`
			`timeoutMs: env.JOB_TIMEOUT_SECONDS * 1000,`
			`budgetCapUsd: task.budget_cap_usd ?? env.BUDGET_CAP_USD_PER_JOB,`
			`});`
			`await audit({`
			`event: "detective_completed",`
			`job_id: task.job_id,`
			`detective: "holmes@detective",`
			`cost_usd: llm.costUsd,`
			`tokens_in: llm.tokensIn,`
			`tokens_out: llm.tokensOut,`
			`duration_ms: llm.durationMs,`
			`});`

			console.error(`[holmes] response (${llm.text.length} chars): ${llm.text.slice(0, 800)}`);

			`// 3. Parse + write.`
			`const arr = extractJsonArray(llm.text);`
			`if (arr === null) return { skipped: true, reason: "NO_HYPOTHESES" };`

			`const out: Array<{ hypothesis_id: string; case_file: string }> = [];`
			`for (const raw of arr.slice(0, 3)) {`
			`const args: WriteHypothesisArgs = {`
			`question: task.question,`
			`position: String((raw as { position?: unknown }).position ?? "").trim(),`
			`argument_for: typeof (raw as { argument_for?: unknown }).argument_for === "string"`
			`? (raw as { argument_for: string }).argument_for : undefined,`
			`argument_against: typeof (raw as { argument_against?: unknown }).argument_against === "string"`
			`? (raw as { argument_against: string }).argument_against : undefined,`
			`prior: Number((raw as { prior?: unknown }).prior),`
			`posterior: Number((raw as { posterior?: unknown }).posterior),`
			`confidence_band: (raw as { confidence_band?: WriteHypothesisArgs["confidence_band"] }).confidence_band,`
			`evidence_refs: Array.isArray((raw as { evidence_refs?: unknown }).evidence_refs)`
			`? (raw as { evidence_refs: Array<{ evidence_id?: string; supports?: boolean; weight?: number }> }).evidence_refs`
			`.filter((r): r is { evidence_id: string; supports?: boolean; weight?: number } =>`
			`typeof r?.evidence_id === "string" && r.evidence_id.length > 0)`
			`: [],`
			`};`
			`if (!args.position) continue;`
			`try {`
			`const r = await writeHypothesis(args, { job_id: task.job_id, detective: "holmes@detective" });`
			`out.push(r);`
			`} catch (e) {`
			`await audit({`
			`event: "write_hypothesis_failed",`
			`job_id: task.job_id,`
			`detective: "holmes@detective",`
			`error: (e as Error).message,`
			`position: args.position.slice(0, 200),`
			`});`
			`}`
			`}`
			`return { hypotheses: out };`
			`}`