disclosure-bureau/investigator-runtime/src/detectives/holmes.ts

/**
 * holmes.ts — hypothesis tournament detective.
 *
 * Workflow (matches agentic-layer-spec sec 7):
 *  1. The runtime grounds Holmes with a small corpus shortlist via
 *     hybridSearch — Holmes never gets the whole DB, just the relevant 8-15
 *     chunks.
 *  2. Claude Sonnet 4.6 reads the question + chunks, emits a JSON array of
 *     2-3 rival hypotheses with priors/posteriors/citations.
 *  3. The runtime parses the array and calls writeHypothesis() for each.
 *     The writer enforces posterior bounds + Tetlock band + FK to evidence.
 *
 * Holmes does NOT get tool calls. All grounding is pre-fed; all writes are
 * applied by the runtime after validation (sa-security gate #2).
 */
import { readFile } from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { audit } from "../lib/audit";
import { callClaude } from "../lib/claude";
import { env } from "../lib/env";
import { hybridSearch, type SearchHit } from "../lib/search";
import { writeHypothesis, type WriteHypothesisArgs } from "../tools/write_hypothesis";

const HERE = path.dirname(fileURLToPath(import.meta.url));
const PROMPT_PATH = path.resolve(HERE, "..", "..", "prompts", "holmes.md");

export interface HolmesTask {
  job_id: string;
  question: string;
  /** Optional PT-BR mirror of the question. If omitted, the EN one is used
   *  for both sides until the model emits PT-BR output. */
  question_pt_br?: string;
  /** Optional scope narrowing — restrict the search to one doc / entity. */
  doc_id?: string;
  lang?: "pt" | "en";
  /** How many chunks to feed Holmes. Default 12. */
  context_chunks?: number;
  budget_cap_usd?: number;
}

function renderChunkBlock(hits: SearchHit[], lang: "pt" | "en"): string {
  const blocks = hits.map((h, i) => {
    const text = (lang === "en" ? h.content_en : h.content_pt) || h.content_en || h.content_pt || "";
    const pageStr = String(h.page).padStart(3, "0");
    return [
      `--- chunk ${i + 1} ---`,
      `id: [[${h.doc_id}/p${pageStr}#${h.chunk_id}]]`,
      `type: ${h.type}`,
      h.classification ? `classification: ${h.classification}` : null,
      "",
      text.slice(0, 1200),
    ].filter(Boolean).join("\n");
  });
  return blocks.join("\n\n");
}

function buildPrompt(task: HolmesTask, hits: SearchHit[], lang: "pt" | "en"): string {
  const block = renderChunkBlock(hits, lang);
  const ptQ = task.question_pt_br?.trim();
  return [
    `# Question to investigate`,
    "",
    `**EN.** ${task.question}`,
    ptQ ? `**PT-BR.** ${ptQ}` : null,
    "",
    `## Corpus shortlist (${hits.length} chunks${task.doc_id ? `, scoped to ${task.doc_id}` : ""})`,
    "",
    block,
    "",
    "## Your task",
    "",
    "Build 2-3 rival hypotheses about the question above. Each must cite at",
    "least one chunk via [[doc-id/pNNN#cNNNN]] in both argument_for and",
    "argument_against (EN) and in argument_for_pt_br and",
    "argument_against_pt_br (PT-BR). Assign priors + posteriors summing",
    "roughly to 1.0. Emit the JSON array exactly as specified by the system",
    "prompt — no prose, no code fence, no preamble. **Bilingual is mandatory:",
    "every narrative field appears in both EN and PT-BR.**",
  ].filter(Boolean).join("\n");
}

function extractJsonArray(text: string): unknown[] | null {
  const t = text.trim();
  if (/^`?NO_HYPOTHESES`?\b/i.test(t)) return null;
  const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
  const first = stripped.indexOf("[");
  const last = stripped.lastIndexOf("]");
  if (first === -1 || last === -1) {
    throw new Error(`holmes returned no JSON array: ${t.slice(0, 200)}`);
  }
  const parsed = JSON.parse(stripped.slice(first, last + 1));
  if (!Array.isArray(parsed)) throw new Error("holmes JSON is not an array");
  return parsed;
}

export async function runHolmes(task: HolmesTask): Promise<
  | { hypotheses: Array<{ hypothesis_id: string; case_file: string }> }
  | { skipped: true; reason: string }
> {
  const lang: "pt" | "en" = task.lang ?? "pt";
  const k = task.context_chunks ?? 12;

  // 1. Ground with hybrid_search.
  const hits = await hybridSearch({
    query: task.question,
    lang,
    doc_id: task.doc_id ?? null,
    top_k: k,
    recall_k: 60,
  });
  await audit({
    event: "holmes_grounded",
    job_id: task.job_id,
    detective: "holmes@detective",
    question: task.question,
    n_chunks: hits.length,
    doc_id: task.doc_id ?? null,
  });
  if (hits.length === 0) {
    return { skipped: true, reason: "no_corpus_match" };
  }

  // 2. Call Claude.
  const systemPrompt = await readFile(PROMPT_PATH, "utf-8");
  const prompt = buildPrompt(task, hits, lang);
  const llm = await callClaude({
    prompt,
    systemPrompt,
    model: env.CLAUDE_MODEL,
    allowedTools: [],
    timeoutMs: env.JOB_TIMEOUT_SECONDS * 1000,
    budgetCapUsd: task.budget_cap_usd ?? env.BUDGET_CAP_USD_PER_JOB,
  });
  await audit({
    event: "detective_completed",
    job_id: task.job_id,
    detective: "holmes@detective",
    cost_usd: llm.costUsd,
    tokens_in: llm.tokensIn,
    tokens_out: llm.tokensOut,
    duration_ms: llm.durationMs,
  });

  console.error(`[holmes] response (${llm.text.length} chars): ${llm.text.slice(0, 800)}`);

  // 3. Parse + write.
  const arr = extractJsonArray(llm.text);
  if (arr === null) return { skipped: true, reason: "NO_HYPOTHESES" };

  const out: Array<{ hypothesis_id: string; case_file: string }> = [];
  for (const raw of arr.slice(0, 3)) {
    const r = raw as Record<string, unknown>;
    const strOrUndef = (k: string): string | undefined =>
      typeof r[k] === "string" && (r[k] as string).trim().length > 0
        ? (r[k] as string).trim() : undefined;
    const args: WriteHypothesisArgs = {
      question:                task.question,
      question_pt_br:          task.question_pt_br ?? task.question,
      position:                String(r.position ?? "").trim(),
      position_pt_br:          strOrUndef("position_pt_br"),
      argument_for:            strOrUndef("argument_for"),
      argument_for_pt_br:      strOrUndef("argument_for_pt_br"),
      argument_against:        strOrUndef("argument_against"),
      argument_against_pt_br:  strOrUndef("argument_against_pt_br"),
      prior:                   Number(r.prior),
      posterior:               Number(r.posterior),
      confidence_band:         r.confidence_band as WriteHypothesisArgs["confidence_band"],
      evidence_refs: Array.isArray(r.evidence_refs)
        ? (r.evidence_refs as Array<{ evidence_id?: string; supports?: boolean; weight?: number }>)
            .filter((x): x is { evidence_id: string; supports?: boolean; weight?: number } =>
              typeof x?.evidence_id === "string" && x.evidence_id.length > 0)
        : [],
    };
    if (!args.position) continue;
    try {
      const r = await writeHypothesis(args, { job_id: task.job_id, detective: "holmes@detective" });
      out.push(r);
    } catch (e) {
      await audit({
        event: "write_hypothesis_failed",
        job_id: task.job_id,
        detective: "holmes@detective",
        error: (e as Error).message,
        position: args.position.slice(0, 200),
      });
    }
  }
  return { hypotheses: out };
}