178 lines
6.5 KiB
TypeScript
178 lines
6.5 KiB
TypeScript
|
|
/**
|
||
|
|
* holmes.ts — hypothesis tournament detective.
|
||
|
|
*
|
||
|
|
* Workflow (matches agentic-layer-spec sec 7):
|
||
|
|
* 1. The runtime grounds Holmes with a small corpus shortlist via
|
||
|
|
* hybridSearch — Holmes never gets the whole DB, just the relevant 8-15
|
||
|
|
* chunks.
|
||
|
|
* 2. Claude Sonnet 4.6 reads the question + chunks, emits a JSON array of
|
||
|
|
* 2-3 rival hypotheses with priors/posteriors/citations.
|
||
|
|
* 3. The runtime parses the array and calls writeHypothesis() for each.
|
||
|
|
* The writer enforces posterior bounds + Tetlock band + FK to evidence.
|
||
|
|
*
|
||
|
|
* Holmes does NOT get tool calls. All grounding is pre-fed; all writes are
|
||
|
|
* applied by the runtime after validation (sa-security gate #2).
|
||
|
|
*/
|
||
|
|
import { readFile } from "node:fs/promises";
|
||
|
|
import path from "node:path";
|
||
|
|
import { fileURLToPath } from "node:url";
|
||
|
|
import { audit } from "../lib/audit";
|
||
|
|
import { callClaude } from "../lib/claude";
|
||
|
|
import { env } from "../lib/env";
|
||
|
|
import { hybridSearch, type SearchHit } from "../lib/search";
|
||
|
|
import { writeHypothesis, type WriteHypothesisArgs } from "../tools/write_hypothesis";
|
||
|
|
|
||
|
|
const HERE = path.dirname(fileURLToPath(import.meta.url));
|
||
|
|
const PROMPT_PATH = path.resolve(HERE, "..", "..", "prompts", "holmes.md");
|
||
|
|
|
||
|
|
export interface HolmesTask {
|
||
|
|
job_id: string;
|
||
|
|
question: string;
|
||
|
|
/** Optional scope narrowing — restrict the search to one doc / entity. */
|
||
|
|
doc_id?: string;
|
||
|
|
lang?: "pt" | "en";
|
||
|
|
/** How many chunks to feed Holmes. Default 12. */
|
||
|
|
context_chunks?: number;
|
||
|
|
budget_cap_usd?: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
function renderChunkBlock(hits: SearchHit[], lang: "pt" | "en"): string {
|
||
|
|
const blocks = hits.map((h, i) => {
|
||
|
|
const text = (lang === "en" ? h.content_en : h.content_pt) || h.content_en || h.content_pt || "";
|
||
|
|
const pageStr = String(h.page).padStart(3, "0");
|
||
|
|
return [
|
||
|
|
`--- chunk ${i + 1} ---`,
|
||
|
|
`id: [[${h.doc_id}/p${pageStr}#${h.chunk_id}]]`,
|
||
|
|
`type: ${h.type}`,
|
||
|
|
h.classification ? `classification: ${h.classification}` : null,
|
||
|
|
"",
|
||
|
|
text.slice(0, 1200),
|
||
|
|
].filter(Boolean).join("\n");
|
||
|
|
});
|
||
|
|
return blocks.join("\n\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
function buildPrompt(task: HolmesTask, hits: SearchHit[], lang: "pt" | "en"): string {
|
||
|
|
const block = renderChunkBlock(hits, lang);
|
||
|
|
return [
|
||
|
|
`# Question to investigate`,
|
||
|
|
"",
|
||
|
|
task.question,
|
||
|
|
"",
|
||
|
|
`## Corpus shortlist (${hits.length} chunks${task.doc_id ? `, scoped to ${task.doc_id}` : ""})`,
|
||
|
|
"",
|
||
|
|
block,
|
||
|
|
"",
|
||
|
|
"## Your task",
|
||
|
|
"",
|
||
|
|
"Build 2-3 rival hypotheses about the question above. Each must cite at",
|
||
|
|
"least one chunk via [[doc-id/pNNN#cNNNN]] in argument_for and",
|
||
|
|
"argument_against. Assign priors + posteriors summing roughly to 1.0.",
|
||
|
|
"Emit the JSON array exactly as specified by the system prompt — no prose,",
|
||
|
|
"no code fence, no preamble.",
|
||
|
|
].join("\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
function extractJsonArray(text: string): unknown[] | null {
|
||
|
|
const t = text.trim();
|
||
|
|
if (t === "NO_HYPOTHESES") return null;
|
||
|
|
const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
|
||
|
|
const first = stripped.indexOf("[");
|
||
|
|
const last = stripped.lastIndexOf("]");
|
||
|
|
if (first === -1 || last === -1) {
|
||
|
|
throw new Error(`holmes returned no JSON array: ${t.slice(0, 200)}`);
|
||
|
|
}
|
||
|
|
const parsed = JSON.parse(stripped.slice(first, last + 1));
|
||
|
|
if (!Array.isArray(parsed)) throw new Error("holmes JSON is not an array");
|
||
|
|
return parsed;
|
||
|
|
}
|
||
|
|
|
||
|
|
export async function runHolmes(task: HolmesTask): Promise<
|
||
|
|
| { hypotheses: Array<{ hypothesis_id: string; case_file: string }> }
|
||
|
|
| { skipped: true; reason: string }
|
||
|
|
> {
|
||
|
|
const lang: "pt" | "en" = task.lang ?? "pt";
|
||
|
|
const k = task.context_chunks ?? 12;
|
||
|
|
|
||
|
|
// 1. Ground with hybrid_search.
|
||
|
|
const hits = await hybridSearch({
|
||
|
|
query: task.question,
|
||
|
|
lang,
|
||
|
|
doc_id: task.doc_id ?? null,
|
||
|
|
top_k: k,
|
||
|
|
recall_k: 60,
|
||
|
|
});
|
||
|
|
await audit({
|
||
|
|
event: "holmes_grounded",
|
||
|
|
job_id: task.job_id,
|
||
|
|
detective: "holmes@detective",
|
||
|
|
question: task.question,
|
||
|
|
n_chunks: hits.length,
|
||
|
|
doc_id: task.doc_id ?? null,
|
||
|
|
});
|
||
|
|
if (hits.length === 0) {
|
||
|
|
return { skipped: true, reason: "no_corpus_match" };
|
||
|
|
}
|
||
|
|
|
||
|
|
// 2. Call Claude.
|
||
|
|
const systemPrompt = await readFile(PROMPT_PATH, "utf-8");
|
||
|
|
const prompt = buildPrompt(task, hits, lang);
|
||
|
|
const llm = await callClaude({
|
||
|
|
prompt,
|
||
|
|
systemPrompt,
|
||
|
|
model: env.CLAUDE_MODEL,
|
||
|
|
allowedTools: [],
|
||
|
|
timeoutMs: env.JOB_TIMEOUT_SECONDS * 1000,
|
||
|
|
budgetCapUsd: task.budget_cap_usd ?? env.BUDGET_CAP_USD_PER_JOB,
|
||
|
|
});
|
||
|
|
await audit({
|
||
|
|
event: "detective_completed",
|
||
|
|
job_id: task.job_id,
|
||
|
|
detective: "holmes@detective",
|
||
|
|
cost_usd: llm.costUsd,
|
||
|
|
tokens_in: llm.tokensIn,
|
||
|
|
tokens_out: llm.tokensOut,
|
||
|
|
duration_ms: llm.durationMs,
|
||
|
|
});
|
||
|
|
|
||
|
|
console.error(`[holmes] response (${llm.text.length} chars): ${llm.text.slice(0, 800)}`);
|
||
|
|
|
||
|
|
// 3. Parse + write.
|
||
|
|
const arr = extractJsonArray(llm.text);
|
||
|
|
if (arr === null) return { skipped: true, reason: "NO_HYPOTHESES" };
|
||
|
|
|
||
|
|
const out: Array<{ hypothesis_id: string; case_file: string }> = [];
|
||
|
|
for (const raw of arr.slice(0, 3)) {
|
||
|
|
const args: WriteHypothesisArgs = {
|
||
|
|
question: task.question,
|
||
|
|
position: String((raw as { position?: unknown }).position ?? "").trim(),
|
||
|
|
argument_for: typeof (raw as { argument_for?: unknown }).argument_for === "string"
|
||
|
|
? (raw as { argument_for: string }).argument_for : undefined,
|
||
|
|
argument_against: typeof (raw as { argument_against?: unknown }).argument_against === "string"
|
||
|
|
? (raw as { argument_against: string }).argument_against : undefined,
|
||
|
|
prior: Number((raw as { prior?: unknown }).prior),
|
||
|
|
posterior: Number((raw as { posterior?: unknown }).posterior),
|
||
|
|
confidence_band: (raw as { confidence_band?: WriteHypothesisArgs["confidence_band"] }).confidence_band,
|
||
|
|
evidence_refs: Array.isArray((raw as { evidence_refs?: unknown }).evidence_refs)
|
||
|
|
? (raw as { evidence_refs: Array<{ evidence_id?: string; supports?: boolean; weight?: number }> }).evidence_refs
|
||
|
|
.filter((r): r is { evidence_id: string; supports?: boolean; weight?: number } =>
|
||
|
|
typeof r?.evidence_id === "string" && r.evidence_id.length > 0)
|
||
|
|
: [],
|
||
|
|
};
|
||
|
|
if (!args.position) continue;
|
||
|
|
try {
|
||
|
|
const r = await writeHypothesis(args, { job_id: task.job_id, detective: "holmes@detective" });
|
||
|
|
out.push(r);
|
||
|
|
} catch (e) {
|
||
|
|
await audit({
|
||
|
|
event: "write_hypothesis_failed",
|
||
|
|
job_id: task.job_id,
|
||
|
|
detective: "holmes@detective",
|
||
|
|
error: (e as Error).message,
|
||
|
|
position: args.position.slice(0, 200),
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return { hypotheses: out };
|
||
|
|
}
|