disclosure-bureau/investigator-runtime/src/tools/write_calibration.ts
Luiz Gustavo 67185ff518
Some checks failed
CI / Web — typecheck + lint + build (push) Failing after 40s
CI / Scripts — Python smoke (push) Failing after 3s
CI / Web — npm audit (push) Failing after 31s
CI / Retrieval — golden set (Recall@5 + MRR) (push) Failing after 4s
W3.9: surface the Investigation Bureau on the homepage + /bureau hub
Closes a UX gap the user surfaced: W3.5-3.8 built 8 detectives, 4 new
URL endpoints (/jobs/[id], /h/[id], /c/[slug], /api/h/[id]/red-team)
and a chat tool, but the homepage was unchanged — the bureau was
invisible unless you knew the URL or asked the chat to invoke
request_investigation.

Homepage (web/app/page.tsx):
  - Title `▍ war.gov/ufo — Investigative Wiki` → `▍ The Disclosure Bureau`
  - Subtitle expanded from "Holmes · Poirot · Dupin · Locard" to all 8
    detectives (Holmes · Locard · Dupin · Schneier · Poirot · Taleb ·
    Tetlock · Case-Writer)
  - New `🔎 bureau` topbar link (gold, between graph/stats and batch)
  - BureauSnapshot inserted right after the header

BureauSnapshot (web/components/bureau-snapshot.tsx) — server component:
  - 8 detective tiles with role labels (each in its tone color)
  - 6 clickable counters (evidence / hypotheses / contradictions /
    witnesses / outliers / case reports) — anchor to /bureau#section
  - 6 "recent artefacts" columns surfacing the last 3-4 of each kind:
    hypotheses with prior→posterior + band + ↳reviewed_by marker,
    contradictions with topic + resolution_status, evidence with
    Grade badge + verbatim quote, outliers with title + scope.kind,
    witness analyses with canonical_name + credibility + verdict,
    case reports with slug + link to /c/<slug>
  - "Recent jobs" strip linking to /jobs/[id] color-coded by status
  - Reports read from /data/ufo/case/reports/ via fs.readdir + stat,
    sorted by mtime — no DB round-trip needed for that section

/bureau (web/app/bureau/page.tsx) — full hub:
  - Header with full counts
  - 7 sections (anchored to homepage counter links): Case reports,
    Hypotheses, Evidence, Contradictions, Outliers, Witnesses,
    Recent jobs table — each rendering up to 100 rows
  - Reports section parses frontmatter from each .md to surface topic
    + n_hypotheses + n_evidence on the card

Runtime fixes batched in:
  - Poirot: coerce entity_pk via Number() — node-postgres returns
    BIGINT as string by default; writer's Number.isFinite() rejected
    it as "person_entity_pk required" (j-edgar-hoover retry path)
  - Tetlock: write_calibration rationale cap 600 → 1200 chars. Prompt
    still asks ≤ 600 but a 2× slack beats failing the job on honest
    analysis. Observed live: Tetlock emitted ~620 chars on H-0003 and
    the writer rejected the entire calibration.
  - Case-Writer: Promise.all of 5 queries × max_parallel=2 jobs
    demanded up to 10 connections against the investigator role's
    rolconnlimit=4 → "too many connections for role investigator".
    Sequentialized — the LLM call is the hot path, not these queries.

Smoke results visible now on the homepage:
  - 3 hypotheses (H-0001/2/3) about green fireballs origin
  - 3 contradictions (R-0001/2/3) about color, geographic confinement,
    exclusive-green vs multicolored
  - 2 evidence cards (E-0002/3) Grade B
  - 3 outliers (G-0001/2/3) — including Taleb's deliberate
    meteor-shower-camouflage flag
  - 1 case report at /c/green-fireballs-sandia (Watson 13.4 KB,
    five-act narrative, fully cited)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 22:41:28 -03:00

165 lines
6.1 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* write_calibration.ts — Tetlock's primary writer.
*
* UPDATEs public.hypotheses (posterior + confidence_band + reviewed_by +
* updated_at) and APPENDS (or replaces) a "## Calibration history" section
* to the H-NNNN.md case file. Each calibration includes a timestamp +
* old/new posterior + recommended_action + rationale.
*/
import { readFile, writeFile } from "node:fs/promises";
import path from "node:path";
import { audit } from "../lib/audit";
import { env } from "../lib/env";
import { query, queryOne } from "../lib/pg";
export interface WriteCalibrationArgs {
hypothesis_id: string;
new_posterior: number;
new_confidence_band: "high" | "medium" | "low" | "speculation";
delta: number;
rationale: string;
recommended_action: "keep" | "downgrade" | "upgrade" | "supersede";
supersede_reason?: string;
/** previous posterior captured at call time — used in the case-file row. */
old_posterior: number | null;
old_confidence_band: string | null;
}
export interface WriteCalibrationContext {
job_id: string;
detective: string;
}
const SECTION_MARKER = "## Calibration history";
function bandFromPosterior(p: number): "high" | "medium" | "low" | "speculation" {
if (p >= 0.90) return "high";
if (p >= 0.60) return "medium";
if (p >= 0.30) return "low";
return "speculation";
}
function buildSection(args: WriteCalibrationArgs, ctx: WriteCalibrationContext): string {
const ts = new Date().toISOString();
const rows = [
`### ${ts}${args.recommended_action}`,
"",
`_Calibrated by ${ctx.detective} — job \`${ctx.job_id}\`._`,
"",
`| field | old | new |`,
`|---|---|---|`,
`| posterior | ${args.old_posterior ?? "—"} | **${args.new_posterior}** |`,
`| band | ${args.old_confidence_band ?? "—"} | **${args.new_confidence_band}** |`,
`| delta | — | ${args.delta >= 0 ? "+" : ""}${args.delta.toFixed(3)} |`,
"",
`**Rationale.** ${args.rationale}`,
];
if (args.recommended_action === "supersede" && args.supersede_reason) {
rows.push("", `**Supersede reason.** ${args.supersede_reason}`);
}
rows.push("");
return rows.join("\n");
}
function appendCalibration(existing: string, section: string): string {
// Calibration history is APPEND-only (Tetlock can be invoked many times
// and each datapoint matters). Find the section, append; create it if
// missing.
const idx = existing.indexOf(`\n${SECTION_MARKER}`);
if (idx === -1) {
return existing.trimEnd() + "\n\n" + SECTION_MARKER + "\n\n" + section;
}
return existing.trimEnd() + "\n" + section;
}
export async function writeCalibration(
body: WriteCalibrationArgs,
ctx: WriteCalibrationContext,
): Promise<{ hypothesis_id: string; case_file: string; new_posterior: number; recommended_action: string }> {
if (!body.hypothesis_id?.match(/^H-\d{4}$/)) {
throw new Error(`bad hypothesis_id: ${body.hypothesis_id}`);
}
if (!Number.isFinite(body.new_posterior) || body.new_posterior < 0 || body.new_posterior > 1) {
throw new Error(`new_posterior out of range: ${body.new_posterior}`);
}
const expectedBand = bandFromPosterior(body.new_posterior);
// Force the band to match the posterior — Tetlock can mis-label.
body.new_confidence_band = expectedBand;
if (!body.rationale?.trim()) throw new Error("rationale required");
// Soft cap: 1200 chars. Tetlock often writes 600-800 of substantive
// reasoning + chunk citations; the prompt asks for ≤ 600 but a 2× slack
// beats failing the job on an honest analysis.
if (body.rationale.length > 1200) throw new Error(`rationale too long (${body.rationale.length} > 1200)`);
const action = body.recommended_action;
if (!["keep", "downgrade", "upgrade", "supersede"].includes(action)) {
throw new Error(`bad recommended_action: ${action}`);
}
if (action === "supersede" && !body.supersede_reason?.trim()) {
throw new Error("supersede_reason required when action == supersede");
}
// Verify hypothesis exists.
const h = await queryOne<{ hypothesis_id: string; status: string }>(
`SELECT hypothesis_id, status FROM public.hypotheses WHERE hypothesis_id = $1`,
[body.hypothesis_id],
);
if (!h) throw new Error(`hypothesis not found: ${body.hypothesis_id}`);
// UPDATE DB: posterior + band (always), status='superseded' when action=='supersede'.
if (action === "supersede") {
await query(
`UPDATE public.hypotheses
SET posterior = $1, confidence_band = $2, status = 'superseded',
reviewed_by = $3, updated_at = NOW()
WHERE hypothesis_id = $4`,
[body.new_posterior, body.new_confidence_band, ctx.detective, body.hypothesis_id],
);
} else if (action === "keep" && Math.abs(body.delta) < 0.005) {
// Pure keep with no movement — only touch updated_at + reviewed_by.
await query(
`UPDATE public.hypotheses
SET reviewed_by = $1, updated_at = NOW()
WHERE hypothesis_id = $2`,
[ctx.detective, body.hypothesis_id],
);
} else {
await query(
`UPDATE public.hypotheses
SET posterior = $1, confidence_band = $2,
reviewed_by = $3, updated_at = NOW()
WHERE hypothesis_id = $4`,
[body.new_posterior, body.new_confidence_band, ctx.detective, body.hypothesis_id],
);
}
// Append calibration row to the case file.
const file = path.join(env.CASE_ROOT, "hypotheses", `${body.hypothesis_id}.md`);
let existing: string;
try {
existing = await readFile(file, "utf-8");
} catch (e) {
throw new Error(`hypothesis case file missing: ${file} (${(e as Error).message})`);
}
const section = buildSection(body, ctx);
const next = appendCalibration(existing, section);
await writeFile(file, next, "utf-8");
await audit({
event: "write_calibration",
job_id: ctx.job_id,
detective: ctx.detective,
hypothesis_id: body.hypothesis_id,
new_posterior: body.new_posterior,
new_confidence_band: body.new_confidence_band,
delta: body.delta,
recommended_action: action,
file,
});
return {
hypothesis_id: body.hypothesis_id, case_file: file,
new_posterior: body.new_posterior,
recommended_action: action,
};
}