disclosure-bureau/web/lib/chat/vision.ts

/**
 * vision.ts — answer questions about an image region via Claude Code OAuth.
 *
 * Pattern matches the project's existing vision pipeline (02-vision-page.py):
 *   1. Crop the PNG of the requested page to the requested bbox.
 *   2. Spawn `claude -p --model sonnet --allowedTools Read` and instruct the
 *      model to Read the local PNG path and answer the user's question.
 *
 * Uses the user's Claude Code OAuth (Max 20x). Per W1.2 budget policy, the
 * agentic worker may use Opus 4.7 without hard cap, but `analyze_image_region`
 * runs synchronously in the chat path — keep it on Sonnet for latency.
 */
import { spawn } from "node:child_process";
import { mkdtemp, unlink, rmdir } from "node:fs/promises";
import path from "node:path";
import os from "node:os";
import sharp from "sharp";
import { PROCESSING } from "@/lib/wiki";

const MODEL = process.env.VISION_MODEL || "sonnet";
const TIMEOUT_MS = Number(process.env.VISION_TIMEOUT_MS || 120_000);

export interface AnalyzeImageRegionArgs {
  doc_id: string;
  page: number;
  bbox: { x: number; y: number; w: number; h: number };
  question: string;
  /** Optional context to ground the model (e.g., "this is an FBI memo from 1947"). */
  context?: string;
  /** Output language hint. Defaults to "pt-br". */
  lang?: "pt" | "en";
}

export interface AnalyzeImageRegionResult {
  answer: string;
  model: string;
  duration_ms: number;
  bbox: { x: number; y: number; w: number; h: number };
  crop_url: string;
}

function pageFilename(page: number): string {
  return `p-${String(page).padStart(3, "0")}.png`;
}

/** Crop a bbox region of a page PNG and write the result to a temp file. */
async function cropToTempFile(args: AnalyzeImageRegionArgs): Promise<{ path: string; dir: string }> {
  const sourcePath = path.join(PROCESSING, "png", args.doc_id, pageFilename(args.page));
  const meta = await sharp(sourcePath).metadata();
  const W = meta.width ?? 0;
  const H = meta.height ?? 0;
  if (W === 0 || H === 0) throw new Error(`source PNG unreadable: ${sourcePath}`);
  const left = Math.max(0, Math.round(args.bbox.x * W));
  const top = Math.max(0, Math.round(args.bbox.y * H));
  const width = Math.max(1, Math.min(W - left, Math.round(args.bbox.w * W)));
  const height = Math.max(1, Math.min(H - top, Math.round(args.bbox.h * H)));

  const dir = await mkdtemp(path.join(os.tmpdir(), "analyze-image-"));
  const filePath = path.join(dir, "crop.png");
  await sharp(sourcePath)
    .extract({ left, top, width, height })
    .resize({ width: Math.min(1024, width), withoutEnlargement: true })
    .png()
    .toFile(filePath);
  return { path: filePath, dir };
}

function buildPrompt(args: AnalyzeImageRegionArgs, cropPath: string): string {
  const lang = args.lang === "en" ? "English" : "Brazilian Portuguese (pt-br)";
  const ctx = args.context ? `\n\nContext: ${args.context}` : "";
  return [
    `Use the Read tool on this local PNG file: ${cropPath}`,
    "",
    `The image is a cropped region from document "${args.doc_id}", page ${args.page}.`,
    ctx,
    "",
    `Answer this question about what is visible in the image, in ${lang}:`,
    "",
    args.question,
    "",
    "Rules:",
    "- Be factual and concise (3-8 sentences unless the question requires more).",
    "- If text is visible, transcribe the relevant portion verbatim (do not translate).",
    "- If the image is unclear or empty, say so explicitly. Don't invent.",
    "- Do not call any tool besides Read on the provided path.",
  ].join("\n");
}

/** Spawn `claude -p` and return the JSON output result. */
function callClaudeCli(prompt: string): Promise<{ result: string; durationMs: number; costUsd?: number; tokensIn?: number; tokensOut?: number }> {
  return new Promise((resolve, reject) => {
    const t0 = Date.now();
    const child = spawn(
      "claude",
      [
        "-p",
        "--model", MODEL,
        "--output-format", "json",
        "--max-turns", "2",
        "--allowedTools", "Read",
        "--",
        prompt,
      ],
      { stdio: ["ignore", "pipe", "pipe"], env: { ...process.env } },
    );
    let stdout = "";
    let stderr = "";
    child.stdout.on("data", (c) => (stdout += c.toString()));
    child.stderr.on("data", (c) => (stderr += c.toString()));
    const t = setTimeout(() => {
      child.kill("SIGKILL");
      reject(new Error(`claude CLI timeout > ${TIMEOUT_MS}ms`));
    }, TIMEOUT_MS);
    child.on("error", (e) => { clearTimeout(t); reject(e); });
    child.on("close", (code) => {
      clearTimeout(t);
      if (code !== 0) {
        return reject(new Error(`claude CLI rc=${code}: ${stderr.slice(-300)}`));
      }
      try {
        const cli = JSON.parse(stdout);
        if (cli.is_error) return reject(new Error(`claude error: ${(cli.result || "").slice(0, 300)}`));
        resolve({
          result: cli.result || "",
          durationMs: cli.duration_ms || Date.now() - t0,
          costUsd: cli.total_cost_usd,
          tokensIn: cli.usage?.input_tokens,
          tokensOut: cli.usage?.output_tokens,
        });
      } catch (e) {
        reject(new Error(`claude stdout parse: ${e instanceof Error ? e.message : String(e)}`));
      }
    });
  });
}

export async function analyzeImageRegion(args: AnalyzeImageRegionArgs): Promise<AnalyzeImageRegionResult> {
  if (!args.doc_id) throw new Error("doc_id required");
  if (!Number.isFinite(args.page) || args.page < 1) throw new Error("page must be >= 1");
  if (!args.bbox || !["x", "y", "w", "h"].every((k) => Number.isFinite((args.bbox as Record<string, unknown>)[k]))) {
    throw new Error("bbox {x,y,w,h} required (normalized 0..1)");
  }
  if (!args.question?.trim()) throw new Error("question required");

  const { path: cropPath, dir } = await cropToTempFile(args);
  try {
    const t0 = Date.now();
    const prompt = buildPrompt(args, cropPath);
    const out = await callClaudeCli(prompt);
    const cropUrl =
      `/api/crop?doc=${encodeURIComponent(args.doc_id)}` +
      `&page=${args.page}&x=${args.bbox.x}&y=${args.bbox.y}&w=${args.bbox.w}&h=${args.bbox.h}&w_px=640`;
    return {
      answer: out.result.trim(),
      model: MODEL,
      duration_ms: out.durationMs || Date.now() - t0,
      bbox: args.bbox,
      crop_url: cropUrl,
    };
  } finally {
    // Best-effort cleanup. Crop is in $TMPDIR, OS will reap if we miss.
    unlink(cropPath).catch(() => undefined);
    rmdir(dir).catch(() => undefined);
  }
}