/** * vision.ts — answer questions about an image region via Claude Code OAuth. * * Pattern matches the project's existing vision pipeline (02-vision-page.py): * 1. Crop the PNG of the requested page to the requested bbox. * 2. Spawn `claude -p --model sonnet --allowedTools Read` and instruct the * model to Read the local PNG path and answer the user's question. * * Uses the user's Claude Code OAuth (Max 20x). Per W1.2 budget policy, the * agentic worker may use Opus 4.7 without hard cap, but `analyze_image_region` * runs synchronously in the chat path — keep it on Sonnet for latency. */ import { spawn } from "node:child_process"; import { mkdtemp, unlink, rmdir } from "node:fs/promises"; import path from "node:path"; import os from "node:os"; import sharp from "sharp"; import { PROCESSING } from "@/lib/wiki"; const MODEL = process.env.VISION_MODEL || "sonnet"; const TIMEOUT_MS = Number(process.env.VISION_TIMEOUT_MS || 120_000); export interface AnalyzeImageRegionArgs { doc_id: string; page: number; bbox: { x: number; y: number; w: number; h: number }; question: string; /** Optional context to ground the model (e.g., "this is an FBI memo from 1947"). */ context?: string; /** Output language hint. Defaults to "pt-br". */ lang?: "pt" | "en"; } export interface AnalyzeImageRegionResult { answer: string; model: string; duration_ms: number; bbox: { x: number; y: number; w: number; h: number }; crop_url: string; } function pageFilename(page: number): string { return `p-${String(page).padStart(3, "0")}.png`; } /** Crop a bbox region of a page PNG and write the result to a temp file. */ async function cropToTempFile(args: AnalyzeImageRegionArgs): Promise<{ path: string; dir: string }> { const sourcePath = path.join(PROCESSING, "png", args.doc_id, pageFilename(args.page)); const meta = await sharp(sourcePath).metadata(); const W = meta.width ?? 0; const H = meta.height ?? 0; if (W === 0 || H === 0) throw new Error(`source PNG unreadable: ${sourcePath}`); const left = Math.max(0, Math.round(args.bbox.x * W)); const top = Math.max(0, Math.round(args.bbox.y * H)); const width = Math.max(1, Math.min(W - left, Math.round(args.bbox.w * W))); const height = Math.max(1, Math.min(H - top, Math.round(args.bbox.h * H))); const dir = await mkdtemp(path.join(os.tmpdir(), "analyze-image-")); const filePath = path.join(dir, "crop.png"); await sharp(sourcePath) .extract({ left, top, width, height }) .resize({ width: Math.min(1024, width), withoutEnlargement: true }) .png() .toFile(filePath); return { path: filePath, dir }; } function buildPrompt(args: AnalyzeImageRegionArgs, cropPath: string): string { const lang = args.lang === "en" ? "English" : "Brazilian Portuguese (pt-br)"; const ctx = args.context ? `\n\nContext: ${args.context}` : ""; return [ `Use the Read tool on this local PNG file: ${cropPath}`, "", `The image is a cropped region from document "${args.doc_id}", page ${args.page}.`, ctx, "", `Answer this question about what is visible in the image, in ${lang}:`, "", args.question, "", "Rules:", "- Be factual and concise (3-8 sentences unless the question requires more).", "- If text is visible, transcribe the relevant portion verbatim (do not translate).", "- If the image is unclear or empty, say so explicitly. Don't invent.", "- Do not call any tool besides Read on the provided path.", ].join("\n"); } /** Spawn `claude -p` and return the JSON output result. */ function callClaudeCli(prompt: string): Promise<{ result: string; durationMs: number; costUsd?: number; tokensIn?: number; tokensOut?: number }> { return new Promise((resolve, reject) => { const t0 = Date.now(); const child = spawn( "claude", [ "-p", "--model", MODEL, "--output-format", "json", "--max-turns", "2", "--allowedTools", "Read", "--", prompt, ], { stdio: ["ignore", "pipe", "pipe"], env: { ...process.env } }, ); let stdout = ""; let stderr = ""; child.stdout.on("data", (c) => (stdout += c.toString())); child.stderr.on("data", (c) => (stderr += c.toString())); const t = setTimeout(() => { child.kill("SIGKILL"); reject(new Error(`claude CLI timeout > ${TIMEOUT_MS}ms`)); }, TIMEOUT_MS); child.on("error", (e) => { clearTimeout(t); reject(e); }); child.on("close", (code) => { clearTimeout(t); if (code !== 0) { return reject(new Error(`claude CLI rc=${code}: ${stderr.slice(-300)}`)); } try { const cli = JSON.parse(stdout); if (cli.is_error) return reject(new Error(`claude error: ${(cli.result || "").slice(0, 300)}`)); resolve({ result: cli.result || "", durationMs: cli.duration_ms || Date.now() - t0, costUsd: cli.total_cost_usd, tokensIn: cli.usage?.input_tokens, tokensOut: cli.usage?.output_tokens, }); } catch (e) { reject(new Error(`claude stdout parse: ${e instanceof Error ? e.message : String(e)}`)); } }); }); } export async function analyzeImageRegion(args: AnalyzeImageRegionArgs): Promise { if (!args.doc_id) throw new Error("doc_id required"); if (!Number.isFinite(args.page) || args.page < 1) throw new Error("page must be >= 1"); if (!args.bbox || !["x", "y", "w", "h"].every((k) => Number.isFinite((args.bbox as Record)[k]))) { throw new Error("bbox {x,y,w,h} required (normalized 0..1)"); } if (!args.question?.trim()) throw new Error("question required"); const { path: cropPath, dir } = await cropToTempFile(args); try { const t0 = Date.now(); const prompt = buildPrompt(args, cropPath); const out = await callClaudeCli(prompt); const cropUrl = `/api/crop?doc=${encodeURIComponent(args.doc_id)}` + `&page=${args.page}&x=${args.bbox.x}&y=${args.bbox.y}&w=${args.bbox.w}&h=${args.bbox.h}&w_px=640`; return { answer: out.result.trim(), model: MODEL, duration_ms: out.durationMs || Date.now() - t0, bbox: args.bbox, crop_url: cropUrl, }; } finally { // Best-effort cleanup. Crop is in $TMPDIR, OS will reap if we miss. unlink(cropPath).catch(() => undefined); rmdir(dir).catch(() => undefined); } }