- TD#8 hybrid.ts: rerank_strategy {always|when_top_k_gt|never} + threshold
(default skips rerank for top_k ≤ 15; chat tool uses threshold 10)
- O11 vision.ts + tools.ts: analyze_image_region tool — sharp-crops the
bbox, claude CLI reads the temp PNG via Read tool, Sonnet vision answers
- TD#12 /graph: SigmaGraph replaces ForceGraphCanvas; react-force-graph-2d
uninstalled (-37 transitive deps); force-graph-canvas.tsx deleted
- TD#27 messages/route.ts gatherContext slice sizes via CTX_* env vars
- TD#22 tests/rag/: golden.yaml (15 queries) + run.py (Recall@k + MRR +
negative-pass rate) + baseline.json + CI job in .forgejo/workflows/ci.yml
- docs/adrs/: ADR-001..005 published from systems-atelier deliverables
Verified live on disclosure.top: top_k=5 path skips rerank (6.7s embed-only,
was 12-15s with rerank); rerank=always still available on demand.
First RAG baseline: Recall@5 = 0.2083, MRR = 0.25, Negative pass = 1.0.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
165 lines
6.3 KiB
TypeScript
165 lines
6.3 KiB
TypeScript
/**
|
|
* vision.ts — answer questions about an image region via Claude Code OAuth.
|
|
*
|
|
* Pattern matches the project's existing vision pipeline (02-vision-page.py):
|
|
* 1. Crop the PNG of the requested page to the requested bbox.
|
|
* 2. Spawn `claude -p --model sonnet --allowedTools Read` and instruct the
|
|
* model to Read the local PNG path and answer the user's question.
|
|
*
|
|
* Uses the user's Claude Code OAuth (Max 20x). Per W1.2 budget policy, the
|
|
* agentic worker may use Opus 4.7 without hard cap, but `analyze_image_region`
|
|
* runs synchronously in the chat path — keep it on Sonnet for latency.
|
|
*/
|
|
import { spawn } from "node:child_process";
|
|
import { mkdtemp, unlink, rmdir } from "node:fs/promises";
|
|
import path from "node:path";
|
|
import os from "node:os";
|
|
import sharp from "sharp";
|
|
import { PROCESSING } from "@/lib/wiki";
|
|
|
|
const MODEL = process.env.VISION_MODEL || "sonnet";
|
|
const TIMEOUT_MS = Number(process.env.VISION_TIMEOUT_MS || 120_000);
|
|
|
|
export interface AnalyzeImageRegionArgs {
|
|
doc_id: string;
|
|
page: number;
|
|
bbox: { x: number; y: number; w: number; h: number };
|
|
question: string;
|
|
/** Optional context to ground the model (e.g., "this is an FBI memo from 1947"). */
|
|
context?: string;
|
|
/** Output language hint. Defaults to "pt-br". */
|
|
lang?: "pt" | "en";
|
|
}
|
|
|
|
export interface AnalyzeImageRegionResult {
|
|
answer: string;
|
|
model: string;
|
|
duration_ms: number;
|
|
bbox: { x: number; y: number; w: number; h: number };
|
|
crop_url: string;
|
|
}
|
|
|
|
function pageFilename(page: number): string {
|
|
return `p-${String(page).padStart(3, "0")}.png`;
|
|
}
|
|
|
|
/** Crop a bbox region of a page PNG and write the result to a temp file. */
|
|
async function cropToTempFile(args: AnalyzeImageRegionArgs): Promise<{ path: string; dir: string }> {
|
|
const sourcePath = path.join(PROCESSING, "png", args.doc_id, pageFilename(args.page));
|
|
const meta = await sharp(sourcePath).metadata();
|
|
const W = meta.width ?? 0;
|
|
const H = meta.height ?? 0;
|
|
if (W === 0 || H === 0) throw new Error(`source PNG unreadable: ${sourcePath}`);
|
|
const left = Math.max(0, Math.round(args.bbox.x * W));
|
|
const top = Math.max(0, Math.round(args.bbox.y * H));
|
|
const width = Math.max(1, Math.min(W - left, Math.round(args.bbox.w * W)));
|
|
const height = Math.max(1, Math.min(H - top, Math.round(args.bbox.h * H)));
|
|
|
|
const dir = await mkdtemp(path.join(os.tmpdir(), "analyze-image-"));
|
|
const filePath = path.join(dir, "crop.png");
|
|
await sharp(sourcePath)
|
|
.extract({ left, top, width, height })
|
|
.resize({ width: Math.min(1024, width), withoutEnlargement: true })
|
|
.png()
|
|
.toFile(filePath);
|
|
return { path: filePath, dir };
|
|
}
|
|
|
|
function buildPrompt(args: AnalyzeImageRegionArgs, cropPath: string): string {
|
|
const lang = args.lang === "en" ? "English" : "Brazilian Portuguese (pt-br)";
|
|
const ctx = args.context ? `\n\nContext: ${args.context}` : "";
|
|
return [
|
|
`Use the Read tool on this local PNG file: ${cropPath}`,
|
|
"",
|
|
`The image is a cropped region from document "${args.doc_id}", page ${args.page}.`,
|
|
ctx,
|
|
"",
|
|
`Answer this question about what is visible in the image, in ${lang}:`,
|
|
"",
|
|
args.question,
|
|
"",
|
|
"Rules:",
|
|
"- Be factual and concise (3-8 sentences unless the question requires more).",
|
|
"- If text is visible, transcribe the relevant portion verbatim (do not translate).",
|
|
"- If the image is unclear or empty, say so explicitly. Don't invent.",
|
|
"- Do not call any tool besides Read on the provided path.",
|
|
].join("\n");
|
|
}
|
|
|
|
/** Spawn `claude -p` and return the JSON output result. */
|
|
function callClaudeCli(prompt: string): Promise<{ result: string; durationMs: number; costUsd?: number; tokensIn?: number; tokensOut?: number }> {
|
|
return new Promise((resolve, reject) => {
|
|
const t0 = Date.now();
|
|
const child = spawn(
|
|
"claude",
|
|
[
|
|
"-p",
|
|
"--model", MODEL,
|
|
"--output-format", "json",
|
|
"--max-turns", "2",
|
|
"--allowedTools", "Read",
|
|
"--",
|
|
prompt,
|
|
],
|
|
{ stdio: ["ignore", "pipe", "pipe"], env: { ...process.env } },
|
|
);
|
|
let stdout = "";
|
|
let stderr = "";
|
|
child.stdout.on("data", (c) => (stdout += c.toString()));
|
|
child.stderr.on("data", (c) => (stderr += c.toString()));
|
|
const t = setTimeout(() => {
|
|
child.kill("SIGKILL");
|
|
reject(new Error(`claude CLI timeout > ${TIMEOUT_MS}ms`));
|
|
}, TIMEOUT_MS);
|
|
child.on("error", (e) => { clearTimeout(t); reject(e); });
|
|
child.on("close", (code) => {
|
|
clearTimeout(t);
|
|
if (code !== 0) {
|
|
return reject(new Error(`claude CLI rc=${code}: ${stderr.slice(-300)}`));
|
|
}
|
|
try {
|
|
const cli = JSON.parse(stdout);
|
|
if (cli.is_error) return reject(new Error(`claude error: ${(cli.result || "").slice(0, 300)}`));
|
|
resolve({
|
|
result: cli.result || "",
|
|
durationMs: cli.duration_ms || Date.now() - t0,
|
|
costUsd: cli.total_cost_usd,
|
|
tokensIn: cli.usage?.input_tokens,
|
|
tokensOut: cli.usage?.output_tokens,
|
|
});
|
|
} catch (e) {
|
|
reject(new Error(`claude stdout parse: ${e instanceof Error ? e.message : String(e)}`));
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
export async function analyzeImageRegion(args: AnalyzeImageRegionArgs): Promise<AnalyzeImageRegionResult> {
|
|
if (!args.doc_id) throw new Error("doc_id required");
|
|
if (!Number.isFinite(args.page) || args.page < 1) throw new Error("page must be >= 1");
|
|
if (!args.bbox || !["x", "y", "w", "h"].every((k) => Number.isFinite((args.bbox as Record<string, unknown>)[k]))) {
|
|
throw new Error("bbox {x,y,w,h} required (normalized 0..1)");
|
|
}
|
|
if (!args.question?.trim()) throw new Error("question required");
|
|
|
|
const { path: cropPath, dir } = await cropToTempFile(args);
|
|
try {
|
|
const t0 = Date.now();
|
|
const prompt = buildPrompt(args, cropPath);
|
|
const out = await callClaudeCli(prompt);
|
|
const cropUrl =
|
|
`/api/crop?doc=${encodeURIComponent(args.doc_id)}` +
|
|
`&page=${args.page}&x=${args.bbox.x}&y=${args.bbox.y}&w=${args.bbox.w}&h=${args.bbox.h}&w_px=640`;
|
|
return {
|
|
answer: out.result.trim(),
|
|
model: MODEL,
|
|
duration_ms: out.durationMs || Date.now() - t0,
|
|
bbox: args.bbox,
|
|
crop_url: cropUrl,
|
|
};
|
|
} finally {
|
|
// Best-effort cleanup. Crop is in $TMPDIR, OS will reap if we miss.
|
|
unlink(cropPath).catch(() => undefined);
|
|
rmdir(dir).catch(() => undefined);
|
|
}
|
|
}
|