/** * Sherlock's tool kit — OpenAI-style function-calling schema + local handlers. * * Each tool has: * - definition: JSON Schema sent to the model * - handler: Node function that runs locally and returns a JSON-serializable result * * Tools called by the model trigger AG-UI events streamed to the frontend * (tool_start, tool_result, navigate). The frontend renders these inline in * the message AND, for `navigate_to`, can offer a clickable button to scroll * the UI to a target page. * * Retrieval stack (chunks-aware): * - hybrid_search → BM25 + dense (BGE-M3) + RRF + BGE-Reranker rerank * - read_chunk → fetch a single chunk by chunk_id (cite-then-quote) * - list_anomalies → all UFO/cryptid-flagged chunks (cheap, no LLM) * - get_page_chunks → assemble one page from chunks * Wiki-aware fallbacks (when DB not available or richer entity data needed): * - read_page, read_document, read_entity, search_corpus (legacy grep) * - navigate_to → emit clickable button to scroll UI */ import fs from "node:fs/promises"; import path from "node:path"; import { WIKI, readDocument, readPage, readEntity, listDocuments, listPages, classKeyToFolder, } from "../wiki"; import { hybridSearch, getChunk, listAnomalies, getPageChunks, type ChunkHit, } from "../retrieval/hybrid"; import { findEntity, getNeighbors, findPaths, getCoMentionChunks, } from "../retrieval/graph"; export interface ToolDefinition { type: "function"; function: { name: string; description: string; parameters: Record; }; } export interface ToolHandlerContext { /** Currently-viewed location, if any, to bias search. */ doc_id?: string | null; page_id?: string | null; /** UI language preference (pt | en). */ lang?: "pt" | "en"; /** Optional sink for inline AG-UI artifacts (citations, crops, entity cards). * When provided, tools may push typed artifacts that the UI renders inline * alongside the tool block. Safe to leave undefined for non-streaming callers. */ emitArtifact?: (artifact: import("./agui").Artifact) => void; } export interface ToolHandler { (args: Record, ctx: ToolHandlerContext): Promise; } /* ─── Tool defs ─────────────────────────────────────────────────────────── */ const hybrid_search_tool: ToolDefinition = { type: "function", function: { name: "hybrid_search", description: "PRIMARY semantic search over the entire UAP/UFO corpus chunks. " + "Combines BM25 keyword recall + BGE-M3 dense embeddings + cross-encoder rerank. " + "Returns up to top_k chunks with chunk_id, doc_id, page, bbox, text snippets, " + "classification, and relevance score. Use this for any question about content. " + "Filter with doc_id to scope to one document; type to restrict chunk type " + "(paragraph, heading, stamp, etc.); ufo_only=true to retrieve only anomaly-flagged chunks.", parameters: { type: "object", properties: { query: { type: "string", description: "Natural language query, PT or EN." }, lang: { type: "string", enum: ["pt", "en"], description: "Search language (default pt)." }, doc_id: { type: "string", description: "Optional: restrict to one document." }, type: { type: "string", description: "Optional chunk-type filter: paragraph, heading, table_marker, image, stamp, signature, " + "address_block, classification_marking, redaction, footer, marginalia, form_field.", }, classification: { type: "string", description: "Optional: SECRET, CONFIDENTIAL, RESTRICTED, NOFORN.", }, ufo_only: { type: "boolean", description: "Only chunks flagged with UFO anomaly." }, top_k: { type: "integer", description: "Number of final results (default 20, max 50)." }, }, required: ["query"], }, }, }; const read_chunk_tool: ToolDefinition = { type: "function", function: { name: "read_chunk", description: "Read ONE chunk in full (verbatim text EN+PT, full bbox, metadata, anomaly flags). " + "Use AFTER hybrid_search to expand a citation before quoting the user.", parameters: { type: "object", properties: { doc_id: { type: "string" }, chunk_id: { type: "string", description: "e.g. 'c0042'" }, }, required: ["doc_id", "chunk_id"], }, }, }; const get_page_chunks_tool: ToolDefinition = { type: "function", function: { name: "get_page_chunks", description: "Get all chunks of one page in reading order. Use to reconstruct a page or to " + "answer 'what's on page N of doc X' questions with full structure.", parameters: { type: "object", properties: { doc_id: { type: "string" }, page: { type: "integer", description: "Page number (1-indexed)." }, }, required: ["doc_id", "page"], }, }, }; const list_anomalies_tool: ToolDefinition = { type: "function", function: { name: "list_anomalies", description: "List all chunks flagged with a UFO or cryptid anomaly. Cheap query (no embedding). " + "Use for 'show me all sightings', 'all spherical objects', 'cryptid encounters'.", parameters: { type: "object", properties: { kind: { type: "string", enum: ["ufo", "cryptid"] }, doc_id: { type: "string", description: "Optional: restrict to one doc." }, limit: { type: "integer", description: "Max results (default 50)." }, }, required: ["kind"], }, }, }; const read_page_tool: ToolDefinition = { type: "function", function: { name: "read_page", description: "Read the legacy wiki page record for context (vision_description, " + "entities_extracted, content_classification). Useful WHEN the doc isn't in the new " + "chunk index yet OR you need page-level vision metadata. Prefer hybrid_search + " + "read_chunk for content questions.", parameters: { type: "object", properties: { doc_id: { type: "string" }, page: { type: "string", description: "e.g. 'p007' or '7'." }, }, required: ["doc_id", "page"], }, }, }; const read_document_tool: ToolDefinition = { type: "function", function: { name: "read_document", description: "Get the consolidated overview of a document — summary, page index, " + "content_classification, key entities.", parameters: { type: "object", properties: { doc_id: { type: "string" } }, required: ["doc_id"], }, }, }; const read_entity_tool: ToolDefinition = { type: "function", function: { name: "read_entity", description: "Read the detail of an entity (person, organization, location, event, " + "uap_object, vehicle, operation, concept) including enrichment from WebSearch.", parameters: { type: "object", properties: { class: { type: "string", enum: [ "person", "organization", "location", "event", "uap_object", "vehicle", "operation", "concept", ], }, id: { type: "string", description: "kebab-case id, e.g. 'j-edgar-hoover'." }, }, required: ["class", "id"], }, }, }; const search_corpus_tool: ToolDefinition = { type: "function", function: { name: "search_corpus", description: "Legacy keyword-only search over document IDs, titles, and entity IDs. " + "Prefer hybrid_search for content questions. Use this only to find entities/docs by name.", parameters: { type: "object", properties: { query: { type: "string" }, scope: { type: "string", enum: ["all", "documents", "entities"] }, }, required: ["query"], }, }, }; const entity_neighbors_tool: ToolDefinition = { type: "function", function: { name: "entity_neighbors", description: "List entities co-mentioned with a given entity in the corpus chunks. " + "Use to answer 'who/what is connected to X' questions. Returns up to " + "limit neighbors sorted by edge weight (number of shared chunks).", parameters: { type: "object", properties: { class: { type: "string", enum: ["person", "organization", "location", "event", "uap_object", "vehicle", "operation", "concept"], }, id: { type: "string", description: "kebab-case id or canonical name." }, filter_classes: { type: "array", items: { type: "string" }, description: "Optional: restrict neighbors to these entity classes.", }, limit: { type: "integer", description: "Max neighbors (default 30, max 100)." }, }, required: ["class", "id"], }, }, }; const entity_path_tool: ToolDefinition = { type: "function", function: { name: "entity_path", description: "Find paths between two entities via shared chunks (multi-hop). Useful for " + "'how is X connected to Y' or 'show the trail between Hoover and Project Sign'.", parameters: { type: "object", properties: { from_class: { type: "string" }, from_id: { type: "string" }, to_class: { type: "string" }, to_id: { type: "string" }, max_hops: { type: "integer", description: "1-4 (default 3)." }, }, required: ["from_class", "from_id", "to_class", "to_id"], }, }, }; const co_mention_chunks_tool: ToolDefinition = { type: "function", function: { name: "co_mention_chunks", description: "Return chunks where two specific entities both appear. Use after entity_neighbors " + "to inspect the actual passages connecting them.", parameters: { type: "object", properties: { a_class: { type: "string" }, a_id: { type: "string" }, b_class: { type: "string" }, b_id: { type: "string" }, limit: { type: "integer", description: "Default 20, max 100." }, }, required: ["a_class", "a_id", "b_class", "b_id"], }, }, }; const analyze_image_region_tool: ToolDefinition = { type: "function", function: { name: "analyze_image_region", description: "Vision tool — answer a question about a cropped region of a document page. " + "Use this when the user asks about a photograph, diagram, sketch, signature, " + "stamp, redaction, or any visual element where the chunk's text description " + "isn't enough. The model reads the actual pixels via Sonnet vision. " + "Get the bbox + page from a prior hybrid_search hit (each chunk carries bbox). " + "Cost: ~$0.005–$0.02 per call. Use sparingly; prefer hybrid_search first.", parameters: { type: "object", properties: { doc_id: { type: "string" }, page: { type: "integer", description: "1-indexed page number" }, bbox: { type: "object", description: "Normalized bbox (0..1) of the region to analyze.", properties: { x: { type: "number" }, y: { type: "number" }, w: { type: "number" }, h: { type: "number" }, }, required: ["x", "y", "w", "h"], }, question: { type: "string", description: "What you want to know about the image." }, context: { type: "string", description: "Optional: prose context that grounds the model." }, }, required: ["doc_id", "page", "bbox", "question"], }, }, }; const navigate_to_tool: ToolDefinition = { type: "function", function: { name: "navigate_to", description: "Offer the user a clickable button to navigate the main UI to a specific " + "doc, page, or chunk anchor. Target examples: '/d/', '/d//p007', " + "'/d//p007#c0042'. Frontend renders the button — does NOT auto-redirect.", parameters: { type: "object", properties: { target: { type: "string" }, label: { type: "string", description: "Short button text (max 40 chars)." }, }, required: ["target", "label"], }, }, }; export const TOOL_DEFINITIONS: ToolDefinition[] = [ hybrid_search_tool, read_chunk_tool, get_page_chunks_tool, list_anomalies_tool, entity_neighbors_tool, entity_path_tool, co_mention_chunks_tool, read_page_tool, read_document_tool, read_entity_tool, search_corpus_tool, analyze_image_region_tool, navigate_to_tool, ]; /* ─── Helpers ───────────────────────────────────────────────────────────── */ function pickLang(ctx: ToolHandlerContext, override?: unknown): "pt" | "en" { if (override === "en" || override === "pt") return override; return ctx.lang === "en" ? "en" : "pt"; } function compactHit(h: ChunkHit, lang: "pt" | "en") { const text = lang === "en" ? h.content_en : h.content_pt; return { chunk_id: h.chunk_id, doc_id: h.doc_id, page: h.page, type: h.type, classification: h.classification, bbox: h.bbox, snippet: (text || "").slice(0, 300), score: Number((h.rerank_score ?? h.score).toFixed(4)), href: `/d/${h.doc_id}#${h.chunk_id}`, }; } function snippet(text: string, query: string, len = 200): string { const lc = text.toLowerCase(); const q = query.toLowerCase().split(/\s+/).find((w) => w.length >= 3) ?? ""; const i = q ? lc.indexOf(q) : -1; const start = i >= 0 ? Math.max(0, i - 60) : 0; return text.slice(start, start + len).replace(/\s+/g, " ").trim(); } /* ─── Tool handlers ─────────────────────────────────────────────────────── */ async function handleHybridSearch( args: Record, ctx: ToolHandlerContext, ): Promise { const query = String(args.query ?? "").trim(); if (!query) return { error: "empty_query", hits: [] }; const lang = pickLang(ctx, args.lang); const top_k = Math.min(Number(args.top_k) || 20, 50); try { const hits = await hybridSearch({ query, lang, doc_id: (args.doc_id as string) || ctx.doc_id || null, type: (args.type as string) || null, classification: (args.classification as string) || null, ufo_only: Boolean(args.ufo_only), top_k, // W2-TD#8: chat is latency-sensitive — skip rerank when ≤10 candidates. // The model only cites the first few hits anyway and BGE-Reranker // adds 5-8s on CPU. RRF order from the RPC is plenty for the head. rerank_strategy: "when_top_k_gt", rerank_threshold: 10, }); // Emit one citation (+ optional crop_image) artifact per hit so the UI can // render inline cards next to the assistant text. Limit to top 6 to avoid // flooding the chat with crops when top_k is large. if (ctx.emitArtifact) { for (const h of hits.slice(0, 6)) { ctx.emitArtifact({ kind: "citation", chunk_id: h.chunk_id, doc_id: h.doc_id, page: h.page, type: h.type, classification: h.classification, bbox: h.bbox ?? null, snippet: ((lang === "en" ? h.content_en : h.content_pt) || "").slice(0, 300), score: Number((h.rerank_score ?? h.score).toFixed(4)), }); if (h.bbox && h.bbox.w > 0 && h.bbox.h > 0) { const bb = h.bbox; const src = `/api/crop?doc=${encodeURIComponent(h.doc_id)}` + `&page=${h.page}&x=${bb.x}&y=${bb.y}&w=${bb.w}&h=${bb.h}&w_px=640`; ctx.emitArtifact({ kind: "crop_image", src, doc_id: h.doc_id, page: h.page, chunk_id: h.chunk_id, alt_en: (h.content_en || h.chunk_id).slice(0, 120), alt_pt: (h.content_pt || h.chunk_id).slice(0, 120), }); } } } return { query, lang, count: hits.length, hits: hits.map((h) => compactHit(h, lang)) }; } catch (e) { return { error: "retrieval_unavailable", message: (e as Error).message, fallback: "use search_corpus (legacy keyword)", }; } } async function handleReadChunk(args: Record): Promise { const doc_id = String(args.doc_id ?? "").trim(); const chunk_id = String(args.chunk_id ?? "").trim(); if (!doc_id || !chunk_id) return { error: "missing_args" }; try { const c = await getChunk(doc_id, chunk_id); if (!c) return { error: "not_found", doc_id, chunk_id }; return { chunk_id: c.chunk_id, doc_id: c.doc_id, page: c.page, type: c.type, bbox: c.bbox, classification: c.classification, content_en: c.content_en, content_pt: c.content_pt, href: `/d/${c.doc_id}#${c.chunk_id}`, }; } catch (e) { return { error: "retrieval_unavailable", message: (e as Error).message }; } } async function handleGetPageChunks(args: Record): Promise { const doc_id = String(args.doc_id ?? "").trim(); const page = Number(args.page); if (!doc_id || !Number.isFinite(page) || page < 1) return { error: "bad_args" }; try { const chunks = await getPageChunks(doc_id, page); return { doc_id, page, count: chunks.length, chunks: chunks.map((c) => ({ chunk_id: c.chunk_id, type: c.type, bbox: c.bbox, classification: c.classification, content_en: (c.content_en || "").slice(0, 500), content_pt: (c.content_pt || "").slice(0, 500), })), }; } catch (e) { return { error: "retrieval_unavailable", message: (e as Error).message }; } } async function handleListAnomalies( args: Record, ctx: ToolHandlerContext, ): Promise { const kind = (args.kind as string) === "cryptid" ? "cryptid" : "ufo"; const doc_id = (args.doc_id as string) || ctx.doc_id || null; const limit = Math.min(Number(args.limit) || 50, 200); try { const rows = await listAnomalies({ kind, doc_id, limit }); return { kind, doc_id, count: rows.length, anomalies: rows }; } catch (e) { return { error: "retrieval_unavailable", message: (e as Error).message }; } } async function handleSearch(args: Record): Promise { const query = String(args.query ?? "").trim(); const scope = (args.scope as string) ?? "all"; if (!query) return { error: "empty_query", hits: [] }; const ql = query.toLowerCase(); const hits: Array<{ type: string; id: string; title: string; snippet: string; href: string }> = []; if (scope === "all" || scope === "documents") { const ids = await listDocuments(); for (const id of ids) { const f = await readDocument(id); if (!f) continue; const title = String(f.fm.canonical_title ?? id); const hay = `${id} ${title} ${f.body.slice(0, 2000)}`.toLowerCase(); if (hay.includes(ql)) { hits.push({ type: "document", id, title, snippet: snippet(f.body, query), href: `/d/${id}`, }); } if (hits.length >= 8) break; } } if ((scope === "all" || scope === "entities") && hits.length < 8) { const classes = ["people", "organizations", "locations", "events", "uap-objects", "vehicles", "operations", "concepts"]; for (const cls of classes) { try { const entries = await fs.readdir(path.join(WIKI, "entities", cls)); for (const file of entries) { if (!file.endsWith(".md")) continue; const id = file.replace(/\.md$/, ""); if (id.toLowerCase().includes(ql)) { const content = await fs.readFile(path.join(WIKI, "entities", cls, file), "utf-8"); const cname = content.match(/canonical_name:\s*([^\n]+)/)?.[1]?.trim() ?? id; hits.push({ type: cls.replace(/s$/, ""), id, title: cname, snippet: id, href: `/e/${cls}/${id}`, }); if (hits.length >= 8) break; } } } catch { /* dir missing — fine */ } if (hits.length >= 8) break; } } return { query, scope, hits }; } async function handleReadPage(args: Record): Promise { const doc_id = String(args.doc_id ?? "").trim(); let page = String(args.page ?? "").trim(); if (!/^p\d{3}$/.test(page)) { const n = parseInt(page, 10); if (!Number.isFinite(n)) return { error: "bad_page" }; page = `p${String(n).padStart(3, "0")}`; } const md = await readPage(doc_id, page); if (!md) return { error: "not_found", doc_id, page }; return { doc_id, page, page_type: md.fm.page_type, language: md.fm.language_detected, content_classification: md.fm.content_classification, redactions_count: Array.isArray(md.fm.redactions) ? (md.fm.redactions as never[]).length : 0, vision_description: md.fm.vision_description, vision_description_pt_br: md.fm.vision_description_pt_br, entities_extracted: md.fm.entities_extracted, body_excerpt: md.body.slice(0, 2000), }; } async function handleReadDocument(args: Record): Promise { const doc_id = String(args.doc_id ?? "").trim(); const md = await readDocument(doc_id); if (!md) return { error: "not_found", doc_id }; const pages = await listPages(doc_id); return { doc_id, canonical_title: md.fm.canonical_title, collection: md.fm.collection, document_class: md.fm.document_class, page_count: pages.length, pages_index: pages.slice(0, 20), content_classification: md.fm.content_classification, languages_detected: md.fm.languages_detected, key_entities: md.fm.key_entities, executive_summary: md.body.slice(0, 2000), }; } async function handleReadEntity(args: Record): Promise { const cls = String(args.class ?? "").trim(); const id = String(args.id ?? "").trim(); const folder = classKeyToFolder(cls); if (!folder) return { error: "bad_class", cls }; const md = await readEntity(folder, id); if (!md) return { error: "not_found", cls, id }; return { class: folder, id, canonical_name: md.fm.canonical_name, aliases: md.fm.aliases, total_mentions: md.fm.total_mentions, enrichment_status: md.fm.enrichment_status, external_sources: md.fm.external_sources, disambiguation_note: md.fm.disambiguation_note, body_excerpt: md.body.slice(0, 2000), }; } async function handleEntityNeighbors(args: Record): Promise { const cls = String(args.class ?? "").trim(); const id = String(args.id ?? "").trim(); if (!cls || !id) return { error: "missing_args" }; try { const ent = await findEntity(cls, id); if (!ent) return { error: "entity_not_found", class: cls, id }; const filterClasses = (args.filter_classes as string[] | undefined)?.filter(Boolean); const limit = Math.min(Number(args.limit) || 30, 100); const neighbors = await getNeighbors(ent.entity_pk, { limit, classes: filterClasses }); return { entity: ent, count: neighbors.length, neighbors }; } catch (e) { return { error: "graph_unavailable", message: (e as Error).message }; } } async function handleEntityPath(args: Record): Promise { const fromCls = String(args.from_class ?? "").trim(); const fromId = String(args.from_id ?? "").trim(); const toCls = String(args.to_class ?? "").trim(); const toId = String(args.to_id ?? "").trim(); const maxHops = Math.min(Number(args.max_hops) || 3, 4); if (!fromCls || !fromId || !toCls || !toId) return { error: "missing_args" }; try { const [a, b] = await Promise.all([findEntity(fromCls, fromId), findEntity(toCls, toId)]); if (!a) return { error: "from_not_found", class: fromCls, id: fromId }; if (!b) return { error: "to_not_found", class: toCls, id: toId }; const paths = await findPaths(a.entity_pk, b.entity_pk, maxHops); return { from: a, to: b, max_hops: maxHops, paths }; } catch (e) { return { error: "graph_unavailable", message: (e as Error).message }; } } async function handleCoMentionChunks(args: Record): Promise { const aCls = String(args.a_class ?? "").trim(); const aId = String(args.a_id ?? "").trim(); const bCls = String(args.b_class ?? "").trim(); const bId = String(args.b_id ?? "").trim(); const limit = Math.min(Number(args.limit) || 20, 100); if (!aCls || !aId || !bCls || !bId) return { error: "missing_args" }; try { const [a, b] = await Promise.all([findEntity(aCls, aId), findEntity(bCls, bId)]); if (!a || !b) return { error: "entity_not_found", a: aId, b: bId }; const chunks = await getCoMentionChunks(a.entity_pk, b.entity_pk, limit); return { a, b, count: chunks.length, chunks }; } catch (e) { return { error: "graph_unavailable", message: (e as Error).message }; } } async function handleNavigate(args: Record): Promise { const target = String(args.target ?? "").trim(); const label = String(args.label ?? "").slice(0, 40); if (!target.startsWith("/")) return { error: "target_must_start_with_slash", target }; return { ok: true, target, label }; } async function handleAnalyzeImageRegion( args: Record, ctx: ToolHandlerContext, ): Promise { const doc_id = String(args.doc_id ?? "").trim(); const page = Number(args.page); const bbox = args.bbox as { x: number; y: number; w: number; h: number } | undefined; const question = String(args.question ?? "").trim(); if (!doc_id || !page || !bbox || !question) return { error: "missing_args" }; try { const { analyzeImageRegion } = await import("./vision"); const out = await analyzeImageRegion({ doc_id, page, bbox, question, context: typeof args.context === "string" ? args.context : undefined, lang: ctx.lang === "en" ? "en" : "pt", }); if (ctx.emitArtifact) { ctx.emitArtifact({ kind: "crop_image", src: out.crop_url, doc_id, page, alt_en: question.slice(0, 120), alt_pt: question.slice(0, 120), }); } return out; } catch (e) { return { error: "vision_failed", message: (e as Error).message }; } } export const TOOL_HANDLERS: Record = { hybrid_search: handleHybridSearch, read_chunk: handleReadChunk, get_page_chunks: handleGetPageChunks, list_anomalies: handleListAnomalies, entity_neighbors: handleEntityNeighbors, entity_path: handleEntityPath, co_mention_chunks: handleCoMentionChunks, read_page: handleReadPage, read_document: handleReadDocument, read_entity: handleReadEntity, search_corpus: handleSearch, analyze_image_region: handleAnalyzeImageRegion, navigate_to: handleNavigate, };