- TD#8 hybrid.ts: rerank_strategy {always|when_top_k_gt|never} + threshold
(default skips rerank for top_k ≤ 15; chat tool uses threshold 10)
- O11 vision.ts + tools.ts: analyze_image_region tool — sharp-crops the
bbox, claude CLI reads the temp PNG via Read tool, Sonnet vision answers
- TD#12 /graph: SigmaGraph replaces ForceGraphCanvas; react-force-graph-2d
uninstalled (-37 transitive deps); force-graph-canvas.tsx deleted
- TD#27 messages/route.ts gatherContext slice sizes via CTX_* env vars
- TD#22 tests/rag/: golden.yaml (15 queries) + run.py (Recall@k + MRR +
negative-pass rate) + baseline.json + CI job in .forgejo/workflows/ci.yml
- docs/adrs/: ADR-001..005 published from systems-atelier deliverables
Verified live on disclosure.top: top_k=5 path skips rerank (6.7s embed-only,
was 12-15s with rerank); rerank=always still available on demand.
First RAG baseline: Recall@5 = 0.2083, MRR = 0.25, Negative pass = 1.0.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
178 lines
6.5 KiB
Python
178 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
tests/rag/run.py — Golden RAG evaluation.
|
|
|
|
Reads tests/rag/golden.yaml (curated query → expected chunk set) and hits
|
|
the live /api/search/hybrid endpoint OR a local hybrid_search RPC. Computes
|
|
Recall@5 and MRR per query, plus aggregates. Writes a JSON report to
|
|
tests/rag/last_run.json and compares with tests/rag/baseline.json.
|
|
|
|
CI gate: if Recall@5 drops more than --max-recall-drop (default 0.05) from
|
|
baseline, exit 1.
|
|
|
|
Usage:
|
|
python3 tests/rag/run.py # uses prod URL
|
|
python3 tests/rag/run.py --url http://localhost:3000 # local dev
|
|
python3 tests/rag/run.py --refresh-baseline # accept current as baseline
|
|
python3 tests/rag/run.py --top-k 10 --no-rerank
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import urllib.parse
|
|
import urllib.request
|
|
import urllib.error
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.exit("pip install pyyaml")
|
|
|
|
ROOT = Path(__file__).resolve().parent
|
|
GOLDEN = ROOT / "golden.yaml"
|
|
BASELINE = ROOT / "baseline.json"
|
|
LAST_RUN = ROOT / "last_run.json"
|
|
|
|
|
|
def search(base_url: str, q: str, lang: str, top_k: int, rerank: str) -> list[dict]:
|
|
params = {"q": q, "lang": lang, "top_k": str(top_k)}
|
|
if rerank == "never":
|
|
params["rerank"] = "never"
|
|
elif rerank == "always":
|
|
params["rerank"] = "always"
|
|
qs = urllib.parse.urlencode(params)
|
|
url = f"{base_url.rstrip('/')}/api/search/hybrid?{qs}"
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=30) as r:
|
|
data = json.loads(r.read())
|
|
return data.get("hits", [])
|
|
except urllib.error.HTTPError as e:
|
|
sys.stderr.write(f" ! HTTP {e.code} on {q!r}\n")
|
|
return []
|
|
except Exception as e:
|
|
sys.stderr.write(f" ! {e} on {q!r}\n")
|
|
return []
|
|
|
|
|
|
def evaluate(golden: list[dict], hits_by_id: dict[str, list[dict]], k: int) -> dict:
|
|
"""Per-query Recall@k + MRR. Negative-set queries (no expected chunks)
|
|
pass when no hits are returned within the top-k."""
|
|
per_query: list[dict] = []
|
|
pos_recalls: list[float] = []
|
|
pos_mrrs: list[float] = []
|
|
neg_pass = 0
|
|
neg_total = 0
|
|
|
|
for q in golden:
|
|
qid = q["id"]
|
|
expected = {(e["doc"], e["chunk"]) for e in (q.get("expected_chunks") or [])}
|
|
hits = hits_by_id.get(qid, [])
|
|
topk = hits[:k]
|
|
|
|
if not expected:
|
|
# Negative-set: pass when fewer than k hits, OR when first hit is
|
|
# weak enough that the model wouldn't latch onto it. We accept
|
|
# any non-zero result count as failure to keep the metric strict.
|
|
neg_total += 1
|
|
ok = len(topk) == 0
|
|
per_query.append({
|
|
"id": qid, "negative": True, "ok": ok,
|
|
"n_hits": len(topk),
|
|
})
|
|
if ok:
|
|
neg_pass += 1
|
|
continue
|
|
|
|
present = sum(1 for h in topk if (h.get("doc_id"), h.get("chunk_id")) in expected)
|
|
recall = present / len(expected)
|
|
# MRR — first matching position (1-indexed). 0 if none.
|
|
rr = 0.0
|
|
for i, h in enumerate(topk, start=1):
|
|
if (h.get("doc_id"), h.get("chunk_id")) in expected:
|
|
rr = 1.0 / i
|
|
break
|
|
per_query.append({
|
|
"id": qid, "negative": False,
|
|
"recall_at_k": round(recall, 4),
|
|
"mrr": round(rr, 4),
|
|
"n_expected": len(expected),
|
|
"n_present": present,
|
|
})
|
|
pos_recalls.append(recall)
|
|
pos_mrrs.append(rr)
|
|
|
|
return {
|
|
"k": k,
|
|
"n_queries": len(per_query),
|
|
"n_positive": len(pos_recalls),
|
|
"n_negative": neg_total,
|
|
"recall_at_k": round(sum(pos_recalls) / len(pos_recalls), 4) if pos_recalls else 0.0,
|
|
"mrr": round(sum(pos_mrrs) / len(pos_mrrs), 4) if pos_mrrs else 0.0,
|
|
"negative_pass_rate": round(neg_pass / neg_total, 4) if neg_total else 1.0,
|
|
"per_query": per_query,
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--url", default="https://disclosure.top",
|
|
help="Base URL of the deployment to evaluate")
|
|
ap.add_argument("--top-k", type=int, default=5)
|
|
ap.add_argument("--rerank", choices=["always", "when_top_k_gt", "never"],
|
|
default="when_top_k_gt")
|
|
ap.add_argument("--refresh-baseline", action="store_true",
|
|
help="Overwrite baseline.json with this run (acknowledged regression).")
|
|
ap.add_argument("--max-recall-drop", type=float, default=0.05)
|
|
args = ap.parse_args()
|
|
|
|
data = yaml.safe_load(GOLDEN.read_text())
|
|
queries = data["queries"]
|
|
print(f"= running {len(queries)} queries against {args.url} (k={args.top_k}, rerank={args.rerank})")
|
|
|
|
hits_by_id = {}
|
|
for q in queries:
|
|
hits = search(args.url, q["question"], q.get("lang", "pt"),
|
|
top_k=max(args.top_k, 10), rerank=args.rerank)
|
|
hits_by_id[q["id"]] = hits
|
|
first = hits[0].get("chunk_id") if hits else "-"
|
|
print(f" {q['id']:24s} → {len(hits):2d} hits (first={first})")
|
|
|
|
report = evaluate(queries, hits_by_id, k=args.top_k)
|
|
report["url"] = args.url
|
|
report["top_k"] = args.top_k
|
|
report["rerank"] = args.rerank
|
|
|
|
LAST_RUN.write_text(json.dumps(report, indent=2))
|
|
print(f"\n— wrote {LAST_RUN}")
|
|
print(f" Recall@{args.top_k} = {report['recall_at_k']:.4f}")
|
|
print(f" MRR = {report['mrr']:.4f}")
|
|
print(f" Negative pass = {report['negative_pass_rate']:.4f}")
|
|
|
|
if args.refresh_baseline:
|
|
BASELINE.write_text(json.dumps({
|
|
"url": args.url, "top_k": args.top_k, "rerank": args.rerank,
|
|
"recall_at_k": report["recall_at_k"],
|
|
"mrr": report["mrr"],
|
|
"negative_pass_rate": report["negative_pass_rate"],
|
|
}, indent=2))
|
|
print(f"\n✓ baseline refreshed: {BASELINE}")
|
|
return 0
|
|
|
|
if not BASELINE.exists():
|
|
print("\n! no baseline yet — run with --refresh-baseline to create one")
|
|
return 0
|
|
|
|
baseline = json.loads(BASELINE.read_text())
|
|
drop = baseline["recall_at_k"] - report["recall_at_k"]
|
|
print(f"\n baseline Recall@{args.top_k} = {baseline['recall_at_k']:.4f} (Δ {-drop:+.4f})")
|
|
if drop > args.max_recall_drop:
|
|
print(f"\n✗ GATE FAILED: Recall@{args.top_k} dropped {drop:.4f} > {args.max_recall_drop}")
|
|
return 1
|
|
print(f"\n✓ gate passed (drop ≤ {args.max_recall_drop})")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|