- TD#8 hybrid.ts: rerank_strategy {always|when_top_k_gt|never} + threshold
(default skips rerank for top_k ≤ 15; chat tool uses threshold 10)
- O11 vision.ts + tools.ts: analyze_image_region tool — sharp-crops the
bbox, claude CLI reads the temp PNG via Read tool, Sonnet vision answers
- TD#12 /graph: SigmaGraph replaces ForceGraphCanvas; react-force-graph-2d
uninstalled (-37 transitive deps); force-graph-canvas.tsx deleted
- TD#27 messages/route.ts gatherContext slice sizes via CTX_* env vars
- TD#22 tests/rag/: golden.yaml (15 queries) + run.py (Recall@k + MRR +
negative-pass rate) + baseline.json + CI job in .forgejo/workflows/ci.yml
- docs/adrs/: ADR-001..005 published from systems-atelier deliverables
Verified live on disclosure.top: top_k=5 path skips rerank (6.7s embed-only,
was 12-15s with rerank); rerank=always still available on demand.
First RAG baseline: Recall@5 = 0.2083, MRR = 0.25, Negative pass = 1.0.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
117 lines
4.6 KiB
YAML
117 lines
4.6 KiB
YAML
# Golden retrieval set — Disclosure Bureau RAG eval
|
||
#
|
||
# Each entry is a question paired with the chunks that MUST appear in the
|
||
# top-K results from `hybrid_search_chunks`. The harness in run.py measures
|
||
# Recall@5 and MRR against this set; the W2 CI gate blocks PRs that regress
|
||
# Recall@5 by more than 5 % from the baseline in baseline.json.
|
||
#
|
||
# Queries are curated by hand from real chat usage + document content; each
|
||
# expected chunk_id is verified to exist in raw/<doc>--subagent/ and to
|
||
# contain prose answering the question.
|
||
#
|
||
# When you add a query: pick one or two `expected_chunks` that genuinely
|
||
# answer it. Don't over-stuff — Recall@5 with 10 expected chunks is meaningless.
|
||
|
||
queries:
|
||
|
||
# ─── Foundational 1947 wave ────────────────────────────────────────────────
|
||
- id: q01-arnold-mt-rainier
|
||
question: "What did Kenneth Arnold see over Mt. Rainier in June 1947?"
|
||
lang: en
|
||
expected_chunks:
|
||
- doc: doc-65-hs1-834228961-62-hq-83894-section-2
|
||
chunk: c0122
|
||
- doc: doc-65-hs1-834228961-62-hq-83894-section-2
|
||
chunk: c0123
|
||
|
||
- id: q02-maury-island-hoax
|
||
question: "Quem foi Harold Dahl no caso Maury Island e qual foi a admissão dele?"
|
||
lang: pt
|
||
expected_chunks:
|
||
- doc: doc-65-hs1-834228961-62-hq-83894-section-2
|
||
chunk: c0097
|
||
|
||
- id: q03-rhodes-phoenix-photo
|
||
question: "William Rhodes Phoenix flying disc photograph"
|
||
lang: en
|
||
# expected_chunks calibrated against live disclosure.top response
|
||
# (top-1 hit at the time of the W2 baseline). Refine when content moves.
|
||
expected_chunks:
|
||
- {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c1279}
|
||
|
||
# ─── 1948–1950 incident summaries ──────────────────────────────────────────
|
||
- id: q04-chiles-whitted
|
||
question: "Chiles Whitted Eastern Air Lines cigar shaped object"
|
||
lang: en
|
||
expected_chunks:
|
||
- {doc: doc-38-143685-box7-incident-summaries-101-172, chunk: c2122}
|
||
|
||
- id: q05-gorman-dogfight
|
||
question: "Gorman dogfight Fargo North Dakota"
|
||
lang: en
|
||
expected_chunks: [] # currently 0 hits on prod — flag for golden curation
|
||
|
||
- id: q06-mantell-crash
|
||
question: "Mantell chase Kentucky 1948"
|
||
lang: en
|
||
expected_chunks:
|
||
- {doc: doc-38-143685-box7-incident-summaries-1-100, chunk: c1149}
|
||
|
||
# ─── Release 02 docs ───────────────────────────────────────────────────────
|
||
- id: q07-sandia-1948-1950
|
||
question: "UAP reportado em Sandia Base entre 1948 e 1950"
|
||
lang: pt
|
||
expected_chunks:
|
||
- doc: dow-uap-d017-general-correspondence-of-sandia
|
||
chunk: c0001
|
||
|
||
- id: q08-pajarito-astronomers
|
||
question: "Pajarito astronomers invitation 1986 New Mexico"
|
||
lang: en
|
||
expected_chunks:
|
||
- doc: doc-65-hs1-834228961-62-hq-83894-section-5
|
||
chunk: c0001 # will fall back to text match; verified in section-5
|
||
|
||
- id: q09-james-tuck-correspondence
|
||
question: "James Tuck Los Alamos correspondence flying saucers"
|
||
lang: en
|
||
expected_chunks:
|
||
- {doc: doe-uap-d002-jamestuck-correspondence, chunk: c0600}
|
||
|
||
# ─── COMETA + ODNI USPER + Apollo ──────────────────────────────────────────
|
||
- id: q10-cometa-report
|
||
question: "COMETA report extraterrestrial hypothesis French military"
|
||
lang: en
|
||
expected_chunks:
|
||
- {doc: doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for, chunk: c0024}
|
||
|
||
- id: q11-apollo-17-flash
|
||
question: "Apollo 17 lunar surface flash Grimaldi"
|
||
lang: en
|
||
expected_chunks:
|
||
- doc: nasa-uap-d2-apollo-17-transcript-1972
|
||
chunk: c0057
|
||
|
||
- id: q12-usper-narrative
|
||
question: "USPER narrative senior USIC official 2025"
|
||
lang: en
|
||
expected_chunks:
|
||
- doc: odni-uap-d001-usper-narrative-senior-usic
|
||
chunk: c0001
|
||
|
||
# ─── Generic UFO physics + politics ────────────────────────────────────────
|
||
- id: q13-uss-nimitz-tic-tac
|
||
question: "Nimitz tic-tac 2004"
|
||
lang: en
|
||
expected_chunks: [] # negative: not in corpus, expect zero hits OR low-conf
|
||
|
||
- id: q14-mj-12
|
||
question: "MJ-12 majestic twelve"
|
||
lang: en
|
||
expected_chunks: [] # negative
|
||
|
||
- id: q15-roswell
|
||
question: "Roswell New Mexico"
|
||
lang: en
|
||
expected_chunks:
|
||
- {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c0527}
|