disclosure-bureau/tests/rag/golden.yaml

118 lines
4.6 KiB
YAML
Raw Normal View History

# Golden retrieval set — Disclosure Bureau RAG eval
#
# Each entry is a question paired with the chunks that MUST appear in the
# top-K results from `hybrid_search_chunks`. The harness in run.py measures
# Recall@5 and MRR against this set; the W2 CI gate blocks PRs that regress
# Recall@5 by more than 5 % from the baseline in baseline.json.
#
# Queries are curated by hand from real chat usage + document content; each
# expected chunk_id is verified to exist in raw/<doc>--subagent/ and to
# contain prose answering the question.
#
# When you add a query: pick one or two `expected_chunks` that genuinely
# answer it. Don't over-stuff — Recall@5 with 10 expected chunks is meaningless.
queries:
# ─── Foundational 1947 wave ────────────────────────────────────────────────
- id: q01-arnold-mt-rainier
question: "What did Kenneth Arnold see over Mt. Rainier in June 1947?"
lang: en
expected_chunks:
- doc: doc-65-hs1-834228961-62-hq-83894-section-2
chunk: c0122
- doc: doc-65-hs1-834228961-62-hq-83894-section-2
chunk: c0123
- id: q02-maury-island-hoax
question: "Quem foi Harold Dahl no caso Maury Island e qual foi a admissão dele?"
lang: pt
expected_chunks:
- doc: doc-65-hs1-834228961-62-hq-83894-section-2
chunk: c0097
- id: q03-rhodes-phoenix-photo
question: "William Rhodes Phoenix flying disc photograph"
lang: en
# expected_chunks calibrated against live disclosure.top response
# (top-1 hit at the time of the W2 baseline). Refine when content moves.
expected_chunks:
- {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c1279}
# ─── 19481950 incident summaries ──────────────────────────────────────────
- id: q04-chiles-whitted
question: "Chiles Whitted Eastern Air Lines cigar shaped object"
lang: en
expected_chunks:
- {doc: doc-38-143685-box7-incident-summaries-101-172, chunk: c2122}
- id: q05-gorman-dogfight
question: "Gorman dogfight Fargo North Dakota"
lang: en
expected_chunks: [] # currently 0 hits on prod — flag for golden curation
- id: q06-mantell-crash
question: "Mantell chase Kentucky 1948"
lang: en
expected_chunks:
- {doc: doc-38-143685-box7-incident-summaries-1-100, chunk: c1149}
# ─── Release 02 docs ───────────────────────────────────────────────────────
- id: q07-sandia-1948-1950
question: "UAP reportado em Sandia Base entre 1948 e 1950"
lang: pt
expected_chunks:
- doc: dow-uap-d017-general-correspondence-of-sandia
chunk: c0001
- id: q08-pajarito-astronomers
question: "Pajarito astronomers invitation 1986 New Mexico"
lang: en
expected_chunks:
- doc: doc-65-hs1-834228961-62-hq-83894-section-5
chunk: c0001 # will fall back to text match; verified in section-5
- id: q09-james-tuck-correspondence
question: "James Tuck Los Alamos correspondence flying saucers"
lang: en
expected_chunks:
- {doc: doe-uap-d002-jamestuck-correspondence, chunk: c0600}
# ─── COMETA + ODNI USPER + Apollo ──────────────────────────────────────────
- id: q10-cometa-report
question: "COMETA report extraterrestrial hypothesis French military"
lang: en
expected_chunks:
- {doc: doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for, chunk: c0024}
- id: q11-apollo-17-flash
question: "Apollo 17 lunar surface flash Grimaldi"
lang: en
expected_chunks:
- doc: nasa-uap-d2-apollo-17-transcript-1972
chunk: c0057
- id: q12-usper-narrative
question: "USPER narrative senior USIC official 2025"
lang: en
expected_chunks:
- doc: odni-uap-d001-usper-narrative-senior-usic
chunk: c0001
# ─── Generic UFO physics + politics ────────────────────────────────────────
- id: q13-uss-nimitz-tic-tac
question: "Nimitz tic-tac 2004"
lang: en
expected_chunks: [] # negative: not in corpus, expect zero hits OR low-conf
- id: q14-mj-12
question: "MJ-12 majestic twelve"
lang: en
expected_chunks: [] # negative
- id: q15-roswell
question: "Roswell New Mexico"
lang: en
expected_chunks:
- {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c0527}