# Golden retrieval set — Disclosure Bureau RAG eval # # Each entry is a question paired with the chunks that MUST appear in the # top-K results from `hybrid_search_chunks`. The harness in run.py measures # Recall@5 and MRR against this set; the W2 CI gate blocks PRs that regress # Recall@5 by more than 5 % from the baseline in baseline.json. # # Queries are curated by hand from real chat usage + document content; each # expected chunk_id is verified to exist in raw/--subagent/ and to # contain prose answering the question. # # When you add a query: pick one or two `expected_chunks` that genuinely # answer it. Don't over-stuff — Recall@5 with 10 expected chunks is meaningless. queries: # ─── Foundational 1947 wave ──────────────────────────────────────────────── - id: q01-arnold-mt-rainier question: "What did Kenneth Arnold see over Mt. Rainier in June 1947?" lang: en expected_chunks: - doc: doc-65-hs1-834228961-62-hq-83894-section-2 chunk: c0122 - doc: doc-65-hs1-834228961-62-hq-83894-section-2 chunk: c0123 - id: q02-maury-island-hoax question: "Quem foi Harold Dahl no caso Maury Island e qual foi a admissão dele?" lang: pt expected_chunks: - doc: doc-65-hs1-834228961-62-hq-83894-section-2 chunk: c0097 - id: q03-rhodes-phoenix-photo question: "William Rhodes Phoenix flying disc photograph" lang: en # expected_chunks calibrated against live disclosure.top response # (top-1 hit at the time of the W2 baseline). Refine when content moves. expected_chunks: - {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c1279} # ─── 1948–1950 incident summaries ────────────────────────────────────────── - id: q04-chiles-whitted question: "Chiles Whitted Eastern Air Lines cigar shaped object" lang: en expected_chunks: - {doc: doc-38-143685-box7-incident-summaries-101-172, chunk: c2122} - id: q05-gorman-dogfight question: "Gorman dogfight Fargo North Dakota" lang: en expected_chunks: [] # currently 0 hits on prod — flag for golden curation - id: q06-mantell-crash question: "Mantell chase Kentucky 1948" lang: en expected_chunks: - {doc: doc-38-143685-box7-incident-summaries-1-100, chunk: c1149} # ─── Release 02 docs ─────────────────────────────────────────────────────── - id: q07-sandia-1948-1950 question: "UAP reportado em Sandia Base entre 1948 e 1950" lang: pt expected_chunks: - doc: dow-uap-d017-general-correspondence-of-sandia chunk: c0001 - id: q08-pajarito-astronomers question: "Pajarito astronomers invitation 1986 New Mexico" lang: en expected_chunks: - doc: doc-65-hs1-834228961-62-hq-83894-section-5 chunk: c0001 # will fall back to text match; verified in section-5 - id: q09-james-tuck-correspondence question: "James Tuck Los Alamos correspondence flying saucers" lang: en expected_chunks: - {doc: doe-uap-d002-jamestuck-correspondence, chunk: c0600} # ─── COMETA + ODNI USPER + Apollo ────────────────────────────────────────── - id: q10-cometa-report question: "COMETA report extraterrestrial hypothesis French military" lang: en expected_chunks: - {doc: doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for, chunk: c0024} - id: q11-apollo-17-flash question: "Apollo 17 lunar surface flash Grimaldi" lang: en expected_chunks: - doc: nasa-uap-d2-apollo-17-transcript-1972 chunk: c0057 - id: q12-usper-narrative question: "USPER narrative senior USIC official 2025" lang: en expected_chunks: - doc: odni-uap-d001-usper-narrative-senior-usic chunk: c0001 # ─── Generic UFO physics + politics ──────────────────────────────────────── - id: q13-uss-nimitz-tic-tac question: "Nimitz tic-tac 2004" lang: en expected_chunks: [] # negative: not in corpus, expect zero hits OR low-conf - id: q14-mj-12 question: "MJ-12 majestic twelve" lang: en expected_chunks: [] # negative - id: q15-roswell question: "Roswell New Mexico" lang: en expected_chunks: - {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c0527}