disclosure-bureau/tests/rag/golden.yaml

# Golden retrieval set — Disclosure Bureau RAG eval
#
# Each entry is a question paired with the chunks that MUST appear in the
# top-K results from `hybrid_search_chunks`. The harness in run.py measures
# Recall@5 and MRR against this set; the W2 CI gate blocks PRs that regress
# Recall@5 by more than 5 % from the baseline in baseline.json.
#
# Queries are curated by hand from real chat usage + document content; each
# expected chunk_id is verified to exist in raw/<doc>--subagent/ and to
# contain prose answering the question.
#
# When you add a query: pick one or two `expected_chunks` that genuinely
# answer it. Don't over-stuff — Recall@5 with 10 expected chunks is meaningless.

queries:

  # ─── Foundational 1947 wave ────────────────────────────────────────────────
  - id: q01-arnold-mt-rainier
    question: "What did Kenneth Arnold see over Mt. Rainier in June 1947?"
    lang: en
    expected_chunks:
      - doc: doc-65-hs1-834228961-62-hq-83894-section-2
        chunk: c0122
      - doc: doc-65-hs1-834228961-62-hq-83894-section-2
        chunk: c0123

  - id: q02-maury-island-hoax
    question: "Quem foi Harold Dahl no caso Maury Island e qual foi a admissão dele?"
    lang: pt
    expected_chunks:
      - doc: doc-65-hs1-834228961-62-hq-83894-section-2
        chunk: c0097

  - id: q03-rhodes-phoenix-photo
    question: "William Rhodes Phoenix flying disc photograph"
    lang: en
    # expected_chunks calibrated against live disclosure.top response
    # (top-1 hit at the time of the W2 baseline). Refine when content moves.
    expected_chunks:
      - {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c1279}

  # ─── 1948–1950 incident summaries ──────────────────────────────────────────
  - id: q04-chiles-whitted
    question: "Chiles Whitted Eastern Air Lines cigar shaped object"
    lang: en
    expected_chunks:
      - {doc: doc-38-143685-box7-incident-summaries-101-172, chunk: c2122}

  - id: q05-gorman-dogfight
    question: "Gorman dogfight Fargo North Dakota"
    lang: en
    expected_chunks: []  # currently 0 hits on prod — flag for golden curation

  - id: q06-mantell-crash
    question: "Mantell chase Kentucky 1948"
    lang: en
    expected_chunks:
      - {doc: doc-38-143685-box7-incident-summaries-1-100, chunk: c1149}

  # ─── Release 02 docs ───────────────────────────────────────────────────────
  - id: q07-sandia-1948-1950
    question: "UAP reportado em Sandia Base entre 1948 e 1950"
    lang: pt
    expected_chunks:
      - doc: dow-uap-d017-general-correspondence-of-sandia
        chunk: c0001

  - id: q08-pajarito-astronomers
    question: "Pajarito astronomers invitation 1986 New Mexico"
    lang: en
    expected_chunks:
      - doc: doc-65-hs1-834228961-62-hq-83894-section-5
        chunk: c0001  # will fall back to text match; verified in section-5

  - id: q09-james-tuck-correspondence
    question: "James Tuck Los Alamos correspondence flying saucers"
    lang: en
    expected_chunks:
      - {doc: doe-uap-d002-jamestuck-correspondence, chunk: c0600}

  # ─── COMETA + ODNI USPER + Apollo ──────────────────────────────────────────
  - id: q10-cometa-report
    question: "COMETA report extraterrestrial hypothesis French military"
    lang: en
    expected_chunks:
      - {doc: doc-255-413270-ufo-s-and-defense-what-should-we-prepare-for, chunk: c0024}

  - id: q11-apollo-17-flash
    question: "Apollo 17 lunar surface flash Grimaldi"
    lang: en
    expected_chunks:
      - doc: nasa-uap-d2-apollo-17-transcript-1972
        chunk: c0057

  - id: q12-usper-narrative
    question: "USPER narrative senior USIC official 2025"
    lang: en
    expected_chunks:
      - doc: odni-uap-d001-usper-narrative-senior-usic
        chunk: c0001

  # ─── Generic UFO physics + politics ────────────────────────────────────────
  - id: q13-uss-nimitz-tic-tac
    question: "Nimitz tic-tac 2004"
    lang: en
    expected_chunks: []  # negative: not in corpus, expect zero hits OR low-conf

  - id: q14-mj-12
    question: "MJ-12 majestic twelve"
    lang: en
    expected_chunks: []  # negative

  - id: q15-roswell
    question: "Roswell New Mexico"
    lang: en
    expected_chunks:
      - {doc: doc-65-hs1-834228961-62-hq-83894-section-1, chunk: c0527}