#!/usr/bin/env python3 """ 60_meili_index.py — Push documents + chunks into Meilisearch for autocomplete. W1 deliverable. Meilisearch is the typo-tolerant prefix-aware search engine in the stack; it complements Postgres BM25 + pgvector (used by the chat). The goal here is fast `/search` autocomplete that shows matching docs and chunks as the user types — sub-30ms. Indexes created: - documents id=doc_id, fields=[canonical_title, collection, doc_id] - chunks id=chunk_pk, fields=[doc_id, chunk_id, page, content_en, content_pt] Idempotent: re-running upserts. Skip `--reset` to rebuild from scratch. Run from inside the disclosure-internal network OR with --meili-url override. The default reads MEILI_MASTER_KEY + MEILISEARCH_URL from env. Usage: python3 scripts/maintain/60_meili_index.py python3 scripts/maintain/60_meili_index.py --reset python3 scripts/maintain/60_meili_index.py --doc-id """ from __future__ import annotations import argparse import json import os import sys from typing import Any try: import psycopg import requests except ImportError as e: sys.exit(f"pip install psycopg[binary] requests # missing: {e}") DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL") MEILI_URL = os.getenv("MEILISEARCH_URL", "http://meilisearch:7700") MEILI_KEY = os.getenv("MEILI_MASTER_KEY") or os.getenv("MEILISEARCH_API_KEY", "") BATCH = int(os.getenv("MEILI_BATCH", "1000")) def meili(method: str, path: str, body: Any = None) -> dict: headers = {"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"} r = requests.request(method, f"{MEILI_URL}{path}", headers=headers, data=json.dumps(body) if body is not None else None, timeout=120) r.raise_for_status() return r.json() if r.text else {} def ensure_index(uid: str, primary_key: str, searchable: list[str], filterable: list[str]): """Create the index if missing, then set settings.""" try: meili("POST", "/indexes", {"uid": uid, "primaryKey": primary_key}) print(f" created index {uid}") except requests.HTTPError as e: # 409 = already exists, OK. if e.response.status_code not in (400, 409): raise meili("PATCH", f"/indexes/{uid}/settings", { "searchableAttributes": searchable, "filterableAttributes": filterable, "displayedAttributes": ["*"], "rankingRules": ["words", "typo", "proximity", "attribute", "sort", "exactness"], "typoTolerance": {"enabled": True, "minWordSizeForTypos": {"oneTypo": 4, "twoTypos": 8}}, }) def push(uid: str, docs: list[dict]): if not docs: return meili("POST", f"/indexes/{uid}/documents", docs) def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--reset", action="store_true", help="Delete and recreate indexes") ap.add_argument("--doc-id", help="Reindex only one doc") args = ap.parse_args() if not DATABASE_URL: sys.exit("DATABASE_URL not set") if not MEILI_KEY: sys.exit("MEILI_MASTER_KEY not set") if args.reset and not args.doc_id: print("Resetting indexes...") for uid in ("documents", "chunks"): try: meili("DELETE", f"/indexes/{uid}") except requests.HTTPError: pass ensure_index("documents", "doc_id", searchable=["canonical_title", "collection", "doc_id"], filterable=["collection", "classification"]) ensure_index("chunks", "chunk_pk", searchable=["content_pt", "content_en", "doc_id", "chunk_id"], filterable=["doc_id", "type", "classification", "ufo_anomaly", "is_searchable"]) with psycopg.connect(DATABASE_URL) as conn, conn.cursor() as cur: # documents where_doc = "WHERE doc_id = %s" if args.doc_id else "" params = (args.doc_id,) if args.doc_id else () cur.execute(f""" SELECT doc_id, canonical_title, collection, classification FROM public.documents {where_doc} """, params) rows = cur.fetchall() docs = [{"doc_id": r[0], "canonical_title": r[1] or r[0], "collection": r[2] or "", "classification": r[3] or ""} for r in rows] print(f"documents → meili: {len(docs)}") for i in range(0, len(docs), BATCH): push("documents", docs[i:i+BATCH]) # chunks (only searchable ones — drops scaffolding noise) where_chunk = "WHERE c.is_searchable" + (" AND c.doc_id = %s" if args.doc_id else "") cur.execute(f""" SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.content_en, c.content_pt, c.classification, c.ufo_anomaly FROM public.chunks c {where_chunk} """, params) chunks: list[dict] = [] total = 0 for r in cur: chunks.append({ "chunk_pk": r[0], "doc_id": r[1], "chunk_id": r[2], "page": r[3], "type": r[4], "content_en": (r[5] or "")[:2000], "content_pt": (r[6] or "")[:2000], "classification": r[7] or "", "ufo_anomaly": bool(r[8]), "is_searchable": True, }) if len(chunks) >= BATCH: push("chunks", chunks) total += len(chunks) chunks = [] print(f" pushed {total} chunks...") if chunks: push("chunks", chunks) total += len(chunks) print(f"chunks → meili: {total}") print("\n✓ done. Indexer enqueued; meili processes asynchronously.") print(f" Verify: curl -H 'Authorization: Bearer ...' {MEILI_URL}/indexes/chunks/stats") return 0 if __name__ == "__main__": sys.exit(main())