disclosure-bureau/scripts/maintain/60_meili_index.py
Luiz Gustavo 55cac8a395
Some checks failed
CI / Web — typecheck + lint + build (push) Failing after 1m30s
CI / Scripts — Python smoke (push) Failing after 32s
CI / Web — npm audit (push) Failing after 37s
W0+W1+W1.2: security hardening, observability, autocomplete, glitchtip, forgejo CI
W0 — security hardening (5 fixes verified live on disclosure.top)
- middleware: gate /api/admin/* same as /admin/* (F1)
- imgproxy: tighten LOCAL_FILESYSTEM_ROOT from / to /var/lib/storage (F2)
- studio: real basic-auth label (bcrypt hash, middleware reference) (F3)
- relations: ENABLE ROW LEVEL SECURITY + public SELECT policy (F4)
- migration 0003: fold is_searchable + hybrid_search update into canonical (TD#2)

W1 — observability + resilience + autocomplete
- studio: HOSTNAME=0.0.0.0 so Next.js binds on loopback for healthcheck
- compose: PG_POOL_MAX=20, CLAUDE_CODE_OAUTH_TOKEN gated by separate env
- claude-code.ts: subprocess timeout configurable (CLAUDE_CODE_TIMEOUT_MS)
- openrouter.ts: retry with exponential backoff + Retry-After + in-memory
  circuit breaker (promotes FALLBACK after CB_THRESHOLD failures)
- lib/logger.ts: pino logger (NDJSON prod / pretty dev) + withRequest helper
- middleware: mints correlation_id, stamps x-correlation-id response header,
  emits structured http_request log per /api/* call
- messages/route.ts: switch to structured logger
- 60_meili_index.py: push documents + chunks into Meilisearch
- /api/search/autocomplete: parallel meili search (docs + chunks), 5-8ms p50
- search-autocomplete.tsx: debounced dropdown wired into search-panel

W1.2 — Glitchtip + Forgejo self-hosted
- compose: glitchtip-redis + glitchtip-web + glitchtip-worker (v4.2)
- compose: forgejo + forgejo-runner (server v9, runner v6) with group_add=988
- @sentry/nextjs SDK wired (instrumentation.ts + sentry.{client,server}.config.ts)
- /api/admin/throw smoke endpoint (gated by W0-F1 middleware)
- Synthetic event ingestion verified at glitchtip.disclosure.top
- forgejo.disclosure.top up, repo discadmin/disclosure-bureau created,
  runner registered (labels: ubuntu-latest, docker)
- .forgejo/workflows/ci.yml: typecheck + lint + build + npm audit + python
  syntax + compose validation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:18:42 -03:00

151 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
60_meili_index.py — Push documents + chunks into Meilisearch for autocomplete.
W1 deliverable. Meilisearch is the typo-tolerant prefix-aware search engine in
the stack; it complements Postgres BM25 + pgvector (used by the chat). The
goal here is fast `/search` autocomplete that shows matching docs and chunks
as the user types — sub-30ms.
Indexes created:
- documents id=doc_id, fields=[canonical_title, collection, doc_id]
- chunks id=chunk_pk, fields=[doc_id, chunk_id, page, content_en, content_pt]
Idempotent: re-running upserts. Skip `--reset` to rebuild from scratch.
Run from inside the disclosure-internal network OR with --meili-url override.
The default reads MEILI_MASTER_KEY + MEILISEARCH_URL from env.
Usage:
python3 scripts/maintain/60_meili_index.py
python3 scripts/maintain/60_meili_index.py --reset
python3 scripts/maintain/60_meili_index.py --doc-id <id>
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from typing import Any
try:
import psycopg
import requests
except ImportError as e:
sys.exit(f"pip install psycopg[binary] requests # missing: {e}")
DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")
MEILI_URL = os.getenv("MEILISEARCH_URL", "http://meilisearch:7700")
MEILI_KEY = os.getenv("MEILI_MASTER_KEY") or os.getenv("MEILISEARCH_API_KEY", "")
BATCH = int(os.getenv("MEILI_BATCH", "1000"))
def meili(method: str, path: str, body: Any = None) -> dict:
headers = {"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"}
r = requests.request(method, f"{MEILI_URL}{path}", headers=headers,
data=json.dumps(body) if body is not None else None,
timeout=120)
r.raise_for_status()
return r.json() if r.text else {}
def ensure_index(uid: str, primary_key: str, searchable: list[str], filterable: list[str]):
"""Create the index if missing, then set settings."""
try:
meili("POST", "/indexes", {"uid": uid, "primaryKey": primary_key})
print(f" created index {uid}")
except requests.HTTPError as e:
# 409 = already exists, OK.
if e.response.status_code not in (400, 409):
raise
meili("PATCH", f"/indexes/{uid}/settings", {
"searchableAttributes": searchable,
"filterableAttributes": filterable,
"displayedAttributes": ["*"],
"rankingRules": ["words", "typo", "proximity", "attribute", "sort", "exactness"],
"typoTolerance": {"enabled": True, "minWordSizeForTypos": {"oneTypo": 4, "twoTypos": 8}},
})
def push(uid: str, docs: list[dict]):
if not docs: return
meili("POST", f"/indexes/{uid}/documents", docs)
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--reset", action="store_true", help="Delete and recreate indexes")
ap.add_argument("--doc-id", help="Reindex only one doc")
args = ap.parse_args()
if not DATABASE_URL: sys.exit("DATABASE_URL not set")
if not MEILI_KEY: sys.exit("MEILI_MASTER_KEY not set")
if args.reset and not args.doc_id:
print("Resetting indexes...")
for uid in ("documents", "chunks"):
try: meili("DELETE", f"/indexes/{uid}")
except requests.HTTPError: pass
ensure_index("documents", "doc_id",
searchable=["canonical_title", "collection", "doc_id"],
filterable=["collection", "classification"])
ensure_index("chunks", "chunk_pk",
searchable=["content_pt", "content_en", "doc_id", "chunk_id"],
filterable=["doc_id", "type", "classification", "ufo_anomaly", "is_searchable"])
with psycopg.connect(DATABASE_URL) as conn, conn.cursor() as cur:
# documents
where_doc = "WHERE doc_id = %s" if args.doc_id else ""
params = (args.doc_id,) if args.doc_id else ()
cur.execute(f"""
SELECT doc_id, canonical_title, collection, classification
FROM public.documents {where_doc}
""", params)
rows = cur.fetchall()
docs = [{"doc_id": r[0], "canonical_title": r[1] or r[0],
"collection": r[2] or "", "classification": r[3] or ""} for r in rows]
print(f"documents → meili: {len(docs)}")
for i in range(0, len(docs), BATCH):
push("documents", docs[i:i+BATCH])
# chunks (only searchable ones — drops scaffolding noise)
where_chunk = "WHERE c.is_searchable" + (" AND c.doc_id = %s" if args.doc_id else "")
cur.execute(f"""
SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type,
c.content_en, c.content_pt, c.classification, c.ufo_anomaly
FROM public.chunks c
{where_chunk}
""", params)
chunks: list[dict] = []
total = 0
for r in cur:
chunks.append({
"chunk_pk": r[0],
"doc_id": r[1],
"chunk_id": r[2],
"page": r[3],
"type": r[4],
"content_en": (r[5] or "")[:2000],
"content_pt": (r[6] or "")[:2000],
"classification": r[7] or "",
"ufo_anomaly": bool(r[8]),
"is_searchable": True,
})
if len(chunks) >= BATCH:
push("chunks", chunks)
total += len(chunks)
chunks = []
print(f" pushed {total} chunks...")
if chunks:
push("chunks", chunks)
total += len(chunks)
print(f"chunks → meili: {total}")
print("\n✓ done. Indexer enqueued; meili processes asynchronously.")
print(f" Verify: curl -H 'Authorization: Bearer ...' {MEILI_URL}/indexes/chunks/stats")
return 0
if __name__ == "__main__":
sys.exit(main())