W0 — security hardening (5 fixes verified live on disclosure.top)
- middleware: gate /api/admin/* same as /admin/* (F1)
- imgproxy: tighten LOCAL_FILESYSTEM_ROOT from / to /var/lib/storage (F2)
- studio: real basic-auth label (bcrypt hash, middleware reference) (F3)
- relations: ENABLE ROW LEVEL SECURITY + public SELECT policy (F4)
- migration 0003: fold is_searchable + hybrid_search update into canonical (TD#2)
W1 — observability + resilience + autocomplete
- studio: HOSTNAME=0.0.0.0 so Next.js binds on loopback for healthcheck
- compose: PG_POOL_MAX=20, CLAUDE_CODE_OAUTH_TOKEN gated by separate env
- claude-code.ts: subprocess timeout configurable (CLAUDE_CODE_TIMEOUT_MS)
- openrouter.ts: retry with exponential backoff + Retry-After + in-memory
circuit breaker (promotes FALLBACK after CB_THRESHOLD failures)
- lib/logger.ts: pino logger (NDJSON prod / pretty dev) + withRequest helper
- middleware: mints correlation_id, stamps x-correlation-id response header,
emits structured http_request log per /api/* call
- messages/route.ts: switch to structured logger
- 60_meili_index.py: push documents + chunks into Meilisearch
- /api/search/autocomplete: parallel meili search (docs + chunks), 5-8ms p50
- search-autocomplete.tsx: debounced dropdown wired into search-panel
W1.2 — Glitchtip + Forgejo self-hosted
- compose: glitchtip-redis + glitchtip-web + glitchtip-worker (v4.2)
- compose: forgejo + forgejo-runner (server v9, runner v6) with group_add=988
- @sentry/nextjs SDK wired (instrumentation.ts + sentry.{client,server}.config.ts)
- /api/admin/throw smoke endpoint (gated by W0-F1 middleware)
- Synthetic event ingestion verified at glitchtip.disclosure.top
- forgejo.disclosure.top up, repo discadmin/disclosure-bureau created,
runner registered (labels: ubuntu-latest, docker)
- .forgejo/workflows/ci.yml: typecheck + lint + build + npm audit + python
syntax + compose validation
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
151 lines
5.8 KiB
Python
151 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
60_meili_index.py — Push documents + chunks into Meilisearch for autocomplete.
|
|
|
|
W1 deliverable. Meilisearch is the typo-tolerant prefix-aware search engine in
|
|
the stack; it complements Postgres BM25 + pgvector (used by the chat). The
|
|
goal here is fast `/search` autocomplete that shows matching docs and chunks
|
|
as the user types — sub-30ms.
|
|
|
|
Indexes created:
|
|
- documents id=doc_id, fields=[canonical_title, collection, doc_id]
|
|
- chunks id=chunk_pk, fields=[doc_id, chunk_id, page, content_en, content_pt]
|
|
|
|
Idempotent: re-running upserts. Skip `--reset` to rebuild from scratch.
|
|
|
|
Run from inside the disclosure-internal network OR with --meili-url override.
|
|
The default reads MEILI_MASTER_KEY + MEILISEARCH_URL from env.
|
|
|
|
Usage:
|
|
python3 scripts/maintain/60_meili_index.py
|
|
python3 scripts/maintain/60_meili_index.py --reset
|
|
python3 scripts/maintain/60_meili_index.py --doc-id <id>
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from typing import Any
|
|
|
|
try:
|
|
import psycopg
|
|
import requests
|
|
except ImportError as e:
|
|
sys.exit(f"pip install psycopg[binary] requests # missing: {e}")
|
|
|
|
DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")
|
|
MEILI_URL = os.getenv("MEILISEARCH_URL", "http://meilisearch:7700")
|
|
MEILI_KEY = os.getenv("MEILI_MASTER_KEY") or os.getenv("MEILISEARCH_API_KEY", "")
|
|
BATCH = int(os.getenv("MEILI_BATCH", "1000"))
|
|
|
|
|
|
def meili(method: str, path: str, body: Any = None) -> dict:
|
|
headers = {"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"}
|
|
r = requests.request(method, f"{MEILI_URL}{path}", headers=headers,
|
|
data=json.dumps(body) if body is not None else None,
|
|
timeout=120)
|
|
r.raise_for_status()
|
|
return r.json() if r.text else {}
|
|
|
|
|
|
def ensure_index(uid: str, primary_key: str, searchable: list[str], filterable: list[str]):
|
|
"""Create the index if missing, then set settings."""
|
|
try:
|
|
meili("POST", "/indexes", {"uid": uid, "primaryKey": primary_key})
|
|
print(f" created index {uid}")
|
|
except requests.HTTPError as e:
|
|
# 409 = already exists, OK.
|
|
if e.response.status_code not in (400, 409):
|
|
raise
|
|
meili("PATCH", f"/indexes/{uid}/settings", {
|
|
"searchableAttributes": searchable,
|
|
"filterableAttributes": filterable,
|
|
"displayedAttributes": ["*"],
|
|
"rankingRules": ["words", "typo", "proximity", "attribute", "sort", "exactness"],
|
|
"typoTolerance": {"enabled": True, "minWordSizeForTypos": {"oneTypo": 4, "twoTypos": 8}},
|
|
})
|
|
|
|
|
|
def push(uid: str, docs: list[dict]):
|
|
if not docs: return
|
|
meili("POST", f"/indexes/{uid}/documents", docs)
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--reset", action="store_true", help="Delete and recreate indexes")
|
|
ap.add_argument("--doc-id", help="Reindex only one doc")
|
|
args = ap.parse_args()
|
|
|
|
if not DATABASE_URL: sys.exit("DATABASE_URL not set")
|
|
if not MEILI_KEY: sys.exit("MEILI_MASTER_KEY not set")
|
|
|
|
if args.reset and not args.doc_id:
|
|
print("Resetting indexes...")
|
|
for uid in ("documents", "chunks"):
|
|
try: meili("DELETE", f"/indexes/{uid}")
|
|
except requests.HTTPError: pass
|
|
|
|
ensure_index("documents", "doc_id",
|
|
searchable=["canonical_title", "collection", "doc_id"],
|
|
filterable=["collection", "classification"])
|
|
ensure_index("chunks", "chunk_pk",
|
|
searchable=["content_pt", "content_en", "doc_id", "chunk_id"],
|
|
filterable=["doc_id", "type", "classification", "ufo_anomaly", "is_searchable"])
|
|
|
|
with psycopg.connect(DATABASE_URL) as conn, conn.cursor() as cur:
|
|
# documents
|
|
where_doc = "WHERE doc_id = %s" if args.doc_id else ""
|
|
params = (args.doc_id,) if args.doc_id else ()
|
|
cur.execute(f"""
|
|
SELECT doc_id, canonical_title, collection, classification
|
|
FROM public.documents {where_doc}
|
|
""", params)
|
|
rows = cur.fetchall()
|
|
docs = [{"doc_id": r[0], "canonical_title": r[1] or r[0],
|
|
"collection": r[2] or "", "classification": r[3] or ""} for r in rows]
|
|
print(f"documents → meili: {len(docs)}")
|
|
for i in range(0, len(docs), BATCH):
|
|
push("documents", docs[i:i+BATCH])
|
|
|
|
# chunks (only searchable ones — drops scaffolding noise)
|
|
where_chunk = "WHERE c.is_searchable" + (" AND c.doc_id = %s" if args.doc_id else "")
|
|
cur.execute(f"""
|
|
SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type,
|
|
c.content_en, c.content_pt, c.classification, c.ufo_anomaly
|
|
FROM public.chunks c
|
|
{where_chunk}
|
|
""", params)
|
|
chunks: list[dict] = []
|
|
total = 0
|
|
for r in cur:
|
|
chunks.append({
|
|
"chunk_pk": r[0],
|
|
"doc_id": r[1],
|
|
"chunk_id": r[2],
|
|
"page": r[3],
|
|
"type": r[4],
|
|
"content_en": (r[5] or "")[:2000],
|
|
"content_pt": (r[6] or "")[:2000],
|
|
"classification": r[7] or "",
|
|
"ufo_anomaly": bool(r[8]),
|
|
"is_searchable": True,
|
|
})
|
|
if len(chunks) >= BATCH:
|
|
push("chunks", chunks)
|
|
total += len(chunks)
|
|
chunks = []
|
|
print(f" pushed {total} chunks...")
|
|
if chunks:
|
|
push("chunks", chunks)
|
|
total += len(chunks)
|
|
print(f"chunks → meili: {total}")
|
|
|
|
print("\n✓ done. Indexer enqueued; meili processes asynchronously.")
|
|
print(f" Verify: curl -H 'Authorization: Bearer ...' {MEILI_URL}/indexes/chunks/stats")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|