disclosure-bureau/infra/supabase/migrations/0003_w0_hardening.sql
Luiz Gustavo 55cac8a395
Some checks failed
CI / Web — typecheck + lint + build (push) Failing after 1m30s
CI / Scripts — Python smoke (push) Failing after 32s
CI / Web — npm audit (push) Failing after 37s
W0+W1+W1.2: security hardening, observability, autocomplete, glitchtip, forgejo CI
W0 — security hardening (5 fixes verified live on disclosure.top)
- middleware: gate /api/admin/* same as /admin/* (F1)
- imgproxy: tighten LOCAL_FILESYSTEM_ROOT from / to /var/lib/storage (F2)
- studio: real basic-auth label (bcrypt hash, middleware reference) (F3)
- relations: ENABLE ROW LEVEL SECURITY + public SELECT policy (F4)
- migration 0003: fold is_searchable + hybrid_search update into canonical (TD#2)

W1 — observability + resilience + autocomplete
- studio: HOSTNAME=0.0.0.0 so Next.js binds on loopback for healthcheck
- compose: PG_POOL_MAX=20, CLAUDE_CODE_OAUTH_TOKEN gated by separate env
- claude-code.ts: subprocess timeout configurable (CLAUDE_CODE_TIMEOUT_MS)
- openrouter.ts: retry with exponential backoff + Retry-After + in-memory
  circuit breaker (promotes FALLBACK after CB_THRESHOLD failures)
- lib/logger.ts: pino logger (NDJSON prod / pretty dev) + withRequest helper
- middleware: mints correlation_id, stamps x-correlation-id response header,
  emits structured http_request log per /api/* call
- messages/route.ts: switch to structured logger
- 60_meili_index.py: push documents + chunks into Meilisearch
- /api/search/autocomplete: parallel meili search (docs + chunks), 5-8ms p50
- search-autocomplete.tsx: debounced dropdown wired into search-panel

W1.2 — Glitchtip + Forgejo self-hosted
- compose: glitchtip-redis + glitchtip-web + glitchtip-worker (v4.2)
- compose: forgejo + forgejo-runner (server v9, runner v6) with group_add=988
- @sentry/nextjs SDK wired (instrumentation.ts + sentry.{client,server}.config.ts)
- /api/admin/throw smoke endpoint (gated by W0-F1 middleware)
- Synthetic event ingestion verified at glitchtip.disclosure.top
- forgejo.disclosure.top up, repo discadmin/disclosure-bureau created,
  runner registered (labels: ubuntu-latest, docker)
- .forgejo/workflows/ci.yml: typecheck + lint + build + npm audit + python
  syntax + compose validation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:18:42 -03:00

172 lines
6.6 KiB
PL/PgSQL

-- 0003_w0_hardening.sql
--
-- W0 hardening migration. Folds two ad-hoc maintenance scripts into the
-- canonical migration stream so a clean install on a fresh VPS produces a
-- secured, fully-searchable database without any post-bootstrap scripts.
--
-- F4 — RLS on public.relations (drift vs every other public.* table).
-- TD#2 — is_searchable column + reclassification + partial index, AND the
-- updated hybrid_search_chunks() that honors it. (Previously lived
-- in scripts/maintain/47_mark_unsearchable_chunks.sql + 48_*.sql.)
--
-- Idempotent. Safe to re-run.
BEGIN;
-- IMPORTANT: public.chunks / .entities / .relations are owned by
-- `supabase_admin` (not `postgres`). Postgres enforces ownership on RLS DDL
-- even for superusers. Run this migration as:
--
-- docker exec -i disclosure-db psql -U supabase_admin < 0003_w0_hardening.sql
--
-- The `supabase_admin` role has socket-trust auth on the local container.
-- ─────────────────────────────────────────────────────────────────────────
-- F4 · RLS on public.relations
-- ─────────────────────────────────────────────────────────────────────────
ALTER TABLE public.relations ENABLE ROW LEVEL SECURITY;
DROP POLICY IF EXISTS relations_read ON public.relations;
CREATE POLICY relations_read ON public.relations FOR SELECT USING (TRUE);
GRANT SELECT ON public.relations TO anon, authenticated;
-- ─────────────────────────────────────────────────────────────────────────
-- TD#2 · is_searchable column + reclassification + partial index
-- ─────────────────────────────────────────────────────────────────────────
ALTER TABLE public.chunks
ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
UPDATE public.chunks SET is_searchable = TRUE;
UPDATE public.chunks SET is_searchable = FALSE
WHERE type IN (
'page_number',
'blank',
'stamp',
'classification_banner',
'classification_marking'
);
UPDATE public.chunks SET is_searchable = FALSE
WHERE type IN (
'salutation',
'complimentary_close',
'section_heading',
'section_header',
'heading',
'title',
'subtitle',
'date_line',
'bulleted_item',
'field_value',
'field_entry',
'table_marker',
'form_field',
'form_header',
'routing_block',
'distribution_list',
'file_number',
'marginalia'
)
AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
CREATE INDEX IF NOT EXISTS chunks_searchable_idx
ON public.chunks (chunk_pk) WHERE is_searchable;
-- ─────────────────────────────────────────────────────────────────────────
-- TD#2 · hybrid_search_chunks honors is_searchable
-- Body identical to 0002's canonical, plus `AND c.is_searchable` in both
-- the bm25 and dense CTEs. Replaces the function in place.
-- ─────────────────────────────────────────────────────────────────────────
DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT);
DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION);
CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
q_text TEXT,
q_embedding vector(1024),
q_lang TEXT DEFAULT 'pt',
q_doc_id TEXT DEFAULT NULL,
q_type TEXT DEFAULT NULL,
q_classification TEXT DEFAULT NULL,
q_ufo_only BOOLEAN DEFAULT FALSE,
k INT DEFAULT 100,
rrf_k INT DEFAULT 60,
max_dense_dist DOUBLE PRECISION DEFAULT 0.40
)
RETURNS TABLE (
chunk_pk BIGINT,
doc_id TEXT,
chunk_id TEXT,
page INT,
type TEXT,
bbox JSONB,
content_en TEXT,
content_pt TEXT,
classification TEXT,
score DOUBLE PRECISION,
bm25_rank INT,
dense_rank INT
)
LANGUAGE plpgsql STABLE AS $$
BEGIN
RETURN QUERY
WITH
ts_q AS (
SELECT CASE WHEN q_lang = 'en'
THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text)
ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text)
END AS q
),
bm25 AS (
SELECT c.chunk_pk,
row_number() OVER (ORDER BY
ts_rank_cd(
CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END,
(SELECT q FROM ts_q)
) DESC NULLS LAST
)::INT AS r
FROM public.chunks c
WHERE c.is_searchable
AND (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q)
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
AND (q_type IS NULL OR c.type = q_type)
AND (q_classification IS NULL OR c.classification = q_classification)
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
LIMIT k
),
dense AS (
SELECT c.chunk_pk,
row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
FROM public.chunks c
WHERE c.is_searchable
AND c.embedding IS NOT NULL
AND (c.embedding <=> q_embedding) < max_dense_dist
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
AND (q_type IS NULL OR c.type = q_type)
AND (q_classification IS NULL OR c.classification = q_classification)
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
ORDER BY c.embedding <=> q_embedding
LIMIT k
),
fused AS (
SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk,
((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) +
(1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score,
b.r AS bm25_rank,
d.r AS dense_rank
FROM bm25 b
FULL OUTER JOIN dense d USING (chunk_pk)
)
SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox,
c.content_en, c.content_pt, c.classification,
f.score, f.bm25_rank, f.dense_rank
FROM fused f
JOIN public.chunks c USING (chunk_pk)
ORDER BY f.score DESC
LIMIT k;
END
$$;
GRANT EXECUTE ON FUNCTION public.hybrid_search_chunks TO anon, authenticated;
COMMIT;