-- 0003_w0_hardening.sql -- -- W0 hardening migration. Folds two ad-hoc maintenance scripts into the -- canonical migration stream so a clean install on a fresh VPS produces a -- secured, fully-searchable database without any post-bootstrap scripts. -- -- F4 — RLS on public.relations (drift vs every other public.* table). -- TD#2 — is_searchable column + reclassification + partial index, AND the -- updated hybrid_search_chunks() that honors it. (Previously lived -- in scripts/maintain/47_mark_unsearchable_chunks.sql + 48_*.sql.) -- -- Idempotent. Safe to re-run. BEGIN; -- IMPORTANT: public.chunks / .entities / .relations are owned by -- `supabase_admin` (not `postgres`). Postgres enforces ownership on RLS DDL -- even for superusers. Run this migration as: -- -- docker exec -i disclosure-db psql -U supabase_admin < 0003_w0_hardening.sql -- -- The `supabase_admin` role has socket-trust auth on the local container. -- ───────────────────────────────────────────────────────────────────────── -- F4 · RLS on public.relations -- ───────────────────────────────────────────────────────────────────────── ALTER TABLE public.relations ENABLE ROW LEVEL SECURITY; DROP POLICY IF EXISTS relations_read ON public.relations; CREATE POLICY relations_read ON public.relations FOR SELECT USING (TRUE); GRANT SELECT ON public.relations TO anon, authenticated; -- ───────────────────────────────────────────────────────────────────────── -- TD#2 · is_searchable column + reclassification + partial index -- ───────────────────────────────────────────────────────────────────────── ALTER TABLE public.chunks ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE; UPDATE public.chunks SET is_searchable = TRUE; UPDATE public.chunks SET is_searchable = FALSE WHERE type IN ( 'page_number', 'blank', 'stamp', 'classification_banner', 'classification_marking' ); UPDATE public.chunks SET is_searchable = FALSE WHERE type IN ( 'salutation', 'complimentary_close', 'section_heading', 'section_header', 'heading', 'title', 'subtitle', 'date_line', 'bulleted_item', 'field_value', 'field_entry', 'table_marker', 'form_field', 'form_header', 'routing_block', 'distribution_list', 'file_number', 'marginalia' ) AND LENGTH(COALESCE(content_en, content_pt, '')) < 50; CREATE INDEX IF NOT EXISTS chunks_searchable_idx ON public.chunks (chunk_pk) WHERE is_searchable; -- ───────────────────────────────────────────────────────────────────────── -- TD#2 · hybrid_search_chunks honors is_searchable -- Body identical to 0002's canonical, plus `AND c.is_searchable` in both -- the bm25 and dense CTEs. Replaces the function in place. -- ───────────────────────────────────────────────────────────────────────── DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT); DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION); CREATE OR REPLACE FUNCTION public.hybrid_search_chunks( q_text TEXT, q_embedding vector(1024), q_lang TEXT DEFAULT 'pt', q_doc_id TEXT DEFAULT NULL, q_type TEXT DEFAULT NULL, q_classification TEXT DEFAULT NULL, q_ufo_only BOOLEAN DEFAULT FALSE, k INT DEFAULT 100, rrf_k INT DEFAULT 60, max_dense_dist DOUBLE PRECISION DEFAULT 0.40 ) RETURNS TABLE ( chunk_pk BIGINT, doc_id TEXT, chunk_id TEXT, page INT, type TEXT, bbox JSONB, content_en TEXT, content_pt TEXT, classification TEXT, score DOUBLE PRECISION, bm25_rank INT, dense_rank INT ) LANGUAGE plpgsql STABLE AS $$ BEGIN RETURN QUERY WITH ts_q AS ( SELECT CASE WHEN q_lang = 'en' THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text) ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text) END AS q ), bm25 AS ( SELECT c.chunk_pk, row_number() OVER (ORDER BY ts_rank_cd( CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END, (SELECT q FROM ts_q) ) DESC NULLS LAST )::INT AS r FROM public.chunks c WHERE c.is_searchable AND (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q) AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) AND (q_type IS NULL OR c.type = q_type) AND (q_classification IS NULL OR c.classification = q_classification) AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE) LIMIT k ), dense AS ( SELECT c.chunk_pk, row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r FROM public.chunks c WHERE c.is_searchable AND c.embedding IS NOT NULL AND (c.embedding <=> q_embedding) < max_dense_dist AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) AND (q_type IS NULL OR c.type = q_type) AND (q_classification IS NULL OR c.classification = q_classification) AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE) ORDER BY c.embedding <=> q_embedding LIMIT k ), fused AS ( SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk, ((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) + (1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score, b.r AS bm25_rank, d.r AS dense_rank FROM bm25 b FULL OUTER JOIN dense d USING (chunk_pk) ) SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.content_en, c.content_pt, c.classification, f.score, f.bm25_rank, f.dense_rank FROM fused f JOIN public.chunks c USING (chunk_pk) ORDER BY f.score DESC LIMIT k; END $$; GRANT EXECUTE ON FUNCTION public.hybrid_search_chunks TO anon, authenticated; COMMIT;