W0 — security hardening (5 fixes verified live on disclosure.top)
- middleware: gate /api/admin/* same as /admin/* (F1)
- imgproxy: tighten LOCAL_FILESYSTEM_ROOT from / to /var/lib/storage (F2)
- studio: real basic-auth label (bcrypt hash, middleware reference) (F3)
- relations: ENABLE ROW LEVEL SECURITY + public SELECT policy (F4)
- migration 0003: fold is_searchable + hybrid_search update into canonical (TD#2)
W1 — observability + resilience + autocomplete
- studio: HOSTNAME=0.0.0.0 so Next.js binds on loopback for healthcheck
- compose: PG_POOL_MAX=20, CLAUDE_CODE_OAUTH_TOKEN gated by separate env
- claude-code.ts: subprocess timeout configurable (CLAUDE_CODE_TIMEOUT_MS)
- openrouter.ts: retry with exponential backoff + Retry-After + in-memory
circuit breaker (promotes FALLBACK after CB_THRESHOLD failures)
- lib/logger.ts: pino logger (NDJSON prod / pretty dev) + withRequest helper
- middleware: mints correlation_id, stamps x-correlation-id response header,
emits structured http_request log per /api/* call
- messages/route.ts: switch to structured logger
- 60_meili_index.py: push documents + chunks into Meilisearch
- /api/search/autocomplete: parallel meili search (docs + chunks), 5-8ms p50
- search-autocomplete.tsx: debounced dropdown wired into search-panel
W1.2 — Glitchtip + Forgejo self-hosted
- compose: glitchtip-redis + glitchtip-web + glitchtip-worker (v4.2)
- compose: forgejo + forgejo-runner (server v9, runner v6) with group_add=988
- @sentry/nextjs SDK wired (instrumentation.ts + sentry.{client,server}.config.ts)
- /api/admin/throw smoke endpoint (gated by W0-F1 middleware)
- Synthetic event ingestion verified at glitchtip.disclosure.top
- forgejo.disclosure.top up, repo discadmin/disclosure-bureau created,
runner registered (labels: ubuntu-latest, docker)
- .forgejo/workflows/ci.yml: typecheck + lint + build + npm audit + python
syntax + compose validation
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
172 lines
6.6 KiB
PL/PgSQL
172 lines
6.6 KiB
PL/PgSQL
-- 0003_w0_hardening.sql
|
|
--
|
|
-- W0 hardening migration. Folds two ad-hoc maintenance scripts into the
|
|
-- canonical migration stream so a clean install on a fresh VPS produces a
|
|
-- secured, fully-searchable database without any post-bootstrap scripts.
|
|
--
|
|
-- F4 — RLS on public.relations (drift vs every other public.* table).
|
|
-- TD#2 — is_searchable column + reclassification + partial index, AND the
|
|
-- updated hybrid_search_chunks() that honors it. (Previously lived
|
|
-- in scripts/maintain/47_mark_unsearchable_chunks.sql + 48_*.sql.)
|
|
--
|
|
-- Idempotent. Safe to re-run.
|
|
|
|
BEGIN;
|
|
|
|
-- IMPORTANT: public.chunks / .entities / .relations are owned by
|
|
-- `supabase_admin` (not `postgres`). Postgres enforces ownership on RLS DDL
|
|
-- even for superusers. Run this migration as:
|
|
--
|
|
-- docker exec -i disclosure-db psql -U supabase_admin < 0003_w0_hardening.sql
|
|
--
|
|
-- The `supabase_admin` role has socket-trust auth on the local container.
|
|
|
|
-- ─────────────────────────────────────────────────────────────────────────
|
|
-- F4 · RLS on public.relations
|
|
-- ─────────────────────────────────────────────────────────────────────────
|
|
ALTER TABLE public.relations ENABLE ROW LEVEL SECURITY;
|
|
|
|
DROP POLICY IF EXISTS relations_read ON public.relations;
|
|
CREATE POLICY relations_read ON public.relations FOR SELECT USING (TRUE);
|
|
|
|
GRANT SELECT ON public.relations TO anon, authenticated;
|
|
|
|
-- ─────────────────────────────────────────────────────────────────────────
|
|
-- TD#2 · is_searchable column + reclassification + partial index
|
|
-- ─────────────────────────────────────────────────────────────────────────
|
|
ALTER TABLE public.chunks
|
|
ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
|
|
|
|
UPDATE public.chunks SET is_searchable = TRUE;
|
|
|
|
UPDATE public.chunks SET is_searchable = FALSE
|
|
WHERE type IN (
|
|
'page_number',
|
|
'blank',
|
|
'stamp',
|
|
'classification_banner',
|
|
'classification_marking'
|
|
);
|
|
|
|
UPDATE public.chunks SET is_searchable = FALSE
|
|
WHERE type IN (
|
|
'salutation',
|
|
'complimentary_close',
|
|
'section_heading',
|
|
'section_header',
|
|
'heading',
|
|
'title',
|
|
'subtitle',
|
|
'date_line',
|
|
'bulleted_item',
|
|
'field_value',
|
|
'field_entry',
|
|
'table_marker',
|
|
'form_field',
|
|
'form_header',
|
|
'routing_block',
|
|
'distribution_list',
|
|
'file_number',
|
|
'marginalia'
|
|
)
|
|
AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
|
|
|
|
CREATE INDEX IF NOT EXISTS chunks_searchable_idx
|
|
ON public.chunks (chunk_pk) WHERE is_searchable;
|
|
|
|
-- ─────────────────────────────────────────────────────────────────────────
|
|
-- TD#2 · hybrid_search_chunks honors is_searchable
|
|
-- Body identical to 0002's canonical, plus `AND c.is_searchable` in both
|
|
-- the bm25 and dense CTEs. Replaces the function in place.
|
|
-- ─────────────────────────────────────────────────────────────────────────
|
|
DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT);
|
|
DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION);
|
|
CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
|
|
q_text TEXT,
|
|
q_embedding vector(1024),
|
|
q_lang TEXT DEFAULT 'pt',
|
|
q_doc_id TEXT DEFAULT NULL,
|
|
q_type TEXT DEFAULT NULL,
|
|
q_classification TEXT DEFAULT NULL,
|
|
q_ufo_only BOOLEAN DEFAULT FALSE,
|
|
k INT DEFAULT 100,
|
|
rrf_k INT DEFAULT 60,
|
|
max_dense_dist DOUBLE PRECISION DEFAULT 0.40
|
|
)
|
|
RETURNS TABLE (
|
|
chunk_pk BIGINT,
|
|
doc_id TEXT,
|
|
chunk_id TEXT,
|
|
page INT,
|
|
type TEXT,
|
|
bbox JSONB,
|
|
content_en TEXT,
|
|
content_pt TEXT,
|
|
classification TEXT,
|
|
score DOUBLE PRECISION,
|
|
bm25_rank INT,
|
|
dense_rank INT
|
|
)
|
|
LANGUAGE plpgsql STABLE AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
WITH
|
|
ts_q AS (
|
|
SELECT CASE WHEN q_lang = 'en'
|
|
THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text)
|
|
ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text)
|
|
END AS q
|
|
),
|
|
bm25 AS (
|
|
SELECT c.chunk_pk,
|
|
row_number() OVER (ORDER BY
|
|
ts_rank_cd(
|
|
CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END,
|
|
(SELECT q FROM ts_q)
|
|
) DESC NULLS LAST
|
|
)::INT AS r
|
|
FROM public.chunks c
|
|
WHERE c.is_searchable
|
|
AND (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q)
|
|
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
|
|
AND (q_type IS NULL OR c.type = q_type)
|
|
AND (q_classification IS NULL OR c.classification = q_classification)
|
|
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
|
|
LIMIT k
|
|
),
|
|
dense AS (
|
|
SELECT c.chunk_pk,
|
|
row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
|
|
FROM public.chunks c
|
|
WHERE c.is_searchable
|
|
AND c.embedding IS NOT NULL
|
|
AND (c.embedding <=> q_embedding) < max_dense_dist
|
|
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
|
|
AND (q_type IS NULL OR c.type = q_type)
|
|
AND (q_classification IS NULL OR c.classification = q_classification)
|
|
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
|
|
ORDER BY c.embedding <=> q_embedding
|
|
LIMIT k
|
|
),
|
|
fused AS (
|
|
SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk,
|
|
((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) +
|
|
(1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score,
|
|
b.r AS bm25_rank,
|
|
d.r AS dense_rank
|
|
FROM bm25 b
|
|
FULL OUTER JOIN dense d USING (chunk_pk)
|
|
)
|
|
SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox,
|
|
c.content_en, c.content_pt, c.classification,
|
|
f.score, f.bm25_rank, f.dense_rank
|
|
FROM fused f
|
|
JOIN public.chunks c USING (chunk_pk)
|
|
ORDER BY f.score DESC
|
|
LIMIT k;
|
|
END
|
|
$$;
|
|
|
|
GRANT EXECUTE ON FUNCTION public.hybrid_search_chunks TO anon, authenticated;
|
|
|
|
COMMIT;
|