disclosure-bureau/infra/supabase/migrations/0002_chunks_retrieval.sql

253 lines
9.6 KiB
PL/PgSQL

-- The Disclosure Bureau — chunks retrieval schema (v0.2.0)
-- Enables hybrid retrieval (BM25 + dense embeddings + reranker) over the
-- agentic chunks produced by scripts/28-batch-rebuild-all.py.
--
-- Safe to re-run. Apply via Supabase Studio SQL editor OR psql.
-- 1. pgvector + trigram extensions (Supabase image ships both)
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS unaccent;
-- 2. Multilingual unaccent text search config (EN + PT-BR)
DO $$ BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'pt_unaccent') THEN
CREATE TEXT SEARCH CONFIGURATION public.pt_unaccent ( COPY = pg_catalog.portuguese );
ALTER TEXT SEARCH CONFIGURATION public.pt_unaccent
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, portuguese_stem;
END IF;
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'en_unaccent') THEN
CREATE TEXT SEARCH CONFIGURATION public.en_unaccent ( COPY = pg_catalog.english );
ALTER TEXT SEARCH CONFIGURATION public.en_unaccent
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, english_stem;
END IF;
END $$;
-- 3. documents — 1 row per doc (mirrors wiki/documents/<doc-id>.md frontmatter highlights)
CREATE TABLE IF NOT EXISTS public.documents (
doc_id TEXT PRIMARY KEY,
canonical_title TEXT,
collection TEXT,
document_class TEXT,
page_count INT,
classification TEXT,
content_class TEXT[],
schema_version TEXT NOT NULL DEFAULT '0.2.0',
build_approach TEXT,
build_model TEXT,
built_at TIMESTAMPTZ,
ingested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
raw_path TEXT
);
CREATE INDEX IF NOT EXISTS documents_collection_idx ON public.documents (collection);
CREATE INDEX IF NOT EXISTS documents_built_at_idx ON public.documents (built_at DESC);
-- 4. chunks — the retrieval unit. 1 row per chunk file (raw/<doc>/chunks/c*.md).
-- 1024 dims = BGE-M3 dense.
CREATE TABLE IF NOT EXISTS public.chunks (
chunk_pk BIGSERIAL PRIMARY KEY,
doc_id TEXT NOT NULL REFERENCES public.documents(doc_id) ON DELETE CASCADE,
chunk_id TEXT NOT NULL,
page INT NOT NULL,
order_in_page INT NOT NULL,
order_global INT NOT NULL,
type TEXT NOT NULL,
bbox JSONB,
content_en TEXT,
content_pt TEXT,
ocr_confidence REAL,
classification TEXT,
formatting TEXT[],
cross_page_hint TEXT,
prev_chunk TEXT,
next_chunk TEXT,
related_image TEXT,
related_table TEXT,
redaction_code TEXT,
redaction_inferred TEXT,
image_type TEXT,
ufo_anomaly BOOLEAN NOT NULL DEFAULT FALSE,
ufo_anomaly_type TEXT,
ufo_rationale TEXT,
cryptid_anomaly BOOLEAN NOT NULL DEFAULT FALSE,
cryptid_anomaly_type TEXT,
cryptid_rationale TEXT,
image_desc_en TEXT,
image_desc_pt TEXT,
source_png TEXT,
embedding vector(1024),
ts_en tsvector GENERATED ALWAYS AS (
to_tsvector('public.en_unaccent', COALESCE(content_en, ''))
) STORED,
ts_pt tsvector GENERATED ALWAYS AS (
to_tsvector('public.pt_unaccent', COALESCE(content_pt, ''))
) STORED,
CONSTRAINT chunks_doc_chunk_uk UNIQUE (doc_id, chunk_id)
);
-- 5. indexes
CREATE INDEX IF NOT EXISTS chunks_doc_id_page_idx
ON public.chunks (doc_id, page, order_in_page);
CREATE INDEX IF NOT EXISTS chunks_type_idx
ON public.chunks (type);
CREATE INDEX IF NOT EXISTS chunks_classification_idx
ON public.chunks (classification) WHERE classification IS NOT NULL;
CREATE INDEX IF NOT EXISTS chunks_ufo_idx
ON public.chunks (ufo_anomaly) WHERE ufo_anomaly = TRUE;
CREATE INDEX IF NOT EXISTS chunks_cryptid_idx
ON public.chunks (cryptid_anomaly) WHERE cryptid_anomaly = TRUE;
CREATE INDEX IF NOT EXISTS chunks_ts_en_idx ON public.chunks USING GIN (ts_en);
CREATE INDEX IF NOT EXISTS chunks_ts_pt_idx ON public.chunks USING GIN (ts_pt);
-- HNSW vector index — m=16, ef_construction=64 (defaults; tune later)
CREATE INDEX IF NOT EXISTS chunks_embedding_hnsw_idx
ON public.chunks USING hnsw (embedding vector_cosine_ops);
-- Trigram index on content for fuzzy ILIKE
CREATE INDEX IF NOT EXISTS chunks_content_en_trgm_idx
ON public.chunks USING GIN (content_en gin_trgm_ops);
CREATE INDEX IF NOT EXISTS chunks_content_pt_trgm_idx
ON public.chunks USING GIN (content_pt gin_trgm_ops);
-- 6. entities — flattened from wiki/entities/<class>/<id>.md (post-lint)
CREATE TABLE IF NOT EXISTS public.entities (
entity_pk BIGSERIAL PRIMARY KEY,
entity_class TEXT NOT NULL,
entity_id TEXT NOT NULL,
canonical_name TEXT NOT NULL,
aliases TEXT[],
embedding vector(1024),
total_mentions INT NOT NULL DEFAULT 0,
documents_count INT NOT NULL DEFAULT 0,
enrichment_status TEXT,
last_ingest TIMESTAMPTZ,
CONSTRAINT entities_uk UNIQUE (entity_class, entity_id)
);
CREATE INDEX IF NOT EXISTS entities_canonical_name_idx ON public.entities (canonical_name);
CREATE INDEX IF NOT EXISTS entities_aliases_idx ON public.entities USING GIN (aliases);
CREATE INDEX IF NOT EXISTS entities_embedding_hnsw_idx
ON public.entities USING hnsw (embedding vector_cosine_ops);
-- 7. entity_mentions — link table chunk ↔ entity (materialized from lint)
CREATE TABLE IF NOT EXISTS public.entity_mentions (
mention_pk BIGSERIAL PRIMARY KEY,
chunk_pk BIGINT NOT NULL REFERENCES public.chunks(chunk_pk) ON DELETE CASCADE,
entity_pk BIGINT NOT NULL REFERENCES public.entities(entity_pk) ON DELETE CASCADE,
surface_form TEXT,
CONSTRAINT entity_mentions_uk UNIQUE (chunk_pk, entity_pk)
);
CREATE INDEX IF NOT EXISTS entity_mentions_chunk_idx ON public.entity_mentions (chunk_pk);
CREATE INDEX IF NOT EXISTS entity_mentions_entity_idx ON public.entity_mentions (entity_pk);
-- 8. Hybrid search RPC: BM25 + dense + RRF fusion server-side
CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
q_text TEXT,
q_embedding vector(1024),
q_lang TEXT DEFAULT 'pt', -- 'pt' | 'en'
q_doc_id TEXT DEFAULT NULL,
q_type TEXT DEFAULT NULL,
q_classification TEXT DEFAULT NULL,
q_ufo_only BOOLEAN DEFAULT FALSE,
k INT DEFAULT 100,
rrf_k INT DEFAULT 60
)
RETURNS TABLE (
chunk_pk BIGINT,
doc_id TEXT,
chunk_id TEXT,
page INT,
type TEXT,
bbox JSONB,
content_en TEXT,
content_pt TEXT,
classification TEXT,
score DOUBLE PRECISION,
bm25_rank INT,
dense_rank INT
)
LANGUAGE plpgsql STABLE AS $$
BEGIN
RETURN QUERY
WITH
ts_q AS (
SELECT CASE WHEN q_lang = 'en'
THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text)
ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text)
END AS q
),
bm25 AS (
SELECT c.chunk_pk,
row_number() OVER (ORDER BY
ts_rank_cd(
CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END,
(SELECT q FROM ts_q)
) DESC NULLS LAST
)::INT AS r
FROM public.chunks c
WHERE (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q)
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
AND (q_type IS NULL OR c.type = q_type)
AND (q_classification IS NULL OR c.classification = q_classification)
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
LIMIT k
),
dense AS (
SELECT c.chunk_pk,
row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
FROM public.chunks c
WHERE c.embedding IS NOT NULL
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
AND (q_type IS NULL OR c.type = q_type)
AND (q_classification IS NULL OR c.classification = q_classification)
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
ORDER BY c.embedding <=> q_embedding
LIMIT k
),
fused AS (
SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk,
((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) +
(1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score,
b.r AS bm25_rank,
d.r AS dense_rank
FROM bm25 b
FULL OUTER JOIN dense d USING (chunk_pk)
)
SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox,
c.content_en, c.content_pt, c.classification,
f.score, f.bm25_rank, f.dense_rank
FROM fused f
JOIN public.chunks c USING (chunk_pk)
ORDER BY f.score DESC
LIMIT k;
END
$$;
-- 9. RLS — chunks/entities are public read; writes via service_role
ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
ALTER TABLE public.chunks ENABLE ROW LEVEL SECURITY;
ALTER TABLE public.entities ENABLE ROW LEVEL SECURITY;
ALTER TABLE public.entity_mentions ENABLE ROW LEVEL SECURITY;
DROP POLICY IF EXISTS documents_read ON public.documents;
DROP POLICY IF EXISTS chunks_read ON public.chunks;
DROP POLICY IF EXISTS entities_read ON public.entities;
DROP POLICY IF EXISTS entity_mentions_read ON public.entity_mentions;
CREATE POLICY documents_read ON public.documents FOR SELECT USING (TRUE);
CREATE POLICY chunks_read ON public.chunks FOR SELECT USING (TRUE);
CREATE POLICY entities_read ON public.entities FOR SELECT USING (TRUE);
CREATE POLICY entity_mentions_read ON public.entity_mentions FOR SELECT USING (TRUE);
GRANT SELECT ON public.documents, public.chunks, public.entities, public.entity_mentions
TO anon, authenticated;
GRANT EXECUTE ON FUNCTION public.hybrid_search_chunks TO anon, authenticated;