-- The Disclosure Bureau — chunks retrieval schema (v0.2.0) -- Enables hybrid retrieval (BM25 + dense embeddings + reranker) over the -- agentic chunks produced by scripts/28-batch-rebuild-all.py. -- -- Safe to re-run. Apply via Supabase Studio SQL editor OR psql. -- 1. pgvector + trigram extensions (Supabase image ships both) CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS unaccent; -- 2. Multilingual unaccent text search config (EN + PT-BR) DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'pt_unaccent') THEN CREATE TEXT SEARCH CONFIGURATION public.pt_unaccent ( COPY = pg_catalog.portuguese ); ALTER TEXT SEARCH CONFIGURATION public.pt_unaccent ALTER MAPPING FOR hword, hword_part, word WITH unaccent, portuguese_stem; END IF; IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'en_unaccent') THEN CREATE TEXT SEARCH CONFIGURATION public.en_unaccent ( COPY = pg_catalog.english ); ALTER TEXT SEARCH CONFIGURATION public.en_unaccent ALTER MAPPING FOR hword, hword_part, word WITH unaccent, english_stem; END IF; END $$; -- 3. documents — 1 row per doc (mirrors wiki/documents/.md frontmatter highlights) CREATE TABLE IF NOT EXISTS public.documents ( doc_id TEXT PRIMARY KEY, canonical_title TEXT, collection TEXT, document_class TEXT, page_count INT, classification TEXT, content_class TEXT[], schema_version TEXT NOT NULL DEFAULT '0.2.0', build_approach TEXT, build_model TEXT, built_at TIMESTAMPTZ, ingested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), raw_path TEXT ); CREATE INDEX IF NOT EXISTS documents_collection_idx ON public.documents (collection); CREATE INDEX IF NOT EXISTS documents_built_at_idx ON public.documents (built_at DESC); -- 4. chunks — the retrieval unit. 1 row per chunk file (raw//chunks/c*.md). -- 1024 dims = BGE-M3 dense. CREATE TABLE IF NOT EXISTS public.chunks ( chunk_pk BIGSERIAL PRIMARY KEY, doc_id TEXT NOT NULL REFERENCES public.documents(doc_id) ON DELETE CASCADE, chunk_id TEXT NOT NULL, page INT NOT NULL, order_in_page INT NOT NULL, order_global INT NOT NULL, type TEXT NOT NULL, bbox JSONB, content_en TEXT, content_pt TEXT, ocr_confidence REAL, classification TEXT, formatting TEXT[], cross_page_hint TEXT, prev_chunk TEXT, next_chunk TEXT, related_image TEXT, related_table TEXT, redaction_code TEXT, redaction_inferred TEXT, image_type TEXT, ufo_anomaly BOOLEAN NOT NULL DEFAULT FALSE, ufo_anomaly_type TEXT, ufo_rationale TEXT, cryptid_anomaly BOOLEAN NOT NULL DEFAULT FALSE, cryptid_anomaly_type TEXT, cryptid_rationale TEXT, image_desc_en TEXT, image_desc_pt TEXT, source_png TEXT, embedding vector(1024), ts_en tsvector GENERATED ALWAYS AS ( to_tsvector('public.en_unaccent', COALESCE(content_en, '')) ) STORED, ts_pt tsvector GENERATED ALWAYS AS ( to_tsvector('public.pt_unaccent', COALESCE(content_pt, '')) ) STORED, CONSTRAINT chunks_doc_chunk_uk UNIQUE (doc_id, chunk_id) ); -- 5. indexes CREATE INDEX IF NOT EXISTS chunks_doc_id_page_idx ON public.chunks (doc_id, page, order_in_page); CREATE INDEX IF NOT EXISTS chunks_type_idx ON public.chunks (type); CREATE INDEX IF NOT EXISTS chunks_classification_idx ON public.chunks (classification) WHERE classification IS NOT NULL; CREATE INDEX IF NOT EXISTS chunks_ufo_idx ON public.chunks (ufo_anomaly) WHERE ufo_anomaly = TRUE; CREATE INDEX IF NOT EXISTS chunks_cryptid_idx ON public.chunks (cryptid_anomaly) WHERE cryptid_anomaly = TRUE; CREATE INDEX IF NOT EXISTS chunks_ts_en_idx ON public.chunks USING GIN (ts_en); CREATE INDEX IF NOT EXISTS chunks_ts_pt_idx ON public.chunks USING GIN (ts_pt); -- HNSW vector index — m=16, ef_construction=64 (defaults; tune later) CREATE INDEX IF NOT EXISTS chunks_embedding_hnsw_idx ON public.chunks USING hnsw (embedding vector_cosine_ops); -- Trigram index on content for fuzzy ILIKE CREATE INDEX IF NOT EXISTS chunks_content_en_trgm_idx ON public.chunks USING GIN (content_en gin_trgm_ops); CREATE INDEX IF NOT EXISTS chunks_content_pt_trgm_idx ON public.chunks USING GIN (content_pt gin_trgm_ops); -- 6. entities — flattened from wiki/entities//.md (post-lint) CREATE TABLE IF NOT EXISTS public.entities ( entity_pk BIGSERIAL PRIMARY KEY, entity_class TEXT NOT NULL, entity_id TEXT NOT NULL, canonical_name TEXT NOT NULL, aliases TEXT[], embedding vector(1024), total_mentions INT NOT NULL DEFAULT 0, documents_count INT NOT NULL DEFAULT 0, enrichment_status TEXT, last_ingest TIMESTAMPTZ, CONSTRAINT entities_uk UNIQUE (entity_class, entity_id) ); CREATE INDEX IF NOT EXISTS entities_canonical_name_idx ON public.entities (canonical_name); CREATE INDEX IF NOT EXISTS entities_aliases_idx ON public.entities USING GIN (aliases); CREATE INDEX IF NOT EXISTS entities_embedding_hnsw_idx ON public.entities USING hnsw (embedding vector_cosine_ops); -- 7. entity_mentions — link table chunk ↔ entity (materialized from lint) CREATE TABLE IF NOT EXISTS public.entity_mentions ( mention_pk BIGSERIAL PRIMARY KEY, chunk_pk BIGINT NOT NULL REFERENCES public.chunks(chunk_pk) ON DELETE CASCADE, entity_pk BIGINT NOT NULL REFERENCES public.entities(entity_pk) ON DELETE CASCADE, surface_form TEXT, CONSTRAINT entity_mentions_uk UNIQUE (chunk_pk, entity_pk) ); CREATE INDEX IF NOT EXISTS entity_mentions_chunk_idx ON public.entity_mentions (chunk_pk); CREATE INDEX IF NOT EXISTS entity_mentions_entity_idx ON public.entity_mentions (entity_pk); -- 8. Hybrid search RPC: BM25 + dense + RRF fusion server-side -- Drop prior signatures so adding the max_dense_dist param replaces rather than -- overloads the function (Postgres treats different arg lists as distinct fns). DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT); DROP FUNCTION IF EXISTS public.hybrid_search_chunks(TEXT, vector, TEXT, TEXT, TEXT, TEXT, BOOLEAN, INT, INT, DOUBLE PRECISION); CREATE OR REPLACE FUNCTION public.hybrid_search_chunks( q_text TEXT, q_embedding vector(1024), q_lang TEXT DEFAULT 'pt', -- 'pt' | 'en' q_doc_id TEXT DEFAULT NULL, q_type TEXT DEFAULT NULL, q_classification TEXT DEFAULT NULL, q_ufo_only BOOLEAN DEFAULT FALSE, k INT DEFAULT 100, rrf_k INT DEFAULT 60, -- Max cosine distance for a dense neighbour to count as relevant. Beyond this -- the vector is noise (measured: strong semantic match ~0.12-0.20, no real -- match ~0.46-0.53). Keeps absent-term queries from returning nearest-vector -- garbage without needing the CPU-slow cross-encoder reranker. max_dense_dist DOUBLE PRECISION DEFAULT 0.40 ) RETURNS TABLE ( chunk_pk BIGINT, doc_id TEXT, chunk_id TEXT, page INT, type TEXT, bbox JSONB, content_en TEXT, content_pt TEXT, classification TEXT, score DOUBLE PRECISION, bm25_rank INT, dense_rank INT ) LANGUAGE plpgsql STABLE AS $$ BEGIN RETURN QUERY WITH ts_q AS ( SELECT CASE WHEN q_lang = 'en' THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text) ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text) END AS q ), bm25 AS ( SELECT c.chunk_pk, row_number() OVER (ORDER BY ts_rank_cd( CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END, (SELECT q FROM ts_q) ) DESC NULLS LAST )::INT AS r FROM public.chunks c WHERE (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q) AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) AND (q_type IS NULL OR c.type = q_type) AND (q_classification IS NULL OR c.classification = q_classification) AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE) LIMIT k ), dense AS ( SELECT c.chunk_pk, row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r FROM public.chunks c WHERE c.embedding IS NOT NULL AND (c.embedding <=> q_embedding) < max_dense_dist AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) AND (q_type IS NULL OR c.type = q_type) AND (q_classification IS NULL OR c.classification = q_classification) AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE) ORDER BY c.embedding <=> q_embedding LIMIT k ), fused AS ( SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk, ((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) + (1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score, b.r AS bm25_rank, d.r AS dense_rank FROM bm25 b FULL OUTER JOIN dense d USING (chunk_pk) ) SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.content_en, c.content_pt, c.classification, f.score, f.bm25_rank, f.dense_rank FROM fused f JOIN public.chunks c USING (chunk_pk) ORDER BY f.score DESC LIMIT k; END $$; -- 9. RLS — chunks/entities are public read; writes via service_role ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY; ALTER TABLE public.chunks ENABLE ROW LEVEL SECURITY; ALTER TABLE public.entities ENABLE ROW LEVEL SECURITY; ALTER TABLE public.entity_mentions ENABLE ROW LEVEL SECURITY; DROP POLICY IF EXISTS documents_read ON public.documents; DROP POLICY IF EXISTS chunks_read ON public.chunks; DROP POLICY IF EXISTS entities_read ON public.entities; DROP POLICY IF EXISTS entity_mentions_read ON public.entity_mentions; CREATE POLICY documents_read ON public.documents FOR SELECT USING (TRUE); CREATE POLICY chunks_read ON public.chunks FOR SELECT USING (TRUE); CREATE POLICY entities_read ON public.entities FOR SELECT USING (TRUE); CREATE POLICY entity_mentions_read ON public.entity_mentions FOR SELECT USING (TRUE); GRANT SELECT ON public.documents, public.chunks, public.entities, public.entity_mentions TO anon, authenticated; GRANT EXECUTE ON FUNCTION public.hybrid_search_chunks TO anon, authenticated;