253 lines
9.6 KiB
PL/PgSQL
253 lines
9.6 KiB
PL/PgSQL
-- The Disclosure Bureau — chunks retrieval schema (v0.2.0)
|
|
-- Enables hybrid retrieval (BM25 + dense embeddings + reranker) over the
|
|
-- agentic chunks produced by scripts/28-batch-rebuild-all.py.
|
|
--
|
|
-- Safe to re-run. Apply via Supabase Studio SQL editor OR psql.
|
|
|
|
-- 1. pgvector + trigram extensions (Supabase image ships both)
|
|
CREATE EXTENSION IF NOT EXISTS vector;
|
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
|
CREATE EXTENSION IF NOT EXISTS unaccent;
|
|
|
|
-- 2. Multilingual unaccent text search config (EN + PT-BR)
|
|
DO $$ BEGIN
|
|
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'pt_unaccent') THEN
|
|
CREATE TEXT SEARCH CONFIGURATION public.pt_unaccent ( COPY = pg_catalog.portuguese );
|
|
ALTER TEXT SEARCH CONFIGURATION public.pt_unaccent
|
|
ALTER MAPPING FOR hword, hword_part, word
|
|
WITH unaccent, portuguese_stem;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'en_unaccent') THEN
|
|
CREATE TEXT SEARCH CONFIGURATION public.en_unaccent ( COPY = pg_catalog.english );
|
|
ALTER TEXT SEARCH CONFIGURATION public.en_unaccent
|
|
ALTER MAPPING FOR hword, hword_part, word
|
|
WITH unaccent, english_stem;
|
|
END IF;
|
|
END $$;
|
|
|
|
-- 3. documents — 1 row per doc (mirrors wiki/documents/<doc-id>.md frontmatter highlights)
|
|
CREATE TABLE IF NOT EXISTS public.documents (
|
|
doc_id TEXT PRIMARY KEY,
|
|
canonical_title TEXT,
|
|
collection TEXT,
|
|
document_class TEXT,
|
|
page_count INT,
|
|
classification TEXT,
|
|
content_class TEXT[],
|
|
schema_version TEXT NOT NULL DEFAULT '0.2.0',
|
|
build_approach TEXT,
|
|
build_model TEXT,
|
|
built_at TIMESTAMPTZ,
|
|
ingested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
raw_path TEXT
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS documents_collection_idx ON public.documents (collection);
|
|
CREATE INDEX IF NOT EXISTS documents_built_at_idx ON public.documents (built_at DESC);
|
|
|
|
-- 4. chunks — the retrieval unit. 1 row per chunk file (raw/<doc>/chunks/c*.md).
|
|
-- 1024 dims = BGE-M3 dense.
|
|
CREATE TABLE IF NOT EXISTS public.chunks (
|
|
chunk_pk BIGSERIAL PRIMARY KEY,
|
|
doc_id TEXT NOT NULL REFERENCES public.documents(doc_id) ON DELETE CASCADE,
|
|
chunk_id TEXT NOT NULL,
|
|
page INT NOT NULL,
|
|
order_in_page INT NOT NULL,
|
|
order_global INT NOT NULL,
|
|
type TEXT NOT NULL,
|
|
bbox JSONB,
|
|
content_en TEXT,
|
|
content_pt TEXT,
|
|
ocr_confidence REAL,
|
|
classification TEXT,
|
|
formatting TEXT[],
|
|
cross_page_hint TEXT,
|
|
prev_chunk TEXT,
|
|
next_chunk TEXT,
|
|
related_image TEXT,
|
|
related_table TEXT,
|
|
redaction_code TEXT,
|
|
redaction_inferred TEXT,
|
|
image_type TEXT,
|
|
ufo_anomaly BOOLEAN NOT NULL DEFAULT FALSE,
|
|
ufo_anomaly_type TEXT,
|
|
ufo_rationale TEXT,
|
|
cryptid_anomaly BOOLEAN NOT NULL DEFAULT FALSE,
|
|
cryptid_anomaly_type TEXT,
|
|
cryptid_rationale TEXT,
|
|
image_desc_en TEXT,
|
|
image_desc_pt TEXT,
|
|
source_png TEXT,
|
|
embedding vector(1024),
|
|
ts_en tsvector GENERATED ALWAYS AS (
|
|
to_tsvector('public.en_unaccent', COALESCE(content_en, ''))
|
|
) STORED,
|
|
ts_pt tsvector GENERATED ALWAYS AS (
|
|
to_tsvector('public.pt_unaccent', COALESCE(content_pt, ''))
|
|
) STORED,
|
|
CONSTRAINT chunks_doc_chunk_uk UNIQUE (doc_id, chunk_id)
|
|
);
|
|
|
|
-- 5. indexes
|
|
CREATE INDEX IF NOT EXISTS chunks_doc_id_page_idx
|
|
ON public.chunks (doc_id, page, order_in_page);
|
|
|
|
CREATE INDEX IF NOT EXISTS chunks_type_idx
|
|
ON public.chunks (type);
|
|
|
|
CREATE INDEX IF NOT EXISTS chunks_classification_idx
|
|
ON public.chunks (classification) WHERE classification IS NOT NULL;
|
|
|
|
CREATE INDEX IF NOT EXISTS chunks_ufo_idx
|
|
ON public.chunks (ufo_anomaly) WHERE ufo_anomaly = TRUE;
|
|
|
|
CREATE INDEX IF NOT EXISTS chunks_cryptid_idx
|
|
ON public.chunks (cryptid_anomaly) WHERE cryptid_anomaly = TRUE;
|
|
|
|
CREATE INDEX IF NOT EXISTS chunks_ts_en_idx ON public.chunks USING GIN (ts_en);
|
|
CREATE INDEX IF NOT EXISTS chunks_ts_pt_idx ON public.chunks USING GIN (ts_pt);
|
|
|
|
-- HNSW vector index — m=16, ef_construction=64 (defaults; tune later)
|
|
CREATE INDEX IF NOT EXISTS chunks_embedding_hnsw_idx
|
|
ON public.chunks USING hnsw (embedding vector_cosine_ops);
|
|
|
|
-- Trigram index on content for fuzzy ILIKE
|
|
CREATE INDEX IF NOT EXISTS chunks_content_en_trgm_idx
|
|
ON public.chunks USING GIN (content_en gin_trgm_ops);
|
|
CREATE INDEX IF NOT EXISTS chunks_content_pt_trgm_idx
|
|
ON public.chunks USING GIN (content_pt gin_trgm_ops);
|
|
|
|
-- 6. entities — flattened from wiki/entities/<class>/<id>.md (post-lint)
|
|
CREATE TABLE IF NOT EXISTS public.entities (
|
|
entity_pk BIGSERIAL PRIMARY KEY,
|
|
entity_class TEXT NOT NULL,
|
|
entity_id TEXT NOT NULL,
|
|
canonical_name TEXT NOT NULL,
|
|
aliases TEXT[],
|
|
embedding vector(1024),
|
|
total_mentions INT NOT NULL DEFAULT 0,
|
|
documents_count INT NOT NULL DEFAULT 0,
|
|
enrichment_status TEXT,
|
|
last_ingest TIMESTAMPTZ,
|
|
CONSTRAINT entities_uk UNIQUE (entity_class, entity_id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS entities_canonical_name_idx ON public.entities (canonical_name);
|
|
CREATE INDEX IF NOT EXISTS entities_aliases_idx ON public.entities USING GIN (aliases);
|
|
CREATE INDEX IF NOT EXISTS entities_embedding_hnsw_idx
|
|
ON public.entities USING hnsw (embedding vector_cosine_ops);
|
|
|
|
-- 7. entity_mentions — link table chunk ↔ entity (materialized from lint)
|
|
CREATE TABLE IF NOT EXISTS public.entity_mentions (
|
|
mention_pk BIGSERIAL PRIMARY KEY,
|
|
chunk_pk BIGINT NOT NULL REFERENCES public.chunks(chunk_pk) ON DELETE CASCADE,
|
|
entity_pk BIGINT NOT NULL REFERENCES public.entities(entity_pk) ON DELETE CASCADE,
|
|
surface_form TEXT,
|
|
CONSTRAINT entity_mentions_uk UNIQUE (chunk_pk, entity_pk)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS entity_mentions_chunk_idx ON public.entity_mentions (chunk_pk);
|
|
CREATE INDEX IF NOT EXISTS entity_mentions_entity_idx ON public.entity_mentions (entity_pk);
|
|
|
|
-- 8. Hybrid search RPC: BM25 + dense + RRF fusion server-side
|
|
CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
|
|
q_text TEXT,
|
|
q_embedding vector(1024),
|
|
q_lang TEXT DEFAULT 'pt', -- 'pt' | 'en'
|
|
q_doc_id TEXT DEFAULT NULL,
|
|
q_type TEXT DEFAULT NULL,
|
|
q_classification TEXT DEFAULT NULL,
|
|
q_ufo_only BOOLEAN DEFAULT FALSE,
|
|
k INT DEFAULT 100,
|
|
rrf_k INT DEFAULT 60
|
|
)
|
|
RETURNS TABLE (
|
|
chunk_pk BIGINT,
|
|
doc_id TEXT,
|
|
chunk_id TEXT,
|
|
page INT,
|
|
type TEXT,
|
|
bbox JSONB,
|
|
content_en TEXT,
|
|
content_pt TEXT,
|
|
classification TEXT,
|
|
score DOUBLE PRECISION,
|
|
bm25_rank INT,
|
|
dense_rank INT
|
|
)
|
|
LANGUAGE plpgsql STABLE AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
WITH
|
|
ts_q AS (
|
|
SELECT CASE WHEN q_lang = 'en'
|
|
THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text)
|
|
ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text)
|
|
END AS q
|
|
),
|
|
bm25 AS (
|
|
SELECT c.chunk_pk,
|
|
row_number() OVER (ORDER BY
|
|
ts_rank_cd(
|
|
CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END,
|
|
(SELECT q FROM ts_q)
|
|
) DESC NULLS LAST
|
|
)::INT AS r
|
|
FROM public.chunks c
|
|
WHERE (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q)
|
|
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
|
|
AND (q_type IS NULL OR c.type = q_type)
|
|
AND (q_classification IS NULL OR c.classification = q_classification)
|
|
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
|
|
LIMIT k
|
|
),
|
|
dense AS (
|
|
SELECT c.chunk_pk,
|
|
row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
|
|
FROM public.chunks c
|
|
WHERE c.embedding IS NOT NULL
|
|
AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
|
|
AND (q_type IS NULL OR c.type = q_type)
|
|
AND (q_classification IS NULL OR c.classification = q_classification)
|
|
AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
|
|
ORDER BY c.embedding <=> q_embedding
|
|
LIMIT k
|
|
),
|
|
fused AS (
|
|
SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk,
|
|
((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) +
|
|
(1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score,
|
|
b.r AS bm25_rank,
|
|
d.r AS dense_rank
|
|
FROM bm25 b
|
|
FULL OUTER JOIN dense d USING (chunk_pk)
|
|
)
|
|
SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox,
|
|
c.content_en, c.content_pt, c.classification,
|
|
f.score, f.bm25_rank, f.dense_rank
|
|
FROM fused f
|
|
JOIN public.chunks c USING (chunk_pk)
|
|
ORDER BY f.score DESC
|
|
LIMIT k;
|
|
END
|
|
$$;
|
|
|
|
-- 9. RLS — chunks/entities are public read; writes via service_role
|
|
ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
|
|
ALTER TABLE public.chunks ENABLE ROW LEVEL SECURITY;
|
|
ALTER TABLE public.entities ENABLE ROW LEVEL SECURITY;
|
|
ALTER TABLE public.entity_mentions ENABLE ROW LEVEL SECURITY;
|
|
|
|
DROP POLICY IF EXISTS documents_read ON public.documents;
|
|
DROP POLICY IF EXISTS chunks_read ON public.chunks;
|
|
DROP POLICY IF EXISTS entities_read ON public.entities;
|
|
DROP POLICY IF EXISTS entity_mentions_read ON public.entity_mentions;
|
|
|
|
CREATE POLICY documents_read ON public.documents FOR SELECT USING (TRUE);
|
|
CREATE POLICY chunks_read ON public.chunks FOR SELECT USING (TRUE);
|
|
CREATE POLICY entities_read ON public.entities FOR SELECT USING (TRUE);
|
|
CREATE POLICY entity_mentions_read ON public.entity_mentions FOR SELECT USING (TRUE);
|
|
|
|
GRANT SELECT ON public.documents, public.chunks, public.entities, public.entity_mentions
|
|
TO anon, authenticated;
|
|
GRANT EXECUTE ON FUNCTION public.hybrid_search_chunks TO anon, authenticated;
|