disclosure-bureau/scripts/maintain/47_mark_unsearchable_chunks.sql

65 lines
1.7 KiB
MySQL
Raw Normal View History

-- 47_mark_unsearchable_chunks.sql
-- Add an `is_searchable` flag to public.chunks and turn it OFF for purely
-- structural fragments that carry no informational content (salutations,
-- page numbers, classification banners, isolated headings, etc).
--
-- These chunks still exist for page reconstruction; they just don't pollute
-- search/retrieval results anymore.
--
-- Idempotent: re-running re-applies the same rules.
BEGIN;
ALTER TABLE public.chunks
ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
-- Reset all to true first so reclassification is clean
UPDATE public.chunks SET is_searchable = TRUE;
-- Always-noise types (semantic-free formatting / scaffolding)
UPDATE public.chunks SET is_searchable = FALSE
WHERE type IN (
'page_number',
'blank',
'stamp',
'classification_banner',
'classification_marking'
);
-- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable
UPDATE public.chunks SET is_searchable = FALSE
WHERE type IN (
'salutation',
'complimentary_close',
'section_heading',
'section_header',
'heading',
'title',
'subtitle',
'date_line',
'bulleted_item',
'field_value',
'field_entry',
'table_marker',
'form_field',
'form_header',
'routing_block',
'distribution_list',
'file_number',
'marginalia'
)
AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
-- Partial index: only the searchable ~83% of rows are indexed in vector / fts
CREATE INDEX IF NOT EXISTS chunks_searchable_idx
ON public.chunks (chunk_pk) WHERE is_searchable;
COMMIT;
-- Diagnostic counters
SELECT
is_searchable,
COUNT(*) AS n,
ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len
FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable;