-- 47_mark_unsearchable_chunks.sql -- Add an `is_searchable` flag to public.chunks and turn it OFF for purely -- structural fragments that carry no informational content (salutations, -- page numbers, classification banners, isolated headings, etc). -- -- These chunks still exist for page reconstruction; they just don't pollute -- search/retrieval results anymore. -- -- Idempotent: re-running re-applies the same rules. BEGIN; ALTER TABLE public.chunks ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE; -- Reset all to true first so reclassification is clean UPDATE public.chunks SET is_searchable = TRUE; -- Always-noise types (semantic-free formatting / scaffolding) UPDATE public.chunks SET is_searchable = FALSE WHERE type IN ( 'page_number', 'blank', 'stamp', 'classification_banner', 'classification_marking' ); -- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable UPDATE public.chunks SET is_searchable = FALSE WHERE type IN ( 'salutation', 'complimentary_close', 'section_heading', 'section_header', 'heading', 'title', 'subtitle', 'date_line', 'bulleted_item', 'field_value', 'field_entry', 'table_marker', 'form_field', 'form_header', 'routing_block', 'distribution_list', 'file_number', 'marginalia' ) AND LENGTH(COALESCE(content_en, content_pt, '')) < 50; -- Partial index: only the searchable ~83% of rows are indexed in vector / fts CREATE INDEX IF NOT EXISTS chunks_searchable_idx ON public.chunks (chunk_pk) WHERE is_searchable; COMMIT; -- Diagnostic counters SELECT is_searchable, COUNT(*) AS n, ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable;