65 lines
1.7 KiB
MySQL
65 lines
1.7 KiB
MySQL
|
|
-- 47_mark_unsearchable_chunks.sql
|
||
|
|
-- Add an `is_searchable` flag to public.chunks and turn it OFF for purely
|
||
|
|
-- structural fragments that carry no informational content (salutations,
|
||
|
|
-- page numbers, classification banners, isolated headings, etc).
|
||
|
|
--
|
||
|
|
-- These chunks still exist for page reconstruction; they just don't pollute
|
||
|
|
-- search/retrieval results anymore.
|
||
|
|
--
|
||
|
|
-- Idempotent: re-running re-applies the same rules.
|
||
|
|
|
||
|
|
BEGIN;
|
||
|
|
|
||
|
|
ALTER TABLE public.chunks
|
||
|
|
ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
|
||
|
|
|
||
|
|
-- Reset all to true first so reclassification is clean
|
||
|
|
UPDATE public.chunks SET is_searchable = TRUE;
|
||
|
|
|
||
|
|
-- Always-noise types (semantic-free formatting / scaffolding)
|
||
|
|
UPDATE public.chunks SET is_searchable = FALSE
|
||
|
|
WHERE type IN (
|
||
|
|
'page_number',
|
||
|
|
'blank',
|
||
|
|
'stamp',
|
||
|
|
'classification_banner',
|
||
|
|
'classification_marking'
|
||
|
|
);
|
||
|
|
|
||
|
|
-- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable
|
||
|
|
UPDATE public.chunks SET is_searchable = FALSE
|
||
|
|
WHERE type IN (
|
||
|
|
'salutation',
|
||
|
|
'complimentary_close',
|
||
|
|
'section_heading',
|
||
|
|
'section_header',
|
||
|
|
'heading',
|
||
|
|
'title',
|
||
|
|
'subtitle',
|
||
|
|
'date_line',
|
||
|
|
'bulleted_item',
|
||
|
|
'field_value',
|
||
|
|
'field_entry',
|
||
|
|
'table_marker',
|
||
|
|
'form_field',
|
||
|
|
'form_header',
|
||
|
|
'routing_block',
|
||
|
|
'distribution_list',
|
||
|
|
'file_number',
|
||
|
|
'marginalia'
|
||
|
|
)
|
||
|
|
AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
|
||
|
|
|
||
|
|
-- Partial index: only the searchable ~83% of rows are indexed in vector / fts
|
||
|
|
CREATE INDEX IF NOT EXISTS chunks_searchable_idx
|
||
|
|
ON public.chunks (chunk_pk) WHERE is_searchable;
|
||
|
|
|
||
|
|
COMMIT;
|
||
|
|
|
||
|
|
-- Diagnostic counters
|
||
|
|
SELECT
|
||
|
|
is_searchable,
|
||
|
|
COUNT(*) AS n,
|
||
|
|
ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len
|
||
|
|
FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable;
|