disclosure-bureau/scripts/maintain/47_mark_unsearchable_chunks.sql
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

64 lines
1.7 KiB
PL/PgSQL

-- 47_mark_unsearchable_chunks.sql
-- Add an `is_searchable` flag to public.chunks and turn it OFF for purely
-- structural fragments that carry no informational content (salutations,
-- page numbers, classification banners, isolated headings, etc).
--
-- These chunks still exist for page reconstruction; they just don't pollute
-- search/retrieval results anymore.
--
-- Idempotent: re-running re-applies the same rules.
BEGIN;
ALTER TABLE public.chunks
ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
-- Reset all to true first so reclassification is clean
UPDATE public.chunks SET is_searchable = TRUE;
-- Always-noise types (semantic-free formatting / scaffolding)
UPDATE public.chunks SET is_searchable = FALSE
WHERE type IN (
'page_number',
'blank',
'stamp',
'classification_banner',
'classification_marking'
);
-- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable
UPDATE public.chunks SET is_searchable = FALSE
WHERE type IN (
'salutation',
'complimentary_close',
'section_heading',
'section_header',
'heading',
'title',
'subtitle',
'date_line',
'bulleted_item',
'field_value',
'field_entry',
'table_marker',
'form_field',
'form_header',
'routing_block',
'distribution_list',
'file_number',
'marginalia'
)
AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
-- Partial index: only the searchable ~83% of rows are indexed in vector / fts
CREATE INDEX IF NOT EXISTS chunks_searchable_idx
ON public.chunks (chunk_pk) WHERE is_searchable;
COMMIT;
-- Diagnostic counters
SELECT
is_searchable,
COUNT(*) AS n,
ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len
FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable;