Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
64 lines
1.7 KiB
PL/PgSQL
64 lines
1.7 KiB
PL/PgSQL
-- 47_mark_unsearchable_chunks.sql
|
|
-- Add an `is_searchable` flag to public.chunks and turn it OFF for purely
|
|
-- structural fragments that carry no informational content (salutations,
|
|
-- page numbers, classification banners, isolated headings, etc).
|
|
--
|
|
-- These chunks still exist for page reconstruction; they just don't pollute
|
|
-- search/retrieval results anymore.
|
|
--
|
|
-- Idempotent: re-running re-applies the same rules.
|
|
|
|
BEGIN;
|
|
|
|
ALTER TABLE public.chunks
|
|
ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
|
|
|
|
-- Reset all to true first so reclassification is clean
|
|
UPDATE public.chunks SET is_searchable = TRUE;
|
|
|
|
-- Always-noise types (semantic-free formatting / scaffolding)
|
|
UPDATE public.chunks SET is_searchable = FALSE
|
|
WHERE type IN (
|
|
'page_number',
|
|
'blank',
|
|
'stamp',
|
|
'classification_banner',
|
|
'classification_marking'
|
|
);
|
|
|
|
-- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable
|
|
UPDATE public.chunks SET is_searchable = FALSE
|
|
WHERE type IN (
|
|
'salutation',
|
|
'complimentary_close',
|
|
'section_heading',
|
|
'section_header',
|
|
'heading',
|
|
'title',
|
|
'subtitle',
|
|
'date_line',
|
|
'bulleted_item',
|
|
'field_value',
|
|
'field_entry',
|
|
'table_marker',
|
|
'form_field',
|
|
'form_header',
|
|
'routing_block',
|
|
'distribution_list',
|
|
'file_number',
|
|
'marginalia'
|
|
)
|
|
AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
|
|
|
|
-- Partial index: only the searchable ~83% of rows are indexed in vector / fts
|
|
CREATE INDEX IF NOT EXISTS chunks_searchable_idx
|
|
ON public.chunks (chunk_pk) WHERE is_searchable;
|
|
|
|
COMMIT;
|
|
|
|
-- Diagnostic counters
|
|
SELECT
|
|
is_searchable,
|
|
COUNT(*) AS n,
|
|
ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len
|
|
FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable;
|