#!/usr/bin/env bash # Full dedup pipeline: # 1. Layer 1 (deterministic merges) # 2. Layer 2 (fuzzy trigram with numeric + code-suffix guards) # 3. DB remap (entity_mentions → canonical entity_pk) # 4. Re-sync signal_strength + total_mentions # 5. Re-run text backfill # # Run from /Users/guto/ufo. Requires DATABASE_URL set and SSH tunnel open. set -euo pipefail cd /Users/guto/ufo source /Users/guto/ufo/infra/disclosure-stack/.env export DATABASE_URL="postgres://postgres:${POSTGRES_PASSWORD}@localhost:5433/postgres" # Ensure tunnel ss -ltn 2>/dev/null | grep -q 5433 || { pkill -f "ssh.*5433:172" 2>/dev/null || true sleep 1 sshpass -p "$VPS_PASSWORD" ssh -o StrictHostKeyChecking=accept-new \ -p "$VPS_PORT" -fN -L 5433:172.27.0.2:5432 "${VPS_USER}@${VPS_HOST}" sleep 2 } echo "=== [1/5] Layer 1 dedup (deterministic) — already applied; re-running idempotent" python3 scripts/maintain/49_dedup_aggressive.py 2>&1 | tail -5 echo "" echo "=== [2/5] Layer 2 dedup (fuzzy trigram) ===" python3 scripts/maintain/50_dedup_fuzzy_trigram.py 2>&1 | tail -8 echo "" echo "=== [3/5] Remap entity_mentions in DB ===" python3 scripts/maintain/51_remap_entity_mentions.py 2>&1 | tail -10 echo "" echo "=== [4/5] Resync signal_strength ===" python3 scripts/maintain/42_sync_entity_stats.py --fix-obj-names 2>&1 | tail -10 echo "" echo "=== [5/5] Re-run text backfill on the new canonical set ===" python3 scripts/maintain/46_text_backfill_mentions.py 2>&1 | tail -8 echo "" echo "=== Done. Final entity counts ===" python3 -c " from pathlib import Path p = Path('wiki/entities') active = sum(1 for f in p.rglob('*.md') if '_archived' not in f.parts) archived = sum(1 for f in (p / '_archived').rglob('*.md')) if (p / '_archived').exists() else 0 print(f' active: {active}') print(f' archived: {archived}') print(f' total: {active + archived}') "