55 lines
1.8 KiB
Bash
55 lines
1.8 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# Full dedup pipeline:
|
||
|
|
# 1. Layer 1 (deterministic merges)
|
||
|
|
# 2. Layer 2 (fuzzy trigram with numeric + code-suffix guards)
|
||
|
|
# 3. DB remap (entity_mentions → canonical entity_pk)
|
||
|
|
# 4. Re-sync signal_strength + total_mentions
|
||
|
|
# 5. Re-run text backfill
|
||
|
|
#
|
||
|
|
# Run from /Users/guto/ufo. Requires DATABASE_URL set and SSH tunnel open.
|
||
|
|
set -euo pipefail
|
||
|
|
cd /Users/guto/ufo
|
||
|
|
|
||
|
|
source /Users/guto/ufo/infra/disclosure-stack/.env
|
||
|
|
export DATABASE_URL="postgres://postgres:${POSTGRES_PASSWORD}@localhost:5433/postgres"
|
||
|
|
|
||
|
|
# Ensure tunnel
|
||
|
|
ss -ltn 2>/dev/null | grep -q 5433 || {
|
||
|
|
pkill -f "ssh.*5433:172" 2>/dev/null || true
|
||
|
|
sleep 1
|
||
|
|
sshpass -p "$VPS_PASSWORD" ssh -o StrictHostKeyChecking=accept-new \
|
||
|
|
-p "$VPS_PORT" -fN -L 5433:172.27.0.2:5432 "${VPS_USER}@${VPS_HOST}"
|
||
|
|
sleep 2
|
||
|
|
}
|
||
|
|
|
||
|
|
echo "=== [1/5] Layer 1 dedup (deterministic) — already applied; re-running idempotent"
|
||
|
|
python3 scripts/maintain/49_dedup_aggressive.py 2>&1 | tail -5
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== [2/5] Layer 2 dedup (fuzzy trigram) ==="
|
||
|
|
python3 scripts/maintain/50_dedup_fuzzy_trigram.py 2>&1 | tail -8
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== [3/5] Remap entity_mentions in DB ==="
|
||
|
|
python3 scripts/maintain/51_remap_entity_mentions.py 2>&1 | tail -10
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== [4/5] Resync signal_strength ==="
|
||
|
|
python3 scripts/maintain/42_sync_entity_stats.py --fix-obj-names 2>&1 | tail -10
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== [5/5] Re-run text backfill on the new canonical set ==="
|
||
|
|
python3 scripts/maintain/46_text_backfill_mentions.py 2>&1 | tail -8
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== Done. Final entity counts ==="
|
||
|
|
python3 -c "
|
||
|
|
from pathlib import Path
|
||
|
|
p = Path('wiki/entities')
|
||
|
|
active = sum(1 for f in p.rglob('*.md') if '_archived' not in f.parts)
|
||
|
|
archived = sum(1 for f in (p / '_archived').rglob('*.md')) if (p / '_archived').exists() else 0
|
||
|
|
print(f' active: {active}')
|
||
|
|
print(f' archived: {archived}')
|
||
|
|
print(f' total: {active + archived}')
|
||
|
|
"
|