disclosure-bureau/scripts/29-auto-resume-batch.sh

80 lines
2.5 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
# auto-resume-batch.sh — polls Anthropic quota every 30min via a tiny probe call.
# When a small probe doc succeeds (or the probe also bails quickly with quota),
# decides whether to launch the full batch.
#
# Stops itself when:
# - batch reports successful completion (summary.json with successes >= queue_size)
# - a probe goes 90s without quota error (interpreted as quota back)
#
# Usage:
# nohup ./scripts/29-auto-resume-batch.sh > /tmp/auto-resume.log 2>&1 &
set -uo pipefail
UFO_ROOT=/Users/guto/ufo
LOG_DIR="$UFO_ROOT/raw/_batch-rebuild"
SLEEP_BETWEEN=1800 # 30min between probes
MAX_ATTEMPTS=24 # 24 × 30min = 12h ceiling
log() { echo "[$(date -u +%H:%M:%SZ)] $*"; }
attempt=0
while [ $attempt -lt $MAX_ATTEMPTS ]; do
attempt=$((attempt + 1))
log "attempt $attempt/$MAX_ATTEMPTS — probing batch"
# Check if anything is already running — bail early
if pgrep -f "28-batch-rebuild-all.py" >/dev/null; then
log "batch already running, sleeping ${SLEEP_BETWEEN}s and re-checking"
sleep $SLEEP_BETWEEN
continue
fi
# Snapshot current archive count
before=$(ls -d "$UFO_ROOT"/raw/*--subagent 2>/dev/null | wc -l | tr -d ' ')
log " archived before: $before"
# Kick off batch (will early-abort if quota still throttled)
cd "$UFO_ROOT"
python3 scripts/28-batch-rebuild-all.py --workers 2 \
> /tmp/batch-rebuild-auto-$attempt.log 2>&1 &
PID=$!
log " started python orchestrator PID=$PID"
# Wait for either:
# - process exits (early-abort or done)
# - 90s elapsed without exit (means it's actually running real work)
for i in $(seq 1 90); do
if ! kill -0 $PID 2>/dev/null; then
break
fi
sleep 1
done
if kill -0 $PID 2>/dev/null; then
# Still running after 90s → real work, leave it alone and exit auto-resume
log " ✓ batch is making real progress (still running after 90s)"
log " auto-resume exits; full batch continues in background"
log " monitor: tail -f /tmp/batch-rebuild-auto-$attempt.log"
exit 0
fi
# Process exited within 90s — must have hit quota or completed
after=$(ls -d "$UFO_ROOT"/raw/*--subagent 2>/dev/null | wc -l | tr -d ' ')
delta=$((after - before))
log " process exited fast (likely quota); archived delta: $delta"
if [ "$delta" -gt 0 ]; then
log " ✓ some docs were processed — re-launching immediately"
sleep 5
continue
fi
log " 💤 quota still throttled; sleeping ${SLEEP_BETWEEN}s"
sleep $SLEEP_BETWEEN
done
log "max attempts reached, giving up. re-run manually."
exit 1