80 lines
2.5 KiB
Bash
80 lines
2.5 KiB
Bash
|
|
#!/usr/bin/env bash
|
|||
|
|
# auto-resume-batch.sh — polls Anthropic quota every 30min via a tiny probe call.
|
|||
|
|
# When a small probe doc succeeds (or the probe also bails quickly with quota),
|
|||
|
|
# decides whether to launch the full batch.
|
|||
|
|
#
|
|||
|
|
# Stops itself when:
|
|||
|
|
# - batch reports successful completion (summary.json with successes >= queue_size)
|
|||
|
|
# - a probe goes 90s without quota error (interpreted as quota back)
|
|||
|
|
#
|
|||
|
|
# Usage:
|
|||
|
|
# nohup ./scripts/29-auto-resume-batch.sh > /tmp/auto-resume.log 2>&1 &
|
|||
|
|
|
|||
|
|
set -uo pipefail
|
|||
|
|
|
|||
|
|
UFO_ROOT=/Users/guto/ufo
|
|||
|
|
LOG_DIR="$UFO_ROOT/raw/_batch-rebuild"
|
|||
|
|
SLEEP_BETWEEN=1800 # 30min between probes
|
|||
|
|
MAX_ATTEMPTS=24 # 24 × 30min = 12h ceiling
|
|||
|
|
|
|||
|
|
log() { echo "[$(date -u +%H:%M:%SZ)] $*"; }
|
|||
|
|
|
|||
|
|
attempt=0
|
|||
|
|
while [ $attempt -lt $MAX_ATTEMPTS ]; do
|
|||
|
|
attempt=$((attempt + 1))
|
|||
|
|
log "attempt $attempt/$MAX_ATTEMPTS — probing batch"
|
|||
|
|
|
|||
|
|
# Check if anything is already running — bail early
|
|||
|
|
if pgrep -f "28-batch-rebuild-all.py" >/dev/null; then
|
|||
|
|
log "batch already running, sleeping ${SLEEP_BETWEEN}s and re-checking"
|
|||
|
|
sleep $SLEEP_BETWEEN
|
|||
|
|
continue
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# Snapshot current archive count
|
|||
|
|
before=$(ls -d "$UFO_ROOT"/raw/*--subagent 2>/dev/null | wc -l | tr -d ' ')
|
|||
|
|
log " archived before: $before"
|
|||
|
|
|
|||
|
|
# Kick off batch (will early-abort if quota still throttled)
|
|||
|
|
cd "$UFO_ROOT"
|
|||
|
|
python3 scripts/28-batch-rebuild-all.py --workers 2 \
|
|||
|
|
> /tmp/batch-rebuild-auto-$attempt.log 2>&1 &
|
|||
|
|
PID=$!
|
|||
|
|
log " started python orchestrator PID=$PID"
|
|||
|
|
|
|||
|
|
# Wait for either:
|
|||
|
|
# - process exits (early-abort or done)
|
|||
|
|
# - 90s elapsed without exit (means it's actually running real work)
|
|||
|
|
for i in $(seq 1 90); do
|
|||
|
|
if ! kill -0 $PID 2>/dev/null; then
|
|||
|
|
break
|
|||
|
|
fi
|
|||
|
|
sleep 1
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
if kill -0 $PID 2>/dev/null; then
|
|||
|
|
# Still running after 90s → real work, leave it alone and exit auto-resume
|
|||
|
|
log " ✓ batch is making real progress (still running after 90s)"
|
|||
|
|
log " auto-resume exits; full batch continues in background"
|
|||
|
|
log " monitor: tail -f /tmp/batch-rebuild-auto-$attempt.log"
|
|||
|
|
exit 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# Process exited within 90s — must have hit quota or completed
|
|||
|
|
after=$(ls -d "$UFO_ROOT"/raw/*--subagent 2>/dev/null | wc -l | tr -d ' ')
|
|||
|
|
delta=$((after - before))
|
|||
|
|
log " process exited fast (likely quota); archived delta: $delta"
|
|||
|
|
|
|||
|
|
if [ "$delta" -gt 0 ]; then
|
|||
|
|
log " ✓ some docs were processed — re-launching immediately"
|
|||
|
|
sleep 5
|
|||
|
|
continue
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
log " 💤 quota still throttled; sleeping ${SLEEP_BETWEEN}s"
|
|||
|
|
sleep $SLEEP_BETWEEN
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
log "max attempts reached, giving up. re-run manually."
|
|||
|
|
exit 1
|