Files
2026-05-09 12:01:17 +08:00

67 lines
1.8 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
GRACE_SECONDS=${GRACE_SECONDS:-600}
usage() {
cat <<'EOF'
Usage:
bash stop_training.sh <train_name>
Environment overrides:
GRACE_SECONDS=600
EOF
}
train_name=${1:-}
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
usage
exit 0
fi
pid_file="${RUN_STATE_DIR}/${train_name}.pid"
meta_file="${RUN_STATE_DIR}/${train_name}.env"
if [ ! -f "$pid_file" ]; then
echo "PID file not found: ${pid_file}" >&2
exit 1
fi
pid=$(cat "$pid_file")
if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
echo "Training is not running for train_name=${train_name}; cleaning stale state."
rm -f "$pid_file" "$meta_file"
exit 0
fi
pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true)
if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
pgid=$(grep '^PGID=' "$meta_file" | cut -d= -f2- || true)
fi
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
# send kill signal to the process under group id
kill -SIGTERM "-${pgid}" 2>/dev/null
deadline=$((SECONDS + GRACE_SECONDS))
while kill -0 "$pid" 2>/dev/null; do
if [ "$SECONDS" -ge "$deadline" ]; then
echo "Training did not exit within ${GRACE_SECONDS}s." >&2
echo "If checkpoint saving is still running, wait and inspect logs before forcing termination." >&2
if [ -n "$pgid" ]; then
echo "Force kill manually if needed: kill -KILL -${pgid}" >&2
else
echo "Force kill manually if needed: kill -KILL ${pid}" >&2
fi
exit 2
fi
sleep 5
done
rm -f "$pid_file" "$meta_file"
echo "Stopped training: train_name=${train_name}"