67 lines
1.8 KiB
Bash
Executable File
67 lines
1.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
|
|
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
|
GRACE_SECONDS=${GRACE_SECONDS:-600}
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
bash stop_training.sh <train_name>
|
|
|
|
Environment overrides:
|
|
GRACE_SECONDS=600
|
|
EOF
|
|
}
|
|
|
|
train_name=${1:-}
|
|
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
|
|
usage
|
|
exit 0
|
|
fi
|
|
|
|
pid_file="${RUN_STATE_DIR}/${train_name}.pid"
|
|
meta_file="${RUN_STATE_DIR}/${train_name}.env"
|
|
|
|
if [ ! -f "$pid_file" ]; then
|
|
echo "PID file not found: ${pid_file}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
pid=$(cat "$pid_file")
|
|
if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
|
|
echo "Training is not running for train_name=${train_name}; cleaning stale state."
|
|
rm -f "$pid_file" "$meta_file"
|
|
exit 0
|
|
fi
|
|
|
|
pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true)
|
|
if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
|
|
pgid=$(grep '^PGID=' "$meta_file" | cut -d= -f2- || true)
|
|
fi
|
|
|
|
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
|
# send kill signal to the process under group id
|
|
kill -SIGTERM "-${pgid}" 2>/dev/null
|
|
|
|
|
|
deadline=$((SECONDS + GRACE_SECONDS))
|
|
while kill -0 "$pid" 2>/dev/null; do
|
|
if [ "$SECONDS" -ge "$deadline" ]; then
|
|
echo "Training did not exit within ${GRACE_SECONDS}s." >&2
|
|
echo "If checkpoint saving is still running, wait and inspect logs before forcing termination." >&2
|
|
if [ -n "$pgid" ]; then
|
|
echo "Force kill manually if needed: kill -KILL -${pgid}" >&2
|
|
else
|
|
echo "Force kill manually if needed: kill -KILL ${pid}" >&2
|
|
fi
|
|
exit 2
|
|
fi
|
|
sleep 5
|
|
done
|
|
|
|
rm -f "$pid_file" "$meta_file"
|
|
echo "Stopped training: train_name=${train_name}"
|