diff --git a/scripts/kaiyuan2b-training/stop_training.sh b/scripts/kaiyuan2b-training/stop_training.sh index 0d2e297..e81149b 100755 --- a/scripts/kaiyuan2b-training/stop_training.sh +++ b/scripts/kaiyuan2b-training/stop_training.sh @@ -2,9 +2,9 @@ set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} +ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts} RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" -GRACE_SECONDS=${GRACE_SECONDS:-300} +GRACE_SECONDS=${GRACE_SECONDS:-600} usage() { cat <<'EOF' @@ -12,7 +12,7 @@ Usage: bash stop_training.sh Environment overrides: - GRACE_SECONDS=300 + GRACE_SECONDS=600 EOF } @@ -43,11 +43,9 @@ if [ -z "$pgid" ] && [ -f "$meta_file" ]; then fi echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}" -if [ -n "$pgid" ]; then - kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true -else - kill -TERM "$pid" 2>/dev/null || true -fi +# send kill signal to the process under group id +kill -SIGTERM "-${pgid}" 2>/dev/null + deadline=$((SECONDS + GRACE_SECONDS)) while kill -0 "$pid" 2>/dev/null; do