fix: fixed stop-training script

This commit is contained in:
2026-05-09 12:01:17 +08:00
parent 0008288964
commit 02868ec01a

View File

@@ -2,9 +2,9 @@
set -euo pipefail set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
GRACE_SECONDS=${GRACE_SECONDS:-300} GRACE_SECONDS=${GRACE_SECONDS:-600}
usage() { usage() {
cat <<'EOF' cat <<'EOF'
@@ -12,7 +12,7 @@ Usage:
bash stop_training.sh <train_name> bash stop_training.sh <train_name>
Environment overrides: Environment overrides:
GRACE_SECONDS=300 GRACE_SECONDS=600
EOF EOF
} }
@@ -43,11 +43,9 @@ if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
fi fi
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}" echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
if [ -n "$pgid" ]; then # send kill signal to the process under group id
kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true kill -SIGTERM "-${pgid}" 2>/dev/null
else
kill -TERM "$pid" 2>/dev/null || true
fi
deadline=$((SECONDS + GRACE_SECONDS)) deadline=$((SECONDS + GRACE_SECONDS))
while kill -0 "$pid" 2>/dev/null; do while kill -0 "$pid" 2>/dev/null; do