fix: fixed stop-training script

This commit is contained in:
2026-05-09 12:01:17 +08:00
parent 0008288964
commit 02868ec01a

View File

@@ -2,9 +2,9 @@
set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
GRACE_SECONDS=${GRACE_SECONDS:-300}
GRACE_SECONDS=${GRACE_SECONDS:-600}
usage() {
cat <<'EOF'
@@ -12,7 +12,7 @@ Usage:
bash stop_training.sh <train_name>
Environment overrides:
GRACE_SECONDS=300
GRACE_SECONDS=600
EOF
}
@@ -43,11 +43,9 @@ if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
fi
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
if [ -n "$pgid" ]; then
kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
else
kill -TERM "$pid" 2>/dev/null || true
fi
# send kill signal to the process under group id
kill -SIGTERM "-${pgid}" 2>/dev/null
deadline=$((SECONDS + GRACE_SECONDS))
while kill -0 "$pid" 2>/dev/null; do