fix: fixed stop-training script
This commit is contained in:
@@ -2,9 +2,9 @@
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
|
||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||
GRACE_SECONDS=${GRACE_SECONDS:-300}
|
||||
GRACE_SECONDS=${GRACE_SECONDS:-600}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
@@ -12,7 +12,7 @@ Usage:
|
||||
bash stop_training.sh <train_name>
|
||||
|
||||
Environment overrides:
|
||||
GRACE_SECONDS=300
|
||||
GRACE_SECONDS=600
|
||||
EOF
|
||||
}
|
||||
|
||||
@@ -43,11 +43,9 @@ if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
|
||||
fi
|
||||
|
||||
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
||||
if [ -n "$pgid" ]; then
|
||||
kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
|
||||
else
|
||||
kill -TERM "$pid" 2>/dev/null || true
|
||||
fi
|
||||
# send kill signal to the process under group id
|
||||
kill -SIGTERM "-${pgid}" 2>/dev/null
|
||||
|
||||
|
||||
deadline=$((SECONDS + GRACE_SECONDS))
|
||||
while kill -0 "$pid" 2>/dev/null; do
|
||||
|
||||
Reference in New Issue
Block a user