fix: fixed stop-training script
This commit is contained in:
@@ -2,9 +2,9 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
|
||||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||||
GRACE_SECONDS=${GRACE_SECONDS:-300}
|
GRACE_SECONDS=${GRACE_SECONDS:-600}
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat <<'EOF'
|
cat <<'EOF'
|
||||||
@@ -12,7 +12,7 @@ Usage:
|
|||||||
bash stop_training.sh <train_name>
|
bash stop_training.sh <train_name>
|
||||||
|
|
||||||
Environment overrides:
|
Environment overrides:
|
||||||
GRACE_SECONDS=300
|
GRACE_SECONDS=600
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,11 +43,9 @@ if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
||||||
if [ -n "$pgid" ]; then
|
# send kill signal to the process under group id
|
||||||
kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
|
kill -SIGTERM "-${pgid}" 2>/dev/null
|
||||||
else
|
|
||||||
kill -TERM "$pid" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
deadline=$((SECONDS + GRACE_SECONDS))
|
deadline=$((SECONDS + GRACE_SECONDS))
|
||||||
while kill -0 "$pid" 2>/dev/null; do
|
while kill -0 "$pid" 2>/dev/null; do
|
||||||
|
|||||||
Reference in New Issue
Block a user