45 lines
1.3 KiB
Bash
Executable File
45 lines
1.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
bash stop_multinode_training.sh <train_name>
|
|
|
|
Environment overrides:
|
|
HOSTS="g0033 g0034 g0035 g0036"
|
|
CONTAINER_NAME=megatron-ngc25-training
|
|
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
|
GRACE_SECONDS=600
|
|
EOF
|
|
}
|
|
|
|
train_name=${1:-}
|
|
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
|
|
usage
|
|
exit 0
|
|
fi
|
|
|
|
read -r -a HOST_ARRAY <<< "${HOSTS:-g0033 g0034 g0035 g0036}"
|
|
CONTAINER_NAME=${CONTAINER_NAME:-}
|
|
|
|
for idx in "${!HOST_ARRAY[@]}"; do
|
|
host=${HOST_ARRAY[$idx]}
|
|
node_train_name="${train_name}_node${idx}"
|
|
stop_cmd="cd \"$SCRIPT_DIR\" && ARTIFACT_ROOT=\"${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}\" GRACE_SECONDS=\"${GRACE_SECONDS:-600}\" bash stop_training.sh \"$node_train_name\""
|
|
if [ -n "$CONTAINER_NAME" ]; then
|
|
remote_cmd="docker exec \"$CONTAINER_NAME\" bash -lc $(printf '%q' "$stop_cmd")"
|
|
else
|
|
remote_cmd="$stop_cmd"
|
|
fi
|
|
|
|
echo "Stopping host=${host}, train_name=${node_train_name}"
|
|
if [ "$host" = "$(hostname -s)" ] || [ "$host" = "$(hostname)" ]; then
|
|
bash -lc "$remote_cmd" || true
|
|
else
|
|
ssh "$host" "bash -lc $(printf '%q' "$remote_cmd")" || true
|
|
fi
|
|
done
|