#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) usage() { cat <<'EOF' Usage: bash stop_multinode_training.sh Environment overrides: HOSTS="g0033 g0034 g0035 g0036" CONTAINER_NAME=megatron-ngc25-training ARTIFACT_ROOT=/apps/yi/model_training/artifacts GRACE_SECONDS=600 EOF } train_name=${1:-} if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then usage exit 0 fi read -r -a HOST_ARRAY <<< "${HOSTS:-g0033 g0034 g0035 g0036}" CONTAINER_NAME=${CONTAINER_NAME:-} for idx in "${!HOST_ARRAY[@]}"; do host=${HOST_ARRAY[$idx]} node_train_name="${train_name}_node${idx}" stop_cmd="cd \"$SCRIPT_DIR\" && ARTIFACT_ROOT=\"${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}\" GRACE_SECONDS=\"${GRACE_SECONDS:-600}\" bash stop_training.sh \"$node_train_name\"" if [ -n "$CONTAINER_NAME" ]; then remote_cmd="docker exec \"$CONTAINER_NAME\" bash -lc $(printf '%q' "$stop_cmd")" else remote_cmd="$stop_cmd" fi echo "Stopping host=${host}, train_name=${node_train_name}" if [ "$host" = "$(hostname -s)" ] || [ "$host" = "$(hostname)" ]; then bash -lc "$remote_cmd" || true else ssh "$host" "bash -lc $(printf '%q' "$remote_cmd")" || true fi done