feat: incorporate multi-device training scripts and README

This commit is contained in:
2026-05-09 21:35:42 +08:00
parent 02868ec01a
commit 75eacf00c2
6 changed files with 1082 additions and 10 deletions

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
usage() {
cat <<'EOF'
Usage:
bash stop_multinode_training.sh <train_name>
Environment overrides:
HOSTS="g0033 g0034 g0035 g0036"
CONTAINER_NAME=megatron-ngc25-training
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
GRACE_SECONDS=600
EOF
}
train_name=${1:-}
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
usage
exit 0
fi
read -r -a HOST_ARRAY <<< "${HOSTS:-g0033 g0034 g0035 g0036}"
CONTAINER_NAME=${CONTAINER_NAME:-}
for idx in "${!HOST_ARRAY[@]}"; do
host=${HOST_ARRAY[$idx]}
node_train_name="${train_name}_node${idx}"
stop_cmd="cd \"$SCRIPT_DIR\" && ARTIFACT_ROOT=\"${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}\" GRACE_SECONDS=\"${GRACE_SECONDS:-600}\" bash stop_training.sh \"$node_train_name\""
if [ -n "$CONTAINER_NAME" ]; then
remote_cmd="docker exec \"$CONTAINER_NAME\" bash -lc $(printf '%q' "$stop_cmd")"
else
remote_cmd="$stop_cmd"
fi
echo "Stopping host=${host}, train_name=${node_train_name}"
if [ "$host" = "$(hostname -s)" ] || [ "$host" = "$(hostname)" ]; then
bash -lc "$remote_cmd" || true
else
ssh "$host" "bash -lc $(printf '%q' "$remote_cmd")" || true
fi
done