feat: incorporate multi-device training scripts and README
This commit is contained in:
44
scripts/kaiyuan2b-training/stop_multinode_training.sh
Executable file
44
scripts/kaiyuan2b-training/stop_multinode_training.sh
Executable file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
bash stop_multinode_training.sh <train_name>
|
||||
|
||||
Environment overrides:
|
||||
HOSTS="g0033 g0034 g0035 g0036"
|
||||
CONTAINER_NAME=megatron-ngc25-training
|
||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
||||
GRACE_SECONDS=600
|
||||
EOF
|
||||
}
|
||||
|
||||
train_name=${1:-}
|
||||
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
read -r -a HOST_ARRAY <<< "${HOSTS:-g0033 g0034 g0035 g0036}"
|
||||
CONTAINER_NAME=${CONTAINER_NAME:-}
|
||||
|
||||
for idx in "${!HOST_ARRAY[@]}"; do
|
||||
host=${HOST_ARRAY[$idx]}
|
||||
node_train_name="${train_name}_node${idx}"
|
||||
stop_cmd="cd \"$SCRIPT_DIR\" && ARTIFACT_ROOT=\"${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}\" GRACE_SECONDS=\"${GRACE_SECONDS:-600}\" bash stop_training.sh \"$node_train_name\""
|
||||
if [ -n "$CONTAINER_NAME" ]; then
|
||||
remote_cmd="docker exec \"$CONTAINER_NAME\" bash -lc $(printf '%q' "$stop_cmd")"
|
||||
else
|
||||
remote_cmd="$stop_cmd"
|
||||
fi
|
||||
|
||||
echo "Stopping host=${host}, train_name=${node_train_name}"
|
||||
if [ "$host" = "$(hostname -s)" ] || [ "$host" = "$(hostname)" ]; then
|
||||
bash -lc "$remote_cmd" || true
|
||||
else
|
||||
ssh "$host" "bash -lc $(printf '%q' "$remote_cmd")" || true
|
||||
fi
|
||||
done
|
||||
Reference in New Issue
Block a user