feat: incorporate multi-device training scripts and README
This commit is contained in:
163
scripts/kaiyuan2b-training/start_multinode_training.sh
Executable file
163
scripts/kaiyuan2b-training/start_multinode_training.sh
Executable file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||
LOG_DIR="${ARTIFACT_ROOT}/logs"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
bash start_multinode_training.sh <model> [mode] [train_name]
|
||||
|
||||
Default cluster:
|
||||
g0033,g0034,g0035,g0036 with 8 GPUs per host.
|
||||
|
||||
Environment overrides:
|
||||
HOSTS="g0033 g0034 g0035 g0036"
|
||||
MASTER_ADDR=g0033
|
||||
MASTER_PORT=6000
|
||||
NPROC_PER_NODE=8
|
||||
ZERO_STAGE=0|1|2
|
||||
CONTAINER_NAME=megatron-ngc25-training
|
||||
NCCL_SOCKET_IFNAME=eth0
|
||||
GLOO_SOCKET_IFNAME=eth0
|
||||
NCCL_IB_HCA=mlx5_0,mlx5_1
|
||||
NCCL_DEBUG=INFO
|
||||
CHECKPOINT_KEEP_RECENT=3
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=300
|
||||
EXTRA_ARGS="--exit-duration-in-mins 120"
|
||||
|
||||
Examples:
|
||||
bash start_multinode_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_32gpu
|
||||
CONTAINER_NAME=megatron-ngc25-training bash start_multinode_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_32gpu
|
||||
ZERO_STAGE=1 bash start_multinode_training.sh qwen3_1p7b phase1 qwen3_phase1_zero1
|
||||
EOF
|
||||
}
|
||||
|
||||
model=${1:-}
|
||||
mode=${2:-}
|
||||
train_name=${3:-}
|
||||
|
||||
if [ -z "$model" ] || [ "$model" = "-h" ] || [ "$model" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case "$model" in
|
||||
gpt_smoke)
|
||||
train_script="${SCRIPT_DIR}/training_smoke_gpt2.sh"
|
||||
mode=${mode:-smoke}
|
||||
train_name=${train_name:-smoke_gpt_multinode}
|
||||
;;
|
||||
qwen3_1p7b)
|
||||
train_script="${SCRIPT_DIR}/training_smoke_qwen3_1p7b.sh"
|
||||
mode=${mode:-qwen3_1p7b_smoke_yi}
|
||||
train_name=${train_name:-qwen3_1p7b_multinode}
|
||||
;;
|
||||
*)
|
||||
echo "Unknown model: $model" >&2
|
||||
usage >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
read -r -a HOST_ARRAY <<< "${HOSTS:-g0033 g0034 g0035 g0036}"
|
||||
NNODES=${NNODES:-${#HOST_ARRAY[@]}}
|
||||
NPROC_PER_NODE=${NPROC_PER_NODE:-8}
|
||||
MASTER_ADDR=${MASTER_ADDR:-${HOST_ARRAY[0]}}
|
||||
MASTER_PORT=${MASTER_PORT:-6000}
|
||||
ZERO_STAGE=${ZERO_STAGE:-0}
|
||||
CONTAINER_NAME=${CONTAINER_NAME:-}
|
||||
NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-}
|
||||
GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-}
|
||||
NCCL_IB_HCA=${NCCL_IB_HCA:-}
|
||||
NCCL_DEBUG=${NCCL_DEBUG:-}
|
||||
NCCL_IB_DISABLE=${NCCL_IB_DISABLE:-}
|
||||
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
||||
EXTRA_ARGS="--exit-signal-handler ${EXTRA_ARGS:-}"
|
||||
|
||||
mkdir -p "$RUN_STATE_DIR" "$LOG_DIR"
|
||||
|
||||
echo "Starting multinode training: model=${model}, mode=${mode}, train_name=${train_name}"
|
||||
echo "Hosts: ${HOST_ARRAY[*]}"
|
||||
echo "Distributed: nnodes=${NNODES}, nproc_per_node=${NPROC_PER_NODE}, master=${MASTER_ADDR}:${MASTER_PORT}, zero_stage=${ZERO_STAGE}"
|
||||
if [ -n "$CONTAINER_NAME" ]; then
|
||||
echo "Container: ${CONTAINER_NAME}"
|
||||
fi
|
||||
|
||||
for idx in "${!HOST_ARRAY[@]}"; do
|
||||
host=${HOST_ARRAY[$idx]}
|
||||
node_rank=$idx
|
||||
node_train_name="${train_name}_node${node_rank}"
|
||||
pid_file="${RUN_STATE_DIR}/${node_train_name}.pid"
|
||||
meta_file="${RUN_STATE_DIR}/${node_train_name}.env"
|
||||
log_file="${LOG_DIR}/${node_train_name}.log"
|
||||
|
||||
remote_cmd=$(cat <<EOF
|
||||
set -euo pipefail
|
||||
run_cmd=\$(cat <<'RUN_CMD'
|
||||
mkdir -p "$RUN_STATE_DIR" "$LOG_DIR"
|
||||
cd "$SCRIPT_DIR"
|
||||
ARTIFACT_ROOT="$ARTIFACT_ROOT" \\
|
||||
CHECKPOINT_KEEP_RECENT="$CHECKPOINT_KEEP_RECENT" \\
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS="$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" \\
|
||||
NPROC_PER_NODE="$NPROC_PER_NODE" \\
|
||||
NNODES="$NNODES" \\
|
||||
NODE_RANK="$node_rank" \\
|
||||
MASTER_ADDR="$MASTER_ADDR" \\
|
||||
MASTER_PORT="$MASTER_PORT" \\
|
||||
ZERO_STAGE="$ZERO_STAGE" \\
|
||||
NCCL_SOCKET_IFNAME="$NCCL_SOCKET_IFNAME" \\
|
||||
GLOO_SOCKET_IFNAME="$GLOO_SOCKET_IFNAME" \\
|
||||
NCCL_IB_HCA="$NCCL_IB_HCA" \\
|
||||
NCCL_DEBUG="$NCCL_DEBUG" \\
|
||||
NCCL_IB_DISABLE="$NCCL_IB_DISABLE" \\
|
||||
EXTRA_ARGS="$EXTRA_ARGS" \\
|
||||
setsid bash "$train_script" "$mode" "$train_name" > "$log_file" 2>&1 < /dev/null &
|
||||
pid=\$!
|
||||
pgid=\$(ps -o pgid= -p "\$pid" | tr -d ' ' || true)
|
||||
printf '%s\n' "\$pid" > "$pid_file"
|
||||
cat > "$meta_file" <<META
|
||||
MODEL=$model
|
||||
MODE=$mode
|
||||
TRAIN_NAME=$train_name
|
||||
NODE_TRAIN_NAME=$node_train_name
|
||||
HOST=$host
|
||||
NODE_RANK=$node_rank
|
||||
NNODES=$NNODES
|
||||
NPROC_PER_NODE=$NPROC_PER_NODE
|
||||
MASTER_ADDR=$MASTER_ADDR
|
||||
MASTER_PORT=$MASTER_PORT
|
||||
ZERO_STAGE=$ZERO_STAGE
|
||||
NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME
|
||||
GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME
|
||||
NCCL_IB_HCA=$NCCL_IB_HCA
|
||||
NCCL_DEBUG=$NCCL_DEBUG
|
||||
NCCL_IB_DISABLE=$NCCL_IB_DISABLE
|
||||
PID=\$pid
|
||||
PGID=\$pgid
|
||||
LOG_FILE=$log_file
|
||||
TRAIN_SCRIPT=$train_script
|
||||
META
|
||||
echo "host=$host node_rank=$node_rank pid=\$pid pgid=\${pgid:-unknown} log=$log_file"
|
||||
RUN_CMD
|
||||
)
|
||||
if [ -n "$CONTAINER_NAME" ]; then
|
||||
docker exec "$CONTAINER_NAME" bash -lc "\$run_cmd"
|
||||
else
|
||||
bash -lc "\$run_cmd"
|
||||
fi
|
||||
EOF
|
||||
)
|
||||
|
||||
if [ "$host" = "$(hostname -s)" ] || [ "$host" = "$(hostname)" ]; then
|
||||
bash -lc "$remote_cmd"
|
||||
else
|
||||
ssh "$host" "bash -lc $(printf '%q' "$remote_cmd")"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Launched ${NNODES} nodes. Stop all nodes with: bash ${SCRIPT_DIR}/stop_multinode_training.sh ${train_name}"
|
||||
Reference in New Issue
Block a user