#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" LOG_DIR="${ARTIFACT_ROOT}/logs" usage() { cat <<'EOF' Usage: bash start_multinode_training.sh [mode] [train_name] Default cluster: g0033,g0034,g0035,g0036 with 8 GPUs per host. Environment overrides: HOSTS="g0033 g0034 g0035 g0036" MASTER_ADDR=g0033 MASTER_PORT=6000 NPROC_PER_NODE=8 ZERO_STAGE=0|1|2 CONTAINER_NAME=megatron-ngc25-training NCCL_SOCKET_IFNAME=eth0 GLOO_SOCKET_IFNAME=eth0 NCCL_IB_HCA=mlx5_0,mlx5_1 NCCL_DEBUG=INFO CHECKPOINT_KEEP_RECENT=3 CHECKPOINT_CLEANUP_INTERVAL_SECONDS=300 EXTRA_ARGS="--exit-duration-in-mins 120" Examples: bash start_multinode_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_32gpu CONTAINER_NAME=megatron-ngc25-training bash start_multinode_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_32gpu ZERO_STAGE=1 bash start_multinode_training.sh qwen3_1p7b phase1 qwen3_phase1_zero1 EOF } model=${1:-} mode=${2:-} train_name=${3:-} if [ -z "$model" ] || [ "$model" = "-h" ] || [ "$model" = "--help" ]; then usage exit 0 fi case "$model" in gpt_smoke) train_script="${SCRIPT_DIR}/training_smoke_gpt2.sh" mode=${mode:-smoke} train_name=${train_name:-smoke_gpt_multinode} ;; qwen3_1p7b) train_script="${SCRIPT_DIR}/training_smoke_qwen3_1p7b.sh" mode=${mode:-qwen3_1p7b_smoke_yi} train_name=${train_name:-qwen3_1p7b_multinode} ;; *) echo "Unknown model: $model" >&2 usage >&2 exit 1 ;; esac read -r -a HOST_ARRAY <<< "${HOSTS:-g0033 g0034 g0035 g0036}" NNODES=${NNODES:-${#HOST_ARRAY[@]}} NPROC_PER_NODE=${NPROC_PER_NODE:-8} MASTER_ADDR=${MASTER_ADDR:-${HOST_ARRAY[0]}} MASTER_PORT=${MASTER_PORT:-6000} ZERO_STAGE=${ZERO_STAGE:-0} CONTAINER_NAME=${CONTAINER_NAME:-} NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-} GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-} NCCL_IB_HCA=${NCCL_IB_HCA:-} NCCL_DEBUG=${NCCL_DEBUG:-} NCCL_IB_DISABLE=${NCCL_IB_DISABLE:-} CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3} CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300} EXTRA_ARGS="--exit-signal-handler ${EXTRA_ARGS:-}" mkdir -p "$RUN_STATE_DIR" "$LOG_DIR" echo "Starting multinode training: model=${model}, mode=${mode}, train_name=${train_name}" echo "Hosts: ${HOST_ARRAY[*]}" echo "Distributed: nnodes=${NNODES}, nproc_per_node=${NPROC_PER_NODE}, master=${MASTER_ADDR}:${MASTER_PORT}, zero_stage=${ZERO_STAGE}" if [ -n "$CONTAINER_NAME" ]; then echo "Container: ${CONTAINER_NAME}" fi for idx in "${!HOST_ARRAY[@]}"; do host=${HOST_ARRAY[$idx]} node_rank=$idx node_train_name="${train_name}_node${node_rank}" pid_file="${RUN_STATE_DIR}/${node_train_name}.pid" meta_file="${RUN_STATE_DIR}/${node_train_name}.env" log_file="${LOG_DIR}/${node_train_name}.log" remote_cmd=$(cat < "$log_file" 2>&1 < /dev/null & pid=\$! pgid=\$(ps -o pgid= -p "\$pid" | tr -d ' ' || true) printf '%s\n' "\$pid" > "$pid_file" cat > "$meta_file" <