#!/usr/bin/env bash set -euo pipefail export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #export CUDA_DEVICE_MAX_CONNECTIONS=1 MODE=${1:-qwen3_1p7b_smoke} TRAIN_NAME=${2:-qwen3_1p7b_smoke} MEGATRON_PATH=/apps/yi/model_training/Megatron-LM ARTIFACT_ROOT=/apps/yi/model_training/artifacts TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" source params/optim_common.sh source params/qwen3_1p7b/model.sh source params/qwen3_1p7b/data_phase1_smoke.sh source params/qwen3_1p7b/hparams.sh LOGGING_ARGS=" --tensorboard-dir ${TB_DIR} --tensorboard-log-interval 1 --log-interval 1 --log-timers-to-tensorboard --log-validation-ppl-to-tensorboard --log-memory-to-tensorboard --log-world-size-to-tensorboard --log-num-zeros-in-grad --log-device-memory-used --log-throughput --log-params-norm " PRECISION_ARGS="--bf16" PARALLEL_ARGS=" --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 " # PARALLEL_ARGS=" # --tensor-model-parallel-size 2 # --sequence-parallel # " if [ "$MODE" = "qwen3_1p7b_smoke" ]; then source params/qwen3_1p7b/hparams.sh RUN_ARGS="$HPARAMS" elif [ "$MODE" = "phase1" ]; then source params/phase1_full.sh RUN_ARGS="$PHASE_ARGS" else echo "Unknown mode: $MODE" exit 1 fi mkdir -p "$CKPT_DIR" "$TB_DIR" DISTRIBUTED_ARGS=" --nproc_per_node 8 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6000 " nsys profile \ -s none \ -t cuda,nvtx,cudnn,cublas \ -o megatron_8gpu_%p \ --force-overwrite true \ --capture-range=cudaProfilerApi \ --capture-range-end=stop \ --cuda-graph-trace=node \ torchrun $DISTRIBUTED_ARGS \ $MEGATRON_PATH/pretrain_gpt.py \ $MODEL_ARGS \ $OPTIM_ARGS \ $PRECISION_ARGS \ $PARALLEL_ARGS \ $DATA_ARGS \ $RUN_ARGS \ $LOGGING_ARGS\ --save "$CKPT_DIR" \ --load "$CKPT_DIR" \ --enable-cuda-graph \ --cuda-graph-warmup-steps 3 \ --profile \ --profile-step-start 10 \ --profile-step-end 12 \ --profile-ranks 0 1 \ --transformer-impl transformer_engine \ --cross-entropy-loss-fusion \ --cross-entropy-fusion-impl te #--use-distributed-optimizer #--overlap-grad-reduce \ # --overlap-param-gather \