Initial Commit
This commit is contained in:
94
scripts/kaiyuan2b-profiling/training_smoke_qwen3_1p7b.sh
Normal file
94
scripts/kaiyuan2b-profiling/training_smoke_qwen3_1p7b.sh
Normal file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
#export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
|
||||
MODE=${1:-qwen3_1p7b_smoke}
|
||||
TRAIN_NAME=${2:-qwen3_1p7b_smoke}
|
||||
|
||||
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||
|
||||
source params/optim_common.sh
|
||||
source params/qwen3_1p7b/model.sh
|
||||
source params/qwen3_1p7b/data_phase1_smoke.sh
|
||||
source params/qwen3_1p7b/hparams.sh
|
||||
|
||||
LOGGING_ARGS="
|
||||
--tensorboard-dir ${TB_DIR}
|
||||
--tensorboard-log-interval 1
|
||||
--log-interval 1
|
||||
--log-timers-to-tensorboard
|
||||
--log-validation-ppl-to-tensorboard
|
||||
--log-memory-to-tensorboard
|
||||
--log-world-size-to-tensorboard
|
||||
--log-num-zeros-in-grad
|
||||
--log-device-memory-used
|
||||
--log-throughput
|
||||
--log-params-norm
|
||||
"
|
||||
|
||||
PRECISION_ARGS="--bf16"
|
||||
PARALLEL_ARGS="
|
||||
--tensor-model-parallel-size 1
|
||||
--pipeline-model-parallel-size 1
|
||||
"
|
||||
# PARALLEL_ARGS="
|
||||
# --tensor-model-parallel-size 2
|
||||
# --sequence-parallel
|
||||
# "
|
||||
|
||||
if [ "$MODE" = "qwen3_1p7b_smoke" ]; then
|
||||
source params/qwen3_1p7b/hparams.sh
|
||||
RUN_ARGS="$HPARAMS"
|
||||
elif [ "$MODE" = "phase1" ]; then
|
||||
source params/phase1_full.sh
|
||||
RUN_ARGS="$PHASE_ARGS"
|
||||
else
|
||||
echo "Unknown mode: $MODE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node 8
|
||||
--nnodes 1
|
||||
--node_rank 0
|
||||
--master_addr localhost
|
||||
--master_port 6000
|
||||
"
|
||||
nsys profile \
|
||||
-s none \
|
||||
-t cuda,nvtx,cudnn,cublas \
|
||||
-o megatron_8gpu_%p \
|
||||
--force-overwrite true \
|
||||
--capture-range=cudaProfilerApi \
|
||||
--capture-range-end=stop \
|
||||
--cuda-graph-trace=node \
|
||||
torchrun $DISTRIBUTED_ARGS \
|
||||
$MEGATRON_PATH/pretrain_gpt.py \
|
||||
$MODEL_ARGS \
|
||||
$OPTIM_ARGS \
|
||||
$PRECISION_ARGS \
|
||||
$PARALLEL_ARGS \
|
||||
$DATA_ARGS \
|
||||
$RUN_ARGS \
|
||||
$LOGGING_ARGS\
|
||||
--save "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
--enable-cuda-graph \
|
||||
--cuda-graph-warmup-steps 3 \
|
||||
--profile \
|
||||
--profile-step-start 10 \
|
||||
--profile-step-end 12 \
|
||||
--profile-ranks 0 1 \
|
||||
--transformer-impl transformer_engine \
|
||||
--cross-entropy-loss-fusion \
|
||||
--cross-entropy-fusion-impl te
|
||||
#--use-distributed-optimizer
|
||||
#--overlap-grad-reduce \
|
||||
# --overlap-param-gather \
|
||||
Reference in New Issue
Block a user