Files
pretrain_kaiyuan2b/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh
2026-05-06 15:06:07 +08:00

84 lines
1.9 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODE=${1:-qwen3_1p7b_smoke_yi}
TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi}
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training
PARAMS_DIR="${SCRIPT_DIR}/params"
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
source "${PARAMS_DIR}/optim_common.sh"
source "${PARAMS_DIR}/qwen3_1p7b/model.sh"
source "${PARAMS_DIR}/qwen3_1p7b/data_phase1_smoke.sh"
source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh"
LOGGING_ARGS="
--tensorboard-dir ${TB_DIR}
--tensorboard-log-interval 1
--log-interval 1
--log-timers-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-memory-to-tensorboard
--log-world-size-to-tensorboard
--log-num-zeros-in-grad
--log-device-memory-used
--log-throughput
--log-params-norm
"
PRECISION_ARGS="--bf16"
PARALLEL_ARGS="
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 1
"
# PARALLEL_ARGS="
# --tensor-model-parallel-size 2
# --sequence-parallel
# "
if [ "$MODE" = "qwen3_1p7b_smoke_yi" ]; then
source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh"
RUN_ARGS="$HPARAMS"
elif [ "$MODE" = "phase1" ]; then
source "${PARAMS_DIR}/phase1_full.sh"
RUN_ARGS="$PHASE_ARGS"
else
echo "Unknown mode: $MODE"
exit 1
fi
mkdir -p "$CKPT_DIR" "$TB_DIR"
DISTRIBUTED_ARGS="
--nproc_per_node 8
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 6000
"
torchrun $DISTRIBUTED_ARGS \
$MEGATRON_PATH/pretrain_gpt.py \
$MODEL_ARGS \
$OPTIM_ARGS \
$PRECISION_ARGS \
$PARALLEL_ARGS \
$DATA_ARGS \
$RUN_ARGS \
$LOGGING_ARGS\
--save "$CKPT_DIR" \
--load "$CKPT_DIR" \
--enable-cuda-graph \
--cuda-graph-warmup-steps 3 \
--transformer-impl transformer_engine \
--cross-entropy-loss-fusion \
--cross-entropy-fusion-impl te