Initial Commit

This commit is contained in:
2026-05-06 15:06:07 +08:00
parent b5ac2c8ed5
commit f154c1611d
29 changed files with 1068 additions and 0 deletions

View File

@@ -0,0 +1,69 @@
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
# <Path to checkpoint (e.g /345m)>
CHECKPOINT=/apps/yi/model_training/artifacts/checkpoints/smoke_gpt
# <Path to vocab.json (e.g. /gpt2-vocab.json)>
VOCAB_FILE=/apps/yi/model_training/data/tokenizer/vocab.json
# <Path to merges.txt (e.g. /gpt2-merges.txt)>
MERGE_FILE=/apps/yi/model_training/data/tokenizer/merges.txt
# <Path to tokenizer>
TOKENIZER_PATH=/apps/yi/model_training/data/tokenizer
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
export CUDA_DEVICE_MAX_CONNECTIONS=1
# pip install flask-restful
# torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \
# --tensor-model-parallel-size 1 \
# --pipeline-model-parallel-size 1 \
# --num-layers 12 \
# --hidden-size 3072 \
# --load ${CHECKPOINT} \
# --num-attention-heads 8 \
# --num-query-groups 4 \
# --max-position-embeddings 4096 \
# --fp16 \
# --micro-batch-size 1 \
# --seq-length 1024 \
# --temperature 1.0 \
# --top_p 0.9 \
# --seed 42 \
# --tokenizer-type GPT2BPETokenizer
# --vocab-file $VOCAB_FILE \
# --merge-file $MERGE_FILE \
torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \
--load $CHECKPOINT \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--ffn-hidden-size 3072 \
--num-attention-heads 8 \
--num-query-groups 4 \
--group-query-attention \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type rope \
--rotary-base 10000 \
--swiglu \
--disable-bias-linear \
--normalization RMSNorm \
--untie-embeddings-and-output-weights \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model $TOKENIZER_PATH \
--bf16 \
--micro-batch-size 1 \

View File

@@ -0,0 +1,21 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=50
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936

View File

@@ -0,0 +1,21 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=50
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936

View File

@@ -0,0 +1,13 @@
HPARAM_ARGS="
--micro-batch-size 32
--global-batch-size 2048
--train-iters 15000
--eval-iters 10
--eval-interval 1000
--save-interval 1000
--log-interval 1
--lr 1e-3
--min-lr 1e-3
--lr-decay-style constant
--lr-warmup-iters 10
"

View File

@@ -0,0 +1,18 @@
# downscaled qwen3 arch, simulate gpt2-level training
MODEL_ARGS="
--seq-length 4096
--hidden-size 768
--ffn-hidden-size 3072
--num-layers 12
--num-attention-heads 8
--num-query-groups 4
--rotary-base 10000
--init-method-std 0.018
--group-query-attention
--max-position-embeddings 4096
--position-embedding-type rope
--swiglu
--disable-bias-linear
--normalization RMSNorm
--untie-embeddings-and-output-weights
"

View File

@@ -0,0 +1,10 @@
# note: by default decoupled_weight_decay is True and adam optimizer acts as adamW
OPTIM_ARGS="
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--weight-decay 0.1
--clip-grad 1.0
"

View File

@@ -0,0 +1,21 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=210
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936

View File

@@ -0,0 +1,13 @@
HPARAMS="
--micro-batch-size 16
--global-batch-size 2048
--train-iters 19760
--eval-iters 10
--eval-interval 1000
--save-interval 1000
--log-interval 1
--lr 5e-3
--min-lr 5e-3
--lr-decay-style constant
--lr-warmup-iters 10
"

View File

@@ -0,0 +1,19 @@
# note: official qwen3 training uses qk norm while megatron has no official support
MODEL_ARGS="
--seq-length 4096
--hidden-size 2048
--ffn-hidden-size 6144
--num-layers 28
--num-attention-heads 16
--num-query-groups 8
--rotary-base 10000
--init-method-std 0.018
--group-query-attention
--max-position-embeddings 4096
--position-embedding-type rope
--swiglu
--disable-bias-linear
--normalization RMSNorm
--untie-embeddings-and-output-weights
"

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env bash
set -euo pipefail
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODE=${1:-smoke}
TRAIN_NAME=${2:-smoke_gpt}
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
source params/optim_common.sh
source params/gpt_smoke/model.sh
source params/gpt_smoke/data.sh
source params/gpt_smoke/hparams.sh
RUN_ARGS=$HPARAM_ARGS
LOGGING_ARGS="
--tensorboard-dir ${TB_DIR}
--tensorboard-log-interval 1
--log-interval 1
--log-timers-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-memory-to-tensorboard
--log-world-size-to-tensorboard
--log-num-zeros-in-grad
--log-device-memory-used
--log-throughput
--log-params-norm
"
PRECISION_ARGS="--bf16"
PARALLEL_ARGS="
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 1
"
# PARALLEL_ARGS="
# --tensor-model-parallel-size 2
# --sequence-parallel
# "
mkdir -p "$CKPT_DIR" "$TB_DIR"
DISTRIBUTED_ARGS="
--nproc_per_node 8
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 6000
"
torchrun $DISTRIBUTED_ARGS \
$MEGATRON_PATH/pretrain_gpt.py \
$MODEL_ARGS \
$OPTIM_ARGS \
$PRECISION_ARGS \
$PARALLEL_ARGS \
$DATA_ARGS \
$RUN_ARGS \
$LOGGING_ARGS\
--save "$CKPT_DIR" \
--load "$CKPT_DIR" \

View File

@@ -0,0 +1,94 @@
#!/usr/bin/env bash
set -euo pipefail
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export CUDA_DEVICE_MAX_CONNECTIONS=1
MODE=${1:-qwen3_1p7b_smoke}
TRAIN_NAME=${2:-qwen3_1p7b_smoke}
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
source params/optim_common.sh
source params/qwen3_1p7b/model.sh
source params/qwen3_1p7b/data_phase1_smoke.sh
source params/qwen3_1p7b/hparams.sh
LOGGING_ARGS="
--tensorboard-dir ${TB_DIR}
--tensorboard-log-interval 1
--log-interval 1
--log-timers-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-memory-to-tensorboard
--log-world-size-to-tensorboard
--log-num-zeros-in-grad
--log-device-memory-used
--log-throughput
--log-params-norm
"
PRECISION_ARGS="--bf16"
PARALLEL_ARGS="
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 1
"
# PARALLEL_ARGS="
# --tensor-model-parallel-size 2
# --sequence-parallel
# "
if [ "$MODE" = "qwen3_1p7b_smoke" ]; then
source params/qwen3_1p7b/hparams.sh
RUN_ARGS="$HPARAMS"
elif [ "$MODE" = "phase1" ]; then
source params/phase1_full.sh
RUN_ARGS="$PHASE_ARGS"
else
echo "Unknown mode: $MODE"
exit 1
fi
mkdir -p "$CKPT_DIR" "$TB_DIR"
DISTRIBUTED_ARGS="
--nproc_per_node 8
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 6000
"
nsys profile \
-s none \
-t cuda,nvtx,cudnn,cublas \
-o megatron_8gpu_%p \
--force-overwrite true \
--capture-range=cudaProfilerApi \
--capture-range-end=stop \
--cuda-graph-trace=node \
torchrun $DISTRIBUTED_ARGS \
$MEGATRON_PATH/pretrain_gpt.py \
$MODEL_ARGS \
$OPTIM_ARGS \
$PRECISION_ARGS \
$PARALLEL_ARGS \
$DATA_ARGS \
$RUN_ARGS \
$LOGGING_ARGS\
--save "$CKPT_DIR" \
--load "$CKPT_DIR" \
--enable-cuda-graph \
--cuda-graph-warmup-steps 3 \
--profile \
--profile-step-start 10 \
--profile-step-end 12 \
--profile-ranks 0 1 \
--transformer-impl transformer_engine \
--cross-entropy-loss-fusion \
--cross-entropy-fusion-impl te
#--use-distributed-optimizer
#--overlap-grad-reduce \
# --overlap-param-gather \