Initial Commit
This commit is contained in:
69
scripts/kaiyuan2b-training/eval_smoke_gpt2.sh
Normal file
69
scripts/kaiyuan2b-training/eval_smoke_gpt2.sh
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
# This example will start serving the 345M model.
|
||||
DISTRIBUTED_ARGS="--nproc_per_node 1 \
|
||||
--nnodes 1 \
|
||||
--node_rank 0 \
|
||||
--master_addr localhost \
|
||||
--master_port 6000"
|
||||
|
||||
# <Path to checkpoint (e.g /345m)>
|
||||
CHECKPOINT=/apps/yi/model_training/artifacts/checkpoints/smoke_gpt
|
||||
|
||||
# <Path to vocab.json (e.g. /gpt2-vocab.json)>
|
||||
VOCAB_FILE=/apps/yi/model_training/data/tokenizer/vocab.json
|
||||
|
||||
# <Path to merges.txt (e.g. /gpt2-merges.txt)>
|
||||
MERGE_FILE=/apps/yi/model_training/data/tokenizer/merges.txt
|
||||
|
||||
# <Path to tokenizer>
|
||||
TOKENIZER_PATH=/apps/yi/model_training/data/tokenizer
|
||||
|
||||
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
||||
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
|
||||
# pip install flask-restful
|
||||
|
||||
# torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \
|
||||
# --tensor-model-parallel-size 1 \
|
||||
# --pipeline-model-parallel-size 1 \
|
||||
# --num-layers 12 \
|
||||
# --hidden-size 3072 \
|
||||
# --load ${CHECKPOINT} \
|
||||
# --num-attention-heads 8 \
|
||||
# --num-query-groups 4 \
|
||||
# --max-position-embeddings 4096 \
|
||||
# --fp16 \
|
||||
# --micro-batch-size 1 \
|
||||
# --seq-length 1024 \
|
||||
# --temperature 1.0 \
|
||||
|
||||
# --top_p 0.9 \
|
||||
# --seed 42 \
|
||||
# --tokenizer-type GPT2BPETokenizer
|
||||
# --vocab-file $VOCAB_FILE \
|
||||
# --merge-file $MERGE_FILE \
|
||||
|
||||
|
||||
torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \
|
||||
--load $CHECKPOINT \
|
||||
--tensor-model-parallel-size 1 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--num-layers 12 \
|
||||
--hidden-size 768 \
|
||||
--ffn-hidden-size 3072 \
|
||||
--num-attention-heads 8 \
|
||||
--num-query-groups 4 \
|
||||
--group-query-attention \
|
||||
--seq-length 4096 \
|
||||
--max-position-embeddings 4096 \
|
||||
--position-embedding-type rope \
|
||||
--rotary-base 10000 \
|
||||
--swiglu \
|
||||
--disable-bias-linear \
|
||||
--normalization RMSNorm \
|
||||
--untie-embeddings-and-output-weights \
|
||||
--tokenizer-type HuggingFaceTokenizer \
|
||||
--tokenizer-model $TOKENIZER_PATH \
|
||||
--bf16 \
|
||||
--micro-batch-size 1 \
|
||||
21
scripts/kaiyuan2b-training/params/data_phase1.sh.back
Normal file
21
scripts/kaiyuan2b-training/params/data_phase1.sh.back
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=50
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
21
scripts/kaiyuan2b-training/params/gpt_smoke/data.sh
Normal file
21
scripts/kaiyuan2b-training/params/gpt_smoke/data.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=50
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
13
scripts/kaiyuan2b-training/params/gpt_smoke/hparams.sh
Normal file
13
scripts/kaiyuan2b-training/params/gpt_smoke/hparams.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
HPARAM_ARGS="
|
||||
--micro-batch-size 32
|
||||
--global-batch-size 2048
|
||||
--train-iters 15000
|
||||
--eval-iters 10
|
||||
--eval-interval 1000
|
||||
--save-interval 1000
|
||||
--log-interval 1
|
||||
--lr 1e-3
|
||||
--min-lr 1e-3
|
||||
--lr-decay-style constant
|
||||
--lr-warmup-iters 10
|
||||
"
|
||||
18
scripts/kaiyuan2b-training/params/gpt_smoke/model.sh
Normal file
18
scripts/kaiyuan2b-training/params/gpt_smoke/model.sh
Normal file
@@ -0,0 +1,18 @@
|
||||
# downscaled qwen3 arch, simulate gpt2-level training
|
||||
MODEL_ARGS="
|
||||
--seq-length 4096
|
||||
--hidden-size 768
|
||||
--ffn-hidden-size 3072
|
||||
--num-layers 12
|
||||
--num-attention-heads 8
|
||||
--num-query-groups 4
|
||||
--rotary-base 10000
|
||||
--init-method-std 0.018
|
||||
--group-query-attention
|
||||
--max-position-embeddings 4096
|
||||
--position-embedding-type rope
|
||||
--swiglu
|
||||
--disable-bias-linear
|
||||
--normalization RMSNorm
|
||||
--untie-embeddings-and-output-weights
|
||||
"
|
||||
10
scripts/kaiyuan2b-training/params/optim_common.sh
Normal file
10
scripts/kaiyuan2b-training/params/optim_common.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
# note: by default decoupled_weight_decay is True and adam optimizer acts as adamW
|
||||
|
||||
OPTIM_ARGS="
|
||||
--optimizer adam
|
||||
--adam-beta1 0.9
|
||||
--adam-beta2 0.95
|
||||
--adam-eps 1e-8
|
||||
--weight-decay 0.1
|
||||
--clip-grad 1.0
|
||||
"
|
||||
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=210
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
13
scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh
Normal file
13
scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
HPARAMS="
|
||||
--micro-batch-size 16
|
||||
--global-batch-size 2048
|
||||
--train-iters 87000
|
||||
--eval-iters 10
|
||||
--eval-interval 1000
|
||||
--save-interval 1000
|
||||
--log-interval 1
|
||||
--lr 5e-3
|
||||
--min-lr 5e-3
|
||||
--lr-decay-style constant
|
||||
--lr-warmup-iters 10
|
||||
"
|
||||
19
scripts/kaiyuan2b-training/params/qwen3_1p7b/model.sh
Normal file
19
scripts/kaiyuan2b-training/params/qwen3_1p7b/model.sh
Normal file
@@ -0,0 +1,19 @@
|
||||
# note: official qwen3 training uses qk norm while megatron has no official support
|
||||
|
||||
MODEL_ARGS="
|
||||
--seq-length 4096
|
||||
--hidden-size 2048
|
||||
--ffn-hidden-size 6144
|
||||
--num-layers 28
|
||||
--num-attention-heads 16
|
||||
--num-query-groups 8
|
||||
--rotary-base 10000
|
||||
--init-method-std 0.018
|
||||
--group-query-attention
|
||||
--max-position-embeddings 4096
|
||||
--position-embedding-type rope
|
||||
--swiglu
|
||||
--disable-bias-linear
|
||||
--normalization RMSNorm
|
||||
--untie-embeddings-and-output-weights
|
||||
"
|
||||
66
scripts/kaiyuan2b-training/training_smoke_gpt2.sh
Normal file
66
scripts/kaiyuan2b-training/training_smoke_gpt2.sh
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
|
||||
MODE=${1:-smoke}
|
||||
TRAIN_NAME=${2:-smoke_gpt}
|
||||
|
||||
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||
|
||||
source params/optim_common.sh
|
||||
source params/gpt_smoke/model.sh
|
||||
source params/gpt_smoke/data.sh
|
||||
source params/gpt_smoke/hparams.sh
|
||||
|
||||
RUN_ARGS=$HPARAM_ARGS
|
||||
|
||||
LOGGING_ARGS="
|
||||
--tensorboard-dir ${TB_DIR}
|
||||
--tensorboard-log-interval 1
|
||||
--log-interval 1
|
||||
--log-timers-to-tensorboard
|
||||
--log-validation-ppl-to-tensorboard
|
||||
--log-memory-to-tensorboard
|
||||
--log-world-size-to-tensorboard
|
||||
--log-num-zeros-in-grad
|
||||
--log-device-memory-used
|
||||
--log-throughput
|
||||
--log-params-norm
|
||||
"
|
||||
|
||||
PRECISION_ARGS="--bf16"
|
||||
PARALLEL_ARGS="
|
||||
--tensor-model-parallel-size 1
|
||||
--pipeline-model-parallel-size 1
|
||||
"
|
||||
# PARALLEL_ARGS="
|
||||
# --tensor-model-parallel-size 2
|
||||
# --sequence-parallel
|
||||
# "
|
||||
|
||||
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node 8
|
||||
--nnodes 1
|
||||
--node_rank 0
|
||||
--master_addr localhost
|
||||
--master_port 6000
|
||||
"
|
||||
|
||||
torchrun $DISTRIBUTED_ARGS \
|
||||
$MEGATRON_PATH/pretrain_gpt.py \
|
||||
$MODEL_ARGS \
|
||||
$OPTIM_ARGS \
|
||||
$PRECISION_ARGS \
|
||||
$PARALLEL_ARGS \
|
||||
$DATA_ARGS \
|
||||
$RUN_ARGS \
|
||||
$LOGGING_ARGS\
|
||||
--save "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
83
scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh
Normal file
83
scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
|
||||
MODE=${1:-qwen3_1p7b_smoke_yi}
|
||||
TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi}
|
||||
|
||||
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
||||
SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training
|
||||
PARAMS_DIR="${SCRIPT_DIR}/params"
|
||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||
|
||||
source "${PARAMS_DIR}/optim_common.sh"
|
||||
source "${PARAMS_DIR}/qwen3_1p7b/model.sh"
|
||||
source "${PARAMS_DIR}/qwen3_1p7b/data_phase1_smoke.sh"
|
||||
source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh"
|
||||
|
||||
LOGGING_ARGS="
|
||||
--tensorboard-dir ${TB_DIR}
|
||||
--tensorboard-log-interval 1
|
||||
--log-interval 1
|
||||
--log-timers-to-tensorboard
|
||||
--log-validation-ppl-to-tensorboard
|
||||
--log-memory-to-tensorboard
|
||||
--log-world-size-to-tensorboard
|
||||
--log-num-zeros-in-grad
|
||||
--log-device-memory-used
|
||||
--log-throughput
|
||||
--log-params-norm
|
||||
"
|
||||
|
||||
PRECISION_ARGS="--bf16"
|
||||
PARALLEL_ARGS="
|
||||
--tensor-model-parallel-size 1
|
||||
--pipeline-model-parallel-size 1
|
||||
"
|
||||
# PARALLEL_ARGS="
|
||||
# --tensor-model-parallel-size 2
|
||||
# --sequence-parallel
|
||||
# "
|
||||
|
||||
if [ "$MODE" = "qwen3_1p7b_smoke_yi" ]; then
|
||||
source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh"
|
||||
RUN_ARGS="$HPARAMS"
|
||||
elif [ "$MODE" = "phase1" ]; then
|
||||
source "${PARAMS_DIR}/phase1_full.sh"
|
||||
RUN_ARGS="$PHASE_ARGS"
|
||||
else
|
||||
echo "Unknown mode: $MODE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node 8
|
||||
--nnodes 1
|
||||
--node_rank 0
|
||||
--master_addr localhost
|
||||
--master_port 6000
|
||||
"
|
||||
|
||||
torchrun $DISTRIBUTED_ARGS \
|
||||
$MEGATRON_PATH/pretrain_gpt.py \
|
||||
$MODEL_ARGS \
|
||||
$OPTIM_ARGS \
|
||||
$PRECISION_ARGS \
|
||||
$PARALLEL_ARGS \
|
||||
$DATA_ARGS \
|
||||
$RUN_ARGS \
|
||||
$LOGGING_ARGS\
|
||||
--save "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
--enable-cuda-graph \
|
||||
--cuda-graph-warmup-steps 3 \
|
||||
--transformer-impl transformer_engine \
|
||||
--cross-entropy-loss-fusion \
|
||||
--cross-entropy-fusion-impl te
|
||||
|
||||
Reference in New Issue
Block a user