Initial Commit

This commit is contained in:
2026-05-06 15:06:07 +08:00
parent b5ac2c8ed5
commit f154c1611d
29 changed files with 1068 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=50
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936

View File

@@ -0,0 +1,21 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=50
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936

View File

@@ -0,0 +1,13 @@
HPARAM_ARGS="
--micro-batch-size 32
--global-batch-size 2048
--train-iters 15000
--eval-iters 10
--eval-interval 1000
--save-interval 1000
--log-interval 1
--lr 1e-3
--min-lr 1e-3
--lr-decay-style constant
--lr-warmup-iters 10
"

View File

@@ -0,0 +1,18 @@
# downscaled qwen3 arch, simulate gpt2-level training
MODEL_ARGS="
--seq-length 4096
--hidden-size 768
--ffn-hidden-size 3072
--num-layers 12
--num-attention-heads 8
--num-query-groups 4
--rotary-base 10000
--init-method-std 0.018
--group-query-attention
--max-position-embeddings 4096
--position-embedding-type rope
--swiglu
--disable-bias-linear
--normalization RMSNorm
--untie-embeddings-and-output-weights
"

View File

@@ -0,0 +1,10 @@
# note: by default decoupled_weight_decay is True and adam optimizer acts as adamW
OPTIM_ARGS="
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--weight-decay 0.1
--clip-grad 1.0
"

View File

@@ -0,0 +1,21 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=210
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936

View File

@@ -0,0 +1,13 @@
HPARAMS="
--micro-batch-size 16
--global-batch-size 2048
--train-iters 87000
--eval-iters 10
--eval-interval 1000
--save-interval 1000
--log-interval 1
--lr 5e-3
--min-lr 5e-3
--lr-decay-style constant
--lr-warmup-iters 10
"

View File

@@ -0,0 +1,19 @@
# note: official qwen3 training uses qk norm while megatron has no official support
MODEL_ARGS="
--seq-length 4096
--hidden-size 2048
--ffn-hidden-size 6144
--num-layers 28
--num-attention-heads 16
--num-query-groups 8
--rotary-base 10000
--init-method-std 0.018
--group-query-attention
--max-position-embeddings 4096
--position-embedding-type rope
--swiglu
--disable-bias-linear
--normalization RMSNorm
--untie-embeddings-and-output-weights
"