Initial Commit
This commit is contained in:
21
scripts/kaiyuan2b-profiling/params/data_phase1.sh.back
Normal file
21
scripts/kaiyuan2b-profiling/params/data_phase1.sh.back
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=50
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
21
scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh
Normal file
21
scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=50
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
13
scripts/kaiyuan2b-profiling/params/gpt_smoke/hparams.sh
Normal file
13
scripts/kaiyuan2b-profiling/params/gpt_smoke/hparams.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
HPARAM_ARGS="
|
||||
--micro-batch-size 32
|
||||
--global-batch-size 2048
|
||||
--train-iters 15000
|
||||
--eval-iters 10
|
||||
--eval-interval 1000
|
||||
--save-interval 1000
|
||||
--log-interval 1
|
||||
--lr 1e-3
|
||||
--min-lr 1e-3
|
||||
--lr-decay-style constant
|
||||
--lr-warmup-iters 10
|
||||
"
|
||||
18
scripts/kaiyuan2b-profiling/params/gpt_smoke/model.sh
Normal file
18
scripts/kaiyuan2b-profiling/params/gpt_smoke/model.sh
Normal file
@@ -0,0 +1,18 @@
|
||||
# downscaled qwen3 arch, simulate gpt2-level training
|
||||
MODEL_ARGS="
|
||||
--seq-length 4096
|
||||
--hidden-size 768
|
||||
--ffn-hidden-size 3072
|
||||
--num-layers 12
|
||||
--num-attention-heads 8
|
||||
--num-query-groups 4
|
||||
--rotary-base 10000
|
||||
--init-method-std 0.018
|
||||
--group-query-attention
|
||||
--max-position-embeddings 4096
|
||||
--position-embedding-type rope
|
||||
--swiglu
|
||||
--disable-bias-linear
|
||||
--normalization RMSNorm
|
||||
--untie-embeddings-and-output-weights
|
||||
"
|
||||
10
scripts/kaiyuan2b-profiling/params/optim_common.sh
Normal file
10
scripts/kaiyuan2b-profiling/params/optim_common.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
# note: by default decoupled_weight_decay is True and adam optimizer acts as adamW
|
||||
|
||||
OPTIM_ARGS="
|
||||
--optimizer adam
|
||||
--adam-beta1 0.9
|
||||
--adam-beta2 0.95
|
||||
--adam-eps 1e-8
|
||||
--weight-decay 0.1
|
||||
--clip-grad 1.0
|
||||
"
|
||||
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=210
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
13
scripts/kaiyuan2b-profiling/params/qwen3_1p7b/hparams.sh
Normal file
13
scripts/kaiyuan2b-profiling/params/qwen3_1p7b/hparams.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
HPARAMS="
|
||||
--micro-batch-size 16
|
||||
--global-batch-size 2048
|
||||
--train-iters 19760
|
||||
--eval-iters 10
|
||||
--eval-interval 1000
|
||||
--save-interval 1000
|
||||
--log-interval 1
|
||||
--lr 5e-3
|
||||
--min-lr 5e-3
|
||||
--lr-decay-style constant
|
||||
--lr-warmup-iters 10
|
||||
"
|
||||
19
scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh
Normal file
19
scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh
Normal file
@@ -0,0 +1,19 @@
|
||||
# note: official qwen3 training uses qk norm while megatron has no official support
|
||||
|
||||
MODEL_ARGS="
|
||||
--seq-length 4096
|
||||
--hidden-size 2048
|
||||
--ffn-hidden-size 6144
|
||||
--num-layers 28
|
||||
--num-attention-heads 16
|
||||
--num-query-groups 8
|
||||
--rotary-base 10000
|
||||
--init-method-std 0.018
|
||||
--group-query-attention
|
||||
--max-position-embeddings 4096
|
||||
--position-embedding-type rope
|
||||
--swiglu
|
||||
--disable-bias-linear
|
||||
--normalization RMSNorm
|
||||
--untie-embeddings-and-output-weights
|
||||
"
|
||||
Reference in New Issue
Block a user