Files
pretrain_kaiyuan2b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh
2026-05-06 15:06:07 +08:00

19 lines
474 B
Bash

# note: official qwen3 training uses qk norm while megatron has no official support
MODEL_ARGS="
--seq-length 4096
--hidden-size 2048
--ffn-hidden-size 6144
--num-layers 28
--num-attention-heads 16
--num-query-groups 8
--rotary-base 10000
--init-method-std 0.018
--group-query-attention
--max-position-embeddings 4096
--position-embedding-type rope
--swiglu
--disable-bias-linear
--normalization RMSNorm
--untie-embeddings-and-output-weights
"