18 lines
441 B
Bash
18 lines
441 B
Bash
# downscaled qwen3 arch, simulate gpt2-level training
|
|
MODEL_ARGS="
|
|
--seq-length 4096
|
|
--hidden-size 768
|
|
--ffn-hidden-size 3072
|
|
--num-layers 12
|
|
--num-attention-heads 8
|
|
--num-query-groups 4
|
|
--rotary-base 10000
|
|
--init-method-std 0.018
|
|
--group-query-attention
|
|
--max-position-embeddings 4096
|
|
--position-embedding-type rope
|
|
--swiglu
|
|
--disable-bias-linear
|
|
--normalization RMSNorm
|
|
--untie-embeddings-and-output-weights
|
|
" |