# downscaled qwen3 arch, simulate gpt2-level training MODEL_ARGS=" --seq-length 4096 --hidden-size 768 --ffn-hidden-size 3072 --num-layers 12 --num-attention-heads 8 --num-query-groups 4 --rotary-base 10000 --init-method-std 0.018 --group-query-attention --max-position-embeddings 4096 --position-embedding-type rope --swiglu --disable-bias-linear --normalization RMSNorm --untie-embeddings-and-output-weights "