# note: official qwen3 training uses qk norm while megatron has no official support MODEL_ARGS=" --seq-length 4096 --hidden-size 2048 --ffn-hidden-size 6144 --num-layers 28 --num-attention-heads 16 --num-query-groups 8 --rotary-base 10000 --init-method-std 0.018 --group-query-attention --max-position-embeddings 4096 --position-embedding-type rope --swiglu --disable-bias-linear --normalization RMSNorm --untie-embeddings-and-output-weights "