# note: official qwen3 training uses qk norm while megatron has no official support

MODEL_ARGS="
  --seq-length 4096
  --hidden-size 2048
  --ffn-hidden-size 6144
  --num-layers 28
  --num-attention-heads 16
  --num-query-groups 8
  --rotary-base 10000
  --init-method-std 0.018
  --group-query-attention
  --max-position-embeddings 4096
  --position-embedding-type rope
  --swiglu
  --disable-bias-linear
  --normalization RMSNorm
  --untie-embeddings-and-output-weights
"