Initial Commit

This commit is contained in:
2026-05-06 15:06:07 +08:00
parent b5ac2c8ed5
commit f154c1611d
29 changed files with 1068 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
# downscaled qwen3 arch, simulate gpt2-level training
MODEL_ARGS="
--seq-length 4096
--hidden-size 768
--ffn-hidden-size 3072
--num-layers 12
--num-attention-heads 8
--num-query-groups 4
--rotary-base 10000
--init-method-std 0.018
--group-query-attention
--max-position-embeddings 4096
--position-embedding-type rope
--swiglu
--disable-bias-linear
--normalization RMSNorm
--untie-embeddings-and-output-weights
"