torchrun --nproc_per_node=1 pretrain_gpt.py \ --num-layers 2 \ --hidden-size 256 \ --ffn-hidden-size 1024 \ --num-attention-heads 4 \ --seq-length 512 \ --max-position-embeddings 512 \ --micro-batch-size 1 \ --global-batch-size 8 \ --train-iters 200 \ --lr 1e-4 \ --min-lr 1e-5 \ --lr-decay-style cosine \ --lr-warmup-iters 2 \ --weight-decay 0.01 \ --clip-grad 1.0 \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --init-method-std 0.02 \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model /apps/yi/kaiyuan_pretraining/tokenizer \ --data-path 1.0 /apps/yi/kaiyuan_pretraining/megatron_phase1/phase1_part-00000_text_document \ --split 949,50,1 \ --bf16 \ --save /apps/yi/checkpoints/tiny-test-full \ --save-interval 20 \ --tensorboard-dir /apps/yi/tb_logs/tiny-test-full \ --log-interval 1 \ --eval-interval 10 \ --eval-iters 20