#!/bin/bash # This example will start serving the 345M model. DISTRIBUTED_ARGS="--nproc_per_node 1 \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" # CHECKPOINT=/apps/yi/model_training/artifacts/checkpoints/smoke_gpt # VOCAB_FILE=/apps/yi/model_training/data/tokenizer/vocab.json # MERGE_FILE=/apps/yi/model_training/data/tokenizer/merges.txt # TOKENIZER_PATH=/apps/yi/model_training/data/tokenizer MEGATRON_PATH=/apps/yi/model_training/Megatron-LM export CUDA_DEVICE_MAX_CONNECTIONS=1 # pip install flask-restful # torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ # --tensor-model-parallel-size 1 \ # --pipeline-model-parallel-size 1 \ # --num-layers 12 \ # --hidden-size 3072 \ # --load ${CHECKPOINT} \ # --num-attention-heads 8 \ # --num-query-groups 4 \ # --max-position-embeddings 4096 \ # --fp16 \ # --micro-batch-size 1 \ # --seq-length 1024 \ # --temperature 1.0 \ # --top_p 0.9 \ # --seed 42 \ # --tokenizer-type GPT2BPETokenizer # --vocab-file $VOCAB_FILE \ # --merge-file $MERGE_FILE \ torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ --load $CHECKPOINT \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ --num-layers 12 \ --hidden-size 768 \ --ffn-hidden-size 3072 \ --num-attention-heads 8 \ --num-query-groups 4 \ --group-query-attention \ --seq-length 4096 \ --max-position-embeddings 4096 \ --position-embedding-type rope \ --rotary-base 10000 \ --swiglu \ --disable-bias-linear \ --normalization RMSNorm \ --untie-embeddings-and-output-weights \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model $TOKENIZER_PATH \ --bf16 \ --micro-batch-size 1 \