#!/bin/bash # This example will start serving the 345M model. DISTRIBUTED_ARGS="--nproc_per_node 1 \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" # CHECKPOINT=/apps/yi/model_training/artifacts/checkpoints/qwen3_1p7b_smoke_yi # VOCAB_FILE=/apps/yi/model_training/data/tokenizer/vocab.json # MERGE_FILE=/apps/yi/model_training/data/tokenizer/merges.txt # TOKENIZER_PATH=/apps/yi/model_training/data/tokenizer MEGATRON_PATH=/apps/yi/model_training/Megatron-LM export CUDA_DEVICE_MAX_CONNECTIONS=1 # pip install flask-restful torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ --load $CHECKPOINT \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ --num-layers 28 \ --hidden-size 2048 \ --ffn-hidden-size 6144 \ --num-attention-heads 16 \ --num-query-groups 8 \ --group-query-attention \ --seq-length 4096 \ --max-position-embeddings 4096 \ --position-embedding-type rope \ --rotary-base 10000 \ --swiglu \ --disable-bias-linear \ --normalization RMSNorm \ --untie-embeddings-and-output-weights \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model $TOKENIZER_PATH \ --bf16 \ --micro-batch-size 1 \ --micro-batch-size 1 \ --inference-max-requests 1