MODEL_PATH=$1 | |
MODEL_NAME=$2 | |
API_KEY=$3 | |
PORT=$4 | |
TP=$5 | |
VISIABLE_DEVICES=$(seq -s, $6 $7); | |
echo "MODEL-PATH ${MODEL_PATH} API_key ${API_KEY} PORT ${PORT} TP ${TP} cuda visiable devices : ${VISIABLE_DEVICES}" | |
CUDA_VISIBLE_DEVICES=${VISIABLE_DEVICES} vllm serve $MODEL_PATH \ | |
--port $PORT \ | |
--tensor-parallel-size $TP \ | |
--served-model-name $MODEL_NAME \ | |
--enable-chunked-prefill \ | |
--enforce-eager \ | |
--api-key $API_KEY \ | |
--disable-log-requests \ | |
--max_model_len 8192 | |