#!/bin/bash MODEL="OpenGVLab/InternVL2_5-8B" # export CUDA_DEVICE_ORDER="PCI_BUS_ID" # export NCCL_P2P_DISABLE=1 # export CUDA_VISIBLE_DEVICES="0" # export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True PORT=8000 vllm serve $MODEL \ --port $PORT \ --trust-remote-code \ --limit-mm-per-prompt image=4 \ --max-model-len 8192 \ --gpu-memory-utilization 0.97 \ --disable-log-requests