liamcripwell's picture
initial content push
ee86994
raw
history blame
927 Bytes
#!/bin/bash
echo "Starting vLLM server..."
# Start vLLM in the background with logging
python3 -m vllm.entrypoints.openai.api_server \
--model numind/NuMarkdown-8B-Thinking \
--port 8000 \
--host 0.0.0.0 \
--max-model-len 20000 \
--gpu-memory-utilization 0.6 > $HOME/app/vllm.log 2>&1 &
VLLM_PID=$!
echo "vLLM started with PID: $VLLM_PID"
# Wait for vLLM to be ready
echo "Waiting for vLLM server to start..."
for i in {1..90}; do
if curl -s http://localhost:8000/v1/models > /dev/null; then
echo "vLLM server is ready!"
break
fi
echo "Waiting... ($i/90)"
sleep 2
done
# Check if vLLM is actually running
if ! curl -s http://localhost:8000/v1/models > /dev/null; then
echo "ERROR: vLLM server failed to start!"
echo "vLLM logs:"
cat $HOME/app/vllm.log
exit 1
fi
echo "Starting Gradio app..."
# Start Gradio app in the foreground
python3 $HOME/app/app.py