MixtureOfInputs / server.py
yzhuang's picture
update
f4cd2da
raw
history blame
975 Bytes
import subprocess
import threading
import os
import time
import spaces
def setup_mixinputs():
# Step 1: Run mixinputs setup
subprocess.run(["mixinputs", "setup"], check=True)
# @spaces.GPU(duration=240)
def launch_vllm_server(beta=1.0):
# Step 2: Set environment variables
env = os.environ.copy()
env["MIXINPUTS_BETA"] = str(beta)
env["VLLM_USE_V1"] = "1"
# Step 3: Launch vLLM with custom options
cmd = [
"vllm", "serve",
"Qwen/Qwen3-4B",
"--tensor-parallel-size", "1",
"--enforce-eager",
"--max-model-len", "2048",
"--max-seq-len-to-capture", "2048",
"--max-num-seqs", "1",
"--port", "8000",
"--disable-async-output-proc"
]
subprocess.run(cmd, env=env)
# Step 1: Setup
setup_mixinputs()
# Step 2: Launch vLLM server in background
threading.Thread(target=launch_vllm_server, daemon=True).start()
# Step 3: Give time for server to initialize
time.sleep(60)