Spaces:
Sleeping
Sleeping
File size: 3,162 Bytes
3d9b062 a846510 3d9b062 a846510 3d9b062 a846510 fb33d5d a846510 4891247 a846510 4891247 3d9b062 a846510 3d9b062 a846510 3d9b062 a846510 3d9b062 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# app.py ── launch vLLM inside a Hugging Face Space (with clean shutdown)
import os, signal, sys, atexit, time, socket, subprocess
import spaces # only needed for the GPU decorator
# ----------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------
def _wait_for_port(host: str, port: int, timeout: int = 240):
"""Block until (host, port) accepts TCP connections or timeout."""
deadline = time.time() + timeout
while time.time() < deadline:
with socket.socket() as sock:
sock.settimeout(2)
if sock.connect_ex((host, port)) == 0:
return
time.sleep(1)
raise RuntimeError(f"vLLM server on {host}:{port} never came up")
def _kill_proc_tree(proc: subprocess.Popen):
"""SIGTERM the whole process-group started by `proc` (if still alive)."""
if proc and proc.poll() is None: # still running
pgid = os.getpgid(proc.pid)
os.killpg(pgid, signal.SIGTERM) # graceful
try:
proc.wait(15)
except subprocess.TimeoutExpired:
os.killpg(pgid, signal.SIGKILL) # force
# ----------------------------------------------------------------------
# Setup – runs on *CPU* only; fast.
# ----------------------------------------------------------------------
def setup_mixinputs():
subprocess.run(["mixinputs", "setup"], check=True)
# ----------------------------------------------------------------------
# Serve – runs on the GPU; heavy, so we mark it.
# ----------------------------------------------------------------------
def launch_vllm_server(beta: float = 1.0, port: int = 8000) -> subprocess.Popen:
env = os.environ.copy()
env["MIXINPUTS_BETA"] = str(beta)
env["VLLM_USE_V1"] = "1"
cmd = [
"vllm", "serve",
"Qwen/Qwen3-4B",
"--tensor-parallel-size", "1",
"--enforce-eager",
"--max-model-len", "2048",
"--max-seq-len-to-capture", "2048",
"--max-num-seqs", "1",
"--port", str(port)
]
# new session ⇒ its own process-group
proc = subprocess.Popen(cmd, env=env, start_new_session=True)
_wait_for_port("localhost", port) # block until ready
return proc
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
if __name__ == "__main__":
setup_mixinputs() # fast
server_proc = launch_vllm_server() # heavy
# Ensures the GPU process dies when the Space stops / reloads
atexit.register(_kill_proc_tree, server_proc)
# ---- your Gradio / FastAPI app goes below ----
# e.g. import gradio as gr
# with gr.Blocks(teardown=lambda: _kill_proc_tree(server_proc)) as demo:
# ...
# demo.launch(server_name="0.0.0.0", server_port=7860)
#
# For this snippet we’ll just block forever so the container
# doesn’t exit immediately.
try:
server_proc.wait()
except KeyboardInterrupt:
pass
|