Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

File size: 3,162 Bytes

# app.py ── launch vLLM inside a Hugging Face Space (with clean shutdown)
import os, signal, sys, atexit, time, socket, subprocess
import spaces          # only needed for the GPU decorator

# ----------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------
def _wait_for_port(host: str, port: int, timeout: int = 240):
    """Block until (host, port) accepts TCP connections or timeout."""
    deadline = time.time() + timeout
    while time.time() < deadline:
        with socket.socket() as sock:
            sock.settimeout(2)
            if sock.connect_ex((host, port)) == 0:
                return
        time.sleep(1)
    raise RuntimeError(f"vLLM server on {host}:{port} never came up")

def _kill_proc_tree(proc: subprocess.Popen):
    """SIGTERM the whole process-group started by `proc` (if still alive)."""
    if proc and proc.poll() is None:           # still running
        pgid = os.getpgid(proc.pid)
        os.killpg(pgid, signal.SIGTERM)        # graceful
        try:
            proc.wait(15)
        except subprocess.TimeoutExpired:
            os.killpg(pgid, signal.SIGKILL)    # force

# ----------------------------------------------------------------------
# Setup – runs on *CPU* only; fast.
# ----------------------------------------------------------------------
def setup_mixinputs():
    subprocess.run(["mixinputs", "setup"], check=True)

# ----------------------------------------------------------------------
# Serve – runs on the GPU; heavy, so we mark it.
# ----------------------------------------------------------------------
def launch_vllm_server(beta: float = 1.0, port: int = 8000) -> subprocess.Popen:
    env = os.environ.copy()
    env["MIXINPUTS_BETA"] = str(beta)
    env["VLLM_USE_V1"] = "1"

    cmd = [
        "vllm", "serve",
        "Qwen/Qwen3-4B",
        "--tensor-parallel-size", "1",
        "--enforce-eager",
        "--max-model-len", "2048",
        "--max-seq-len-to-capture", "2048",
        "--max-num-seqs", "1",
        "--port", str(port)
    ]

    # new session ⇒ its own process-group
    proc = subprocess.Popen(cmd, env=env, start_new_session=True)
    _wait_for_port("localhost", port)          # block until ready
    return proc

# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
if __name__ == "__main__":
    setup_mixinputs()                          # fast
    server_proc = launch_vllm_server()         # heavy

    # Ensures the GPU process dies when the Space stops / reloads
    atexit.register(_kill_proc_tree, server_proc)

    # ---- your Gradio / FastAPI app goes below ----
    #   e.g. import gradio as gr
    #        with gr.Blocks(teardown=lambda: _kill_proc_tree(server_proc)) as demo:
    #            ...
    #        demo.launch(server_name="0.0.0.0", server_port=7860)
    #
    # For this snippet we’ll just block forever so the container
    # doesn’t exit immediately.
    try:
        server_proc.wait()
    except KeyboardInterrupt:
        pass