Spaces:
Sleeping
Sleeping
File size: 1,757 Bytes
4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e 952fd8e 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 4c07c1e a846510 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import gradio as gr
import requests
import os
import spaces
from server import setup_mixinputs, launch_vllm_server
API_URL = "http://localhost:8000/v1/chat/completions"
@spaces.GPU(duration=120)
def chat_with_moi(message, history, temperature, top_p, beta):
# Set the MIXINPUTS_BETA env var *per request*
os.environ["MIXINPUTS_BETA"] = str(beta)
# setup_mixinputs()
# launch_vllm_server(beta=beta)
payload = {
"model": "Qwen/QwQ-32B", # match what your vLLM server expects
"messages": [{"role": "user", "content": message}],
"temperature": temperature,
"top_p": top_p,
"max_tokens": 512,
}
try:
response = requests.post(API_URL, json=payload)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
except Exception as e:
return f"[ERROR] {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🧪 Mixture of Inputs (MoI) Demo with vLLM")
with gr.Row():
temperature = gr.Slider(0.0, 1.5, value=0.7, label="Temperature")
top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
beta = gr.Slider(0.0, 10.0, value=1.0, label="MoI Beta")
chatbot = gr.Chatbot()
message = gr.Textbox(label="Your message")
send_btn = gr.Button("Send")
history = gr.State([])
def respond(user_message, chat_history, temperature, top_p, beta):
reply = chat_with_moi(user_message, chat_history, temperature, top_p, beta)
chat_history = chat_history + [(user_message, reply)]
return chat_history, chat_history
send_btn.click(respond, inputs=[message, history, temperature, top_p, beta],
outputs=[chatbot, history])
demo.launch()
|