Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

MixtureOfInputs / app.py

update

952fd8e about 2 months ago

1.76 kB

	import gradio as gr
	import requests
	import os
	import spaces

	from server import setup_mixinputs, launch_vllm_server

	API_URL = "http://localhost:8000/v1/chat/completions"

	@spaces.GPU(duration=120)
	def chat_with_moi(message, history, temperature, top_p, beta):
	# Set the MIXINPUTS_BETA env var per request
	os.environ["MIXINPUTS_BETA"] = str(beta)

	# setup_mixinputs()
	# launch_vllm_server(beta=beta)

	payload = {
	"model": "Qwen/QwQ-32B", # match what your vLLM server expects
	"messages": [{"role": "user", "content": message}],
	"temperature": temperature,
	"top_p": top_p,
	"max_tokens": 512,
	}

	try:
	response = requests.post(API_URL, json=payload)
	response.raise_for_status()
	return response.json()["choices"][0]["message"]["content"]
	except Exception as e:
	return f"[ERROR] {str(e)}"

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# 🧪 Mixture of Inputs (MoI) Demo with vLLM")

	with gr.Row():
	temperature = gr.Slider(0.0, 1.5, value=0.7, label="Temperature")
	top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
	beta = gr.Slider(0.0, 10.0, value=1.0, label="MoI Beta")

	chatbot = gr.Chatbot()
	message = gr.Textbox(label="Your message")
	send_btn = gr.Button("Send")

	history = gr.State([])

	def respond(user_message, chat_history, temperature, top_p, beta):
	reply = chat_with_moi(user_message, chat_history, temperature, top_p, beta)
	chat_history = chat_history + [(user_message, reply)]
	return chat_history, chat_history

	send_btn.click(respond, inputs=[message, history, temperature, top_p, beta],
	outputs=[chatbot, history])

	demo.launch()