Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

App Files Files Community

MixtureOfInputs / app.py

yzhuang

Update app.py

b7b68a4 verified 6 months ago

raw

history blame

2.91 kB

	# app.py
	import json
	import requests, threading
	import sseclient
	import gradio as gr
	from server import setup_mixinputs, launch_vllm_server

	API_URL = "http://localhost:8000/v1/chat/completions"


	def stream_completion(message, history, max_tokens, temperature, top_p, beta):
	"""
	Gradio callback: takes the newest user message + full chat history,
	returns an updated history while streaming assistant tokens.
	"""
	# ------- build OpenAI-style message list (no system prompt) -------------
	messages = []
	for usr, bot in history:
	if usr:
	messages.append({"role": "user", "content": usr})
	if bot:
	messages.append({"role": "assistant", "content": bot})
	messages.append({"role": "user", "content": message})

	payload = {
	"model": "Qwen/Qwen3-4B",
	"messages": messages,
	"temperature": temperature,
	"top_p": top_p,
	"max_tokens": int(max_tokens),
	"stream": True,
	}
	headers = {
	"Content-Type": "application/json",
	"X-MIXINPUTS-BETA": str(beta),
	}

	try:
	resp = requests.post(API_URL, json=payload, stream=True, headers=headers, timeout=60)
	resp.raise_for_status()
	client = sseclient.SSEClient(resp)

	assistant = ""
	for event in client.events():
	if event.data.strip() == "[DONE]":
	break
	delta = json.loads(event.data)["choices"][0]["delta"].get("content", "")
	assistant += delta
	yield history + [(message, assistant)] # update the chat box live

	except Exception as err:
	yield history + [(message, f"[ERROR] {err}")]


	# ----------------------- UI ---------------------------------------------
	with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
	gr.Markdown(
	"## 🎨 Mixture of Inputs (MoI) Demo \n"
	"Streaming vLLM demo with dynamic beta adjustment in MoI, higher beta means less blending."
	)

	# sliders first – all on one row
	with gr.Row():
	beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Beta")
	temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature")
	top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top-p")
	max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")


	chatbot = gr.Chatbot(height=450)
	user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
	clear_btn = gr.Button("Clear chat")

	# wiring
	user_box.submit(
	stream_completion,
	inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
	outputs=chatbot,
	)
	clear_btn.click(lambda: None, None, chatbot, queue=False)

	if __name__ == "__main__":
	setup_mixinputs()
	threading.Thread(target=launch_vllm_server, daemon=True).start()
	demo.launch()