Spaces:
Sleeping
Sleeping
# app.py | |
import json | |
import requests | |
import sseclient | |
import gradio as gr | |
import server | |
API_URL = "http://localhost:8000/v1/chat/completions" | |
def stream_completion(message, history, max_tokens, temperature, top_p, beta): | |
""" | |
Gradio callback: takes the newest user message + full chat history, | |
returns an updated history while streaming assistant tokens. | |
""" | |
# ------- build OpenAI-style message list (no system prompt) ------------- | |
messages = [] | |
for usr, bot in history: | |
if usr: | |
messages.append({"role": "user", "content": usr}) | |
if bot: | |
messages.append({"role": "assistant", "content": bot}) | |
messages.append({"role": "user", "content": message}) | |
payload = { | |
"model": "Qwen/Qwen3-4B", | |
"messages": messages, | |
"temperature": temperature, | |
"top_p": top_p, | |
"max_tokens": int(max_tokens), | |
"stream": True, | |
} | |
headers = { | |
"Content-Type": "application/json", | |
"X-MIXINPUTS-BETA": str(beta), | |
} | |
try: | |
resp = requests.post(API_URL, json=payload, stream=True, headers=headers, timeout=60) | |
resp.raise_for_status() | |
client = sseclient.SSEClient(resp) | |
assistant = "" | |
for event in client.events(): | |
if event.data.strip() == "[DONE]": | |
break | |
delta = json.loads(event.data)["choices"][0]["delta"].get("content", "") | |
assistant += delta | |
yield history + [(message, assistant)] # update the chat box live | |
except Exception as err: | |
yield history + [(message, f"[ERROR] {err}")] | |
# ----------------------- UI --------------------------------------------- | |
with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo: | |
gr.Markdown( | |
"## 🎨 Mixture of Inputs (MoI) Demo \n" | |
"Streaming vLLM demo with dynamic **beta** adjustment in MoI, higher beta means less blending." | |
) | |
# sliders first – all on one row | |
with gr.Row(): | |
beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Beta") | |
temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature") | |
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p") | |
max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens") | |
chatbot = gr.Chatbot(height=450) | |
user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False) | |
clear_btn = gr.Button("Clear chat") | |
# wiring | |
user_box.submit( | |
stream_completion, | |
inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta], | |
outputs=chatbot, | |
) | |
clear_btn.click(lambda: None, None, chatbot, queue=False) | |
if __name__ == "__main__": | |
demo.launch() | |