Spaces:
Sleeping
Sleeping
| # app.py | |
| import json | |
| import requests, threading | |
| import sseclient | |
| import gradio as gr | |
| from server import setup_mixinputs, launch_vllm_server | |
| API_URL = "http://localhost:8000/v1/chat/completions" | |
| def stream_completion(message, history, max_tokens, temperature, top_p, beta): | |
| """ | |
| Gradio callback: takes the newest user message + full chat history, | |
| returns an updated history while streaming assistant tokens. | |
| """ | |
| # ------- build OpenAI-style message list (no system prompt) ------------- | |
| messages = [] | |
| for usr, bot in history: | |
| if usr: | |
| messages.append({"role": "user", "content": usr}) | |
| if bot: | |
| messages.append({"role": "assistant", "content": bot}) | |
| messages.append({"role": "user", "content": message}) | |
| payload = { | |
| "model": "Qwen/Qwen3-4B", | |
| "messages": messages, | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "max_tokens": int(max_tokens), | |
| "stream": True, | |
| } | |
| headers = { | |
| "Content-Type": "application/json", | |
| "X-MIXINPUTS-BETA": str(beta), | |
| } | |
| try: | |
| resp = requests.post(API_URL, json=payload, stream=True, headers=headers, timeout=60) | |
| resp.raise_for_status() | |
| client = sseclient.SSEClient(resp) | |
| assistant = "" | |
| for event in client.events(): | |
| if event.data.strip() == "[DONE]": | |
| break | |
| delta = json.loads(event.data)["choices"][0]["delta"].get("content", "") | |
| assistant += delta | |
| yield history + [(message, assistant)] # update the chat box live | |
| except Exception as err: | |
| yield history + [(message, f"[ERROR] {err}")] | |
| # ----------------------- UI --------------------------------------------- | |
| with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo: | |
| gr.Markdown( | |
| "## 🎨 Mixture of Inputs (MoI) Demo \n" | |
| "Streaming vLLM demo with dynamic **beta** adjustment in MoI, higher beta means less blending." | |
| ) | |
| # sliders first – all on one row | |
| with gr.Row(): | |
| beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Beta") | |
| temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature") | |
| top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top-p") | |
| max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens") | |
| chatbot = gr.Chatbot(height=450) | |
| user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False) | |
| clear_btn = gr.Button("Clear chat") | |
| # wiring | |
| user_box.submit( | |
| stream_completion, | |
| inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta], | |
| outputs=chatbot, | |
| ) | |
| clear_btn.click(lambda: None, None, chatbot, queue=False) | |
| if __name__ == "__main__": | |
| setup_mixinputs() | |
| threading.Thread(target=launch_vllm_server, daemon=True).start() | |
| demo.launch() | |