"""Gradio chat demo that streams responses from a (local) OpenAI‑compatible endpoint using the official `openai` Python SDK. The server is assumed to be running at http://0.0.0.0:8000 with the v1 REST routes. A custom header `X‑MIXINPUTS‑BETA` is forwarded so MoI can adjust its blending strength at runtime. Launch with: python app_openai.py """ from __future__ import annotations import os import openai import gradio as gr # ────────────────────────────────────────────────────────────────────────────── # OpenAI client configuration # ────────────────────────────────────────────────────────────────────────────── # ``openai`` still expects an API key even if the backend ignores it, so we use # a dummy value when none is provided. The *base_url* points to the local # vLLM server that speaks the OpenAI REST dialect. # ----------------------------------------------------------------------------- openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY") openai.base_url = "http://0.0.0.0:8000/v1" # ────────────────────────────────────────────────────────────────────────────── # Chat handler # ────────────────────────────────────────────────────────────────────────────── def stream_completion(message: str, history: list[tuple[str, str]], max_tokens: int, temperature: float, top_p: float, beta: float): """Gradio callback that yields streaming assistant replies. The function reconstructs the conversation *excluding* any system prompt and then calls ``openai.chat.completions.create`` with ``stream=True``. Each incoming delta is appended to an ``assistant`` buffer which is sent back to the Chatbot component for real‑time display. """ # Build OpenAI‑style message list from prior turns messages: list[dict[str, str]] = [] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Current user input comes last messages.append({"role": "user", "content": message}) try: # Kick off streaming completion response = openai.chat.completions.create( model="Qwen/Qwen3-4B", messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens, stream=True, # Forward MoI blending coefficient to the backend extra_headers={"X-MIXINPUTS-BETA": str(beta)}, ) assistant = "" for chunk in response: # ``delta.content`` is None for e.g. role announcements; guard with or "" delta = chunk.choices[0].delta.content or "" assistant += delta yield history + [(message, assistant)] # live update except Exception as err: # pylint: disable=broad-except yield history + [(message, f"[ERROR] {err}")] # ────────────────────────────────────────────────────────────────────────────── # Gradio UI # ────────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo: gr.Markdown( "## 🎨 Mixture of Inputs (MoI) Demo \n" "Streaming vLLM demo with dynamic **beta** adjustment in MoI " "(higher beta → less blending)." ) with gr.Row(): # sliders first beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI β") temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top‑p") max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens") chatbot = gr.Chatbot(height=450) user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False) clear_btn = gr.Button("Clear chat") user_box.submit( fn=stream_completion, inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta], outputs=chatbot, ) clear_btn.click(lambda: None, None, chatbot, queue=False) # ────────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.launch()