File size: 2,721 Bytes
c1965a3
 
a846510
f873ce7
c1965a3
4c07c1e
a846510
4c07c1e
c1965a3
 
 
 
 
 
 
 
 
 
 
 
 
f873ce7
4c07c1e
a846510
c1965a3
f873ce7
a846510
 
c1965a3
f873ce7
a846510
f873ce7
 
c1965a3
f873ce7
 
a846510
c1965a3
 
 
4c07c1e
c1965a3
f873ce7
c1965a3
f873ce7
c1965a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c07c1e
c1965a3
 
 
4c07c1e
c1965a3
 
 
 
 
 
 
a846510
f873ce7
c1965a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# app.py
import json
import requests
import sseclient
import gradio as gr

API_URL = "http://localhost:8000/v1/chat/completions"


def stream_completion(message, history, max_tokens, temperature, top_p, beta):
    """
    Gradio callback: takes the newest user message + full chat history,
    returns an updated history while streaming assistant tokens.
    """
    # ------- build OpenAI-style message list (no system prompt) -------------
    messages = []
    for usr, bot in history:
        if usr:
            messages.append({"role": "user", "content": usr})
        if bot:
            messages.append({"role": "assistant", "content": bot})
    messages.append({"role": "user", "content": message})

    payload = {
        "model": "Qwen/Qwen3-4B",
        "messages": messages,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": int(max_tokens),
        "stream": True,
    }
    headers = {
        "Content-Type": "application/json",
        "X-MIXINPUTS-BETA": str(beta),
    }

    try:
        resp = requests.post(API_URL, json=payload, stream=True, headers=headers, timeout=60)
        resp.raise_for_status()
        client = sseclient.SSEClient(resp)

        assistant = ""
        for event in client.events():
            if event.data.strip() == "[DONE]":
                break
            delta = json.loads(event.data)["choices"][0]["delta"].get("content", "")
            assistant += delta
            yield history + [(message, assistant)]  # update the chat box live

    except Exception as err:
        yield history + [(message, f"[ERROR] {err}")]


# ----------------------- UI ---------------------------------------------
with gr.Blocks(title="🧪 Mixture of Inputs (MoI) Demo") as demo:
    gr.Markdown(
        "## 🧪 Mixture of Inputs (MoI) Demo  \n"
        "Streaming local vLLM demo with dynamic **beta** adjustment."
    )

    # sliders first – all on one row
    with gr.Row():
        max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")
        temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
        beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Beta")

    chatbot = gr.Chatbot(height=450)
    user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
    clear_btn = gr.Button("Clear chat")

    # wiring
    user_box.submit(
        stream_completion,
        inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
        outputs=chatbot,
    )
    clear_btn.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch()