"""Gradio chat demo that streams responses from a (local) OpenAI‑compatible
endpoint using the official `openai` Python SDK.  The server is assumed to be
running at http://0.0.0.0:8000 with the v1 REST routes.  A custom header
`X‑MIXINPUTS‑BETA` is forwarded so MoI can adjust its blending strength at
runtime.

Launch with:
    python app_openai.py
"""

from __future__ import annotations

import os
import openai
import gradio as gr

# ──────────────────────────────────────────────────────────────────────────────
# OpenAI client configuration
# ──────────────────────────────────────────────────────────────────────────────
# ``openai`` still expects an API key even if the backend ignores it, so we use
# a dummy value when none is provided.  The *base_url* points to the local
# vLLM server that speaks the OpenAI REST dialect.
# -----------------------------------------------------------------------------
openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
openai.base_url = "http://0.0.0.0:8000/v1"

# ──────────────────────────────────────────────────────────────────────────────
# Chat handler
# ──────────────────────────────────────────────────────────────────────────────

def stream_completion(message: str,
                      history: list[tuple[str, str]],
                      max_tokens: int,
                      temperature: float,
                      top_p: float,
                      beta: float):
    """Gradio callback that yields streaming assistant replies.

    The function reconstructs the conversation *excluding* any system prompt
    and then calls ``openai.chat.completions.create`` with ``stream=True``.
    Each incoming delta is appended to an ``assistant`` buffer which is sent
    back to the Chatbot component for real‑time display.
    """

    # Build OpenAI‑style message list from prior turns
    messages: list[dict[str, str]] = []
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    # Current user input comes last
    messages.append({"role": "user", "content": message})

    try:
        # Kick off streaming completion
        response = openai.chat.completions.create(
            model="Qwen/Qwen3-4B",
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            stream=True,
            # Forward MoI blending coefficient to the backend
            extra_headers={"X-MIXINPUTS-BETA": str(beta)},
        )

        assistant = ""
        for chunk in response:
            # ``delta.content`` is None for e.g. role announcements; guard with or ""
            delta = chunk.choices[0].delta.content or ""
            assistant += delta
            yield history + [(message, assistant)]  # live update

    except Exception as err:  # pylint: disable=broad-except
        yield history + [(message, f"[ERROR] {err}")]


# ──────────────────────────────────────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────────────────────────────────────

with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
    gr.Markdown(
        "## 🎨 Mixture of Inputs (MoI) Demo  \n"
        "Streaming vLLM demo with dynamic **beta** adjustment in MoI "
        "(higher beta → less blending)."
    )

    with gr.Row():  # sliders first
        beta        = gr.Slider(0.0, 10.0, value=1.0,  step=0.1,  label="MoI β")
        temperature = gr.Slider(0.1, 1.0,  value=0.6,  step=0.1,  label="Temperature")
        top_p       = gr.Slider(0.1, 1.0,  value=0.80, step=0.05, label="Top‑p")
        max_tokens  = gr.Slider(1,   2048, value=512,  step=1,    label="Max new tokens")

    chatbot   = gr.Chatbot(height=450)
    user_box  = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
    clear_btn = gr.Button("Clear chat")

    user_box.submit(
        fn=stream_completion,
        inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
        outputs=chatbot,
    )

    clear_btn.click(lambda: None, None, chatbot, queue=False)

# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch()