Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

App Files Files Community

mahdicv commited on 12 days ago

Commit

f1b7ce9

1 Parent(s): 94b2d6e

initial commit to add working code

Browse files

Files changed (5) hide show

README.md +7 -7
app.py +102 -50
gateway.py +69 -0
requirements.txt +1 -1
utils.py +12 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: Gpt Oss 120b Chatbot
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 5.0.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: 'Alpha chatbot using OpenAI'' gpt-oss-120b released on day 0. '
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 ---
+title: Openai Amd Modelx Internal
+emoji: 💻
+colorFrom: red
+colorTo: pink
 sdk: gradio
+sdk_version: 5.36.2
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: 'internal repo to test '
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,64 +1,116 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
     additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
 if __name__ == "__main__":
-    demo.launch()

+import os, re, logging, gradio as gr
+from openai import OpenAI
+from gateway import request_generation
+from utils import LATEX_DELIMS
+openai_api_key = os.getenv("API_KEY")
+openai_api_base = os.getenv("API_ENDPOINT")
+MODEL = os.getenv("MODEL_NAME", "")
+client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
+CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
+QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
+logging.basicConfig(level=logging.INFO)
+def format_analysis_response(text):
+    m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
+    if m:
+        reasoning = m.group(1).strip()
+        response = text.split("assistantfinal", 1)[-1].strip()
+        return (
+            f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
+            f"**💬 Response:**\n\n{response}"
+        )
+    return text.strip()
+def generate(message, history,
+             system_prompt, temperature,
+             frequency_penalty, presence_penalty,
+             max_new_tokens):
+    if not message.strip():
+        yield "Please enter a prompt."
+        return
+    msgs = []
+    for h in history:
+        if isinstance(h, dict):
+            msgs.append(h)
+        elif isinstance(h, (list, tuple)) and len(h) == 2:
+            u, a = h
+            if u: msgs.append({"role": "user", "content": u})
+            if a: msgs.append({"role": "assistant", "content": a})
+    logging.info(f"[User] {message}")
+    logging.info(f"[System] {system_prompt} | Temp={temperature}")
+    collected, buffer = "", ""
+    yielded_once = False
+    try:
+        for delta in request_generation(
+            api_key=openai_api_key, api_base=openai_api_base,
+            message=message, system_prompt=system_prompt,
+            model_name=MODEL, chat_history=msgs,
+            temperature=temperature,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            max_new_tokens=max_new_tokens,
+        ):
+            if not delta:
+                continue
+            collected += delta
+            buffer += delta
+            if not yielded_once:
+                yield delta
+                buffer = ""
+                yielded_once = True
+                continue
+            if "\n" in buffer or len(buffer) > 150:
+                yield collected
+                buffer = ""
+        final = format_analysis_response(collected)
+        if final.count("$") % 2:
+            final += "$"
+        yield final
+    except Exception as e:
+        logging.exception("Stream failed")
+        yield f"❌ Error: {e}"
+chatbot_ui = gr.ChatInterface(
+    fn=generate,
+    type="messages",
+    chatbot=gr.Chatbot(
+        label="OSS vLLM Chatbot",
+        type="messages",
+        scale=2,
+        height=600,
+        latex_delimiters=LATEX_DELIMS,
+    ),
+    stop_btn=True,
     additional_inputs=[
+        gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
+        gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
     ],
+    examples=[
+        ["Explain the difference between supervised and unsupervised learning."],
+        ["Summarize the plot of Inception in two sentences."],
+        ["Show me the LaTeX for the quadratic formula."],
+        ["What are advantages of AMD Instinct MI300X GPU?"],
+        ["Derive the gradient of softmax cross-entropy loss."],
+        ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
+    ],
+    # title="Open-source GPT-OSS-120B on AMD MI300X",
+    title=" GPT-OSS-120B on AMD MI300X",
+    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
 )
 if __name__ == "__main__":
+    chatbot_ui.queue(max_size=QUEUE_SIZE,
+                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()

gateway.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+from openai import OpenAI
+from typing import List, Generator, Optional
+logging.basicConfig(level=logging.INFO)
+def request_generation(
+    api_key: str,
+    api_base: str,
+    message: str,
+    system_prompt: str,
+    model_name: str,
+    chat_history: Optional[List[dict]] = None,
+    temperature: float = 0.3,
+    frequency_penalty: float = 0.0,
+    presence_penalty: float = 0.0,
+    max_new_tokens: int = 1024,
+    tools: Optional[List[dict]] = None,
+    tool_choice: Optional[str] = None,
+) -> Generator[str, None, None]:
+    """
+    Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client.
+    Buffers output to improve LaTeX rendering.
+    """
+    client = OpenAI(api_key=api_key, base_url=api_base)
+    messages = [{"role": "system", "content": system_prompt}]
+    if chat_history:
+        messages.extend(chat_history)
+    messages.append({"role": "user", "content": message})
+    request_args = {
+        "model": model_name,
+        "messages": messages,
+        "temperature": temperature,
+        "frequency_penalty": frequency_penalty,
+        "presence_penalty": presence_penalty,
+        "max_tokens": max_new_tokens,
+        "stream": True,
+    }
+    if tools:
+        request_args["tools"] = tools
+    if tool_choice:
+        request_args["tool_choice"] = tool_choice
+    logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}")
+    try:
+        stream = client.chat.completions.create(**request_args)
+        collected = ""
+        buffer = ""
+        for chunk in stream:
+            delta = chunk.choices[0].delta.content or ""
+            collected += delta
+            buffer += delta
+            if "\n" in buffer or len(buffer) > 150:
+                yield buffer
+                buffer = ""
+        if buffer:
+            yield buffer
+    except Exception as e:
+        logging.exception("[Gateway] Streaming failed")
+        yield f"Error: {e}"

requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- ~~huggingface_hub==0.25.2~~


1	+ openai

utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# ----------------------------------------------------------------------
+# KaTeX delimiter config for Gradio
+# ----------------------------------------------------------------------
+LATEX_DELIMS = [
+    {"left": "$$",  "right": "$$",  "display": True},
+    {"left": "$",   "right": "$",   "display": False},
+    {"left": "\\[", "right": "\\]", "display": True},
+    {"left": "\\(", "right": "\\)", "display": False},
+]