Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on Apr 30

Commit

2d6eaa5

verified ·

1 Parent(s): 5b8ad4f

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -127

app.py CHANGED Viewed

@@ -1,22 +1,22 @@
 import gradio as gr
 from openai import OpenAI
 import os
-import requests # Added for potential future use, though OpenAI client handles it now
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
-if not ACCESS_TOKEN:
-    print("Warning: HF_TOKEN environment variable not set. Authentication might fail.")
-else:
-    print("Access token loaded.")
-# Base URLs for different providers
-HF_INFERENCE_BASE_URL = "https://api-inference.huggingface.co/v1/"
-CEREBRAS_ROUTER_BASE_URL = "https://router.huggingface.co/cerebras/v1/" # Use base URL for OpenAI client
-# Default provider
-DEFAULT_PROVIDER = "hf-inference"
-# --- Main Respond Function ---
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -27,66 +27,51 @@ def respond(
     frequency_penalty,
     seed,
     custom_model,
-    inference_provider # New argument for provider selection
 ):
-    print(f"--- New Request ---")
-    print(f"Selected Inference Provider: {inference_provider}")
     print(f"Received message: {message}")
-    # print(f"History: {history}") # Can be verbose
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
     print(f"Selected model (custom_model): {custom_model}")
-    # Determine the base URL based on the selected provider
-    if inference_provider == "cerebras":
-        base_url = CEREBRAS_ROUTER_BASE_URL
-        print(f"Using Cerebras Router endpoint: {base_url}")
-    else: # Default to hf-inference
-        base_url = HF_INFERENCE_BASE_URL
-        print(f"Using HF Inference API endpoint: {base_url}")
-    # Initialize the OpenAI client dynamically for each request
-    try:
-        client = OpenAI(
-            base_url=base_url,
-            api_key=ACCESS_TOKEN,
-        )
-        print("OpenAI client initialized for the request.")
-    except Exception as e:
-        print(f"Error initializing OpenAI client: {e}")
-        yield f"Error: Could not initialize API client for provider {inference_provider}. Check token and endpoint."
-        return
     # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
     messages = [{"role": "system", "content": system_message}]
-    # print("Initial messages array constructed.") # Less verbose logging
     # Add conversation history to the context
     for val in history:
-        user_part, assistant_part = val[0], val[1]
-        if user_part: messages.append({"role": "user", "content": user_part})
-        if assistant_part: messages.append({"role": "assistant", "content": assistant_part})
     # Append the latest user message
     messages.append({"role": "user", "content": message})
-    # print("Full message context prepared.") # Less verbose logging
     # If user provided a model, use that; otherwise, fall back to a default model
-    # Ensure a default model is always set if custom_model is empty
-    model_to_use = custom_model.strip() if custom_model.strip() else "meta-llama/Llama-3.3-70B-Instruct"
     print(f"Model selected for inference: {model_to_use}")
-    # Start streaming response
     response = ""
-    print(f"Sending request to {inference_provider} via {base_url}...")
-    try:
-        stream = client.chat.completions.create(
             model=model_to_use,
             max_tokens=max_tokens,
             stream=True,
@@ -95,60 +80,138 @@ def respond(
             frequency_penalty=frequency_penalty,
             seed=seed,
             messages=messages,
-        )
-        for message_chunk in stream:
             token_text = message_chunk.choices[0].delta.content
-            # Handle potential None or empty tokens gracefully
-            if token_text:
-                # print(f"Received token: {token_text}") # Very verbose
                 response += token_text
                 yield response
-            # Handle potential finish reason if needed (e.g., length)
-            # finish_reason = message_chunk.choices[0].finish_reason
-            # if finish_reason:
-            #     print(f"Stream finished with reason: {finish_reason}")
-    except Exception as e:
-        print(f"Error during API call to {inference_provider}: {e}")
-        yield f"Error: API call failed. Details: {str(e)}"
-        return # Stop generation on error
     print("Completed response generation.")
-# --- GRADIO UI Elements ---
-chatbot = gr.Chatbot(height=600, show_copy_button=True, placeholder="Select a model and provider, then begin chatting", layout="panel")
 print("Chatbot interface created.")
-# Moved these inside the Accordion later
-system_message_box = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
-max_tokens_slider = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens") # Increased default
-temperature_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature") # Adjusted range
-top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
-frequency_penalty_slider = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
-seed_slider = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
 custom_model_box = gr.Textbox(
     value="",
-    label="Custom Model Path",
-    info="(Optional) Provide a Hugging Face model path. Overrides featured model selection.",
     placeholder="meta-llama/Llama-3.3-70B-Instruct"
 )
-# New UI Element for Provider Selection (will be placed in Accordion)
-inference_provider_radio = gr.Radio(
     choices=["hf-inference", "cerebras"],
-    value=DEFAULT_PROVIDER,
     label="Inference Provider",
-    info=f"Select the backend API. Default: {DEFAULT_PROVIDER}"
 )
-print("Inference provider radio button created.")
-# --- Gradio Chat Interface Definition ---
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
-        # Order matters: must match the 'respond' function signature
         system_message_box,
         max_tokens_slider,
         temperature_slider,
@@ -156,36 +219,59 @@ demo = gr.ChatInterface(
         frequency_penalty_slider,
         seed_slider,
         custom_model_box,
-        inference_provider_radio, # Added the new input
     ],
     fill_height=True,
     chatbot=chatbot,
     theme="Nymbo/Nymbo_Theme",
-    title="Multi-Provider Chat Hub",
-    description="Chat with various models using different inference backends (HF Inference API or Cerebras via HF Router)."
 )
 print("ChatInterface object created.")
-# --- Add Accordions for Settings within the Demo context ---
 with demo:
-    # Model Selection Accordion (existing logic)
     with gr.Accordion("Model Selection", open=False):
-        model_search_box = gr.Textbox(label="Filter Featured Models", placeholder="Search...", lines=1)
         print("Model search box created.")
-        # Example models list (keep your extensive list)
         models_list = [
-            "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.1-70B-Instruct", "meta-llama/Llama-3.1-8B-Instruct",
-            "NousResearch/Hermes-3-Llama-3.1-8B", "mistralai/Mistral-Nemo-Instruct-2407", "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "mistralai/Mistral-7B-Instruct-v0.3", "Qwen/Qwen3-32B", "microsoft/Phi-3.5-mini-instruct",
-            # Add the rest of your models here...
         ]
         print("Models list initialized.")
         featured_model_radio = gr.Radio(
-            label="Select a Featured Model",
             choices=models_list,
-            value="meta-llama/Llama-3.3-70B-Instruct", # Default featured model
             interactive=True
         )
         print("Featured models radio button created.")
@@ -193,44 +279,33 @@ with demo:
         def filter_models(search_term):
             print(f"Filtering models with search term: {search_term}")
             filtered = [m for m in models_list if search_term.lower() in m.lower()]
-            # Ensure a valid value is selected if the current one is filtered out
-            current_value = featured_model_radio.value
-            if current_value not in filtered and filtered:
-                 new_value = filtered[0] # Select the first available filtered model
-            elif not filtered:
-                 new_value = None # Or handle empty case as needed
-            else:
-                 new_value = current_value # Keep current if still valid
             print(f"Filtered models: {filtered}")
-            return gr.update(choices=filtered, value=new_value)
-        def set_custom_model_from_radio(selected_model):
-            """Updates the Custom Model text box when a featured model is selected."""
-            print(f"Featured model selected: {selected_model}")
-            return selected_model # Directly return the selected model name
-        model_search_box.change(fn=filter_models, inputs=model_search_box, outputs=featured_model_radio)
-        featured_model_radio.change(fn=set_custom_model_from_radio, inputs=featured_model_radio, outputs=custom_model_box)
-        print("Model selection events linked.")
-    # Advanced Settings Accordion (New)
     with gr.Accordion("Advanced Settings", open=False):
-        # Place the provider selection and parameter sliders here
-        gr.Markdown("Configure inference parameters and select the backend provider.")
-        # Add the UI elements defined earlier into this accordion
-        gr.Textbox(value="You are a helpful assistant.", label="System Prompt").render() # Render system_message_box here
-        inference_provider_radio.render() # Render the provider radio here
-        max_tokens_slider.render()
-        temperature_slider.render()
-        top_p_slider.render()
-        frequency_penalty_slider.render()
-        seed_slider.render()
-        print("Advanced settings accordion created with provider selection and parameters.")
-print("Gradio interface fully initialized.")
 if __name__ == "__main__":
     print("Launching the demo application.")
-    demo.launch(show_api=False)

 import gradio as gr
 from openai import OpenAI
 import os
+import requests
+import json
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
+print("Access token loaded.")
+# Initialize the OpenAI client for HF Inference
+hf_client = OpenAI(
+    base_url="https://api-inference.huggingface.co/v1/",
+    api_key=ACCESS_TOKEN,
+)
+print("HF Inference OpenAI client initialized.")
+# Cerebras API endpoint
+CEREBRAS_API_URL = "https://router.huggingface.co/cerebras/v1/chat/completions"
 def respond(
     message,
     history: list[tuple[str, str]],
     frequency_penalty,
     seed,
     custom_model,
+    provider  # New parameter for provider selection
 ):
     print(f"Received message: {message}")
+    print(f"History: {history}")
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
     print(f"Selected model (custom_model): {custom_model}")
+    print(f"Selected provider: {provider}")
     # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
+    # Prepare messages for API
     messages = [{"role": "system", "content": system_message}]
+    print("Initial messages array constructed.")
     # Add conversation history to the context
     for val in history:
+        user_part = val[0]
+        assistant_part = val[1]
+        if user_part:
+            messages.append({"role": "user", "content": user_part})
+            print(f"Added user message to context: {user_part}")
+        if assistant_part:
+            messages.append({"role": "assistant", "content": assistant_part})
+            print(f"Added assistant message to context: {assistant_part}")
     # Append the latest user message
     messages.append({"role": "user", "content": message})
+    print("Latest user message appended.")
     # If user provided a model, use that; otherwise, fall back to a default model
+    model_to_use = custom_model.strip() if custom_model.strip() != "" else "meta-llama/Llama-3.3-70B-Instruct"
     print(f"Model selected for inference: {model_to_use}")
+    # Start with an empty string to build the response as tokens stream in
     response = ""
+    # Handle different providers
+    if provider == "hf-inference":
+        print("Using HF Inference API.")
+        # Use the OpenAI client for HF Inference
+        for message_chunk in hf_client.chat.completions.create(
             model=model_to_use,
             max_tokens=max_tokens,
             stream=True,
             frequency_penalty=frequency_penalty,
             seed=seed,
             messages=messages,
+        ):
             token_text = message_chunk.choices[0].delta.content
+            if token_text is not None:  # Handle None values that might come in stream
+                print(f"Received token: {token_text}")
                 response += token_text
                 yield response
+    elif provider == "cerebras":
+        print("Using Cerebras API via HF Router.")
+        # Prepare headers and payload for the Cerebras API
+        headers = {
+            "Authorization": f"Bearer {ACCESS_TOKEN}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": model_to_use,
+            "messages": messages,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "frequency_penalty": frequency_penalty,
+            "stream": True
+        }
+        if seed is not None:
+            payload["seed"] = seed
+        # Make the streaming request to Cerebras
+        with requests.post(
+            CEREBRAS_API_URL,
+            headers=headers,
+            json=payload,
+            stream=True
+        ) as req:
+            # Handle Server-Sent Events (SSE) format
+            for line in req.iter_lines():
+                if line:
+                    # Skip the "data: " prefix
+                    if line.startswith(b'data: '):
+                        line = line[6:]
+                    # Skip "[DONE]" message
+                    if line == b'[DONE]':
+                        continue
+                    try:
+                        # Parse the JSON chunk
+                        chunk = json.loads(line)
+                        token_text = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
+                        if token_text:
+                            print(f"Received Cerebras token: {token_text}")
+                            response += token_text
+                            yield response
+                    except json.JSONDecodeError as e:
+                        print(f"Error decoding JSON: {e}, Line: {line}")
+                        continue
     print("Completed response generation.")
+# GRADIO UI
+chatbot = gr.Chatbot(height=600, show_copy_button=True, placeholder="Select a model and begin chatting", layout="panel")
 print("Chatbot interface created.")
+system_message_box = gr.Textbox(value="", placeholder="You are a helpful assistant.", label="System Prompt")
+max_tokens_slider = gr.Slider(
+    minimum=1,
+    maximum=4096,
+    value=512,
+    step=1,
+    label="Max new tokens"
+)
+temperature_slider = gr.Slider(
+    minimum=0.1,
+    maximum=4.0,
+    value=0.7,
+    step=0.1,
+    label="Temperature"
+)
+top_p_slider = gr.Slider(
+    minimum=0.1,
+    maximum=1.0,
+    value=0.95,
+    step=0.05,
+    label="Top-P"
+)
+frequency_penalty_slider = gr.Slider(
+    minimum=-2.0,
+    maximum=2.0,
+    value=0.0,
+    step=0.1,
+    label="Frequency Penalty"
+)
+seed_slider = gr.Slider(
+    minimum=-1,
+    maximum=65535,
+    value=-1,
+    step=1,
+    label="Seed (-1 for random)"
+)
+# The custom_model_box is what the respond function sees as "custom_model"
 custom_model_box = gr.Textbox(
     value="",
+    label="Custom Model",
+    info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model.",
     placeholder="meta-llama/Llama-3.3-70B-Instruct"
 )
+# New provider selection radio
+provider_radio = gr.Radio(
     choices=["hf-inference", "cerebras"],
+    value="hf-inference",
     label="Inference Provider",
+    info="Select which inference provider to use"
 )
+def set_custom_model_from_radio(selected):
+    """
+    This function will get triggered whenever someone picks a model from the 'Featured Models' radio.
+    We will update the Custom Model text box with that selection automatically.
+    """
+    print(f"Featured model selected: {selected}")
+    return selected
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
         system_message_box,
         max_tokens_slider,
         temperature_slider,
         frequency_penalty_slider,
         seed_slider,
         custom_model_box,
+        provider_radio,  # Add provider selection to inputs
     ],
     fill_height=True,
     chatbot=chatbot,
     theme="Nymbo/Nymbo_Theme",
 )
 print("ChatInterface object created.")
 with demo:
     with gr.Accordion("Model Selection", open=False):
+        model_search_box = gr.Textbox(
+            label="Filter Models",
+            placeholder="Search for a featured model...",
+            lines=1
+        )
         print("Model search box created.")
         models_list = [
+            "meta-llama/Llama-3.3-70B-Instruct",
+            "meta-llama/Llama-3.1-70B-Instruct",
+            "meta-llama/Llama-3.0-70B-Instruct",
+            "meta-llama/Llama-3.2-3B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.1-8B-Instruct",
+            "NousResearch/Hermes-3-Llama-3.1-8B",
+            "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+            "mistralai/Mistral-Nemo-Instruct-2407",
+            "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "mistralai/Mistral-7B-Instruct-v0.2",
+            "Qwen/Qwen3-235B-A22B",
+            "Qwen/Qwen3-32B",
+            "Qwen/Qwen2.5-72B-Instruct",
+            "Qwen/Qwen2.5-3B-Instruct",
+            "Qwen/Qwen2.5-0.5B-Instruct",
+            "Qwen/QwQ-32B",
+            "Qwen/Qwen2.5-Coder-32B-Instruct",
+            "microsoft/Phi-3.5-mini-instruct",
+            "microsoft/Phi-3-mini-128k-instruct",
+            "microsoft/Phi-3-mini-4k-instruct",
+            "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            "HuggingFaceH4/zephyr-7b-beta",
+            "HuggingFaceTB/SmolLM2-360M-Instruct",
+            "tiiuae/falcon-7b-instruct",
+            "01-ai/Yi-1.5-34B-Chat",
         ]
         print("Models list initialized.")
         featured_model_radio = gr.Radio(
+            label="Select a model below",
             choices=models_list,
+            value="meta-llama/Llama-3.3-70B-Instruct",
             interactive=True
         )
         print("Featured models radio button created.")
         def filter_models(search_term):
             print(f"Filtering models with search term: {search_term}")
             filtered = [m for m in models_list if search_term.lower() in m.lower()]
             print(f"Filtered models: {filtered}")
+            return gr.update(choices=filtered)
+        model_search_box.change(
+            fn=filter_models,
+            inputs=model_search_box,
+            outputs=featured_model_radio
+        )
+        print("Model search box change event linked.")
+        featured_model_radio.change(
+            fn=set_custom_model_from_radio,
+            inputs=featured_model_radio,
+            outputs=custom_model_box
+        )
+        print("Featured model radio button change event linked.")
+    # Add new accordion for advanced settings including provider selection
     with gr.Accordion("Advanced Settings", open=False):
+        # The provider_radio is already defined above, we're just adding it to the UI here
+        gr.Markdown("### Inference Provider")
+        gr.Markdown("Select which provider to use for inference. Default is Hugging Face Inference API.")
+        # Provider radio is already included in the additional_inputs
+        gr.Markdown("Note: Different providers may support different models and parameters.")
+print("Gradio interface initialized.")
 if __name__ == "__main__":
     print("Launching the demo application.")
+    demo.launch(show_api=True)