Nymbo commited on
Commit
1cee504
·
verified ·
1 Parent(s): 2d6eaa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -88
app.py CHANGED
@@ -1,21 +1,14 @@
1
  import gradio as gr
2
- from openai import OpenAI
3
  import os
4
- import requests
5
  import json
6
 
7
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
8
  print("Access token loaded.")
9
 
10
- # Initialize the OpenAI client for HF Inference
11
- hf_client = OpenAI(
12
- base_url="https://api-inference.huggingface.co/v1/",
13
- api_key=ACCESS_TOKEN,
14
- )
15
- print("HF Inference OpenAI client initialized.")
16
-
17
- # Cerebras API endpoint
18
- CEREBRAS_API_URL = "https://router.huggingface.co/cerebras/v1/chat/completions"
19
 
20
  def respond(
21
  message,
@@ -41,7 +34,7 @@ def respond(
41
  if seed == -1:
42
  seed = None
43
 
44
- # Prepare messages for API
45
  messages = [{"role": "system", "content": system_message}]
46
  print("Initial messages array constructed.")
47
 
@@ -66,80 +59,45 @@ def respond(
66
 
67
  # Start with an empty string to build the response as tokens stream in
68
  response = ""
 
 
 
 
 
 
 
 
 
69
 
70
- # Handle different providers
71
- if provider == "hf-inference":
72
- print("Using HF Inference API.")
73
- # Use the OpenAI client for HF Inference
74
- for message_chunk in hf_client.chat.completions.create(
 
 
75
  model=model_to_use,
76
- max_tokens=max_tokens,
77
- stream=True,
78
- temperature=temperature,
79
- top_p=top_p,
80
- frequency_penalty=frequency_penalty,
81
- seed=seed,
82
  messages=messages,
83
- ):
84
- token_text = message_chunk.choices[0].delta.content
85
- if token_text is not None: # Handle None values that might come in stream
86
- print(f"Received token: {token_text}")
87
- response += token_text
88
- yield response
89
-
90
- elif provider == "cerebras":
91
- print("Using Cerebras API via HF Router.")
92
-
93
- # Prepare headers and payload for the Cerebras API
94
- headers = {
95
- "Authorization": f"Bearer {ACCESS_TOKEN}",
96
- "Content-Type": "application/json"
97
- }
98
-
99
- payload = {
100
- "model": model_to_use,
101
- "messages": messages,
102
- "max_tokens": max_tokens,
103
- "temperature": temperature,
104
- "top_p": top_p,
105
- "frequency_penalty": frequency_penalty,
106
- "stream": True
107
- }
108
-
109
- if seed is not None:
110
- payload["seed"] = seed
111
 
112
- # Make the streaming request to Cerebras
113
- with requests.post(
114
- CEREBRAS_API_URL,
115
- headers=headers,
116
- json=payload,
117
- stream=True
118
- ) as req:
119
- # Handle Server-Sent Events (SSE) format
120
- for line in req.iter_lines():
121
- if line:
122
- # Skip the "data: " prefix
123
- if line.startswith(b'data: '):
124
- line = line[6:]
125
-
126
- # Skip "[DONE]" message
127
- if line == b'[DONE]':
128
- continue
129
-
130
- try:
131
- # Parse the JSON chunk
132
- chunk = json.loads(line)
133
- token_text = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
134
-
135
- if token_text:
136
- print(f"Received Cerebras token: {token_text}")
137
- response += token_text
138
- yield response
139
- except json.JSONDecodeError as e:
140
- print(f"Error decoding JSON: {e}, Line: {line}")
141
- continue
142
-
143
  print("Completed response generation.")
144
 
145
  # GRADIO UI
@@ -193,12 +151,22 @@ custom_model_box = gr.Textbox(
193
  placeholder="meta-llama/Llama-3.3-70B-Instruct"
194
  )
195
 
196
- # New provider selection radio
 
 
 
 
 
 
 
 
 
 
197
  provider_radio = gr.Radio(
198
- choices=["hf-inference", "cerebras"],
199
  value="hf-inference",
200
  label="Inference Provider",
201
- info="Select which inference provider to use"
202
  )
203
 
204
  def set_custom_model_from_radio(selected):
@@ -298,11 +266,22 @@ with demo:
298
 
299
  # Add new accordion for advanced settings including provider selection
300
  with gr.Accordion("Advanced Settings", open=False):
301
- # The provider_radio is already defined above, we're just adding it to the UI here
302
  gr.Markdown("### Inference Provider")
303
- gr.Markdown("Select which provider to use for inference. Default is Hugging Face Inference API.")
304
  # Provider radio is already included in the additional_inputs
305
- gr.Markdown("Note: Different providers may support different models and parameters.")
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
  print("Gradio interface initialized.")
308
 
 
1
  import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
  import os
 
4
  import json
5
 
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
7
  print("Access token loaded.")
8
 
9
+ # Initialize the HF Inference Client
10
+ client = InferenceClient(token=ACCESS_TOKEN)
11
+ print("Hugging Face Inference Client initialized.")
 
 
 
 
 
 
12
 
13
  def respond(
14
  message,
 
34
  if seed == -1:
35
  seed = None
36
 
37
+ # Prepare messages in the format expected by the API
38
  messages = [{"role": "system", "content": system_message}]
39
  print("Initial messages array constructed.")
40
 
 
59
 
60
  # Start with an empty string to build the response as tokens stream in
61
  response = ""
62
+ print(f"Sending request to {provider} provider.")
63
+
64
+ # Prepare parameters for the chat completion request
65
+ parameters = {
66
+ "max_new_tokens": max_tokens,
67
+ "temperature": temperature,
68
+ "top_p": top_p,
69
+ "frequency_penalty": frequency_penalty,
70
+ }
71
 
72
+ if seed is not None:
73
+ parameters["seed"] = seed
74
+
75
+ # Use the InferenceClient for making the request with proper provider selection
76
+ try:
77
+ # Create a generator for the streaming response
78
+ stream = client.chat_completion(
79
  model=model_to_use,
 
 
 
 
 
 
80
  messages=messages,
81
+ stream=True,
82
+ provider=provider, # Use the selected provider
83
+ **parameters # Pass all other parameters
84
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # Process the streaming response
87
+ for chunk in stream:
88
+ if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
89
+ # Extract the content from the response
90
+ if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
91
+ token_text = chunk.choices[0].delta.content
92
+ if token_text:
93
+ print(f"Received token: {token_text}")
94
+ response += token_text
95
+ yield response
96
+ except Exception as e:
97
+ print(f"Error during inference: {e}")
98
+ response += f"\nError: {str(e)}"
99
+ yield response
100
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  print("Completed response generation.")
102
 
103
  # GRADIO UI
 
151
  placeholder="meta-llama/Llama-3.3-70B-Instruct"
152
  )
153
 
154
+ # Available providers as of April 2025
155
+ providers_list = [
156
+ "hf-inference", # Default Hugging Face Inference
157
+ "cerebras", # Cerebras provider
158
+ "together", # Together AI
159
+ "sambanova", # SambaNova
160
+ "replicate", # Replicate
161
+ "fal-ai" # Fal.ai
162
+ ]
163
+
164
+ # Provider selection radio
165
  provider_radio = gr.Radio(
166
+ choices=providers_list,
167
  value="hf-inference",
168
  label="Inference Provider",
169
+ info="Select which inference provider to use. Uses your Hugging Face PRO credits."
170
  )
171
 
172
  def set_custom_model_from_radio(selected):
 
266
 
267
  # Add new accordion for advanced settings including provider selection
268
  with gr.Accordion("Advanced Settings", open=False):
 
269
  gr.Markdown("### Inference Provider")
270
+ gr.Markdown("Select which provider to use for inference. Uses your Hugging Face PRO credits.")
271
  # Provider radio is already included in the additional_inputs
272
+
273
+ gr.Markdown("""
274
+ ### Provider Information
275
+
276
+ - **hf-inference**: Default Hugging Face Inference API
277
+ - **cerebras**: Cerebras AI models via Hugging Face router
278
+ - **together**: Together AI models
279
+ - **sambanova**: SambaNova models
280
+ - **replicate**: Replicate models
281
+ - **fal-ai**: Fal.ai models
282
+
283
+ As a PRO user, you receive $2 of credits monthly across all providers.
284
+ """)
285
 
286
  print("Gradio interface initialized.")
287