khurrameycon commited on
Commit
3a240c4
·
verified ·
1 Parent(s): 5c6448d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -32
app.py CHANGED
@@ -116,8 +116,17 @@ logger = logging.getLogger(__name__)
116
  class TextImageRequest(BaseModel):
117
  text: Optional[str] = None
118
  image_base64: Optional[str] = None
119
- voice: str = "af_heart"
120
  speed: float = 1.0
 
 
 
 
 
 
 
 
 
121
 
122
  class AudioResponse(BaseModel):
123
  status: str
@@ -145,31 +154,47 @@ def llm_chat_response(text, image_base64=None):
145
 
146
  logger.info("Initializing InferenceClient...")
147
  client = InferenceClient(
148
- provider="hf-inference", # Updated to the provider shown in the sample
149
  api_key=HF_TOKEN
150
  )
151
 
152
- # System message for better context
153
- system_message = "You are a helpful assistant that provides concise responses."
154
-
155
  try:
 
156
  if image_base64:
157
  logger.info("Processing request with image")
 
 
158
  messages = [
159
- {"role": "system", "content": system_message},
160
- {"role": "user", "content": [
161
- {"type": "text", "text": text if text else "Describe what you see in the image in one line only"},
162
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
163
- ]}
 
 
 
 
 
 
 
 
 
 
164
  ]
165
  else:
166
  logger.info("Processing text-only request")
167
  messages = [
168
- {"role": "system", "content": system_message},
169
- {"role": "user", "content": text + " Describe in one line only."}
 
 
170
  ]
171
 
172
  logger.info("Sending request to model...")
 
 
 
 
173
  completion = client.chat.completions.create(
174
  model="meta-llama/Llama-3.2-11B-Vision-Instruct",
175
  messages=messages,
@@ -178,27 +203,29 @@ def llm_chat_response(text, image_base64=None):
178
 
179
  logger.info(f"Received response from model")
180
 
181
- # Simplified response handling based on the sample code
182
- if not completion.choices or len(completion.choices) == 0:
183
- logger.error("No choices returned from model.")
184
- raise HTTPException(status_code=500, detail="Model returned no choices.")
185
 
186
- # Extract the content directly using the expected format
187
  try:
188
- # Get message from first choice
189
- message = completion.choices[0].message
190
-
191
- # Extract content from message
192
- if hasattr(message, "content"):
193
- return message.content
194
- elif isinstance(message, dict) and "content" in message:
195
- return message["content"]
196
- else:
197
- logger.error(f"Unexpected message format: {message}")
198
- raise HTTPException(status_code=500, detail="Unexpected message format from model")
199
  except Exception as e:
200
  logger.error(f"Error extracting message content: {str(e)}")
201
- raise HTTPException(status_code=500, detail=f"Failed to extract response content: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  except Exception as e:
204
  logger.error(f"Error during model inference: {str(e)}")
@@ -250,12 +277,17 @@ async def generate_audio(request: TextImageRequest):
250
  text_reply = llm_chat_response(user_text, request.image_base64)
251
  logger.info(f"LLM response: {text_reply}")
252
 
 
 
 
 
 
253
  # Generate audio
254
- logger.info(f"Generating audio using voice={request.voice}, speed={request.speed}")
255
  try:
256
  generator = pipeline(
257
  text_reply,
258
- voice=request.voice,
259
  speed=request.speed,
260
  split_pattern=r'\n+'
261
  )
@@ -273,7 +305,7 @@ async def generate_audio(request: TextImageRequest):
273
  # Convert to 16-bit signed integers
274
  pcm_data = (audio_numpy * 32767).astype(np.int16)
275
 
276
- # Convert to bytes (automatically uses row-major order)
277
  raw_audio = pcm_data.tobytes()
278
 
279
  # Return PCM data with minimal necessary headers
 
116
  class TextImageRequest(BaseModel):
117
  text: Optional[str] = None
118
  image_base64: Optional[str] = None
119
+ voice: str = "af_heart" # Default voice that we know exists
120
  speed: float = 1.0
121
+
122
+ # List of known available voices - update this based on what's actually available
123
+ AVAILABLE_VOICES = ["af_heart"] # Add more voices as they become available
124
+
125
+ # Validate that the voice exists
126
+ def validate_voice(self):
127
+ if self.voice not in self.AVAILABLE_VOICES:
128
+ return "af_heart" # Default to a voice we know exists
129
+ return self.voice
130
 
131
  class AudioResponse(BaseModel):
132
  status: str
 
154
 
155
  logger.info("Initializing InferenceClient...")
156
  client = InferenceClient(
157
+ provider="together", # Using the provider shown in the sample
158
  api_key=HF_TOKEN
159
  )
160
 
 
 
 
161
  try:
162
+ # IMPORTANT: Following exactly the format from the sample code
163
  if image_base64:
164
  logger.info("Processing request with image")
165
+ prompt = text if text else "Describe this image in one sentence."
166
+
167
  messages = [
168
+ {
169
+ "role": "user",
170
+ "content": [
171
+ {
172
+ "type": "text",
173
+ "text": prompt
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {
178
+ "url": f"data:image/jpeg;base64,{image_base64}"
179
+ }
180
+ }
181
+ ]
182
+ }
183
  ]
184
  else:
185
  logger.info("Processing text-only request")
186
  messages = [
187
+ {
188
+ "role": "user",
189
+ "content": text + " Describe in one line only."
190
+ }
191
  ]
192
 
193
  logger.info("Sending request to model...")
194
+ # Log the exact message structure we're sending
195
+ logger.info(f"Message structure: {messages}")
196
+
197
+ # Use the exact model name and parameters from the sample
198
  completion = client.chat.completions.create(
199
  model="meta-llama/Llama-3.2-11B-Vision-Instruct",
200
  messages=messages,
 
203
 
204
  logger.info(f"Received response from model")
205
 
206
+ # Very simple response handling exactly like the sample code
207
+ logger.info(f"Model response received: {completion}")
 
 
208
 
 
209
  try:
210
+ # Extract response using the exact approach from the sample code
211
+ response = completion.choices[0].message.content
212
+ logger.info(f"Extracted response content: {response}")
213
+ return response
 
 
 
 
 
 
 
214
  except Exception as e:
215
  logger.error(f"Error extracting message content: {str(e)}")
216
+ logger.error(f"Attempting alternative extraction method...")
217
+
218
+ # Fallback method if the above fails
219
+ try:
220
+ if hasattr(completion.choices[0], "message"):
221
+ if hasattr(completion.choices[0].message, "content"):
222
+ return completion.choices[0].message.content
223
+
224
+ # Last resort - try accessing as dictionary
225
+ return completion.choices[0]["message"]["content"]
226
+ except Exception as e2:
227
+ logger.error(f"All extraction methods failed: {str(e2)}")
228
+ return "I couldn't process that input. Please try again with a different query."
229
 
230
  except Exception as e:
231
  logger.error(f"Error during model inference: {str(e)}")
 
277
  text_reply = llm_chat_response(user_text, request.image_base64)
278
  logger.info(f"LLM response: {text_reply}")
279
 
280
+ # Validate voice parameter
281
+ validated_voice = request.validate_voice()
282
+ if validated_voice != request.voice:
283
+ logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
284
+
285
  # Generate audio
286
+ logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
287
  try:
288
  generator = pipeline(
289
  text_reply,
290
+ voice=validated_voice,
291
  speed=request.speed,
292
  split_pattern=r'\n+'
293
  )
 
305
  # Convert to 16-bit signed integers
306
  pcm_data = (audio_numpy * 32767).astype(np.int16)
307
 
308
+ # Convert to bytes (automatically uses row-major order)
309
  raw_audio = pcm_data.tobytes()
310
 
311
  # Return PCM data with minimal necessary headers