khurrameycon commited on
Commit
60e1507
·
verified ·
1 Parent(s): 3a240c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -107
app.py CHANGED
@@ -94,20 +94,19 @@
94
 
95
  # return Response("No audio generated", status_code=400)
96
 
97
- from fastapi import FastAPI, Response, HTTPException
98
- from fastapi.responses import FileResponse, JSONResponse
 
99
  from kokoro import KPipeline
100
- import soundfile as sf
101
  import os
102
  import numpy as np
103
  import torch
104
  from huggingface_hub import InferenceClient
105
  from pydantic import BaseModel
106
  import base64
107
- from io import BytesIO
108
- from PIL import Image
109
  import logging
110
  from typing import Optional
 
111
 
112
  # Set up logging
113
  logging.basicConfig(level=logging.INFO)
@@ -118,14 +117,13 @@ class TextImageRequest(BaseModel):
118
  image_base64: Optional[str] = None
119
  voice: str = "af_heart" # Default voice that we know exists
120
  speed: float = 1.0
121
-
122
  # List of known available voices - update this based on what's actually available
123
  AVAILABLE_VOICES = ["af_heart"] # Add more voices as they become available
124
-
125
- # Validate that the voice exists
126
  def validate_voice(self):
127
  if self.voice not in self.AVAILABLE_VOICES:
128
- return "af_heart" # Default to a voice we know exists
129
  return self.voice
130
 
131
  class AudioResponse(BaseModel):
@@ -143,8 +141,14 @@ app = FastAPI(
143
  version="1.0.0"
144
  )
145
 
 
 
 
 
 
 
146
  def llm_chat_response(text, image_base64=None):
147
- """Function to get responses from LLM with text and optionally image input."""
148
  try:
149
  HF_TOKEN = os.getenv("HF_TOKEN")
150
  logger.info("Checking HF_TOKEN...")
@@ -154,98 +158,92 @@ def llm_chat_response(text, image_base64=None):
154
 
155
  logger.info("Initializing InferenceClient...")
156
  client = InferenceClient(
157
- provider="together", # Using the provider shown in the sample
158
  api_key=HF_TOKEN
159
  )
160
 
161
- try:
162
- # IMPORTANT: Following exactly the format from the sample code
163
- if image_base64:
164
- logger.info("Processing request with image")
165
- prompt = text if text else "Describe this image in one sentence."
166
-
167
- messages = [
168
- {
169
- "role": "user",
170
- "content": [
171
- {
172
- "type": "text",
173
- "text": prompt
174
- },
175
- {
176
- "type": "image_url",
177
- "image_url": {
178
- "url": f"data:image/jpeg;base64,{image_base64}"
179
- }
180
- }
181
- ]
182
- }
183
- ]
184
- else:
185
- logger.info("Processing text-only request")
186
- messages = [
187
- {
188
- "role": "user",
189
- "content": text + " Describe in one line only."
190
- }
191
- ]
192
-
193
- logger.info("Sending request to model...")
194
- # Log the exact message structure we're sending
195
- logger.info(f"Message structure: {messages}")
196
-
197
- # Use the exact model name and parameters from the sample
198
- completion = client.chat.completions.create(
199
- model="meta-llama/Llama-3.2-11B-Vision-Instruct",
200
- messages=messages,
201
- max_tokens=500
202
- )
203
-
204
- logger.info(f"Received response from model")
205
-
206
- # Very simple response handling exactly like the sample code
207
- logger.info(f"Model response received: {completion}")
208
-
209
  try:
210
- # Extract response using the exact approach from the sample code
211
- response = completion.choices[0].message.content
212
- logger.info(f"Extracted response content: {response}")
213
- return response
214
  except Exception as e:
215
- logger.error(f"Error extracting message content: {str(e)}")
216
- logger.error(f"Attempting alternative extraction method...")
217
-
218
- # Fallback method if the above fails
219
- try:
220
- if hasattr(completion.choices[0], "message"):
221
- if hasattr(completion.choices[0].message, "content"):
222
- return completion.choices[0].message.content
223
-
224
- # Last resort - try accessing as dictionary
225
- return completion.choices[0]["message"]["content"]
226
- except Exception as e2:
227
- logger.error(f"All extraction methods failed: {str(e2)}")
228
- return "I couldn't process that input. Please try again with a different query."
229
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  except Exception as e:
231
- logger.error(f"Error during model inference: {str(e)}")
232
- # Fallback response in case of error
233
- return "I couldn't process that input. Please try again with a different image or text query."
 
 
 
 
 
234
 
235
  except Exception as e:
236
  logger.error(f"Error in llm_chat_response: {str(e)}")
237
  raise HTTPException(status_code=500, detail=str(e))
238
 
239
- # Initialize pipeline once at startup
240
  try:
241
  logger.info("Initializing KPipeline...")
242
  pipeline = KPipeline(lang_code='a')
243
  logger.info("KPipeline initialized successfully")
244
  except Exception as e:
245
  logger.error(f"Failed to initialize KPipeline: {str(e)}")
246
- # We'll let the app start anyway, but log the error
247
 
248
- @app.post("/generate", response_model=None, responses={
249
  200: {"content": {"application/octet-stream": {}}},
250
  400: {"model": ErrorResponse},
251
  500: {"model": ErrorResponse}
@@ -254,14 +252,12 @@ async def generate_audio(request: TextImageRequest):
254
  """
255
  Generate audio from text and optionally analyze an image.
256
 
257
- - If text is provided, uses that as input
258
- - If image is provided, analyzes the image
259
- - Converts the LLM response to speech using the specified voice and speed
260
  """
261
  try:
262
- logger.info(f"Received audio generation request")
263
-
264
- # If no text is provided but image is provided, use default prompt
265
  user_text = request.text if request.text is not None else ""
266
  if not user_text and request.image_base64:
267
  user_text = "Describe what you see in the image"
@@ -272,17 +268,14 @@ async def generate_audio(request: TextImageRequest):
272
  content={"error": "Request must include either text or image_base64"}
273
  )
274
 
275
- # Generate response using text and image if provided
276
  logger.info("Getting LLM response...")
277
  text_reply = llm_chat_response(user_text, request.image_base64)
278
  logger.info(f"LLM response: {text_reply}")
279
 
280
- # Validate voice parameter
281
  validated_voice = request.validate_voice()
282
  if validated_voice != request.voice:
283
  logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
284
 
285
- # Generate audio
286
  logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
287
  try:
288
  generator = pipeline(
@@ -292,28 +285,20 @@ async def generate_audio(request: TextImageRequest):
292
  split_pattern=r'\n+'
293
  )
294
 
295
- # Process only the first segment for demo
296
  for i, (gs, ps, audio) in enumerate(generator):
297
  logger.info(f"Audio generated successfully: segment {i}")
298
-
299
  # Convert PyTorch tensor to NumPy array
300
  audio_numpy = audio.cpu().numpy()
301
-
302
- # Convert to 16-bit PCM
303
- # Ensure the audio is in the range [-1, 1]
304
  audio_numpy = np.clip(audio_numpy, -1, 1)
305
- # Convert to 16-bit signed integers
306
  pcm_data = (audio_numpy * 32767).astype(np.int16)
307
-
308
- # Convert to bytes (automatically uses row-major order)
309
  raw_audio = pcm_data.tobytes()
310
 
311
- # Return PCM data with minimal necessary headers
312
  return Response(
313
  content=raw_audio,
314
  media_type="application/octet-stream",
315
  headers={
316
- "Content-Disposition": f'attachment; filename="output.pcm"',
317
  "X-Sample-Rate": "24000",
318
  "X-Bits-Per-Sample": "16",
319
  "X-Endianness": "little"
@@ -342,18 +327,18 @@ async def generate_audio(request: TextImageRequest):
342
 
343
  @app.get("/")
344
  async def root():
345
- return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate endpoint with 'text' and optionally 'image_base64' for queries."}
346
 
347
  @app.exception_handler(404)
348
- async def not_found_handler(request, exc):
349
  return JSONResponse(
350
  status_code=404,
351
  content={"error": "Endpoint not found. Please use POST /generate for queries."}
352
  )
353
 
354
  @app.exception_handler(405)
355
- async def method_not_allowed_handler(request, exc):
356
  return JSONResponse(
357
  status_code=405,
358
  content={"error": "Method not allowed. Please check the API documentation."}
359
- )
 
94
 
95
  # return Response("No audio generated", status_code=400)
96
 
97
+ from fastapi import FastAPI, Response, HTTPException, Request
98
+ from fastapi.responses import JSONResponse
99
+ from fastapi.staticfiles import StaticFiles
100
  from kokoro import KPipeline
 
101
  import os
102
  import numpy as np
103
  import torch
104
  from huggingface_hub import InferenceClient
105
  from pydantic import BaseModel
106
  import base64
 
 
107
  import logging
108
  from typing import Optional
109
+ import uuid
110
 
111
  # Set up logging
112
  logging.basicConfig(level=logging.INFO)
 
117
  image_base64: Optional[str] = None
118
  voice: str = "af_heart" # Default voice that we know exists
119
  speed: float = 1.0
120
+
121
  # List of known available voices - update this based on what's actually available
122
  AVAILABLE_VOICES = ["af_heart"] # Add more voices as they become available
123
+
 
124
  def validate_voice(self):
125
  if self.voice not in self.AVAILABLE_VOICES:
126
+ return "af_heart" # Default to a known available voice
127
  return self.voice
128
 
129
  class AudioResponse(BaseModel):
 
141
  version="1.0.0"
142
  )
143
 
144
+ # Create and mount static images directory so images are accessible via URL
145
+ STATIC_DIR = "static_images"
146
+ if not os.path.exists(STATIC_DIR):
147
+ os.makedirs(STATIC_DIR)
148
+ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
149
+
150
  def llm_chat_response(text, image_base64=None):
151
+ """Get responses from LLM with text and optionally an image input."""
152
  try:
153
  HF_TOKEN = os.getenv("HF_TOKEN")
154
  logger.info("Checking HF_TOKEN...")
 
158
 
159
  logger.info("Initializing InferenceClient...")
160
  client = InferenceClient(
161
+ provider="hf-inference", # Using correct provider as per sample
162
  api_key=HF_TOKEN
163
  )
164
 
165
+ if image_base64:
166
+ logger.info("Processing request with image")
167
+ # Save the base64 image to the static folder
168
+ filename = f"{uuid.uuid4()}.jpg"
169
+ image_path = os.path.join(STATIC_DIR, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  try:
171
+ image_data = base64.b64decode(image_base64)
 
 
 
172
  except Exception as e:
173
+ logger.error(f"Error decoding base64 image: {str(e)}")
174
+ raise HTTPException(status_code=400, detail="Invalid base64 image data")
175
+ with open(image_path, "wb") as f:
176
+ f.write(image_data)
177
+ # Construct image URL (assumes BASE_URL environment variable or defaults to localhost)
178
+ base_url = os.getenv("BASE_URL", "http://localhost:8000")
179
+ image_url = f"{base_url}/static/{filename}"
180
+ prompt = text if text else "Describe this image in one sentence."
181
+ messages = [
182
+ {
183
+ "role": "user",
184
+ "content": [
185
+ {
186
+ "type": "text",
187
+ "text": prompt
188
+ },
189
+ {
190
+ "type": "image_url",
191
+ "image_url": {
192
+ "url": image_url
193
+ }
194
+ }
195
+ ]
196
+ }
197
+ ]
198
+ else:
199
+ logger.info("Processing text-only request")
200
+ messages = [
201
+ {
202
+ "role": "user",
203
+ "content": text + " Describe in one line only."
204
+ }
205
+ ]
206
+
207
+ logger.info("Sending request to model...")
208
+ logger.info(f"Message structure: {messages}")
209
+
210
+ completion = client.chat.completions.create(
211
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
212
+ messages=messages,
213
+ max_tokens=500
214
+ )
215
+
216
+ logger.info("Received response from model")
217
+ logger.info(f"Model response received: {completion}")
218
+
219
+ try:
220
+ response = completion.choices[0].message.content
221
+ logger.info(f"Extracted response content: {response}")
222
+ return response
223
  except Exception as e:
224
+ logger.error(f"Error extracting message content: {str(e)}")
225
+ try:
226
+ if hasattr(completion.choices[0], "message") and hasattr(completion.choices[0].message, "content"):
227
+ return completion.choices[0].message.content
228
+ return completion.choices[0]["message"]["content"]
229
+ except Exception as e2:
230
+ logger.error(f"All extraction methods failed: {str(e2)}")
231
+ return "I couldn't process that input. Please try again with a different query."
232
 
233
  except Exception as e:
234
  logger.error(f"Error in llm_chat_response: {str(e)}")
235
  raise HTTPException(status_code=500, detail=str(e))
236
 
237
+ # Initialize the audio generation pipeline once at startup
238
  try:
239
  logger.info("Initializing KPipeline...")
240
  pipeline = KPipeline(lang_code='a')
241
  logger.info("KPipeline initialized successfully")
242
  except Exception as e:
243
  logger.error(f"Failed to initialize KPipeline: {str(e)}")
244
+ # The app starts regardless but logs the error
245
 
246
+ @app.post("/generate", responses={
247
  200: {"content": {"application/octet-stream": {}}},
248
  400: {"model": ErrorResponse},
249
  500: {"model": ErrorResponse}
 
252
  """
253
  Generate audio from text and optionally analyze an image.
254
 
255
+ - If text is provided, it is used as input.
256
+ - If an image is provided (base64), it is saved and a URL is generated for processing.
257
+ - The LLM response is then converted to speech.
258
  """
259
  try:
260
+ logger.info("Received audio generation request")
 
 
261
  user_text = request.text if request.text is not None else ""
262
  if not user_text and request.image_base64:
263
  user_text = "Describe what you see in the image"
 
268
  content={"error": "Request must include either text or image_base64"}
269
  )
270
 
 
271
  logger.info("Getting LLM response...")
272
  text_reply = llm_chat_response(user_text, request.image_base64)
273
  logger.info(f"LLM response: {text_reply}")
274
 
 
275
  validated_voice = request.validate_voice()
276
  if validated_voice != request.voice:
277
  logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
278
 
 
279
  logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
280
  try:
281
  generator = pipeline(
 
285
  split_pattern=r'\n+'
286
  )
287
 
 
288
  for i, (gs, ps, audio) in enumerate(generator):
289
  logger.info(f"Audio generated successfully: segment {i}")
 
290
  # Convert PyTorch tensor to NumPy array
291
  audio_numpy = audio.cpu().numpy()
292
+ # Clip values to range [-1, 1] and convert to 16-bit PCM
 
 
293
  audio_numpy = np.clip(audio_numpy, -1, 1)
 
294
  pcm_data = (audio_numpy * 32767).astype(np.int16)
 
 
295
  raw_audio = pcm_data.tobytes()
296
 
 
297
  return Response(
298
  content=raw_audio,
299
  media_type="application/octet-stream",
300
  headers={
301
+ "Content-Disposition": 'attachment; filename="output.pcm"',
302
  "X-Sample-Rate": "24000",
303
  "X-Bits-Per-Sample": "16",
304
  "X-Endianness": "little"
 
327
 
328
  @app.get("/")
329
  async def root():
330
+ return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate with 'text' and optionally 'image_base64' for queries."}
331
 
332
  @app.exception_handler(404)
333
+ async def not_found_handler(request: Request, exc):
334
  return JSONResponse(
335
  status_code=404,
336
  content={"error": "Endpoint not found. Please use POST /generate for queries."}
337
  )
338
 
339
  @app.exception_handler(405)
340
+ async def method_not_allowed_handler(request: Request, exc):
341
  return JSONResponse(
342
  status_code=405,
343
  content={"error": "Method not allowed. Please check the API documentation."}
344
+ )