khurrameycon commited on
Commit
2bb03a8
·
verified ·
1 Parent(s): a1a0caf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -78
app.py CHANGED
@@ -95,9 +95,8 @@
95
  # return Response("No audio generated", status_code=400)
96
 
97
 
98
-
99
- from fastapi import FastAPI, Response
100
- from fastapi.responses import FileResponse
101
  from kokoro import KPipeline
102
  import soundfile as sf
103
  import os
@@ -108,98 +107,230 @@ from pydantic import BaseModel
108
  import base64
109
  from io import BytesIO
110
  from PIL import Image
 
 
 
 
 
 
111
 
112
  class TextImageRequest(BaseModel):
113
- text: str = None
114
- image_base64: str = None
115
  voice: str = "af_heart"
116
  speed: float = 1.0
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def llm_chat_response(text, image_base64=None):
119
- HF_TOKEN = os.getenv("HF_TOKEN")
120
- client = InferenceClient(api_key=HF_TOKEN)
121
-
122
- # For image + text requests, we need to use the conversational format
123
- # with proper message structure
124
- system_message = "You are a helpful assistant that provides concise responses."
125
-
126
  try:
127
- if image_base64:
128
- messages = [
129
- {"role": "system", "content": system_message},
130
- {"role": "user", "content": [
131
- {"type": "text", "text": text if text else "Describe what you see in the image in one line only"},
132
- {"type": "image", "source": {"data": f"data:image/jpeg;base64,{image_base64}"}}
133
- ]}
134
- ]
135
- else:
136
- messages = [
137
- {"role": "system", "content": system_message},
138
- {"role": "user", "content": text + " Describe in one line only."}
139
- ]
140
 
141
- # Call the API
142
- response_from_llama = client.chat.completions.create(
143
- model="meta-llama/Llama-3.2-11B-Vision-Instruct",
144
- messages=messages,
145
- max_tokens=500
146
  )
147
 
148
- return response_from_llama.choices[0].message['content']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  except Exception as e:
150
- print(f"Error calling LLM API: {e}")
151
- # Fallback response in case of error
152
- return "I couldn't process that input. Please try again with a different image or text query."
153
 
154
- app = FastAPI()
155
  # Initialize pipeline once at startup
156
- pipeline = KPipeline(lang_code='a')
 
 
 
 
 
 
157
 
158
- @app.post("/generate")
 
 
 
 
159
  async def generate_audio(request: TextImageRequest):
160
- # If no text is provided but image is provided, use default prompt
161
- user_text = request.text
162
- if user_text is None and request.image_base64:
163
- user_text = "Describe what you see in the image"
164
- elif user_text is None:
165
- user_text = ""
166
-
167
- # Generate response using text and image if provided
168
- text_reply = llm_chat_response(user_text, request.image_base64)
169
-
170
- # Generate audio
171
- generator = pipeline(
172
- text_reply,
173
- voice=request.voice,
174
- speed=request.speed,
175
- split_pattern=r'\n+'
176
- )
177
 
178
- # Process only the first segment for demo
179
- for i, (gs, ps, audio) in enumerate(generator):
180
- # Convert PyTorch tensor to NumPy array
181
- audio_numpy = audio.cpu().numpy()
182
-
183
- # Convert to 16-bit PCM
184
- # Ensure the audio is in the range [-1, 1]
185
- audio_numpy = np.clip(audio_numpy, -1, 1)
186
 
187
- # Convert to 16-bit signed integers
188
- pcm_data = (audio_numpy * 32767).astype(np.int16)
 
 
 
 
 
 
 
 
189
 
190
- # Convert to bytes (automatically uses row-major order)
191
- raw_audio = pcm_data.tobytes()
 
 
192
 
193
- # Return PCM data with minimal necessary headers
194
- return Response(
195
- content=raw_audio,
196
- media_type="application/octet-stream",
197
- headers={
198
- "Content-Disposition": f'attachment; filename="output.pcm"',
199
- "X-Sample-Rate": "24000",
200
- "X-Bits-Per-Sample": "16",
201
- "X-Endianness": "little"
202
- }
203
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- return Response("No audio generated", status_code=400)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # return Response("No audio generated", status_code=400)
96
 
97
 
98
+ from fastapi import FastAPI, Response, HTTPException
99
+ from fastapi.responses import FileResponse, JSONResponse
 
100
  from kokoro import KPipeline
101
  import soundfile as sf
102
  import os
 
107
  import base64
108
  from io import BytesIO
109
  from PIL import Image
110
+ import logging
111
+ from typing import Optional
112
+
113
+ # Set up logging
114
+ logging.basicConfig(level=logging.INFO)
115
+ logger = logging.getLogger(__name__)
116
 
117
  class TextImageRequest(BaseModel):
118
+ text: Optional[str] = None
119
+ image_base64: Optional[str] = None
120
  voice: str = "af_heart"
121
  speed: float = 1.0
122
 
123
+ class AudioResponse(BaseModel):
124
+ status: str
125
+ message: str
126
+
127
+ class ErrorResponse(BaseModel):
128
+ error: str
129
+ detail: Optional[str] = None
130
+
131
+ # Initialize FastAPI app
132
+ app = FastAPI(
133
+ title="Text-to-Speech API with Vision Support",
134
+ description="API for generating speech from text with optional image analysis",
135
+ version="1.0.0"
136
+ )
137
+
138
  def llm_chat_response(text, image_base64=None):
139
+ """Function to get responses from LLM with text and optionally image input."""
 
 
 
 
 
 
140
  try:
141
+ HF_TOKEN = os.getenv("HF_TOKEN")
142
+ logger.info("Checking HF_TOKEN...")
143
+ if not HF_TOKEN:
144
+ logger.error("HF_TOKEN not found in environment variables")
145
+ raise HTTPException(status_code=500, detail="HF_TOKEN not configured")
 
 
 
 
 
 
 
 
146
 
147
+ logger.info("Initializing InferenceClient...")
148
+ client = InferenceClient(
149
+ provider="sambanova", # Specify provider if needed
150
+ api_key=HF_TOKEN
 
151
  )
152
 
153
+ # System message for better context
154
+ system_message = "You are a helpful assistant that provides concise responses."
155
+
156
+ try:
157
+ if image_base64:
158
+ logger.info("Processing request with image")
159
+ messages = [
160
+ {"role": "system", "content": system_message},
161
+ {"role": "user", "content": [
162
+ {"type": "text", "text": text if text else "Describe what you see in the image in one line only"},
163
+ {"type": "image", "source": {"data": f"data:image/jpeg;base64,{image_base64}"}}
164
+ ]}
165
+ ]
166
+ else:
167
+ logger.info("Processing text-only request")
168
+ messages = [
169
+ {"role": "system", "content": system_message},
170
+ {"role": "user", "content": text + " Describe in one line only."}
171
+ ]
172
+
173
+ logger.info("Sending request to model...")
174
+ completion = client.chat.completions.create(
175
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
176
+ messages=messages,
177
+ max_tokens=500
178
+ )
179
+
180
+ logger.info(f"Received response from model")
181
+
182
+ # Handle potential different response formats
183
+ if not completion.choices or len(completion.choices) == 0:
184
+ logger.error("No choices returned from model.")
185
+ raise HTTPException(status_code=500, detail="Model returned no choices.")
186
+
187
+ # Extract the response message from the first choice
188
+ choice = completion.choices[0]
189
+ response_message = None
190
+
191
+ if hasattr(choice, "message"):
192
+ response_message = choice.message
193
+ elif isinstance(choice, dict):
194
+ response_message = choice.get("message")
195
+
196
+ if not response_message:
197
+ logger.error(f"Response message is empty: {choice}")
198
+ raise HTTPException(status_code=500, detail="Model response did not include a message.")
199
+
200
+ content = None
201
+ if isinstance(response_message, dict):
202
+ content = response_message.get("content")
203
+ if content is None and hasattr(response_message, "content"):
204
+ content = response_message.content
205
+
206
+ if not content:
207
+ logger.error(f"Message content is missing: {response_message}")
208
+ raise HTTPException(status_code=500, detail="Model message did not include content.")
209
+
210
+ return content
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error during model inference: {str(e)}")
214
+ # Fallback response in case of error
215
+ return "I couldn't process that input. Please try again with a different image or text query."
216
+
217
  except Exception as e:
218
+ logger.error(f"Error in llm_chat_response: {str(e)}")
219
+ raise HTTPException(status_code=500, detail=str(e))
 
220
 
 
221
  # Initialize pipeline once at startup
222
+ try:
223
+ logger.info("Initializing KPipeline...")
224
+ pipeline = KPipeline(lang_code='a')
225
+ logger.info("KPipeline initialized successfully")
226
+ except Exception as e:
227
+ logger.error(f"Failed to initialize KPipeline: {str(e)}")
228
+ # We'll let the app start anyway, but log the error
229
 
230
+ @app.post("/generate", response_model=None, responses={
231
+ 200: {"content": {"application/octet-stream": {}}},
232
+ 400: {"model": ErrorResponse},
233
+ 500: {"model": ErrorResponse}
234
+ })
235
  async def generate_audio(request: TextImageRequest):
236
+ """
237
+ Generate audio from text and optionally analyze an image.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ - If text is provided, uses that as input
240
+ - If image is provided, analyzes the image
241
+ - Converts the LLM response to speech using the specified voice and speed
242
+ """
243
+ try:
244
+ logger.info(f"Received audio generation request")
 
 
245
 
246
+ # If no text is provided but image is provided, use default prompt
247
+ user_text = request.text if request.text is not None else ""
248
+ if not user_text and request.image_base64:
249
+ user_text = "Describe what you see in the image"
250
+ elif not user_text and not request.image_base64:
251
+ logger.error("Neither text nor image provided in request")
252
+ return JSONResponse(
253
+ status_code=400,
254
+ content={"error": "Request must include either text or image_base64"}
255
+ )
256
 
257
+ # Generate response using text and image if provided
258
+ logger.info("Getting LLM response...")
259
+ text_reply = llm_chat_response(user_text, request.image_base64)
260
+ logger.info(f"LLM response: {text_reply}")
261
 
262
+ # Generate audio
263
+ logger.info(f"Generating audio using voice={request.voice}, speed={request.speed}")
264
+ try:
265
+ generator = pipeline(
266
+ text_reply,
267
+ voice=request.voice,
268
+ speed=request.speed,
269
+ split_pattern=r'\n+'
270
+ )
271
+
272
+ # Process only the first segment for demo
273
+ for i, (gs, ps, audio) in enumerate(generator):
274
+ logger.info(f"Audio generated successfully: segment {i}")
275
+
276
+ # Convert PyTorch tensor to NumPy array
277
+ audio_numpy = audio.cpu().numpy()
278
+
279
+ # Convert to 16-bit PCM
280
+ # Ensure the audio is in the range [-1, 1]
281
+ audio_numpy = np.clip(audio_numpy, -1, 1)
282
+ # Convert to 16-bit signed integers
283
+ pcm_data = (audio_numpy * 32767).astype(np.int16)
284
+
285
+ # Convert to bytes (automatically uses row-major order)
286
+ raw_audio = pcm_data.tobytes()
287
+
288
+ # Return PCM data with minimal necessary headers
289
+ return Response(
290
+ content=raw_audio,
291
+ media_type="application/octet-stream",
292
+ headers={
293
+ "Content-Disposition": f'attachment; filename="output.pcm"',
294
+ "X-Sample-Rate": "24000",
295
+ "X-Bits-Per-Sample": "16",
296
+ "X-Endianness": "little"
297
+ }
298
+ )
299
+
300
+ logger.error("No audio segments generated")
301
+ return JSONResponse(
302
+ status_code=400,
303
+ content={"error": "No audio generated", "detail": "The pipeline did not produce any audio"}
304
+ )
305
+
306
+ except Exception as e:
307
+ logger.error(f"Error generating audio: {str(e)}")
308
+ return JSONResponse(
309
+ status_code=500,
310
+ content={"error": "Audio generation failed", "detail": str(e)}
311
+ )
312
 
313
+ except Exception as e:
314
+ logger.error(f"Unexpected error in generate_audio endpoint: {str(e)}")
315
+ return JSONResponse(
316
+ status_code=500,
317
+ content={"error": "Internal server error", "detail": str(e)}
318
+ )
319
+
320
+ @app.get("/")
321
+ async def root():
322
+ return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate endpoint with 'text' and optionally 'image_base64' for queries."}
323
+
324
+ @app.exception_handler(404)
325
+ async def not_found_handler(request, exc):
326
+ return JSONResponse(
327
+ status_code=404,
328
+ content={"error": "Endpoint not found. Please use POST /generate for queries."}
329
+ )
330
+
331
+ @app.exception_handler(405)
332
+ async def method_not_allowed_handler(request, exc):
333
+ return JSONResponse(
334
+ status_code=405,
335
+ content={"error": "Method not allowed. Please check the API documentation."}
336
+ )