khurrameycon commited on
Commit
313833e
·
verified ·
1 Parent(s): 4196bc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -35
app.py CHANGED
@@ -106,7 +106,7 @@ from typing import Optional, ClassVar, List
106
  from huggingface_hub import InferenceClient
107
  import numpy as np
108
  import torch
109
- from kokoro import KPipeline # Assuming you have this pipeline for audio generation
110
 
111
  # Set up logging
112
  logging.basicConfig(level=logging.INFO)
@@ -115,7 +115,7 @@ logger = logging.getLogger(__name__)
115
  # Create FastAPI app
116
  app = FastAPI(
117
  title="Text-to-Speech API with Vision Support",
118
- description="This API uses meta-llama/Llama-3.2-11B-Vision-Instruct, which requires an image input.",
119
  version="1.0.0"
120
  )
121
 
@@ -140,17 +140,12 @@ class TextImageRequest(BaseModel):
140
  return "af_heart"
141
  return self.voice
142
 
143
- # (Optional) Pydantic models for responses
144
- class AudioResponse(BaseModel):
145
- status: str
146
- message: str
147
-
148
  class ErrorResponse(BaseModel):
149
  error: str
150
  detail: Optional[str] = None
151
 
152
- # Function to call the LLM model following the reference code exactly
153
- def llm_chat_response(text: str, image_base64: str) -> str:
154
  HF_TOKEN = os.getenv("HF_TOKEN")
155
  logger.info("Checking HF_TOKEN...")
156
  if not HF_TOKEN:
@@ -163,7 +158,7 @@ def llm_chat_response(text: str, image_base64: str) -> str:
163
  api_key=HF_TOKEN
164
  )
165
 
166
- # Save the base64-encoded image locally so it is accessible via a URL
167
  filename = f"{uuid.uuid4()}.jpg"
168
  image_path = os.path.join(STATIC_DIR, filename)
169
  try:
@@ -176,19 +171,25 @@ def llm_chat_response(text: str, image_base64: str) -> str:
176
  f.write(image_data)
177
 
178
  # Construct the public URL for the saved image.
179
- # BASE_URL should be set to your public URL if not running locally.
180
  base_url = os.getenv("BASE_URL", "http://localhost:8000")
181
  image_url = f"{base_url}/static/{filename}"
182
 
183
- # Build the message exactly as in the reference code.
184
- # This model requires a list with two items: one for text and one for the image.
185
- prompt = text if text else "Describe this image in one sentence."
186
  messages = [
187
  {
188
  "role": "user",
189
  "content": [
190
- {"type": "text", "text": prompt},
191
- {"type": "image_url", "image_url": {"url": image_url}}
 
 
 
 
 
 
 
 
192
  ]
193
  }
194
  ]
@@ -198,7 +199,7 @@ def llm_chat_response(text: str, image_base64: str) -> str:
198
  completion = client.chat.completions.create(
199
  model="meta-llama/Llama-3.2-11B-Vision-Instruct",
200
  messages=messages,
201
- max_tokens=500
202
  )
203
  response = completion.choices[0].message.content
204
  logger.info(f"Extracted response: {response}")
@@ -207,14 +208,14 @@ def llm_chat_response(text: str, image_base64: str) -> str:
207
  logger.error(f"Error during model inference: {str(e)}")
208
  raise HTTPException(status_code=500, detail=str(e))
209
 
210
- # Initialize audio generation pipeline (your audio conversion pipeline)
211
  try:
212
  logger.info("Initializing KPipeline...")
213
  pipeline = KPipeline(lang_code='a')
214
  logger.info("KPipeline initialized successfully")
215
  except Exception as e:
216
  logger.error(f"Failed to initialize KPipeline: {str(e)}")
217
- # The API can still run, but audio generation will fail.
218
 
219
  @app.post("/generate", responses={
220
  200: {"content": {"application/octet-stream": {}}},
@@ -224,44 +225,37 @@ except Exception as e:
224
  async def generate_audio(request: TextImageRequest):
225
  """
226
  Generate audio from a multimodal (text+image) input.
227
- This model does not support text-only inputs.
228
  """
229
  logger.info("Received generation request")
230
- # Ensure an image is provided because the model is multimodal.
 
231
  if not request.image_base64:
232
  raise HTTPException(status_code=400, detail="This model requires an image input.")
233
 
234
- # Get the text prompt. If none is provided, use a default.
235
- user_text = request.text if request.text else "Describe this image in one sentence."
236
-
237
- # Get the LLM's response
238
  logger.info("Calling the LLM model")
239
- text_reply = llm_chat_response(user_text, request.image_base64)
240
  logger.info(f"LLM response: {text_reply}")
241
 
242
- # Validate voice parameter (if needed for audio generation)
243
  validated_voice = request.validate_voice()
244
  if validated_voice != request.voice:
245
  logger.warning(f"Voice '{request.voice}' not available; using '{validated_voice}' instead")
246
 
247
- # Convert the text reply to audio using your audio pipeline
248
  logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
249
  try:
250
- # Generate audio segments (assumes pipeline yields segments)
251
  generator = pipeline(
252
  text_reply,
253
  voice=validated_voice,
254
  speed=request.speed,
255
  split_pattern=r'\n+'
256
  )
257
- for i, (gs, ps, audio) in enumerate(generator):
258
- logger.info(f"Audio generated, segment {i}")
259
- # Convert audio tensor to 16-bit PCM bytes
260
  audio_numpy = audio.cpu().numpy()
261
  audio_numpy = np.clip(audio_numpy, -1, 1)
262
  pcm_data = (audio_numpy * 32767).astype(np.int16)
263
  raw_audio = pcm_data.tobytes()
264
-
265
  return Response(
266
  content=raw_audio,
267
  media_type="application/octet-stream",
@@ -279,7 +273,7 @@ async def generate_audio(request: TextImageRequest):
279
 
280
  @app.get("/")
281
  async def root():
282
- return {"message": "Welcome! Use POST /generate with text and image_base64."}
283
 
284
  @app.exception_handler(404)
285
  async def not_found_handler(request: Request, exc):
@@ -288,4 +282,3 @@ async def not_found_handler(request: Request, exc):
288
  @app.exception_handler(405)
289
  async def method_not_allowed_handler(request: Request, exc):
290
  return JSONResponse(status_code=405, content={"error": "Method not allowed."})
291
-
 
106
  from huggingface_hub import InferenceClient
107
  import numpy as np
108
  import torch
109
+ from kokoro import KPipeline # Your audio generation pipeline
110
 
111
  # Set up logging
112
  logging.basicConfig(level=logging.INFO)
 
115
  # Create FastAPI app
116
  app = FastAPI(
117
  title="Text-to-Speech API with Vision Support",
118
+ description="This API uses meta-llama/Llama-3.2-11B-Vision-Instruct which requires an image input.",
119
  version="1.0.0"
120
  )
121
 
 
140
  return "af_heart"
141
  return self.voice
142
 
143
+ # Pydantic model for error responses
 
 
 
 
144
  class ErrorResponse(BaseModel):
145
  error: str
146
  detail: Optional[str] = None
147
 
148
+ def llm_chat_response(prompt: str, image_base64: str) -> str:
 
149
  HF_TOKEN = os.getenv("HF_TOKEN")
150
  logger.info("Checking HF_TOKEN...")
151
  if not HF_TOKEN:
 
158
  api_key=HF_TOKEN
159
  )
160
 
161
+ # Save the base64-encoded image locally
162
  filename = f"{uuid.uuid4()}.jpg"
163
  image_path = os.path.join(STATIC_DIR, filename)
164
  try:
 
171
  f.write(image_data)
172
 
173
  # Construct the public URL for the saved image.
174
+ # Set BASE_URL to your public URL if needed.
175
  base_url = os.getenv("BASE_URL", "http://localhost:8000")
176
  image_url = f"{base_url}/static/{filename}"
177
 
178
+ # Build the message payload exactly as in the reference:
 
 
179
  messages = [
180
  {
181
  "role": "user",
182
  "content": [
183
+ {
184
+ "type": "text",
185
+ "text": prompt
186
+ },
187
+ {
188
+ "type": "image_url",
189
+ "image_url": {
190
+ "url": image_url
191
+ }
192
+ }
193
  ]
194
  }
195
  ]
 
199
  completion = client.chat.completions.create(
200
  model="meta-llama/Llama-3.2-11B-Vision-Instruct",
201
  messages=messages,
202
+ max_tokens=500,
203
  )
204
  response = completion.choices[0].message.content
205
  logger.info(f"Extracted response: {response}")
 
208
  logger.error(f"Error during model inference: {str(e)}")
209
  raise HTTPException(status_code=500, detail=str(e))
210
 
211
+ # Initialize the audio generation pipeline (KPipeline)
212
  try:
213
  logger.info("Initializing KPipeline...")
214
  pipeline = KPipeline(lang_code='a')
215
  logger.info("KPipeline initialized successfully")
216
  except Exception as e:
217
  logger.error(f"Failed to initialize KPipeline: {str(e)}")
218
+ # The API will run but audio generation will fail if the pipeline is not ready.
219
 
220
  @app.post("/generate", responses={
221
  200: {"content": {"application/octet-stream": {}}},
 
225
  async def generate_audio(request: TextImageRequest):
226
  """
227
  Generate audio from a multimodal (text+image) input.
228
+ This model requires an image input.
229
  """
230
  logger.info("Received generation request")
231
+
232
+ # The model requires an image; if missing, return an error.
233
  if not request.image_base64:
234
  raise HTTPException(status_code=400, detail="This model requires an image input.")
235
 
236
+ prompt = request.text if request.text else "Describe this image in one sentence."
 
 
 
237
  logger.info("Calling the LLM model")
238
+ text_reply = llm_chat_response(prompt, request.image_base64)
239
  logger.info(f"LLM response: {text_reply}")
240
 
 
241
  validated_voice = request.validate_voice()
242
  if validated_voice != request.voice:
243
  logger.warning(f"Voice '{request.voice}' not available; using '{validated_voice}' instead")
244
 
245
+ # Convert the text reply to audio using the KPipeline.
246
  logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
247
  try:
 
248
  generator = pipeline(
249
  text_reply,
250
  voice=validated_voice,
251
  speed=request.speed,
252
  split_pattern=r'\n+'
253
  )
254
+ for _, _, audio in generator:
 
 
255
  audio_numpy = audio.cpu().numpy()
256
  audio_numpy = np.clip(audio_numpy, -1, 1)
257
  pcm_data = (audio_numpy * 32767).astype(np.int16)
258
  raw_audio = pcm_data.tobytes()
 
259
  return Response(
260
  content=raw_audio,
261
  media_type="application/octet-stream",
 
273
 
274
  @app.get("/")
275
  async def root():
276
+ return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate with text and image_base64."}
277
 
278
  @app.exception_handler(404)
279
  async def not_found_handler(request: Request, exc):
 
282
  @app.exception_handler(405)
283
  async def method_not_allowed_handler(request: Request, exc):
284
  return JSONResponse(status_code=405, content={"error": "Method not allowed."})