TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 5

Commit

c034a74

verified ·

1 Parent(s): e1bc235

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -42

app.py CHANGED Viewed

@@ -1,79 +1,196 @@
 from fastapi import FastAPI, Response
 from fastapi.responses import FileResponse
 from kokoro import KPipeline
 import soundfile as sf
 import os
 import numpy as np
-import torch
 from huggingface_hub import InferenceClient
-def llm_chat_response(text):
     HF_TOKEN = os.getenv("HF_TOKEN")
     client = InferenceClient(api_key=HF_TOKEN)
     messages = [
-	{
-		"role": "user",
-		"content": [
-			{
-				"type": "text",
-				"text": text + str('describe in one line only')
-			} #,
-			# {
-			# 	"type": "image_url",
-			# 	"image_url": {
-			# 		"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-			# 	}
-			# }
-            ]
-	}
     ]
     response_from_llama = client.chat.completions.create(
-    model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-	messages=messages,
-	max_tokens=500)
     return response_from_llama.choices[0].message['content']
 app = FastAPI()
 # Initialize pipeline once at startup
 pipeline = KPipeline(lang_code='a')
 @app.post("/generate")
-async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
-    text_reply = llm_chat_response(text)
     # Generate audio
     generator = pipeline(
         text_reply,
-        voice=voice,
-        speed=speed,
         split_pattern=r'\n+'
     )
-    # # Save first segment only for demo
-    # for i, (gs, ps, audio) in enumerate(generator):
-    #     sf.write(f"output_{i}.wav", audio, 24000)
-    #     return FileResponse(
-    #         f"output_{i}.wav",
-    #         media_type="audio/wav",
-    #         filename="output.wav"
-    #     )
-    # return Response("No audio generated", status_code=400)
     # Process only the first segment for demo
     for i, (gs, ps, audio) in enumerate(generator):
         # Convert PyTorch tensor to NumPy array
         audio_numpy = audio.cpu().numpy()
-        # Convert to 16-bit PCM
         # Ensure the audio is in the range [-1, 1]
         audio_numpy = np.clip(audio_numpy, -1, 1)
         # Convert to 16-bit signed integers
         pcm_data = (audio_numpy * 32767).astype(np.int16)

+# from fastapi import FastAPI, Response
+# from fastapi.responses import FileResponse
+# from kokoro import KPipeline
+# import soundfile as sf
+# import os
+# import numpy as np
+# import torch
+# from huggingface_hub import InferenceClient
+# def llm_chat_response(text):
+#     HF_TOKEN = os.getenv("HF_TOKEN")
+#     client = InferenceClient(api_key=HF_TOKEN)
+#     messages = [
+# 	{
+# 		"role": "user",
+# 		"content": [
+# 			{
+# 				"type": "text",
+# 				"text": text + str('describe in one line only')
+# 			} #,
+# 			# {
+# 			# 	"type": "image_url",
+# 			# 	"image_url": {
+# 			# 		"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+# 			# 	}
+# 			# }
+#             ]
+# 	}
+#     ]
+#     response_from_llama = client.chat.completions.create(
+#     model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+# 	messages=messages,
+# 	max_tokens=500)
+#     return response_from_llama.choices[0].message['content']
+# app = FastAPI()
+# # Initialize pipeline once at startup
+# pipeline = KPipeline(lang_code='a')
+# @app.post("/generate")
+# async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
+#     text_reply = llm_chat_response(text)
+#     # Generate audio
+#     generator = pipeline(
+#         text_reply,
+#         voice=voice,
+#         speed=speed,
+#         split_pattern=r'\n+'
+#     )
+#     # # Save first segment only for demo
+#     # for i, (gs, ps, audio) in enumerate(generator):
+#     #     sf.write(f"output_{i}.wav", audio, 24000)
+#     #     return FileResponse(
+#     #         f"output_{i}.wav",
+#     #         media_type="audio/wav",
+#     #         filename="output.wav"
+#     #     )
+#     # return Response("No audio generated", status_code=400)
+#     # Process only the first segment for demo
+#     for i, (gs, ps, audio) in enumerate(generator):
+#         # Convert PyTorch tensor to NumPy array
+#         audio_numpy = audio.cpu().numpy()
+#         # Convert to 16-bit PCM
+#         # Ensure the audio is in the range [-1, 1]
+#         audio_numpy = np.clip(audio_numpy, -1, 1)
+#         # Convert to 16-bit signed integers
+#         pcm_data = (audio_numpy * 32767).astype(np.int16)
+#         # Convert to bytes (automatically uses row-major order)
+#         raw_audio = pcm_data.tobytes()
+#         # Return PCM data with minimal necessary headers
+#         return Response(
+#             content=raw_audio,
+#             media_type="application/octet-stream",
+#             headers={
+#                 "Content-Disposition": f'attachment; filename="output.pcm"',
+#                 "X-Sample-Rate": "24000",
+#                 "X-Bits-Per-Sample": "16",
+#                 "X-Endianness": "little"
+#             }
+#         )
+#     return Response("No audio generated", status_code=400)
 from fastapi import FastAPI, Response
 from fastapi.responses import FileResponse
 from kokoro import KPipeline
 import soundfile as sf
 import os
 import numpy as np
+import torch
 from huggingface_hub import InferenceClient
+from pydantic import BaseModel
+import base64
+from io import BytesIO
+from PIL import Image
+class TextImageRequest(BaseModel):
+    text: str = None
+    image_base64: str = None
+    voice: str = "af_heart"
+    speed: float = 1.0
+def llm_chat_response(text, image_base64=None):
     HF_TOKEN = os.getenv("HF_TOKEN")
     client = InferenceClient(api_key=HF_TOKEN)
+    message_content = [
+        {
+            "type": "text",
+            "text": text + str('describe in one line only')
+        }
+    ]
+    # If image_base64 is provided, add it to the message content
+    if image_base64:
+        # Convert base64 to PIL Image for validation
+        try:
+            image_bytes = base64.b64decode(image_base64)
+            # Validate that it's a proper image
+            Image.open(BytesIO(image_bytes))
+            # Add the image to message content
+            message_content.append({
+                "type": "image",
+                "image": {
+                    "data": image_base64
+                }
+            })
+        except Exception as e:
+            print(f"Error processing image: {e}")
     messages = [
+        {
+            "role": "user",
+            "content": message_content
+        }
     ]
     response_from_llama = client.chat.completions.create(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        messages=messages,
+        max_tokens=500
+    )
     return response_from_llama.choices[0].message['content']
 app = FastAPI()
 # Initialize pipeline once at startup
 pipeline = KPipeline(lang_code='a')
 @app.post("/generate")
+async def generate_audio(request: TextImageRequest):
+    # If no text is provided but image is provided, use default prompt
+    user_text = request.text
+    if user_text is None and request.image_base64:
+        user_text = "Describe what you see in the image"
+    elif user_text is None:
+        user_text = ""
+    # Generate response using text and image if provided
+    text_reply = llm_chat_response(user_text, request.image_base64)
     # Generate audio
     generator = pipeline(
         text_reply,
+        voice=request.voice,
+        speed=request.speed,
         split_pattern=r'\n+'
     )
     # Process only the first segment for demo
     for i, (gs, ps, audio) in enumerate(generator):
         # Convert PyTorch tensor to NumPy array
         audio_numpy = audio.cpu().numpy()
+        # Convert to 16-bit PCM
         # Ensure the audio is in the range [-1, 1]
         audio_numpy = np.clip(audio_numpy, -1, 1)
         # Convert to 16-bit signed integers
         pcm_data = (audio_numpy * 32767).astype(np.int16)