Spaces:

sagaxlearn
/

TTS_API

Sleeping

App Files Files Community

khurrameycon commited on Apr 7

Commit

af836e4

verified ·

1 Parent(s): 9354354

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -49

app.py CHANGED Viewed

@@ -1,7 +1,108 @@
 from fastapi import FastAPI, Response
 from fastapi.responses import FileResponse
 from kokoro import KPipeline
-import soundfile as sf
 import os
 import numpy as np
 import torch
@@ -10,33 +111,27 @@ from huggingface_hub import InferenceClient
 def llm_chat_response(text):
     HF_TOKEN = os.getenv("HF_TOKEN")
     client = InferenceClient(
-    provider="hf-inference",
-    api_key=HF_TOKEN,)
-    response_from_llama = client.chat.completions.create(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        messages=[
             {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "Describe this image in one sentence."
-                    }#,
-                    # {
-                    #     "type": "image_url",
-                    #     "image_url": {
-                    #         "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-                    #     }
-                    # }
-                ]
             }
-        ],
         max_tokens=500,
     )
     return response_from_llama.choices[0].message['content']
 app = FastAPI()
@@ -46,10 +141,9 @@ pipeline = KPipeline(lang_code='a')
 @app.post("/generate")
 async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
     text_reply = llm_chat_response(text)
-    # Generate audio
     generator = pipeline(
         text_reply,
         voice=voice,
@@ -57,43 +151,23 @@ async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0)
         split_pattern=r'\n+'
     )
-    # # Save first segment only for demo
-    # for i, (gs, ps, audio) in enumerate(generator):
-    #     sf.write(f"output_{i}.wav", audio, 24000)
-    #     return FileResponse(
-    #         f"output_{i}.wav",
-    #         media_type="audio/wav",
-    #         filename="output.wav"
-    #     )
-    # return Response("No audio generated", status_code=400)
-    # Process only the first segment for demo
     for i, (gs, ps, audio) in enumerate(generator):
-        # Convert PyTorch tensor to NumPy array
         audio_numpy = audio.cpu().numpy()
-        # Convert to 16-bit PCM
-        # Ensure the audio is in the range [-1, 1]
         audio_numpy = np.clip(audio_numpy, -1, 1)
-        # Convert to 16-bit signed integers
         pcm_data = (audio_numpy * 32767).astype(np.int16)
-        # Convert to bytes (automatically uses row-major order)
         raw_audio = pcm_data.tobytes()
-        # Return PCM data with minimal necessary headers
         return Response(
             content=raw_audio,
             media_type="application/octet-stream",
             headers={
-                "Content-Disposition": f'attachment; filename="output.pcm"',
                 "X-Sample-Rate": "24000",
                 "X-Bits-Per-Sample": "16",
                 "X-Endianness": "little"
             }
         )
-    return Response("No audio generated", status_code=400)

+# from fastapi import FastAPI, Response
+# from fastapi.responses import FileResponse
+# from kokoro import KPipeline
+# import soundfile as sf
+# import os
+# import numpy as np
+# import torch
+# from huggingface_hub import InferenceClient
+# def llm_chat_response(text):
+#     HF_TOKEN = os.getenv("HF_TOKEN")
+#     client = InferenceClient(
+#     provider="hf-inference",
+#     api_key=HF_TOKEN,)
+#     response_from_llama = client.chat.completions.create(
+#         model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {
+#                         "type": "text",
+#                         "text": "Describe this image in one sentence."
+#                     }#,
+#                     # {
+#                     #     "type": "image_url",
+#                     #     "image_url": {
+#                     #         "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+#                     #     }
+#                     # }
+#                 ]
+#             }
+#         ],
+#         max_tokens=500,
+#     )
+#     return response_from_llama.choices[0].message['content']
+# app = FastAPI()
+# # Initialize pipeline once at startup
+# pipeline = KPipeline(lang_code='a')
+# @app.post("/generate")
+# async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
+#     text_reply = llm_chat_response(text)
+#     # Generate audio
+#     generator = pipeline(
+#         text_reply,
+#         voice=voice,
+#         speed=speed,
+#         split_pattern=r'\n+'
+#     )
+#     # # Save first segment only for demo
+#     # for i, (gs, ps, audio) in enumerate(generator):
+#     #     sf.write(f"output_{i}.wav", audio, 24000)
+#     #     return FileResponse(
+#     #         f"output_{i}.wav",
+#     #         media_type="audio/wav",
+#     #         filename="output.wav"
+#     #     )
+#     # return Response("No audio generated", status_code=400)
+#     # Process only the first segment for demo
+#     for i, (gs, ps, audio) in enumerate(generator):
+#         # Convert PyTorch tensor to NumPy array
+#         audio_numpy = audio.cpu().numpy()
+#         # Convert to 16-bit PCM
+#         # Ensure the audio is in the range [-1, 1]
+#         audio_numpy = np.clip(audio_numpy, -1, 1)
+#         # Convert to 16-bit signed integers
+#         pcm_data = (audio_numpy * 32767).astype(np.int16)
+#         # Convert to bytes (automatically uses row-major order)
+#         raw_audio = pcm_data.tobytes()
+#         # Return PCM data with minimal necessary headers
+#         return Response(
+#             content=raw_audio,
+#             media_type="application/octet-stream",
+#             headers={
+#                 "Content-Disposition": f'attachment; filename="output.pcm"',
+#                 "X-Sample-Rate": "24000",
+#                 "X-Bits-Per-Sample": "16",
+#                 "X-Endianness": "little"
+#             }
+#         )
+#     return Response("No audio generated", status_code=400)
 from fastapi import FastAPI, Response
 from fastapi.responses import FileResponse
 from kokoro import KPipeline
 import os
 import numpy as np
 import torch
 def llm_chat_response(text):
     HF_TOKEN = os.getenv("HF_TOKEN")
     client = InferenceClient(
+        provider="sambanova",  # Use the provider that supports conversational image-text tasks.
+        api_key=HF_TOKEN,
+    )
+    # Build the message payload; here we append a prompt suffix when no image is involved.
+    messages = [{
+        "role": "user",
+        "content": [
             {
+                "type": "text",
+                "text": text + " describe in one line only"
             }
+        ]
+    }]
+    response_from_llama = client.chat.completions.create(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        messages=messages,
         max_tokens=500,
     )
     return response_from_llama.choices[0].message['content']
 app = FastAPI()
 @app.post("/generate")
 async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
     text_reply = llm_chat_response(text)
+    # Generate audio using the pipeline
     generator = pipeline(
         text_reply,
         voice=voice,
         split_pattern=r'\n+'
     )
+    # Process only the first segment for demonstration
     for i, (gs, ps, audio) in enumerate(generator):
+        # Convert PyTorch tensor to NumPy array and prepare 16-bit PCM data
         audio_numpy = audio.cpu().numpy()
         audio_numpy = np.clip(audio_numpy, -1, 1)
         pcm_data = (audio_numpy * 32767).astype(np.int16)
         raw_audio = pcm_data.tobytes()
         return Response(
             content=raw_audio,
             media_type="application/octet-stream",
             headers={
+                "Content-Disposition": 'attachment; filename="output.pcm"',
                 "X-Sample-Rate": "24000",
                 "X-Bits-Per-Sample": "16",
                 "X-Endianness": "little"
             }
         )
+    return Response("No audio generated", status_code=400)