# from fastapi import FastAPI, Response # from fastapi.responses import FileResponse # from kokoro import KPipeline # import soundfile as sf # import os # import numpy as np # import torch # from huggingface_hub import InferenceClient # def llm_chat_response(text): # HF_TOKEN = os.getenv("HF_TOKEN") # client = InferenceClient(api_key=HF_TOKEN) # messages = [ # { # "role": "user", # "content": [ # { # "type": "text", # "text": text + str('describe in one line only') # } #, # # { # # "type": "image_url", # # "image_url": { # # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" # # } # # } # ] # } # ] # response_from_llama = client.chat.completions.create( # model="meta-llama/Llama-3.2-11B-Vision-Instruct", # messages=messages, # max_tokens=500) # return response_from_llama.choices[0].message['content'] # app = FastAPI() # # Initialize pipeline once at startup # pipeline = KPipeline(lang_code='a') # @app.post("/generate") # async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0): # text_reply = llm_chat_response(text) # # Generate audio # generator = pipeline( # text_reply, # voice=voice, # speed=speed, # split_pattern=r'\n+' # ) # # # Save first segment only for demo # # for i, (gs, ps, audio) in enumerate(generator): # # sf.write(f"output_{i}.wav", audio, 24000) # # return FileResponse( # # f"output_{i}.wav", # # media_type="audio/wav", # # filename="output.wav" # # ) # # return Response("No audio generated", status_code=400) # # Process only the first segment for demo # for i, (gs, ps, audio) in enumerate(generator): # # Convert PyTorch tensor to NumPy array # audio_numpy = audio.cpu().numpy() # # Convert to 16-bit PCM # # Ensure the audio is in the range [-1, 1] # audio_numpy = np.clip(audio_numpy, -1, 1) # # Convert to 16-bit signed integers # pcm_data = (audio_numpy * 32767).astype(np.int16) # # Convert to bytes (automatically uses row-major order) # raw_audio = pcm_data.tobytes() # # Return PCM data with minimal necessary headers # return Response( # content=raw_audio, # media_type="application/octet-stream", # headers={ # "Content-Disposition": f'attachment; filename="output.pcm"', # "X-Sample-Rate": "24000", # "X-Bits-Per-Sample": "16", # "X-Endianness": "little" # } # ) # return Response("No audio generated", status_code=400) from fastapi import FastAPI, Response from fastapi.responses import FileResponse from kokoro import KPipeline import soundfile as sf import os import numpy as np import torch from huggingface_hub import InferenceClient from pydantic import BaseModel import base64 from io import BytesIO from PIL import Image class TextImageRequest(BaseModel): text: str = None image_base64: str = None voice: str = "af_heart" speed: float = 1.0 def llm_chat_response(text, image_base64=None): HF_TOKEN = os.getenv("HF_TOKEN") client = InferenceClient(api_key=HF_TOKEN) message_content = [ { "type": "text", "text": text + str('describe in one line only') } ] # If image_base64 is provided, add it to the message content if image_base64: # Convert base64 to PIL Image for validation try: image_bytes = base64.b64decode(image_base64) # Validate that it's a proper image Image.open(BytesIO(image_bytes)) # Add the image to message content message_content.append({ "type": "image", "image": { "data": image_base64 } }) except Exception as e: print(f"Error processing image: {e}") messages = [ { "role": "user", "content": message_content } ] response_from_llama = client.chat.completions.create( model="meta-llama/Llama-3.2-11B-Vision-Instruct", messages=messages, max_tokens=500 ) return response_from_llama.choices[0].message['content'] app = FastAPI() # Initialize pipeline once at startup pipeline = KPipeline(lang_code='a') @app.post("/generate") async def generate_audio(request: TextImageRequest): # If no text is provided but image is provided, use default prompt user_text = request.text if user_text is None and request.image_base64: user_text = "Describe what you see in the image" elif user_text is None: user_text = "" # Generate response using text and image if provided text_reply = llm_chat_response(user_text, request.image_base64) # Generate audio generator = pipeline( text_reply, voice=request.voice, speed=request.speed, split_pattern=r'\n+' ) # Process only the first segment for demo for i, (gs, ps, audio) in enumerate(generator): # Convert PyTorch tensor to NumPy array audio_numpy = audio.cpu().numpy() # Convert to 16-bit PCM # Ensure the audio is in the range [-1, 1] audio_numpy = np.clip(audio_numpy, -1, 1) # Convert to 16-bit signed integers pcm_data = (audio_numpy * 32767).astype(np.int16) # Convert to bytes (automatically uses row-major order) raw_audio = pcm_data.tobytes() # Return PCM data with minimal necessary headers return Response( content=raw_audio, media_type="application/octet-stream", headers={ "Content-Disposition": f'attachment; filename="output.pcm"', "X-Sample-Rate": "24000", "X-Bits-Per-Sample": "16", "X-Endianness": "little" } ) return Response("No audio generated", status_code=400)