from fastapi import FastAPI, Response from fastapi.responses import FileResponse from kokoro import KPipeline import soundfile as sf import os import numpy as np import torch app = FastAPI() # Initialize pipeline once at startup pipeline = KPipeline(lang_code='a') @app.post("/generate") async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0): # Generate audio generator = pipeline( text, voice=voice, speed=speed, split_pattern=r'\n+' ) # # Save first segment only for demo # for i, (gs, ps, audio) in enumerate(generator): # sf.write(f"output_{i}.wav", audio, 24000) # return FileResponse( # f"output_{i}.wav", # media_type="audio/wav", # filename="output.wav" # ) # return Response("No audio generated", status_code=400) # Process only the first segment for demo for i, (gs, ps, audio) in enumerate(generator): # Convert PyTorch tensor to NumPy array audio_numpy = audio.cpu().numpy() # Convert to 16-bit PCM # Ensure the audio is in the range [-1, 1] audio_numpy = np.clip(audio_numpy, -1, 1) # Convert to 16-bit signed integers pcm_data = (audio_numpy * 32767).astype(np.int16) # Convert to bytes (automatically uses row-major order) raw_audio = pcm_data.tobytes() # Return PCM data with minimal necessary headers return Response( content=raw_audio, media_type="application/octet-stream", headers={ "Content-Disposition": f'attachment; filename="output.pcm"', "X-Sample-Rate": "24000", "X-Bits-Per-Sample": "16", "X-Endianness": "little" } ) return Response("No audio generated", status_code=400)