from fastapi import FastAPI, Response from fastapi.responses import FileResponse from kokoro import KPipeline import soundfile as sf import os import numpy as np app = FastAPI() # Initialize pipeline once at startup pipeline = KPipeline(lang_code='a') @app.post("/generate") async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0): # Generate audio generator = pipeline( text, voice=voice, speed=speed, split_pattern=r'\n+' ) # # Save first segment only for demo # for i, (gs, ps, audio) in enumerate(generator): # sf.write(f"output_{i}.wav", audio, 24000) # return FileResponse( # f"output_{i}.wav", # media_type="audio/wav", # filename="output.wav" # ) # return Response("No audio generated", status_code=400) # Process only the first segment for demo for i, (gs, ps, audio) in enumerate(generator): # Convert to 16-bit PCM # Ensure the audio is in the range [-1, 1] audio = np.clip(audio, -1, 1) # Convert to 16-bit signed integers pcm_data = (audio * 32767).astype(np.int16) # Convert to bytes (automatically uses row-major order) raw_audio = pcm_data.tobytes() # Return PCM data with minimal necessary headers return Response( content=raw_audio, media_type="application/octet-stream", headers={ "Content-Disposition": f'attachment; filename="output.pcm"', "X-Sample-Rate": "24000", "X-Bits-Per-Sample": "16", "X-Endianness": "little" } ) return Response("No audio generated", status_code=400)