TTS_API_Image / app.py
khurrameycon's picture
updated with PCM
fce7c66 verified
raw
history blame
1.83 kB
from fastapi import FastAPI, Response
from fastapi.responses import FileResponse
from kokoro import KPipeline
import soundfile as sf
import os
app = FastAPI()
# Initialize pipeline once at startup
pipeline = KPipeline(lang_code='a')
@app.post("/generate")
async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
# Generate audio
generator = pipeline(
text,
voice=voice,
speed=speed,
split_pattern=r'\n+'
)
# # Save first segment only for demo
# for i, (gs, ps, audio) in enumerate(generator):
# sf.write(f"output_{i}.wav", audio, 24000)
# return FileResponse(
# f"output_{i}.wav",
# media_type="audio/wav",
# filename="output.wav"
# )
# return Response("No audio generated", status_code=400)
# Process only the first segment for demo
for i, (gs, ps, audio) in enumerate(generator):
# Convert to 16-bit PCM
# Ensure the audio is in the range [-1, 1] first
audio = np.clip(audio, -1, 1)
# Convert to 16-bit signed integers
pcm_data = (audio * 32767).astype(np.int16)
# Convert to bytes, ensuring little-endian
raw_audio = pcm_data.tobytes(order='C')
# Return PCM data with appropriate headers
return Response(
content=raw_audio,
media_type="audio/l16", # Linear PCM
headers={
"Content-Disposition": f'attachment; filename="output.pcm"',
"X-Sample-Rate": "24000",
"X-Bits-Per-Sample": "16",
"X-Channels": "1",
"X-Encoding": "signed-integer",
"X-Endian": "little"
}
)
return Response("No audio generated", status_code=400)