Aittsg / app.py
Athspi's picture
Update app.py
5875fa2 verified
raw
history blame
3.36 kB
import os
import requests
import wave
import base64
from fastapi import FastAPI, Form
from fastapi.responses import FileResponse, JSONResponse
from dotenv import load_dotenv
# Load API key
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
raise ValueError("Missing GEMINI_API_KEY in .env")
# REST endpoint with API key as query param
BASE_URL = (
"https://generativelanguage.googleapis.com/"
"v1beta/models/gemini-2.5-flash-preview-tts:"
"generateContent"
f"?key={API_KEY}"
)
app = FastAPI(title="Gemini TTS Space")
def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
with wave.open(path, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(width)
wf.setframerate(rate)
wf.writeframes(pcm)
@app.get("/")
def health():
return {"status": "Gemini TTS Space is live!"}
@app.post("/single_tts")
def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
payload = {
"model": "gemini-2.5-flash-preview-tts",
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {
"responseMimeType": "audio/wav",
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice_name}
}
}
}
}
resp = requests.post(BASE_URL, json=payload)
if resp.status_code != 200:
return JSONResponse(status_code=resp.status_code, content=resp.json())
data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
pcm = base64.b64decode(data_b64)
out_path = "single_output.wav"
save_wav(out_path, pcm)
return FileResponse(out_path, media_type="audio/wav", filename=out_path)
@app.post("/multi_tts")
def multi_tts(
prompt: str = Form(...),
speaker1: str = Form(...), voice1: str = Form(...),
speaker2: str = Form(...), voice2: str = Form(...)
):
payload = {
"model": "gemini-2.5-flash-preview-tts",
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {
"responseMimeType": "audio/wav",
"responseModalities": ["AUDIO"],
"speechConfig": {
"multiSpeakerVoiceConfig": {
"speakerVoiceConfigs": [
{
"speaker": speaker1,
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice1}
}
},
{
"speaker": speaker2,
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice2}
}
}
]
}
}
}
}
resp = requests.post(BASE_URL, json=payload)
if resp.status_code != 200:
return JSONResponse(status_code=resp.status_code, content=resp.json())
data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
pcm = base64.b64decode(data_b64)
out_path = "multi_output.wav"
save_wav(out_path, pcm)
return FileResponse(out_path, media_type="audio/wav", filename=out_path)