Spaces:
Running
Running
File size: 3,358 Bytes
4a54590 5011794 7f0d329 5011794 a09df36 01919eb 5011794 01919eb 7f0d329 a09df36 5875fa2 01919eb a09df36 5875fa2 a09df36 01919eb 4a54590 01919eb 4a54590 01919eb 5011794 5875fa2 7f0d329 5875fa2 7f0d329 01919eb 7f0d329 5875fa2 7f0d329 01919eb 7f0d329 01919eb 5875fa2 7f0d329 5875fa2 7f0d329 5011794 01919eb 7f0d329 5875fa2 5011794 01919eb a09df36 01919eb a09df36 5875fa2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
import requests
import wave
import base64
from fastapi import FastAPI, Form
from fastapi.responses import FileResponse, JSONResponse
from dotenv import load_dotenv
# Load API key
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
raise ValueError("Missing GEMINI_API_KEY in .env")
# REST endpoint with API key as query param
BASE_URL = (
"https://generativelanguage.googleapis.com/"
"v1beta/models/gemini-2.5-flash-preview-tts:"
"generateContent"
f"?key={API_KEY}"
)
app = FastAPI(title="Gemini TTS Space")
def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
with wave.open(path, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(width)
wf.setframerate(rate)
wf.writeframes(pcm)
@app.get("/")
def health():
return {"status": "Gemini TTS Space is live!"}
@app.post("/single_tts")
def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
payload = {
"model": "gemini-2.5-flash-preview-tts",
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {
"responseMimeType": "audio/wav",
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice_name}
}
}
}
}
resp = requests.post(BASE_URL, json=payload)
if resp.status_code != 200:
return JSONResponse(status_code=resp.status_code, content=resp.json())
data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
pcm = base64.b64decode(data_b64)
out_path = "single_output.wav"
save_wav(out_path, pcm)
return FileResponse(out_path, media_type="audio/wav", filename=out_path)
@app.post("/multi_tts")
def multi_tts(
prompt: str = Form(...),
speaker1: str = Form(...), voice1: str = Form(...),
speaker2: str = Form(...), voice2: str = Form(...)
):
payload = {
"model": "gemini-2.5-flash-preview-tts",
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {
"responseMimeType": "audio/wav",
"responseModalities": ["AUDIO"],
"speechConfig": {
"multiSpeakerVoiceConfig": {
"speakerVoiceConfigs": [
{
"speaker": speaker1,
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice1}
}
},
{
"speaker": speaker2,
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice2}
}
}
]
}
}
}
}
resp = requests.post(BASE_URL, json=payload)
if resp.status_code != 200:
return JSONResponse(status_code=resp.status_code, content=resp.json())
data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
pcm = base64.b64decode(data_b64)
out_path = "multi_output.wav"
save_wav(out_path, pcm)
return FileResponse(out_path, media_type="audio/wav", filename=out_path)
|