File size: 3,358 Bytes
4a54590
5011794
 
 
7f0d329
 
5011794
a09df36
01919eb
5011794
01919eb
 
7f0d329
a09df36
5875fa2
01919eb
 
 
 
 
 
a09df36
5875fa2
a09df36
01919eb
 
4a54590
01919eb
4a54590
01919eb
5011794
5875fa2
 
 
 
7f0d329
5875fa2
7f0d329
01919eb
7f0d329
5875fa2
 
 
 
 
 
 
 
7f0d329
 
 
01919eb
 
 
7f0d329
01919eb
 
5875fa2
 
 
 
7f0d329
 
 
 
5875fa2
 
7f0d329
5011794
01919eb
7f0d329
5875fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5011794
 
 
 
01919eb
 
 
a09df36
01919eb
 
a09df36
5875fa2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import requests
import wave
import base64
from fastapi import FastAPI, Form
from fastapi.responses import FileResponse, JSONResponse
from dotenv import load_dotenv

# Load API key
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    raise ValueError("Missing GEMINI_API_KEY in .env")

# REST endpoint with API key as query param
BASE_URL = (
    "https://generativelanguage.googleapis.com/"
    "v1beta/models/gemini-2.5-flash-preview-tts:"
    "generateContent"
    f"?key={API_KEY}"
)

app = FastAPI(title="Gemini TTS Space")

def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
    with wave.open(path, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(width)
        wf.setframerate(rate)
        wf.writeframes(pcm)

@app.get("/")
def health():
    return {"status": "Gemini TTS Space is live!"}

@app.post("/single_tts")
def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
    payload = {
        "model": "gemini-2.5-flash-preview-tts",
        "contents": [{"parts": [{"text": prompt}]}],
        "generationConfig": {
            "responseMimeType": "audio/wav",
            "responseModalities": ["AUDIO"],
            "speechConfig": {
                "voiceConfig": {
                    "prebuiltVoiceConfig": {"voiceName": voice_name}
                }
            }
        }
    }

    resp = requests.post(BASE_URL, json=payload)
    if resp.status_code != 200:
        return JSONResponse(status_code=resp.status_code, content=resp.json())

    data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
    pcm = base64.b64decode(data_b64)

    out_path = "single_output.wav"
    save_wav(out_path, pcm)
    return FileResponse(out_path, media_type="audio/wav", filename=out_path)

@app.post("/multi_tts")
def multi_tts(
    prompt: str = Form(...),
    speaker1: str = Form(...), voice1: str = Form(...),
    speaker2: str = Form(...), voice2: str = Form(...)
):
    payload = {
        "model": "gemini-2.5-flash-preview-tts",
        "contents": [{"parts": [{"text": prompt}]}],
        "generationConfig": {
            "responseMimeType": "audio/wav",
            "responseModalities": ["AUDIO"],
            "speechConfig": {
                "multiSpeakerVoiceConfig": {
                    "speakerVoiceConfigs": [
                        {
                            "speaker": speaker1,
                            "voiceConfig": {
                                "prebuiltVoiceConfig": {"voiceName": voice1}
                            }
                        },
                        {
                            "speaker": speaker2,
                            "voiceConfig": {
                                "prebuiltVoiceConfig": {"voiceName": voice2}
                            }
                        }
                    ]
                }
            }
        }
    }

    resp = requests.post(BASE_URL, json=payload)
    if resp.status_code != 200:
        return JSONResponse(status_code=resp.status_code, content=resp.json())

    data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
    pcm = base64.b64decode(data_b64)

    out_path = "multi_output.wav"
    save_wav(out_path, pcm)
    return FileResponse(out_path, media_type="audio/wav", filename=out_path)