Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,83 @@
|
|
1 |
-
from fastapi import FastAPI,
|
2 |
-
from
|
3 |
-
|
4 |
-
import wave
|
5 |
import os
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
|
9 |
-
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
raise ValueError("Set GOOGLE_API_KEY environment variable.")
|
13 |
|
14 |
-
|
15 |
|
16 |
-
|
|
|
|
|
17 |
|
18 |
-
def wave_file(filename,
|
19 |
with wave.open(filename, "wb") as wf:
|
20 |
wf.setnchannels(channels)
|
21 |
wf.setsampwidth(sample_width)
|
22 |
wf.setframerate(rate)
|
23 |
-
wf.writeframes(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
response = model.generate_content(
|
28 |
-
text,
|
29 |
-
generation_config={"response_mime_type": "audio/wav"},
|
30 |
-
response_modality="AUDIO"
|
31 |
-
)
|
32 |
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
37 |
|
38 |
-
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import Dict
|
|
|
4 |
import os
|
5 |
+
import requests
|
6 |
+
import wave
|
7 |
+
import base64
|
8 |
+
from dotenv import load_dotenv
|
9 |
|
10 |
+
load_dotenv()
|
11 |
|
12 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
13 |
+
if not GEMINI_API_KEY:
|
14 |
+
raise ValueError("Missing GEMINI_API_KEY")
|
15 |
|
16 |
+
API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent"
|
|
|
17 |
|
18 |
+
app = FastAPI()
|
19 |
|
20 |
+
class TTSRequest(BaseModel):
|
21 |
+
prompt: str
|
22 |
+
speakers: Dict[str, str] # e.g. { "Joe": "Kore", "Jane": "Puck" }
|
23 |
|
24 |
+
def wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2):
|
25 |
with wave.open(filename, "wb") as wf:
|
26 |
wf.setnchannels(channels)
|
27 |
wf.setsampwidth(sample_width)
|
28 |
wf.setframerate(rate)
|
29 |
+
wf.writeframes(pcm_data)
|
30 |
+
|
31 |
+
@app.post("/dynamic_multispeaker_tts")
|
32 |
+
def dynamic_multispeaker_tts(request: TTSRequest):
|
33 |
+
headers = {
|
34 |
+
"Content-Type": "application/json",
|
35 |
+
"Authorization": f"Bearer {GEMINI_API_KEY}"
|
36 |
+
}
|
37 |
+
|
38 |
+
# Build speaker voice config list dynamically
|
39 |
+
speaker_voice_configs = []
|
40 |
+
for speaker, voice_name in request.speakers.items():
|
41 |
+
speaker_voice_configs.append({
|
42 |
+
"speaker": speaker,
|
43 |
+
"voiceConfig": {
|
44 |
+
"prebuiltVoiceConfig": {
|
45 |
+
"voiceName": voice_name
|
46 |
+
}
|
47 |
+
}
|
48 |
+
})
|
49 |
+
|
50 |
+
payload = {
|
51 |
+
"contents": [{"parts": [{"text": request.prompt}]}],
|
52 |
+
"generationConfig": {
|
53 |
+
"responseMimeType": "audio/wav"
|
54 |
+
},
|
55 |
+
"responseModality": "AUDIO",
|
56 |
+
"speechConfig": {
|
57 |
+
"multiSpeakerVoiceConfig": {
|
58 |
+
"speakerVoiceConfigs": speaker_voice_configs
|
59 |
+
}
|
60 |
+
}
|
61 |
+
}
|
62 |
+
|
63 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
64 |
+
|
65 |
+
if response.status_code != 200:
|
66 |
+
raise HTTPException(status_code=500, detail=response.text)
|
67 |
+
|
68 |
+
result = response.json()
|
69 |
+
try:
|
70 |
+
audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
|
71 |
+
audio_data = base64.b64decode(audio_data_b64)
|
72 |
|
73 |
+
output_path = "output.wav"
|
74 |
+
wave_file(output_path, audio_data)
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
return {"message": "Audio generated successfully. Download 'output.wav'."}
|
77 |
|
78 |
+
except Exception as e:
|
79 |
+
raise HTTPException(status_code=500, detail=str(e))
|
80 |
|
81 |
+
@app.get("/")
|
82 |
+
def root():
|
83 |
+
return {"message": "Gemini Multi-Speaker TTS is running!"}
|