Athspi commited on
Commit
5875fa2
·
verified ·
1 Parent(s): 01919eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -34
app.py CHANGED
@@ -12,7 +12,7 @@ API_KEY = os.getenv("GEMINI_API_KEY")
12
  if not API_KEY:
13
  raise ValueError("Missing GEMINI_API_KEY in .env")
14
 
15
- # Base URL with key=… param
16
  BASE_URL = (
17
  "https://generativelanguage.googleapis.com/"
18
  "v1beta/models/gemini-2.5-flash-preview-tts:"
@@ -20,28 +20,32 @@ BASE_URL = (
20
  f"?key={API_KEY}"
21
  )
22
 
23
- app = FastAPI()
24
 
25
  def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
26
- """Write raw PCM bytes to a WAV file."""
27
  with wave.open(path, "wb") as wf:
28
  wf.setnchannels(channels)
29
  wf.setsampwidth(width)
30
  wf.setframerate(rate)
31
  wf.writeframes(pcm)
32
 
 
 
 
 
33
  @app.post("/single_tts")
34
- def single_tts(
35
- prompt: str = Form(...),
36
- voice_name: str = Form(...)
37
- ):
38
  payload = {
39
  "model": "gemini-2.5-flash-preview-tts",
40
  "contents": [{"parts": [{"text": prompt}]}],
41
- "generationConfig": {"responseMimeType": "audio/wav"},
42
- "responseModality": "AUDIO",
43
- "speechConfig": {
44
- "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice_name}}
 
 
 
 
45
  }
46
  }
47
 
@@ -51,31 +55,40 @@ def single_tts(
51
 
52
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
53
  pcm = base64.b64decode(data_b64)
54
- out = "single_output.wav"
55
- save_wav(out, pcm)
56
- return FileResponse(out, media_type="audio/wav", filename=out)
 
57
 
58
  @app.post("/multi_tts")
59
  def multi_tts(
60
  prompt: str = Form(...),
61
- speaker1: str = Form(...),
62
- voice1: str = Form(...),
63
- speaker2: str = Form(...),
64
- voice2: str = Form(...)
65
  ):
66
  payload = {
67
  "model": "gemini-2.5-flash-preview-tts",
68
  "contents": [{"parts": [{"text": prompt}]}],
69
- "generationConfig": {"responseMimeType": "audio/wav"},
70
- "responseModality": "AUDIO",
71
- "speechConfig": {
72
- "multiSpeakerVoiceConfig": {
73
- "speakerVoiceConfigs": [
74
- {"speaker": speaker1,
75
- "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice1}}},
76
- {"speaker": speaker2,
77
- "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice2}}},
78
- ]
 
 
 
 
 
 
 
 
 
 
79
  }
80
  }
81
  }
@@ -86,10 +99,7 @@ def multi_tts(
86
 
87
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
88
  pcm = base64.b64decode(data_b64)
89
- out = "multi_output.wav"
90
- save_wav(out, pcm)
91
- return FileResponse(out, media_type="audio/wav", filename=out)
92
 
93
- @app.get("/")
94
- def health():
95
- return {"status": "Gemini TTS FastAPI up and running!"}
 
12
  if not API_KEY:
13
  raise ValueError("Missing GEMINI_API_KEY in .env")
14
 
15
+ # REST endpoint with API key as query param
16
  BASE_URL = (
17
  "https://generativelanguage.googleapis.com/"
18
  "v1beta/models/gemini-2.5-flash-preview-tts:"
 
20
  f"?key={API_KEY}"
21
  )
22
 
23
+ app = FastAPI(title="Gemini TTS Space")
24
 
25
  def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
 
26
  with wave.open(path, "wb") as wf:
27
  wf.setnchannels(channels)
28
  wf.setsampwidth(width)
29
  wf.setframerate(rate)
30
  wf.writeframes(pcm)
31
 
32
+ @app.get("/")
33
+ def health():
34
+ return {"status": "Gemini TTS Space is live!"}
35
+
36
  @app.post("/single_tts")
37
+ def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
 
 
 
38
  payload = {
39
  "model": "gemini-2.5-flash-preview-tts",
40
  "contents": [{"parts": [{"text": prompt}]}],
41
+ "generationConfig": {
42
+ "responseMimeType": "audio/wav",
43
+ "responseModalities": ["AUDIO"],
44
+ "speechConfig": {
45
+ "voiceConfig": {
46
+ "prebuiltVoiceConfig": {"voiceName": voice_name}
47
+ }
48
+ }
49
  }
50
  }
51
 
 
55
 
56
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
57
  pcm = base64.b64decode(data_b64)
58
+
59
+ out_path = "single_output.wav"
60
+ save_wav(out_path, pcm)
61
+ return FileResponse(out_path, media_type="audio/wav", filename=out_path)
62
 
63
  @app.post("/multi_tts")
64
  def multi_tts(
65
  prompt: str = Form(...),
66
+ speaker1: str = Form(...), voice1: str = Form(...),
67
+ speaker2: str = Form(...), voice2: str = Form(...)
 
 
68
  ):
69
  payload = {
70
  "model": "gemini-2.5-flash-preview-tts",
71
  "contents": [{"parts": [{"text": prompt}]}],
72
+ "generationConfig": {
73
+ "responseMimeType": "audio/wav",
74
+ "responseModalities": ["AUDIO"],
75
+ "speechConfig": {
76
+ "multiSpeakerVoiceConfig": {
77
+ "speakerVoiceConfigs": [
78
+ {
79
+ "speaker": speaker1,
80
+ "voiceConfig": {
81
+ "prebuiltVoiceConfig": {"voiceName": voice1}
82
+ }
83
+ },
84
+ {
85
+ "speaker": speaker2,
86
+ "voiceConfig": {
87
+ "prebuiltVoiceConfig": {"voiceName": voice2}
88
+ }
89
+ }
90
+ ]
91
+ }
92
  }
93
  }
94
  }
 
99
 
100
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
101
  pcm = base64.b64decode(data_b64)
 
 
 
102
 
103
+ out_path = "multi_output.wav"
104
+ save_wav(out_path, pcm)
105
+ return FileResponse(out_path, media_type="audio/wav", filename=out_path)