Athspi commited on
Commit
a71d68c
·
verified ·
1 Parent(s): 5875fa2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -12,7 +12,7 @@ API_KEY = os.getenv("GEMINI_API_KEY")
12
  if not API_KEY:
13
  raise ValueError("Missing GEMINI_API_KEY in .env")
14
 
15
- # REST endpoint with API key as query param
16
  BASE_URL = (
17
  "https://generativelanguage.googleapis.com/"
18
  "v1beta/models/gemini-2.5-flash-preview-tts:"
@@ -23,6 +23,7 @@ BASE_URL = (
23
  app = FastAPI(title="Gemini TTS Space")
24
 
25
  def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
 
26
  with wave.open(path, "wb") as wf:
27
  wf.setnchannels(channels)
28
  wf.setsampwidth(width)
@@ -34,12 +35,15 @@ def health():
34
  return {"status": "Gemini TTS Space is live!"}
35
 
36
  @app.post("/single_tts")
37
- def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
 
 
 
 
38
  payload = {
39
  "model": "gemini-2.5-flash-preview-tts",
40
  "contents": [{"parts": [{"text": prompt}]}],
41
- "generationConfig": {
42
- "responseMimeType": "audio/wav",
43
  "responseModalities": ["AUDIO"],
44
  "speechConfig": {
45
  "voiceConfig": {
@@ -53,12 +57,12 @@ def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
53
  if resp.status_code != 200:
54
  return JSONResponse(status_code=resp.status_code, content=resp.json())
55
 
 
56
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
57
  pcm = base64.b64decode(data_b64)
58
-
59
- out_path = "single_output.wav"
60
- save_wav(out_path, pcm)
61
- return FileResponse(out_path, media_type="audio/wav", filename=out_path)
62
 
63
  @app.post("/multi_tts")
64
  def multi_tts(
@@ -69,8 +73,7 @@ def multi_tts(
69
  payload = {
70
  "model": "gemini-2.5-flash-preview-tts",
71
  "contents": [{"parts": [{"text": prompt}]}],
72
- "generationConfig": {
73
- "responseMimeType": "audio/wav",
74
  "responseModalities": ["AUDIO"],
75
  "speechConfig": {
76
  "multiSpeakerVoiceConfig": {
@@ -99,7 +102,6 @@ def multi_tts(
99
 
100
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
101
  pcm = base64.b64decode(data_b64)
102
-
103
- out_path = "multi_output.wav"
104
- save_wav(out_path, pcm)
105
- return FileResponse(out_path, media_type="audio/wav", filename=out_path)
 
12
  if not API_KEY:
13
  raise ValueError("Missing GEMINI_API_KEY in .env")
14
 
15
+ # REST endpoint (API key as query param)
16
  BASE_URL = (
17
  "https://generativelanguage.googleapis.com/"
18
  "v1beta/models/gemini-2.5-flash-preview-tts:"
 
23
  app = FastAPI(title="Gemini TTS Space")
24
 
25
  def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
26
+ """Write raw PCM bytes to a WAV file."""
27
  with wave.open(path, "wb") as wf:
28
  wf.setnchannels(channels)
29
  wf.setsampwidth(width)
 
35
  return {"status": "Gemini TTS Space is live!"}
36
 
37
  @app.post("/single_tts")
38
+ def single_tts(
39
+ prompt: str = Form(...),
40
+ voice_name: str = Form(...)
41
+ ):
42
+ # Build payload with all TTS settings under `config`
43
  payload = {
44
  "model": "gemini-2.5-flash-preview-tts",
45
  "contents": [{"parts": [{"text": prompt}]}],
46
+ "config": {
 
47
  "responseModalities": ["AUDIO"],
48
  "speechConfig": {
49
  "voiceConfig": {
 
57
  if resp.status_code != 200:
58
  return JSONResponse(status_code=resp.status_code, content=resp.json())
59
 
60
+ # Decode and save
61
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
62
  pcm = base64.b64decode(data_b64)
63
+ out = "single_output.wav"
64
+ save_wav(out, pcm)
65
+ return FileResponse(out, media_type="audio/wav", filename=out)
 
66
 
67
  @app.post("/multi_tts")
68
  def multi_tts(
 
73
  payload = {
74
  "model": "gemini-2.5-flash-preview-tts",
75
  "contents": [{"parts": [{"text": prompt}]}],
76
+ "config": {
 
77
  "responseModalities": ["AUDIO"],
78
  "speechConfig": {
79
  "multiSpeakerVoiceConfig": {
 
102
 
103
  data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
104
  pcm = base64.b64decode(data_b64)
105
+ out = "multi_output.wav"
106
+ save_wav(out, pcm)
107
+ return FileResponse(out, media_type="audio/wav", filename=out)