Spaces:

Athspi-aitools
/

Aittsg

Running

App Files Files Community

Athspi commited on Jun 26

Commit

5875fa2

verified ·

1 Parent(s): 01919eb

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -34

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ API_KEY = os.getenv("GEMINI_API_KEY")
 if not API_KEY:
     raise ValueError("Missing GEMINI_API_KEY in .env")
-# Base URL with key=… param
 BASE_URL = (
     "https://generativelanguage.googleapis.com/"
     "v1beta/models/gemini-2.5-flash-preview-tts:"
@@ -20,28 +20,32 @@ BASE_URL = (
     f"?key={API_KEY}"
 )
-app = FastAPI()
 def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
-    """Write raw PCM bytes to a WAV file."""
     with wave.open(path, "wb") as wf:
         wf.setnchannels(channels)
         wf.setsampwidth(width)
         wf.setframerate(rate)
         wf.writeframes(pcm)
 @app.post("/single_tts")
-def single_tts(
-    prompt: str = Form(...),
-    voice_name: str = Form(...)
-):
     payload = {
         "model": "gemini-2.5-flash-preview-tts",
         "contents": [{"parts": [{"text": prompt}]}],
-        "generationConfig": {"responseMimeType": "audio/wav"},
-        "responseModality": "AUDIO",
-        "speechConfig": {
-            "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice_name}}
         }
     }
@@ -51,31 +55,40 @@ def single_tts(
     data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
     pcm = base64.b64decode(data_b64)
-    out = "single_output.wav"
-    save_wav(out, pcm)
-    return FileResponse(out, media_type="audio/wav", filename=out)
 @app.post("/multi_tts")
 def multi_tts(
     prompt: str = Form(...),
-    speaker1: str = Form(...),
-    voice1: str = Form(...),
-    speaker2: str = Form(...),
-    voice2: str = Form(...)
 ):
     payload = {
         "model": "gemini-2.5-flash-preview-tts",
         "contents": [{"parts": [{"text": prompt}]}],
-        "generationConfig": {"responseMimeType": "audio/wav"},
-        "responseModality": "AUDIO",
-        "speechConfig": {
-            "multiSpeakerVoiceConfig": {
-                "speakerVoiceConfigs": [
-                    {"speaker": speaker1,
-                     "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice1}}},
-                    {"speaker": speaker2,
-                     "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice2}}},
-                ]
             }
         }
     }
@@ -86,10 +99,7 @@ def multi_tts(
     data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
     pcm = base64.b64decode(data_b64)
-    out = "multi_output.wav"
-    save_wav(out, pcm)
-    return FileResponse(out, media_type="audio/wav", filename=out)
-@app.get("/")
-def health():
-    return {"status": "Gemini TTS FastAPI up and running!"}

 if not API_KEY:
     raise ValueError("Missing GEMINI_API_KEY in .env")
+# REST endpoint with API key as query param
 BASE_URL = (
     "https://generativelanguage.googleapis.com/"
     "v1beta/models/gemini-2.5-flash-preview-tts:"
     f"?key={API_KEY}"
 )
+app = FastAPI(title="Gemini TTS Space")
 def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
     with wave.open(path, "wb") as wf:
         wf.setnchannels(channels)
         wf.setsampwidth(width)
         wf.setframerate(rate)
         wf.writeframes(pcm)
+@app.get("/")
+def health():
+    return {"status": "Gemini TTS Space is live!"}
 @app.post("/single_tts")
+def single_tts(prompt: str = Form(...), voice_name: str = Form(...)):
     payload = {
         "model": "gemini-2.5-flash-preview-tts",
         "contents": [{"parts": [{"text": prompt}]}],
+        "generationConfig": {
+            "responseMimeType": "audio/wav",
+            "responseModalities": ["AUDIO"],
+            "speechConfig": {
+                "voiceConfig": {
+                    "prebuiltVoiceConfig": {"voiceName": voice_name}
+                }
+            }
         }
     }
     data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
     pcm = base64.b64decode(data_b64)
+    out_path = "single_output.wav"
+    save_wav(out_path, pcm)
+    return FileResponse(out_path, media_type="audio/wav", filename=out_path)
 @app.post("/multi_tts")
 def multi_tts(
     prompt: str = Form(...),
+    speaker1: str = Form(...), voice1: str = Form(...),
+    speaker2: str = Form(...), voice2: str = Form(...)
 ):
     payload = {
         "model": "gemini-2.5-flash-preview-tts",
         "contents": [{"parts": [{"text": prompt}]}],
+        "generationConfig": {
+            "responseMimeType": "audio/wav",
+            "responseModalities": ["AUDIO"],
+            "speechConfig": {
+                "multiSpeakerVoiceConfig": {
+                    "speakerVoiceConfigs": [
+                        {
+                            "speaker": speaker1,
+                            "voiceConfig": {
+                                "prebuiltVoiceConfig": {"voiceName": voice1}
+                            }
+                        },
+                        {
+                            "speaker": speaker2,
+                            "voiceConfig": {
+                                "prebuiltVoiceConfig": {"voiceName": voice2}
+                            }
+                        }
+                    ]
+                }
             }
         }
     }
     data_b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
     pcm = base64.b64decode(data_b64)
+    out_path = "multi_output.wav"
+    save_wav(out_path, pcm)
+    return FileResponse(out_path, media_type="audio/wav", filename=out_path)