Athspi commited on
Commit
7f0d329
·
verified ·
1 Parent(s): 514466d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -26
app.py CHANGED
@@ -1,61 +1,116 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel
3
- from typing import Dict
4
  import os
5
  import requests
6
  import wave
7
  import base64
 
 
8
  from dotenv import load_dotenv
9
 
 
10
  load_dotenv()
11
-
12
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 
13
  if not GEMINI_API_KEY:
14
- raise ValueError("Missing GEMINI_API_KEY")
15
 
 
16
  API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent"
17
 
18
  app = FastAPI()
19
 
20
- class TTSRequest(BaseModel):
21
- prompt: str
22
- speakers: Dict[str, str] # e.g. { "Joe": "Kore", "Jane": "Puck" }
23
-
24
- def wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2):
25
  with wave.open(filename, "wb") as wf:
26
  wf.setnchannels(channels)
27
  wf.setsampwidth(sample_width)
28
  wf.setframerate(rate)
29
  wf.writeframes(pcm_data)
30
 
31
- @app.post("/dynamic_multispeaker_tts")
32
- def dynamic_multispeaker_tts(request: TTSRequest):
 
 
 
 
33
  headers = {
34
  "Content-Type": "application/json",
35
  "Authorization": f"Bearer {GEMINI_API_KEY}"
36
  }
37
 
38
- # Build speaker voice config list dynamically
39
- speaker_voice_configs = []
40
- for speaker, voice_name in request.speakers.items():
41
- speaker_voice_configs.append({
42
- "speaker": speaker,
 
 
43
  "voiceConfig": {
44
  "prebuiltVoiceConfig": {
45
  "voiceName": voice_name
46
  }
47
  }
48
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  payload = {
51
- "contents": [{"parts": [{"text": request.prompt}]}],
52
  "generationConfig": {
53
  "responseMimeType": "audio/wav"
54
  },
55
  "responseModality": "AUDIO",
56
  "speechConfig": {
57
  "multiSpeakerVoiceConfig": {
58
- "speakerVoiceConfigs": speaker_voice_configs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
  }
61
  }
@@ -63,21 +118,23 @@ def dynamic_multispeaker_tts(request: TTSRequest):
63
  response = requests.post(API_URL, headers=headers, json=payload)
64
 
65
  if response.status_code != 200:
66
- raise HTTPException(status_code=500, detail=response.text)
67
 
68
  result = response.json()
 
69
  try:
70
  audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
71
  audio_data = base64.b64decode(audio_data_b64)
72
 
73
- output_path = "output.wav"
74
- wave_file(output_path, audio_data)
75
 
76
- return {"message": "Audio generated successfully. Download 'output.wav'."}
77
 
78
  except Exception as e:
79
- raise HTTPException(status_code=500, detail=str(e))
80
 
 
81
  @app.get("/")
82
  def root():
83
- return {"message": "Gemini Multi-Speaker TTS is running!"}
 
 
 
 
1
  import os
2
  import requests
3
  import wave
4
  import base64
5
+ from fastapi import FastAPI, Form
6
+ from fastapi.responses import FileResponse, JSONResponse
7
  from dotenv import load_dotenv
8
 
9
+ # Load environment variables
10
  load_dotenv()
 
11
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
12
+
13
  if not GEMINI_API_KEY:
14
+ raise ValueError("Missing GEMINI_API_KEY in .env")
15
 
16
+ # Base Gemini API URL for TTS
17
  API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent"
18
 
19
  app = FastAPI()
20
 
21
+ # Utility to save PCM audio as a WAV file
22
+ def save_wav(filename, pcm_data, channels=1, rate=24000, sample_width=2):
 
 
 
23
  with wave.open(filename, "wb") as wf:
24
  wf.setnchannels(channels)
25
  wf.setsampwidth(sample_width)
26
  wf.setframerate(rate)
27
  wf.writeframes(pcm_data)
28
 
29
+ # ✅ Single speaker TTS endpoint
30
+ @app.post("/single_tts")
31
+ def single_tts(
32
+ prompt: str = Form(...),
33
+ voice_name: str = Form(...)
34
+ ):
35
  headers = {
36
  "Content-Type": "application/json",
37
  "Authorization": f"Bearer {GEMINI_API_KEY}"
38
  }
39
 
40
+ payload = {
41
+ "contents": [{"parts": [{"text": prompt}]}],
42
+ "generationConfig": {
43
+ "responseMimeType": "audio/wav"
44
+ },
45
+ "responseModality": "AUDIO",
46
+ "speechConfig": {
47
  "voiceConfig": {
48
  "prebuiltVoiceConfig": {
49
  "voiceName": voice_name
50
  }
51
  }
52
+ }
53
+ }
54
+
55
+ response = requests.post(API_URL, headers=headers, json=payload)
56
+
57
+ if response.status_code != 200:
58
+ return JSONResponse(status_code=500, content={"error": response.text})
59
+
60
+ result = response.json()
61
+
62
+ try:
63
+ audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
64
+ audio_data = base64.b64decode(audio_data_b64)
65
+
66
+ file_path = "single_output.wav"
67
+ save_wav(file_path, audio_data)
68
+
69
+ return FileResponse(file_path, media_type="audio/wav", filename="single_output.wav")
70
+
71
+ except Exception as e:
72
+ return JSONResponse(status_code=500, content={"error": str(e)})
73
+
74
+ # ✅ Multi-speaker TTS endpoint
75
+ @app.post("/multi_tts")
76
+ def multi_tts(
77
+ prompt: str = Form(...),
78
+ speaker1_name: str = Form(...),
79
+ speaker1_voice: str = Form(...),
80
+ speaker2_name: str = Form(...),
81
+ speaker2_voice: str = Form(...)
82
+ ):
83
+ headers = {
84
+ "Content-Type": "application/json",
85
+ "Authorization": f"Bearer {GEMINI_API_KEY}"
86
+ }
87
 
88
  payload = {
89
+ "contents": [{"parts": [{"text": prompt}]}],
90
  "generationConfig": {
91
  "responseMimeType": "audio/wav"
92
  },
93
  "responseModality": "AUDIO",
94
  "speechConfig": {
95
  "multiSpeakerVoiceConfig": {
96
+ "speakerVoiceConfigs": [
97
+ {
98
+ "speaker": speaker1_name,
99
+ "voiceConfig": {
100
+ "prebuiltVoiceConfig": {
101
+ "voiceName": speaker1_voice
102
+ }
103
+ }
104
+ },
105
+ {
106
+ "speaker": speaker2_name,
107
+ "voiceConfig": {
108
+ "prebuiltVoiceConfig": {
109
+ "voiceName": speaker2_voice
110
+ }
111
+ }
112
+ }
113
+ ]
114
  }
115
  }
116
  }
 
118
  response = requests.post(API_URL, headers=headers, json=payload)
119
 
120
  if response.status_code != 200:
121
+ return JSONResponse(status_code=500, content={"error": response.text})
122
 
123
  result = response.json()
124
+
125
  try:
126
  audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
127
  audio_data = base64.b64decode(audio_data_b64)
128
 
129
+ file_path = "multi_output.wav"
130
+ save_wav(file_path, audio_data)
131
 
132
+ return FileResponse(file_path, media_type="audio/wav", filename="multi_output.wav")
133
 
134
  except Exception as e:
135
+ return JSONResponse(status_code=500, content={"error": str(e)})
136
 
137
+ # Test route
138
  @app.get("/")
139
  def root():
140
+ return {"message": "Gemini TTS API is live!"}