Athspi commited on
Commit
405a073
Β·
verified Β·
1 Parent(s): c9eaebb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -61
app.py CHANGED
@@ -1,71 +1,196 @@
 
 
 
 
1
  import os
2
  import wave
 
3
  import gradio as gr
4
- import google.generativeai as genai
5
-
6
- # Set your API Key (or via Hugging Face Secrets / os.environ)
7
- GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
8
-
9
- if not GOOGLE_API_KEY:
10
- raise ValueError("Please set your GOOGLE_API_KEY environment variable.")
11
-
12
- # Configure Generative AI
13
- genai.configure(api_key=GOOGLE_API_KEY)
14
-
15
- # Initialize Gemini TTS model
16
- model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")
17
-
18
- # Function to save raw PCM data to WAV file
19
- def save_wave(filename, pcm_data, channels=1, rate=24000, sample_width=2):
20
- with wave.open(filename, 'wb') as wf:
21
- wf.setnchannels(channels)
22
- wf.setsampwidth(sample_width)
23
- wf.setframerate(rate)
24
- wf.writeframes(pcm_data)
25
-
26
- # Function to handle TTS generation
27
- def generate_tts(text):
28
- if not text.strip():
29
- return None, "Please enter some text."
30
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  try:
32
- response = model.generate_content(
33
- text,
34
- generation_config={"response_mime_type": "audio/wav"},
35
- response_modality="AUDIO"
36
- )
37
-
38
- # Extract audio data from response
39
- audio_data = response.candidates[0].content.parts[0].inline_data.data
40
-
41
- output_filename = "output.wav"
42
- save_wave(output_filename, audio_data)
43
-
44
- return output_filename, "Audio generated successfully!"
45
-
46
  except Exception as e:
47
- return None, f"Error: {str(e)}"
48
-
49
- # Gradio Interface
50
- with gr.Blocks() as demo:
51
- gr.Markdown("## πŸŽ™οΈ Gemini 2.5 Text-to-Speech Demo")
52
-
53
- with gr.Row():
54
- text_input = gr.Textbox(label="Enter text to convert to speech")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- with gr.Row():
57
- submit_button = gr.Button("Generate Speech")
58
 
59
- with gr.Row():
60
- audio_output = gr.Audio(label="Generated Audio", type="filepath")
61
- status_output = gr.Textbox(label="Status")
62
 
63
- submit_button.click(
64
- fn=generate_tts,
65
- inputs=[text_input],
66
- outputs=[audio_output, status_output]
67
- )
68
 
69
- # Launch Gradio app
70
  if __name__ == "__main__":
71
- demo.launch()
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import FileResponse
3
+ from google import genai
4
+ from google.genai import types
5
  import os
6
  import wave
7
+ import tempfile
8
  import gradio as gr
9
+ import uvicorn
10
+ from typing import Optional
11
+ from pydantic import BaseModel
12
+
13
+ # ─── Configuration ────────────────────────────────────────────────────────────
14
+ class Config:
15
+ API_KEYS = [
16
+ os.getenv("GOOGLE_API_KEY_1"),
17
+ os.getenv("GOOGLE_API_KEY_2")
18
+ ]
19
+ MODEL_CHOICES = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
20
+ VOICE_NAMES = [
21
+ "Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe",
22
+ "Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux",
23
+ "Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi",
24
+ "Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"
25
+ ]
26
+
27
+ # ─── API Models ───────────────────────────────────────────────────────────────
28
+ class TTSRequest(BaseModel):
29
+ text: str
30
+ model: str = Config.MODEL_CHOICES[0]
31
+ voice_name: str = Config.VOICE_NAMES[0]
32
+ api_key: Optional[str] = None
33
+
34
+ # ─── Core TTS Engine ──────────────────────────────────────────────────────────
35
+ class TTSEngine:
36
+ def __init__(self):
37
+ self.clients = {}
38
+ self.init_clients()
39
+
40
+ def init_clients(self):
41
+ for idx, key in enumerate(Config.API_KEYS):
42
+ if key:
43
+ try:
44
+ self.clients[f"client_{idx}"] = genai.Client(api_key=key)
45
+ except Exception as e:
46
+ print(f"Failed to initialize client with key {idx}: {str(e)}")
47
+
48
+ def generate_audio(self, request: TTSRequest) -> str:
49
+ client = None
50
+ used_key = request.api_key
51
+
52
+ # Try user-provided key first
53
+ if used_key:
54
+ try:
55
+ client = genai.Client(api_key=used_key)
56
+ except Exception:
57
+ pass
58
+
59
+ # Fallback to pre-configured clients
60
+ if not client:
61
+ for client_name, existing_client in self.clients.items():
62
+ try:
63
+ response = existing_client.models.generate_content(
64
+ model=request.model,
65
+ contents=request.text,
66
+ config=types.GenerateContentConfig(
67
+ response_modalities=["AUDIO"],
68
+ speech_config=types.SpeechConfig(
69
+ voice_config=types.VoiceConfig(
70
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
71
+ voice_name=request.voice_name
72
+ )
73
+ )
74
+ ),
75
+ ),
76
+ )
77
+ client = existing_client
78
+ used_key = client_name
79
+ break
80
+ except Exception:
81
+ continue
82
+
83
+ if not client:
84
+ raise HTTPException(status_code=500, detail="No working API keys available")
85
+
86
+ try:
87
+ response = client.models.generate_content(
88
+ model=request.model,
89
+ contents=request.text,
90
+ config=types.GenerateContentConfig(
91
+ response_modalities=["AUDIO"],
92
+ speech_config=types.SpeechConfig(
93
+ voice_config=types.VoiceConfig(
94
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
95
+ voice_name=request.voice_name
96
+ )
97
+ )
98
+ ),
99
+ ),
100
+ )
101
+
102
+ if response.candidates and response.candidates[0].content.parts:
103
+ pcm_data = response.candidates[0].content.parts[0].inline_data.data
104
+ return self.create_wave_file(pcm_data)
105
+ raise ValueError("Invalid response format from API")
106
+ except Exception as e:
107
+ raise HTTPException(status_code=500, detail=str(e))
108
+
109
+ @staticmethod
110
+ def create_wave_file(pcm_bytes: bytes) -> str:
111
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
112
+ with wave.open(tmp_file.name, "wb") as wf:
113
+ wf.setnchannels(1)
114
+ wf.setsampwidth(2)
115
+ wf.setframerate(24000)
116
+ wf.writeframes(pcm_bytes)
117
+ return tmp_file.name
118
+
119
+ # ─── FastAPI App ──────────────────────────────────────────────────────────────
120
+ app = FastAPI(title="Gemini TTS API", version="1.0")
121
+ engine = TTSEngine()
122
+
123
+ @app.post("/generate/")
124
+ async def generate_tts(request: TTSRequest):
125
  try:
126
+ audio_path = engine.generate_audio(request)
127
+ return FileResponse(audio_path, media_type="audio/wav")
128
+ except HTTPException:
129
+ raise
 
 
 
 
 
 
 
 
 
 
130
  except Exception as e:
131
+ raise HTTPException(status_code=500, detail=str(e))
132
+
133
+ # ─── Gradio Interface ─────────────────────────────────────────────────────────
134
+ def create_gradio_interface():
135
+ with gr.Blocks(title="Gemini TTS", theme=gr.themes.Soft()) as interface:
136
+ gr.Markdown("## 🎀 Gemini TTS Voice Generator")
137
+
138
+ with gr.Row():
139
+ with gr.Column():
140
+ model_choice = gr.Dropdown(
141
+ choices=Config.MODEL_CHOICES,
142
+ value=Config.MODEL_CHOICES[0],
143
+ label="Model"
144
+ )
145
+ voice_name = gr.Dropdown(
146
+ choices=Config.VOICE_NAMES,
147
+ value=Config.VOICE_NAMES[0],
148
+ label="Voice"
149
+ )
150
+ api_key = gr.Textbox(
151
+ label="API Key (optional)",
152
+ type="password",
153
+ placeholder="Leave empty to use configured keys"
154
+ )
155
+
156
+ with gr.Column():
157
+ text_input = gr.Textbox(
158
+ label="Text to Convert",
159
+ lines=5,
160
+ placeholder="Enter text here..."
161
+ )
162
+ generate_btn = gr.Button("Generate Speech")
163
+ audio_output = gr.Audio(label="Generated Audio")
164
+ error_output = gr.Textbox(label="Error", visible=False)
165
+
166
+ def generate(text, model, voice, key):
167
+ try:
168
+ request = TTSRequest(text=text, model=model, voice_name=voice, api_key=key or None)
169
+ audio_path = engine.generate_audio(request)
170
+ return audio_path, ""
171
+ except Exception as e:
172
+ return None, str(e)
173
+
174
+ generate_btn.click(
175
+ generate,
176
+ inputs=[text_input, model_choice, voice_name, api_key],
177
+ outputs=[audio_output, error_output]
178
+ )
179
 
180
+ return interface
 
181
 
182
+ # ─── Deployment Options ───────────────────────────────────────────────────────
183
+ gradio_app = create_gradio_interface()
 
184
 
185
+ # For Hugging Face Spaces
186
+ app = gr.mount_gradio_app(app, gradio_app, path="/")
 
 
 
187
 
188
+ # For local development
189
  if __name__ == "__main__":
190
+ uvicorn.run(
191
+ "main:app",
192
+ host="0.0.0.0",
193
+ port=8000,
194
+ reload=True,
195
+ workers=2
196
+ )