Athspi commited on
Commit
515f8f3
Β·
verified Β·
1 Parent(s): 25a22ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -171
app.py CHANGED
@@ -1,196 +1,133 @@
1
- from fastapi import FastAPI, HTTPException
2
- from fastapi.responses import FileResponse
 
3
  from google import genai
4
  from google.genai import types
5
- import os
6
  import wave
7
- import tempfile
8
- import gradio as gr
9
- import uvicorn
10
  from typing import Optional
11
  from pydantic import BaseModel
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # ─── Configuration ────────────────────────────────────────────────────────────
14
- class Config:
15
- API_KEYS = [
16
- os.getenv("GOOGLE_API_KEY_1"),
17
- os.getenv("GOOGLE_API_KEY_2")
18
- ]
19
- MODEL_CHOICES = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
20
- VOICE_NAMES = [
21
- "Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe",
22
- "Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux",
23
- "Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi",
24
- "Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"
25
- ]
26
 
27
- # ─── API Models ───────────────────────────────────────────────────────────────
28
  class TTSRequest(BaseModel):
29
  text: str
30
- model: str = Config.MODEL_CHOICES[0]
31
- voice_name: str = Config.VOICE_NAMES[0]
32
- api_key: Optional[str] = None
 
 
33
 
34
- # ─── Core TTS Engine ──────────────────────────────────────────────────────────
35
- class TTSEngine:
36
- def __init__(self):
37
- self.clients = {}
38
- self.init_clients()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- def init_clients(self):
41
- for idx, key in enumerate(Config.API_KEYS):
42
- if key:
43
- try:
44
- self.clients[f"client_{idx}"] = genai.Client(api_key=key)
45
- except Exception as e:
46
- print(f"Failed to initialize client with key {idx}: {str(e)}")
47
 
48
- def generate_audio(self, request: TTSRequest) -> str:
49
- client = None
50
- used_key = request.api_key
51
-
52
- # Try user-provided key first
53
- if used_key:
54
- try:
55
- client = genai.Client(api_key=used_key)
56
- except Exception:
57
- pass
58
-
59
- # Fallback to pre-configured clients
60
- if not client:
61
- for client_name, existing_client in self.clients.items():
62
- try:
63
- response = existing_client.models.generate_content(
64
- model=request.model,
65
- contents=request.text,
66
- config=types.GenerateContentConfig(
67
- response_modalities=["AUDIO"],
68
- speech_config=types.SpeechConfig(
69
- voice_config=types.VoiceConfig(
70
- prebuilt_voice_config=types.PrebuiltVoiceConfig(
71
- voice_name=request.voice_name
72
- )
73
- )
74
- ),
75
- ),
76
- )
77
- client = existing_client
78
- used_key = client_name
79
- break
80
- except Exception:
81
- continue
82
 
83
- if not client:
84
- raise HTTPException(status_code=500, detail="No working API keys available")
85
 
86
- try:
87
- response = client.models.generate_content(
88
- model=request.model,
89
- contents=request.text,
90
- config=types.GenerateContentConfig(
91
- response_modalities=["AUDIO"],
92
- speech_config=types.SpeechConfig(
93
- voice_config=types.VoiceConfig(
94
- prebuilt_voice_config=types.PrebuiltVoiceConfig(
95
- voice_name=request.voice_name
96
- )
97
  )
98
- ),
99
  ),
100
  )
101
-
102
- if response.candidates and response.candidates[0].content.parts:
103
- pcm_data = response.candidates[0].content.parts[0].inline_data.data
104
- return self.create_wave_file(pcm_data)
105
- raise ValueError("Invalid response format from API")
106
- except Exception as e:
107
- raise HTTPException(status_code=500, detail=str(e))
108
-
109
- @staticmethod
110
- def create_wave_file(pcm_bytes: bytes) -> str:
111
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
112
- with wave.open(tmp_file.name, "wb") as wf:
113
- wf.setnchannels(1)
114
- wf.setsampwidth(2)
115
- wf.setframerate(24000)
116
- wf.writeframes(pcm_bytes)
117
- return tmp_file.name
118
-
119
- # ─── FastAPI App ──────────────────────────────────────────────────────────────
120
- app = FastAPI(title="Gemini TTS API", version="1.0")
121
- engine = TTSEngine()
122
-
123
- @app.post("/generate/")
124
- async def generate_tts(request: TTSRequest):
125
- try:
126
- audio_path = engine.generate_audio(request)
127
- return FileResponse(audio_path, media_type="audio/wav")
128
- except HTTPException:
129
- raise
130
- except Exception as e:
131
- raise HTTPException(status_code=500, detail=str(e))
132
-
133
- # ─── Gradio Interface ─────────────────────────────────────────────────────────
134
- def create_gradio_interface():
135
- with gr.Blocks(title="Gemini TTS", theme=gr.themes.Soft()) as interface:
136
- gr.Markdown("## 🎀 Gemini TTS Voice Generator")
137
 
138
- with gr.Row():
139
- with gr.Column():
140
- model_choice = gr.Dropdown(
141
- choices=Config.MODEL_CHOICES,
142
- value=Config.MODEL_CHOICES[0],
143
- label="Model"
144
- )
145
- voice_name = gr.Dropdown(
146
- choices=Config.VOICE_NAMES,
147
- value=Config.VOICE_NAMES[0],
148
- label="Voice"
149
- )
150
- api_key = gr.Textbox(
151
- label="API Key (optional)",
152
- type="password",
153
- placeholder="Leave empty to use configured keys"
154
- )
155
-
156
- with gr.Column():
157
- text_input = gr.Textbox(
158
- label="Text to Convert",
159
- lines=5,
160
- placeholder="Enter text here..."
161
- )
162
- generate_btn = gr.Button("Generate Speech")
163
- audio_output = gr.Audio(label="Generated Audio")
164
- error_output = gr.Textbox(label="Error", visible=False)
165
 
166
- def generate(text, model, voice, key):
167
- try:
168
- request = TTSRequest(text=text, model=model, voice_name=voice, api_key=key or None)
169
- audio_path = engine.generate_audio(request)
170
- return audio_path, ""
171
- except Exception as e:
172
- return None, str(e)
173
 
174
- generate_btn.click(
175
- generate,
176
- inputs=[text_input, model_choice, voice_name, api_key],
177
- outputs=[audio_output, error_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  )
179
-
180
- return interface
181
 
182
- # ─── Deployment Options ───────────────────────────────────────────────────────
183
- gradio_app = create_gradio_interface()
 
184
 
185
- # For Hugging Face Spaces
186
- app = gr.mount_gradio_app(app, gradio_app, path="/")
 
 
 
 
 
187
 
188
- # For local development
189
  if __name__ == "__main__":
190
- uvicorn.run(
191
- "main:app",
192
- host="0.0.0.0",
193
- port=8000,
194
- reload=True,
195
- workers=2
196
- )
 
1
+ from fastapi import FastAPI, HTTPException, Request
2
+ from fastapi.responses import FileResponse, JSONResponse
3
+ from fastapi.staticfiles import StaticFiles
4
  from google import genai
5
  from google.genai import types
 
6
  import wave
7
+ import os
8
+ import uuid
 
9
  from typing import Optional
10
  from pydantic import BaseModel
11
+ from pathlib import Path
12
+
13
+ app = FastAPI(
14
+ title="Google GenAI TTS API",
15
+ description="API for text-to-speech conversion using Google GenAI",
16
+ version="1.0.0",
17
+ docs_url="/docs",
18
+ redoc_url=None
19
+ )
20
+
21
+ # Configuration
22
+ AUDIO_OUTPUT_DIR = "audio_output"
23
+ Path(AUDIO_OUTPUT_DIR).mkdir(exist_ok=True)
24
 
25
+ # Mount static files for Hugging Face Spaces
26
+ app.mount("/static", StaticFiles(directory="static"), name="static")
 
 
 
 
 
 
 
 
 
 
 
27
 
 
28
  class TTSRequest(BaseModel):
29
  text: str
30
+ voice_name: Optional[str] = "Kore"
31
+ cheerful: Optional[bool] = True
32
+ sample_rate: Optional[int] = 24000
33
+ channels: Optional[int] = 1
34
+ sample_width: Optional[int] = 2
35
 
36
+ def initialize_genai_client():
37
+ """Initialize the GenAI client with API key from environment"""
38
+ api_key = os.getenv("GEMINI_API_KEY")
39
+ if not api_key:
40
+ raise ValueError("GEMINI_API_KEY environment variable not set")
41
+ return genai.Client(api_key=api_key)
42
+
43
+ def generate_wave_file(filename: str, pcm_data: bytes, channels: int, rate: int, sample_width: int):
44
+ """Generate a WAV file from PCM data"""
45
+ with wave.open(filename, "wb") as wf:
46
+ wf.setnchannels(channels)
47
+ wf.setsampwidth(sample_width)
48
+ wf.setframerate(rate)
49
+ wf.writeframes(pcm_data)
50
+
51
+ @app.post("/api/generate-tts/")
52
+ async def generate_tts(request: TTSRequest):
53
+ """
54
+ Generate speech from text using Google GenAI TTS
55
 
56
+ Parameters:
57
+ - text: The text to convert to speech
58
+ - voice_name: Voice to use (default: 'Kore')
59
+ - cheerful: Whether to speak cheerfully (default: True)
60
+ - sample_rate: Audio sample rate (default: 24000)
61
+ - channels: Number of audio channels (default: 1)
62
+ - sample_width: Sample width in bytes (default: 2)
63
 
64
+ Returns:
65
+ - JSON with file URL or error message
66
+ """
67
+ try:
68
+ client = initialize_genai_client()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ text_to_speak = f"Say cheerfully: {request.text}" if request.cheerful else request.text
 
71
 
72
+ response = client.models.generate_content(
73
+ model="gemini-2.5-flash-preview-tts",
74
+ contents=text_to_speak,
75
+ config=types.GenerateContentConfig(
76
+ response_modalities=["AUDIO"],
77
+ speech_config=types.SpeechConfig(
78
+ voice_config=types.VoiceConfig(
79
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
80
+ voice_name=request.voice_name,
 
 
81
  )
82
+ )
83
  ),
84
  )
85
+ )
86
+
87
+ if not response.candidates or not response.candidates[0].content.parts:
88
+ raise HTTPException(status_code=500, detail="No audio data received from GenAI")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ audio_data = response.candidates[0].content.parts[0].inline_data.data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ file_name = f"tts_{uuid.uuid4().hex}.wav"
93
+ file_path = os.path.join(AUDIO_OUTPUT_DIR, file_name)
 
 
 
 
 
94
 
95
+ generate_wave_file(
96
+ file_path,
97
+ audio_data,
98
+ channels=request.channels,
99
+ rate=request.sample_rate,
100
+ sample_width=request.sample_width
101
+ )
102
+
103
+ # For Hugging Face Spaces, we need to return the URL where the file can be accessed
104
+ file_url = f"/static/{file_name}"
105
+ os.rename(file_path, f"static/{file_name}")
106
+
107
+ return JSONResponse({
108
+ "status": "success",
109
+ "audio_url": file_url,
110
+ "filename": file_name
111
+ })
112
+
113
+ except Exception as e:
114
+ return JSONResponse(
115
+ {"status": "error", "message": str(e)},
116
+ status_code=500
117
  )
 
 
118
 
119
+ @app.get("/")
120
+ async def root():
121
+ return {"message": "Google GenAI TTS API is running"}
122
 
123
+ # Error handler
124
+ @app.exception_handler(Exception)
125
+ async def generic_exception_handler(request: Request, exc: Exception):
126
+ return JSONResponse(
127
+ status_code=500,
128
+ content={"message": f"An error occurred: {str(exc)}"}
129
+ )
130
 
 
131
  if __name__ == "__main__":
132
+ import uvicorn
133
+ uvicorn.run(app, host="0.0.0.0", port=8080)