Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,196 +1,133 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException
|
2 |
-
from fastapi.responses import FileResponse
|
|
|
3 |
from google import genai
|
4 |
from google.genai import types
|
5 |
-
import os
|
6 |
import wave
|
7 |
-
import
|
8 |
-
import
|
9 |
-
import uvicorn
|
10 |
from typing import Optional
|
11 |
from pydantic import BaseModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
-
API_KEYS = [
|
16 |
-
os.getenv("GOOGLE_API_KEY_1"),
|
17 |
-
os.getenv("GOOGLE_API_KEY_2")
|
18 |
-
]
|
19 |
-
MODEL_CHOICES = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
|
20 |
-
VOICE_NAMES = [
|
21 |
-
"Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe",
|
22 |
-
"Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux",
|
23 |
-
"Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi",
|
24 |
-
"Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"
|
25 |
-
]
|
26 |
|
27 |
-
# βββ API Models βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
28 |
class TTSRequest(BaseModel):
|
29 |
text: str
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
if used_key:
|
54 |
-
try:
|
55 |
-
client = genai.Client(api_key=used_key)
|
56 |
-
except Exception:
|
57 |
-
pass
|
58 |
-
|
59 |
-
# Fallback to pre-configured clients
|
60 |
-
if not client:
|
61 |
-
for client_name, existing_client in self.clients.items():
|
62 |
-
try:
|
63 |
-
response = existing_client.models.generate_content(
|
64 |
-
model=request.model,
|
65 |
-
contents=request.text,
|
66 |
-
config=types.GenerateContentConfig(
|
67 |
-
response_modalities=["AUDIO"],
|
68 |
-
speech_config=types.SpeechConfig(
|
69 |
-
voice_config=types.VoiceConfig(
|
70 |
-
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
71 |
-
voice_name=request.voice_name
|
72 |
-
)
|
73 |
-
)
|
74 |
-
),
|
75 |
-
),
|
76 |
-
)
|
77 |
-
client = existing_client
|
78 |
-
used_key = client_name
|
79 |
-
break
|
80 |
-
except Exception:
|
81 |
-
continue
|
82 |
|
83 |
-
if
|
84 |
-
raise HTTPException(status_code=500, detail="No working API keys available")
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
voice_name=request.voice_name
|
96 |
-
)
|
97 |
)
|
98 |
-
)
|
99 |
),
|
100 |
)
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
raise ValueError("Invalid response format from API")
|
106 |
-
except Exception as e:
|
107 |
-
raise HTTPException(status_code=500, detail=str(e))
|
108 |
-
|
109 |
-
@staticmethod
|
110 |
-
def create_wave_file(pcm_bytes: bytes) -> str:
|
111 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
112 |
-
with wave.open(tmp_file.name, "wb") as wf:
|
113 |
-
wf.setnchannels(1)
|
114 |
-
wf.setsampwidth(2)
|
115 |
-
wf.setframerate(24000)
|
116 |
-
wf.writeframes(pcm_bytes)
|
117 |
-
return tmp_file.name
|
118 |
-
|
119 |
-
# βββ FastAPI App ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
120 |
-
app = FastAPI(title="Gemini TTS API", version="1.0")
|
121 |
-
engine = TTSEngine()
|
122 |
-
|
123 |
-
@app.post("/generate/")
|
124 |
-
async def generate_tts(request: TTSRequest):
|
125 |
-
try:
|
126 |
-
audio_path = engine.generate_audio(request)
|
127 |
-
return FileResponse(audio_path, media_type="audio/wav")
|
128 |
-
except HTTPException:
|
129 |
-
raise
|
130 |
-
except Exception as e:
|
131 |
-
raise HTTPException(status_code=500, detail=str(e))
|
132 |
-
|
133 |
-
# βββ Gradio Interface βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
134 |
-
def create_gradio_interface():
|
135 |
-
with gr.Blocks(title="Gemini TTS", theme=gr.themes.Soft()) as interface:
|
136 |
-
gr.Markdown("## π€ Gemini TTS Voice Generator")
|
137 |
|
138 |
-
|
139 |
-
with gr.Column():
|
140 |
-
model_choice = gr.Dropdown(
|
141 |
-
choices=Config.MODEL_CHOICES,
|
142 |
-
value=Config.MODEL_CHOICES[0],
|
143 |
-
label="Model"
|
144 |
-
)
|
145 |
-
voice_name = gr.Dropdown(
|
146 |
-
choices=Config.VOICE_NAMES,
|
147 |
-
value=Config.VOICE_NAMES[0],
|
148 |
-
label="Voice"
|
149 |
-
)
|
150 |
-
api_key = gr.Textbox(
|
151 |
-
label="API Key (optional)",
|
152 |
-
type="password",
|
153 |
-
placeholder="Leave empty to use configured keys"
|
154 |
-
)
|
155 |
-
|
156 |
-
with gr.Column():
|
157 |
-
text_input = gr.Textbox(
|
158 |
-
label="Text to Convert",
|
159 |
-
lines=5,
|
160 |
-
placeholder="Enter text here..."
|
161 |
-
)
|
162 |
-
generate_btn = gr.Button("Generate Speech")
|
163 |
-
audio_output = gr.Audio(label="Generated Audio")
|
164 |
-
error_output = gr.Textbox(label="Error", visible=False)
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
request = TTSRequest(text=text, model=model, voice_name=voice, api_key=key or None)
|
169 |
-
audio_path = engine.generate_audio(request)
|
170 |
-
return audio_path, ""
|
171 |
-
except Exception as e:
|
172 |
-
return None, str(e)
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
)
|
179 |
-
|
180 |
-
return interface
|
181 |
|
182 |
-
|
183 |
-
|
|
|
184 |
|
185 |
-
#
|
186 |
-
app
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
# For local development
|
189 |
if __name__ == "__main__":
|
190 |
-
uvicorn
|
191 |
-
|
192 |
-
host="0.0.0.0",
|
193 |
-
port=8000,
|
194 |
-
reload=True,
|
195 |
-
workers=2
|
196 |
-
)
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Request
|
2 |
+
from fastapi.responses import FileResponse, JSONResponse
|
3 |
+
from fastapi.staticfiles import StaticFiles
|
4 |
from google import genai
|
5 |
from google.genai import types
|
|
|
6 |
import wave
|
7 |
+
import os
|
8 |
+
import uuid
|
|
|
9 |
from typing import Optional
|
10 |
from pydantic import BaseModel
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
app = FastAPI(
|
14 |
+
title="Google GenAI TTS API",
|
15 |
+
description="API for text-to-speech conversion using Google GenAI",
|
16 |
+
version="1.0.0",
|
17 |
+
docs_url="/docs",
|
18 |
+
redoc_url=None
|
19 |
+
)
|
20 |
+
|
21 |
+
# Configuration
|
22 |
+
AUDIO_OUTPUT_DIR = "audio_output"
|
23 |
+
Path(AUDIO_OUTPUT_DIR).mkdir(exist_ok=True)
|
24 |
|
25 |
+
# Mount static files for Hugging Face Spaces
|
26 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
28 |
class TTSRequest(BaseModel):
|
29 |
text: str
|
30 |
+
voice_name: Optional[str] = "Kore"
|
31 |
+
cheerful: Optional[bool] = True
|
32 |
+
sample_rate: Optional[int] = 24000
|
33 |
+
channels: Optional[int] = 1
|
34 |
+
sample_width: Optional[int] = 2
|
35 |
|
36 |
+
def initialize_genai_client():
|
37 |
+
"""Initialize the GenAI client with API key from environment"""
|
38 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
39 |
+
if not api_key:
|
40 |
+
raise ValueError("GEMINI_API_KEY environment variable not set")
|
41 |
+
return genai.Client(api_key=api_key)
|
42 |
+
|
43 |
+
def generate_wave_file(filename: str, pcm_data: bytes, channels: int, rate: int, sample_width: int):
|
44 |
+
"""Generate a WAV file from PCM data"""
|
45 |
+
with wave.open(filename, "wb") as wf:
|
46 |
+
wf.setnchannels(channels)
|
47 |
+
wf.setsampwidth(sample_width)
|
48 |
+
wf.setframerate(rate)
|
49 |
+
wf.writeframes(pcm_data)
|
50 |
+
|
51 |
+
@app.post("/api/generate-tts/")
|
52 |
+
async def generate_tts(request: TTSRequest):
|
53 |
+
"""
|
54 |
+
Generate speech from text using Google GenAI TTS
|
55 |
|
56 |
+
Parameters:
|
57 |
+
- text: The text to convert to speech
|
58 |
+
- voice_name: Voice to use (default: 'Kore')
|
59 |
+
- cheerful: Whether to speak cheerfully (default: True)
|
60 |
+
- sample_rate: Audio sample rate (default: 24000)
|
61 |
+
- channels: Number of audio channels (default: 1)
|
62 |
+
- sample_width: Sample width in bytes (default: 2)
|
63 |
|
64 |
+
Returns:
|
65 |
+
- JSON with file URL or error message
|
66 |
+
"""
|
67 |
+
try:
|
68 |
+
client = initialize_genai_client()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
text_to_speak = f"Say cheerfully: {request.text}" if request.cheerful else request.text
|
|
|
71 |
|
72 |
+
response = client.models.generate_content(
|
73 |
+
model="gemini-2.5-flash-preview-tts",
|
74 |
+
contents=text_to_speak,
|
75 |
+
config=types.GenerateContentConfig(
|
76 |
+
response_modalities=["AUDIO"],
|
77 |
+
speech_config=types.SpeechConfig(
|
78 |
+
voice_config=types.VoiceConfig(
|
79 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
80 |
+
voice_name=request.voice_name,
|
|
|
|
|
81 |
)
|
82 |
+
)
|
83 |
),
|
84 |
)
|
85 |
+
)
|
86 |
+
|
87 |
+
if not response.candidates or not response.candidates[0].content.parts:
|
88 |
+
raise HTTPException(status_code=500, detail="No audio data received from GenAI")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
+
audio_data = response.candidates[0].content.parts[0].inline_data.data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
file_name = f"tts_{uuid.uuid4().hex}.wav"
|
93 |
+
file_path = os.path.join(AUDIO_OUTPUT_DIR, file_name)
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
generate_wave_file(
|
96 |
+
file_path,
|
97 |
+
audio_data,
|
98 |
+
channels=request.channels,
|
99 |
+
rate=request.sample_rate,
|
100 |
+
sample_width=request.sample_width
|
101 |
+
)
|
102 |
+
|
103 |
+
# For Hugging Face Spaces, we need to return the URL where the file can be accessed
|
104 |
+
file_url = f"/static/{file_name}"
|
105 |
+
os.rename(file_path, f"static/{file_name}")
|
106 |
+
|
107 |
+
return JSONResponse({
|
108 |
+
"status": "success",
|
109 |
+
"audio_url": file_url,
|
110 |
+
"filename": file_name
|
111 |
+
})
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
return JSONResponse(
|
115 |
+
{"status": "error", "message": str(e)},
|
116 |
+
status_code=500
|
117 |
)
|
|
|
|
|
118 |
|
119 |
+
@app.get("/")
|
120 |
+
async def root():
|
121 |
+
return {"message": "Google GenAI TTS API is running"}
|
122 |
|
123 |
+
# Error handler
|
124 |
+
@app.exception_handler(Exception)
|
125 |
+
async def generic_exception_handler(request: Request, exc: Exception):
|
126 |
+
return JSONResponse(
|
127 |
+
status_code=500,
|
128 |
+
content={"message": f"An error occurred: {str(exc)}"}
|
129 |
+
)
|
130 |
|
|
|
131 |
if __name__ == "__main__":
|
132 |
+
import uvicorn
|
133 |
+
uvicorn.run(app, host="0.0.0.0", port=8080)
|
|
|
|
|
|
|
|
|
|