Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,17 @@
|
|
1 |
import os
|
2 |
|
3 |
-
# Set environment variables BEFORE any imports
|
4 |
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
5 |
os.environ["TORCH_COMPILE_DISABLE"] = "1"
|
6 |
os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
|
7 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
8 |
|
9 |
-
# Set CUDA environment to help with unsloth GPU detection (only if not ZeroGPU)
|
10 |
-
if not os.getenv("ZERO_GPU"):
|
11 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Force GPU visibility
|
12 |
-
os.environ["FORCE_CUDA"] = "1" # Force CUDA usage
|
13 |
-
|
14 |
import torch
|
15 |
import gradio as gr
|
16 |
import numpy as np
|
17 |
import spaces
|
18 |
import logging
|
19 |
from huggingface_hub import login
|
|
|
20 |
import time
|
21 |
|
22 |
torch._dynamo.config.disable = True
|
@@ -29,23 +24,16 @@ hf_token = os.getenv("HF_TOKEN")
|
|
29 |
if hf_token:
|
30 |
login(token=hf_token)
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
device = "cuda"
|
38 |
-
logger.info("Using CUDA for inference.")
|
39 |
-
elif torch.backends.mps.is_available():
|
40 |
-
device = "mps"
|
41 |
-
logger.info("Using MPS for inference.")
|
42 |
-
else:
|
43 |
-
device = "cpu"
|
44 |
-
logger.info("Using CPU for inference.")
|
45 |
|
46 |
def get_speakers_dict():
|
47 |
-
"""Get speakers dictionary using the
|
48 |
try:
|
|
|
49 |
from maliba_ai.config.settings import Speakers
|
50 |
return {
|
51 |
"Adama": Speakers.Adama,
|
@@ -60,62 +48,70 @@ def get_speakers_dict():
|
|
60 |
"Amara": Speakers.Amara
|
61 |
}
|
62 |
except Exception as e:
|
63 |
-
logger.error(f"Failed to import
|
64 |
-
# Fallback to
|
65 |
try:
|
66 |
-
from maliba_ai.config.
|
67 |
return {
|
68 |
-
"Adama":
|
69 |
-
"Moussa":
|
70 |
-
"Bourama":
|
71 |
-
"Modibo":
|
72 |
-
"Seydou":
|
73 |
}
|
74 |
-
except:
|
75 |
-
logger.error("Failed to import
|
76 |
return {}
|
77 |
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
try:
|
81 |
-
|
82 |
-
if os.getenv("ZERO_GPU") or "zero" in str(os.getenv("SPACE_ID", "")).lower():
|
83 |
-
logger.info("ZeroGPU environment detected - skipping global initialization")
|
84 |
-
return None
|
85 |
-
|
86 |
-
# Only try global init if CUDA is actually available and initialized
|
87 |
-
if not torch.cuda.is_available():
|
88 |
-
logger.info("CUDA not available - skipping global initialization")
|
89 |
-
return None
|
90 |
-
|
91 |
-
logger.info("Attempting global TTS model initialization...")
|
92 |
start_time = time.time()
|
93 |
|
94 |
-
#
|
95 |
-
from maliba_ai.tts
|
96 |
|
97 |
-
# Initialize model
|
98 |
model = BambaraTTSInference()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
elapsed = time.time() - start_time
|
101 |
-
logger.info(f"
|
102 |
|
103 |
-
return
|
104 |
|
105 |
except Exception as e:
|
106 |
-
logger.error(f"Failed to initialize
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
speakers_dict = get_speakers_dict()
|
112 |
-
logger.info(f"Available speakers: {list(speakers_dict.keys())}")
|
113 |
-
|
114 |
-
# Try to initialize model globally only if not in ZeroGPU environment
|
115 |
-
tts_model = initialize_tts_model()
|
116 |
|
117 |
def validate_inputs(text, temperature, top_k, top_p, max_tokens):
|
118 |
-
"""
|
119 |
if not text or not text.strip():
|
120 |
return False, "Please enter some Bambara text."
|
121 |
|
@@ -128,44 +124,33 @@ def validate_inputs(text, temperature, top_k, top_p, max_tokens):
|
|
128 |
if not (0.1 <= top_p <= 1.0):
|
129 |
return False, "Top-P must be between 0.1 and 1.0"
|
130 |
|
131 |
-
if len(text.strip()) > 1000:
|
132 |
-
return False, "Text is too long. Please use shorter text (max 1000 characters)."
|
133 |
-
|
134 |
return True, ""
|
135 |
|
136 |
@spaces.GPU()
|
137 |
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
|
138 |
-
"""Generate speech -
|
139 |
-
global tts_model
|
140 |
-
|
141 |
if not text.strip():
|
142 |
return None, "Please enter some Bambara text."
|
143 |
|
144 |
try:
|
145 |
-
|
146 |
-
if tts_model is None:
|
147 |
-
logger.info("Global model initialization failed, initializing with GPU decorator...")
|
148 |
-
from maliba_ai.tts import BambaraTTSInference
|
149 |
-
tts_model = BambaraTTSInference()
|
150 |
-
logger.info("Model initialized successfully with GPU decorator!")
|
151 |
|
152 |
-
if not
|
153 |
-
return None, "β
|
154 |
|
155 |
-
if speaker_name not in
|
156 |
-
available_speakers = list(
|
157 |
return None, f"β Speaker '{speaker_name}' not found. Available: {available_speakers}"
|
158 |
|
159 |
-
speaker =
|
160 |
-
logger.info(f"
|
161 |
|
162 |
-
# Validate inputs if using advanced settings
|
163 |
if use_advanced:
|
164 |
is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
|
165 |
if not is_valid:
|
166 |
return None, f"β {error_msg}"
|
167 |
|
168 |
-
waveform =
|
169 |
text=text.strip(),
|
170 |
speaker_id=speaker,
|
171 |
temperature=temperature,
|
@@ -174,104 +159,85 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
|
|
174 |
max_new_audio_tokens=int(max_tokens)
|
175 |
)
|
176 |
else:
|
177 |
-
|
178 |
-
waveform = tts_model.generate_speech(
|
179 |
text=text.strip(),
|
180 |
speaker_id=speaker
|
181 |
)
|
182 |
|
183 |
if waveform is None or waveform.size == 0:
|
184 |
-
return None, "
|
185 |
-
|
186 |
-
# Ensure waveform is in correct format
|
187 |
-
if isinstance(waveform, torch.Tensor):
|
188 |
-
waveform = waveform.cpu().numpy()
|
189 |
-
|
190 |
-
# Normalize audio to prevent clipping
|
191 |
-
if np.max(np.abs(waveform)) > 0:
|
192 |
-
waveform = waveform / np.max(np.abs(waveform)) * 0.9
|
193 |
|
194 |
sample_rate = 16000
|
195 |
return (sample_rate, waveform), f"β
Audio generated successfully for speaker {speaker_name}"
|
196 |
|
197 |
except Exception as e:
|
198 |
-
logger.error(f"Speech generation failed: {e}"
|
199 |
return None, f"β Error: {str(e)}"
|
200 |
|
201 |
-
#
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
# Examples with variety of lengths and speakers matched to content
|
205 |
examples = [
|
206 |
["Aw ni ce", "Adama"], # Natural conversational greeting
|
207 |
-
["Mali bΙna diya kΙsΙbΙ, ka a da a kan baara bΙ ka kΙ.", "Bakary"
|
208 |
-
["Ne bΙ se ka sΙbΙnni yΙlΙma ka kΙ kuma ye", "Moussa"],
|
209 |
-
["I ka kΙnΙ wa?", "Ngolo"
|
210 |
-
["LakΙli karamΙgΙw tun tΙ ka se ka sΙbΙnni kΙ ka Ι²Ι walanda kan wa denmisΙnw tun tΙ ka se ka o sΙbΙnni ninnu ye, kuma tΙ ka u kalan. DenmisΙnw kΙra kunfinw ye.", "Bourama"],
|
211 |
-
["sigikafΙ kΙnΙ jamanaw ni Ι²ΙgΙn cΙ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kΙ sariya ani tilennenya kΙnΙ.", "Ibrahima"
|
212 |
-
["Aw ni ce. Ne tΙgΙ ye Adama. AwΙ, ne ye maliden de ye. Aw SanbΙ SanbΙ. San min tΙ Ι²inan ye, an bΙΙ ka jΙ ka o seli Ι²ΙgΙn fΙ, hΙΙrΙ ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa Ι²Ι. Ala ka an ka seliw caya. Ala ka yafa an bΙΙ ma.", "Amara"
|
213 |
-
["An dΙlakelen bΙ masike bilenman don ka tΙw gΙn.", "Modibo"],
|
214 |
-
["Aw ni ce. Seidu bΙ aw fo wa aw ka yafa a ma, ka da a kan tuma dΙw la kow ka can.", "Amadou"
|
215 |
-
["Bamanankan ye kan Ι²uman ye", "Seydou"], # Balanced characteristics for simple statement
|
216 |
]
|
217 |
|
218 |
def build_interface():
|
219 |
-
"""Build the Gradio interface
|
220 |
|
221 |
-
with gr.Blocks(
|
222 |
-
|
223 |
-
|
224 |
-
css="""
|
225 |
-
.main-header { text-align: center; margin-bottom: 2rem; }
|
226 |
-
.status-box { margin-top: 1rem; }
|
227 |
-
"""
|
228 |
-
) as demo:
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
Convert Bambara text to natural-sounding speech using our state-of-the-art neural TTS system.
|
237 |
-
|
238 |
-
**Bambara** is spoken by millions of people in Mali and West Africa π
|
239 |
-
|
240 |
-
**Status**: {'β
Model pre-loaded' if tts_model is not None else 'β³ Model loads on first request (ZeroGPU optimized)'}
|
241 |
-
""", elem_classes=["main-header"])
|
242 |
|
243 |
with gr.Row():
|
244 |
with gr.Column(scale=2):
|
245 |
text_input = gr.Textbox(
|
246 |
label="π Bambara Text",
|
247 |
-
placeholder="
|
248 |
-
lines=
|
249 |
-
max_lines=
|
250 |
value="I ni ce"
|
251 |
)
|
252 |
|
253 |
speaker_dropdown = gr.Dropdown(
|
254 |
choices=SPEAKER_NAMES,
|
255 |
-
value=
|
256 |
label="π£οΈ Speaker Voice",
|
257 |
-
info=f"Choose from {len(SPEAKER_NAMES)} authentic voices
|
258 |
)
|
259 |
|
260 |
-
generate_btn = gr.Button(
|
261 |
-
"π΅ Generate Speech",
|
262 |
-
variant="primary",
|
263 |
-
size="lg"
|
264 |
-
)
|
265 |
|
266 |
with gr.Column(scale=1):
|
267 |
use_advanced = gr.Checkbox(
|
268 |
-
label="βοΈ Advanced Settings",
|
269 |
value=False,
|
270 |
-
info="
|
271 |
)
|
272 |
|
273 |
with gr.Group(visible=False) as advanced_group:
|
274 |
-
gr.Markdown("
|
275 |
|
276 |
temperature = gr.Slider(
|
277 |
minimum=0.1,
|
@@ -279,7 +245,7 @@ def build_interface():
|
|
279 |
value=0.8,
|
280 |
step=0.1,
|
281 |
label="Temperature",
|
282 |
-
info="Higher = more varied
|
283 |
)
|
284 |
|
285 |
top_k = gr.Slider(
|
@@ -287,8 +253,7 @@ def build_interface():
|
|
287 |
maximum=100,
|
288 |
value=50,
|
289 |
step=5,
|
290 |
-
label="Top-K"
|
291 |
-
info="Vocabulary selection size"
|
292 |
)
|
293 |
|
294 |
top_p = gr.Slider(
|
@@ -296,8 +261,7 @@ def build_interface():
|
|
296 |
maximum=1.0,
|
297 |
value=0.9,
|
298 |
step=0.05,
|
299 |
-
label="Top-P"
|
300 |
-
info="Nucleus sampling threshold"
|
301 |
)
|
302 |
|
303 |
max_tokens = gr.Slider(
|
@@ -305,8 +269,7 @@ def build_interface():
|
|
305 |
maximum=4096,
|
306 |
value=2048,
|
307 |
step=256,
|
308 |
-
label="Max
|
309 |
-
info="Maximum audio duration"
|
310 |
)
|
311 |
|
312 |
gr.Markdown("### π Generated Audio")
|
@@ -314,70 +277,41 @@ def build_interface():
|
|
314 |
audio_output = gr.Audio(
|
315 |
label="Generated Speech",
|
316 |
type="numpy",
|
317 |
-
interactive=False
|
318 |
-
show_download_button=True
|
319 |
)
|
320 |
|
321 |
status_output = gr.Textbox(
|
322 |
label="Status",
|
323 |
interactive=False,
|
324 |
show_label=False,
|
325 |
-
container=False
|
326 |
-
elem_classes=["status-box"]
|
327 |
)
|
328 |
|
329 |
-
with gr.Accordion("
|
330 |
def load_example(text, speaker):
|
331 |
return text, speaker, False, 0.8, 50, 0.9, 2048
|
332 |
|
333 |
-
gr.Markdown("**Click any example below
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
btn.click(
|
342 |
-
fn=lambda t=text, s=speaker: load_example(t, s),
|
343 |
-
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
|
344 |
-
)
|
345 |
-
|
346 |
-
with gr.Row():
|
347 |
-
for i, (text, speaker) in enumerate(examples[5:]):
|
348 |
-
btn = gr.Button(
|
349 |
-
f"πΉ {text[:25]}{'...' if len(text) > 25 else ''}",
|
350 |
-
size="sm"
|
351 |
-
)
|
352 |
-
btn.click(
|
353 |
-
fn=lambda t=text, s=speaker: load_example(t, s),
|
354 |
-
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
|
355 |
-
)
|
356 |
|
357 |
-
with gr.Accordion("
|
358 |
gr.Markdown(f"""
|
359 |
## About MALIBA-AI Bambara TTS
|
360 |
|
361 |
- **π― Purpose**: First open-source Text-to-Speech system for Bambara language
|
362 |
- **π£οΈ Speakers**: {len(SPEAKER_NAMES)} different authentic voices
|
363 |
- **π Quality**: 16kHz neural speech synthesis
|
364 |
-
- **β‘ Performance**:
|
365 |
- **π± Usage**: Educational, accessibility, and cultural preservation
|
366 |
|
367 |
-
### π
|
368 |
-
|
369 |
-
- **Bourama**: Most stable and accurate (recommended)
|
370 |
-
- **Adama**: Natural conversational tone
|
371 |
-
- **Moussa**: Clear pronunciation for educational content
|
372 |
-
- **Modibo**: Expressive delivery for storytelling
|
373 |
-
- **Seydou**: Balanced characteristics for general use
|
374 |
-
- **Amadou**: Warm and friendly voice
|
375 |
-
- **Bakary**: Deep, authoritative tone
|
376 |
-
- **Ngolo**: Youthful and energetic
|
377 |
-
- **Ibrahima**: Calm and measured delivery
|
378 |
-
- **Amara**: Melodic and smooth
|
379 |
-
|
380 |
-
**Model Architecture**: Built on state-of-the-art neural TTS with Bambara-specific optimizations
|
381 |
|
382 |
**License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
|
383 |
|
@@ -386,7 +320,6 @@ def build_interface():
|
|
386 |
**MALIBA-AI Mission**: Ensuring no Malian is left behind by technological advances π²π±
|
387 |
""")
|
388 |
|
389 |
-
# Event handlers
|
390 |
def toggle_advanced(use_adv):
|
391 |
return gr.Group(visible=use_adv)
|
392 |
|
@@ -396,7 +329,6 @@ def build_interface():
|
|
396 |
outputs=[advanced_group]
|
397 |
)
|
398 |
|
399 |
-
# Generate speech on button click
|
400 |
generate_btn.click(
|
401 |
fn=generate_speech,
|
402 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
@@ -404,7 +336,6 @@ def build_interface():
|
|
404 |
show_progress=True
|
405 |
)
|
406 |
|
407 |
-
# Generate speech on Enter key
|
408 |
text_input.submit(
|
409 |
fn=generate_speech,
|
410 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
@@ -416,20 +347,17 @@ def build_interface():
|
|
416 |
|
417 |
def main():
|
418 |
"""Main function to launch the Gradio interface"""
|
419 |
-
logger.info("Starting
|
420 |
|
421 |
-
#
|
422 |
interface = build_interface()
|
423 |
-
|
424 |
-
# Launch interface
|
425 |
interface.launch(
|
426 |
server_name="0.0.0.0",
|
427 |
server_port=7860,
|
428 |
-
share=False
|
429 |
-
show_error=True
|
430 |
)
|
431 |
|
432 |
-
logger.info("Gradio interface launched successfully
|
433 |
|
434 |
if __name__ == "__main__":
|
435 |
main()
|
|
|
1 |
import os
|
2 |
|
|
|
3 |
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
4 |
os.environ["TORCH_COMPILE_DISABLE"] = "1"
|
5 |
os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
|
6 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
import torch
|
9 |
import gradio as gr
|
10 |
import numpy as np
|
11 |
import spaces
|
12 |
import logging
|
13 |
from huggingface_hub import login
|
14 |
+
import threading
|
15 |
import time
|
16 |
|
17 |
torch._dynamo.config.disable = True
|
|
|
24 |
if hf_token:
|
25 |
login(token=hf_token)
|
26 |
|
27 |
+
# Global variables for model caching (like your old working version)
|
28 |
+
_tts_model = None
|
29 |
+
_speakers_dict = None
|
30 |
+
_model_initialized = False
|
31 |
+
_initialization_in_progress = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def get_speakers_dict():
|
34 |
+
"""Get speakers dictionary using the correct import structure"""
|
35 |
try:
|
36 |
+
# Try new structure first
|
37 |
from maliba_ai.config.settings import Speakers
|
38 |
return {
|
39 |
"Adama": Speakers.Adama,
|
|
|
48 |
"Amara": Speakers.Amara
|
49 |
}
|
50 |
except Exception as e:
|
51 |
+
logger.error(f"Failed to import from settings: {e}")
|
52 |
+
# Fallback to old structure (like your working version)
|
53 |
try:
|
54 |
+
from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
|
55 |
return {
|
56 |
+
"Adama": Adame,
|
57 |
+
"Moussa": Moussa,
|
58 |
+
"Bourama": Bourama,
|
59 |
+
"Modibo": Modibo,
|
60 |
+
"Seydou": Seydou
|
61 |
}
|
62 |
+
except Exception as e2:
|
63 |
+
logger.error(f"Failed to import speakers: {e2}")
|
64 |
return {}
|
65 |
|
66 |
+
@spaces.GPU()
|
67 |
+
def initialize_model_once():
|
68 |
+
"""Initialize model exactly like your old working version"""
|
69 |
+
global _tts_model, _speakers_dict, _model_initialized, _initialization_in_progress
|
70 |
+
|
71 |
+
if _model_initialized:
|
72 |
+
logger.info("Model already initialized, returning existing instance")
|
73 |
+
return _tts_model, _speakers_dict
|
74 |
+
|
75 |
+
if _initialization_in_progress:
|
76 |
+
logger.info("Initialization already in progress, waiting...")
|
77 |
+
for _ in range(50):
|
78 |
+
time.sleep(0.1)
|
79 |
+
if _model_initialized:
|
80 |
+
return _tts_model, _speakers_dict
|
81 |
+
|
82 |
+
_initialization_in_progress = True
|
83 |
+
|
84 |
try:
|
85 |
+
logger.info("Initializing Bambara TTS model...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
start_time = time.time()
|
87 |
|
88 |
+
# Use the same import as your old working version
|
89 |
+
from maliba_ai.tts import BambaraTTSInference
|
90 |
|
|
|
91 |
model = BambaraTTSInference()
|
92 |
+
speakers = get_speakers_dict()
|
93 |
+
|
94 |
+
if not speakers:
|
95 |
+
raise ValueError("Failed to load speakers dictionary")
|
96 |
+
|
97 |
+
_tts_model = model
|
98 |
+
_speakers_dict = speakers
|
99 |
+
_model_initialized = True
|
100 |
|
101 |
elapsed = time.time() - start_time
|
102 |
+
logger.info(f"Model initialized successfully in {elapsed:.2f} seconds!")
|
103 |
|
104 |
+
return _tts_model, _speakers_dict
|
105 |
|
106 |
except Exception as e:
|
107 |
+
logger.error(f"Failed to initialize model: {e}")
|
108 |
+
_initialization_in_progress = False
|
109 |
+
raise e
|
110 |
+
finally:
|
111 |
+
_initialization_in_progress = False
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
def validate_inputs(text, temperature, top_k, top_p, max_tokens):
|
114 |
+
"""Same validation as your old version"""
|
115 |
if not text or not text.strip():
|
116 |
return False, "Please enter some Bambara text."
|
117 |
|
|
|
124 |
if not (0.1 <= top_p <= 1.0):
|
125 |
return False, "Top-P must be between 0.1 and 1.0"
|
126 |
|
|
|
|
|
|
|
127 |
return True, ""
|
128 |
|
129 |
@spaces.GPU()
|
130 |
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
|
131 |
+
"""Generate speech - exactly like your old working version"""
|
|
|
|
|
132 |
if not text.strip():
|
133 |
return None, "Please enter some Bambara text."
|
134 |
|
135 |
try:
|
136 |
+
tts, speakers = initialize_model_once()
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
+
if not tts or not speakers:
|
139 |
+
return None, "β Model not properly initialized"
|
140 |
|
141 |
+
if speaker_name not in speakers:
|
142 |
+
available_speakers = list(speakers.keys())
|
143 |
return None, f"β Speaker '{speaker_name}' not found. Available: {available_speakers}"
|
144 |
|
145 |
+
speaker = speakers[speaker_name]
|
146 |
+
logger.info(f"Using speaker: {speaker_name}")
|
147 |
|
|
|
148 |
if use_advanced:
|
149 |
is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
|
150 |
if not is_valid:
|
151 |
return None, f"β {error_msg}"
|
152 |
|
153 |
+
waveform = tts.generate_speech(
|
154 |
text=text.strip(),
|
155 |
speaker_id=speaker,
|
156 |
temperature=temperature,
|
|
|
159 |
max_new_audio_tokens=int(max_tokens)
|
160 |
)
|
161 |
else:
|
162 |
+
waveform = tts.generate_speech(
|
|
|
163 |
text=text.strip(),
|
164 |
speaker_id=speaker
|
165 |
)
|
166 |
|
167 |
if waveform is None or waveform.size == 0:
|
168 |
+
return None, "Failed to generate audio. Please try again."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
sample_rate = 16000
|
171 |
return (sample_rate, waveform), f"β
Audio generated successfully for speaker {speaker_name}"
|
172 |
|
173 |
except Exception as e:
|
174 |
+
logger.error(f"Speech generation failed: {e}")
|
175 |
return None, f"β Error: {str(e)}"
|
176 |
|
177 |
+
# Use available speakers (try to get 10, fallback to 5)
|
178 |
+
def get_speaker_names():
|
179 |
+
speakers = get_speakers_dict()
|
180 |
+
if speakers:
|
181 |
+
return list(speakers.keys())
|
182 |
+
return ["Adama", "Moussa", "Bourama", "Modibo", "Seydou"]
|
183 |
+
|
184 |
+
SPEAKER_NAMES = get_speaker_names()
|
185 |
|
186 |
# Examples with variety of lengths and speakers matched to content
|
187 |
examples = [
|
188 |
["Aw ni ce", "Adama"], # Natural conversational greeting
|
189 |
+
["Mali bΙna diya kΙsΙbΙ, ka a da a kan baara bΙ ka kΙ.", "Bakary" if "Bakary" in SPEAKER_NAMES else "Moussa"],
|
190 |
+
["Ne bΙ se ka sΙbΙnni yΙlΙma ka kΙ kuma ye", "Moussa"],
|
191 |
+
["I ka kΙnΙ wa?", "Ngolo" if "Ngolo" in SPEAKER_NAMES else "Modibo"],
|
192 |
+
["LakΙli karamΙgΙw tun tΙ ka se ka sΙbΙnni kΙ ka Ι²Ι walanda kan wa denmisΙnw tun tΙ ka se ka o sΙbΙnni ninnu ye, kuma tΙ ka u kalan. DenmisΙnw kΙra kunfinw ye.", "Bourama"],
|
193 |
+
["sigikafΙ kΙnΙ jamanaw ni Ι²ΙgΙn cΙ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kΙ sariya ani tilennenya kΙnΙ.", "Ibrahima" if "Ibrahima" in SPEAKER_NAMES else "Seydou"],
|
194 |
+
["Aw ni ce. Ne tΙgΙ ye Adama. AwΙ, ne ye maliden de ye. Aw SanbΙ SanbΙ. San min tΙ Ι²inan ye, an bΙΙ ka jΙ ka o seli Ι²ΙgΙn fΙ, hΙΙrΙ ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa Ι²Ι. Ala ka an ka seliw caya. Ala ka yafa an bΙΙ ma.", "Amara" if "Amara" in SPEAKER_NAMES else "Moussa"],
|
195 |
+
["An dΙlakelen bΙ masike bilenman don ka tΙw gΙn.", "Modibo"],
|
196 |
+
["Aw ni ce. Seidu bΙ aw fo wa aw ka yafa a ma, ka da a kan tuma dΙw la kow ka can.", "Amadou" if "Amadou" in SPEAKER_NAMES else "Modibo"],
|
|
|
197 |
]
|
198 |
|
199 |
def build_interface():
|
200 |
+
"""Build the Gradio interface - simplified like your old working version"""
|
201 |
|
202 |
+
with gr.Blocks(title="Bambara TTS - MALIBA-AI") as demo:
|
203 |
+
gr.Markdown("""
|
204 |
+
# π€ Bambara Text-to-Speech
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
+
**Powered by MALIBA-AI**
|
207 |
+
|
208 |
+
Convert Bambara text to speech using our state-of-the-art TTS model.
|
209 |
+
|
210 |
+
**Bambara** is spoken by millions of people in Mali and West Africa.
|
211 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
with gr.Row():
|
214 |
with gr.Column(scale=2):
|
215 |
text_input = gr.Textbox(
|
216 |
label="π Bambara Text",
|
217 |
+
placeholder="Type your Bambara text here...",
|
218 |
+
lines=3,
|
219 |
+
max_lines=10,
|
220 |
value="I ni ce"
|
221 |
)
|
222 |
|
223 |
speaker_dropdown = gr.Dropdown(
|
224 |
choices=SPEAKER_NAMES,
|
225 |
+
value="Bourama" if "Bourama" in SPEAKER_NAMES else SPEAKER_NAMES[0],
|
226 |
label="π£οΈ Speaker Voice",
|
227 |
+
info=f"Choose from {len(SPEAKER_NAMES)} authentic voices"
|
228 |
)
|
229 |
|
230 |
+
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
231 |
|
232 |
with gr.Column(scale=1):
|
233 |
use_advanced = gr.Checkbox(
|
234 |
+
label="βοΈ Use Advanced Settings",
|
235 |
value=False,
|
236 |
+
info="Enable to customize generation parameters"
|
237 |
)
|
238 |
|
239 |
with gr.Group(visible=False) as advanced_group:
|
240 |
+
gr.Markdown("**Advanced Parameters:**")
|
241 |
|
242 |
temperature = gr.Slider(
|
243 |
minimum=0.1,
|
|
|
245 |
value=0.8,
|
246 |
step=0.1,
|
247 |
label="Temperature",
|
248 |
+
info="Higher = more varied"
|
249 |
)
|
250 |
|
251 |
top_k = gr.Slider(
|
|
|
253 |
maximum=100,
|
254 |
value=50,
|
255 |
step=5,
|
256 |
+
label="Top-K"
|
|
|
257 |
)
|
258 |
|
259 |
top_p = gr.Slider(
|
|
|
261 |
maximum=1.0,
|
262 |
value=0.9,
|
263 |
step=0.05,
|
264 |
+
label="Top-P"
|
|
|
265 |
)
|
266 |
|
267 |
max_tokens = gr.Slider(
|
|
|
269 |
maximum=4096,
|
270 |
value=2048,
|
271 |
step=256,
|
272 |
+
label="Max Length"
|
|
|
273 |
)
|
274 |
|
275 |
gr.Markdown("### π Generated Audio")
|
|
|
277 |
audio_output = gr.Audio(
|
278 |
label="Generated Speech",
|
279 |
type="numpy",
|
280 |
+
interactive=False
|
|
|
281 |
)
|
282 |
|
283 |
status_output = gr.Textbox(
|
284 |
label="Status",
|
285 |
interactive=False,
|
286 |
show_label=False,
|
287 |
+
container=False
|
|
|
288 |
)
|
289 |
|
290 |
+
with gr.Accordion("Try These Examples", open=True):
|
291 |
def load_example(text, speaker):
|
292 |
return text, speaker, False, 0.8, 50, 0.9, 2048
|
293 |
|
294 |
+
gr.Markdown("**Click any example below:**")
|
295 |
|
296 |
+
for i, (text, speaker) in enumerate(examples):
|
297 |
+
btn = gr.Button(f"{text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
|
298 |
+
btn.click(
|
299 |
+
fn=lambda t=text, s=speaker: load_example(t, s),
|
300 |
+
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
|
301 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
+
with gr.Accordion("About", open=False):
|
304 |
gr.Markdown(f"""
|
305 |
## About MALIBA-AI Bambara TTS
|
306 |
|
307 |
- **π― Purpose**: First open-source Text-to-Speech system for Bambara language
|
308 |
- **π£οΈ Speakers**: {len(SPEAKER_NAMES)} different authentic voices
|
309 |
- **π Quality**: 16kHz neural speech synthesis
|
310 |
+
- **β‘ Performance**: Model loads once and stays in memory
|
311 |
- **π± Usage**: Educational, accessibility, and cultural preservation
|
312 |
|
313 |
+
### π Available Speakers:
|
314 |
+
{', '.join(SPEAKER_NAMES)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
**License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
|
317 |
|
|
|
320 |
**MALIBA-AI Mission**: Ensuring no Malian is left behind by technological advances π²π±
|
321 |
""")
|
322 |
|
|
|
323 |
def toggle_advanced(use_adv):
|
324 |
return gr.Group(visible=use_adv)
|
325 |
|
|
|
329 |
outputs=[advanced_group]
|
330 |
)
|
331 |
|
|
|
332 |
generate_btn.click(
|
333 |
fn=generate_speech,
|
334 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
|
|
336 |
show_progress=True
|
337 |
)
|
338 |
|
|
|
339 |
text_input.submit(
|
340 |
fn=generate_speech,
|
341 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
|
|
347 |
|
348 |
def main():
|
349 |
"""Main function to launch the Gradio interface"""
|
350 |
+
logger.info("Starting Bambara TTS Gradio interface.")
|
351 |
|
352 |
+
# DO NOT preload - let it initialize on first request only (like your working version)
|
353 |
interface = build_interface()
|
|
|
|
|
354 |
interface.launch(
|
355 |
server_name="0.0.0.0",
|
356 |
server_port=7860,
|
357 |
+
share=False
|
|
|
358 |
)
|
359 |
|
360 |
+
logger.info("Gradio interface launched successfully.")
|
361 |
|
362 |
if __name__ == "__main__":
|
363 |
main()
|