Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,17 +1,21 @@
|
|
1 |
import os
|
2 |
|
|
|
3 |
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
4 |
os.environ["TORCH_COMPILE_DISABLE"] = "1"
|
5 |
os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
|
6 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
7 |
|
|
|
|
|
|
|
|
|
8 |
import torch
|
9 |
import gradio as gr
|
10 |
import numpy as np
|
11 |
import spaces
|
12 |
import logging
|
13 |
from huggingface_hub import login
|
14 |
-
import threading
|
15 |
import time
|
16 |
|
17 |
torch._dynamo.config.disable = True
|
@@ -24,75 +28,81 @@ hf_token = os.getenv("HF_TOKEN")
|
|
24 |
if hf_token:
|
25 |
login(token=hf_token)
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def get_speakers_dict():
|
34 |
-
"""Get speakers dictionary
|
35 |
try:
|
36 |
-
from maliba_ai.config.
|
37 |
return {
|
38 |
-
"Adama":
|
39 |
-
"Moussa": Moussa,
|
40 |
-
"Bourama": Bourama,
|
41 |
-
"Modibo": Modibo,
|
42 |
-
"Seydou": Seydou
|
|
|
|
|
|
|
|
|
|
|
43 |
}
|
44 |
except Exception as e:
|
45 |
-
logger.error(f"Failed to import speakers: {e}")
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
if _model_initialized:
|
62 |
-
return _tts_model, _speakers_dict
|
63 |
-
|
64 |
-
|
65 |
-
_initialization_in_progress = True
|
66 |
-
|
67 |
try:
|
68 |
-
logger.info("Initializing Bambara TTS model...")
|
69 |
start_time = time.time()
|
70 |
|
71 |
-
|
|
|
72 |
|
|
|
73 |
model = BambaraTTSInference()
|
74 |
-
speakers = get_speakers_dict()
|
75 |
-
|
76 |
-
if not speakers:
|
77 |
-
raise ValueError("Failed to load speakers dictionary")
|
78 |
-
|
79 |
-
_tts_model = model
|
80 |
-
_speakers_dict = speakers
|
81 |
-
_model_initialized = True
|
82 |
|
83 |
elapsed = time.time() - start_time
|
84 |
-
logger.info(f"Model initialized successfully in {elapsed:.2f} seconds!")
|
85 |
|
86 |
-
return
|
87 |
|
88 |
except Exception as e:
|
89 |
-
logger.error(f"Failed to initialize model: {e}")
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
def validate_inputs(text, temperature, top_k, top_p, max_tokens):
|
|
|
96 |
if not text or not text.strip():
|
97 |
return False, "Please enter some Bambara text."
|
98 |
|
@@ -105,32 +115,44 @@ def validate_inputs(text, temperature, top_k, top_p, max_tokens):
|
|
105 |
if not (0.1 <= top_p <= 1.0):
|
106 |
return False, "Top-P must be between 0.1 and 1.0"
|
107 |
|
|
|
|
|
|
|
108 |
return True, ""
|
109 |
|
110 |
@spaces.GPU()
|
111 |
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
|
|
|
|
|
|
|
112 |
if not text.strip():
|
113 |
return None, "Please enter some Bambara text."
|
114 |
|
115 |
try:
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
-
if not
|
119 |
-
return None, "β
|
120 |
|
121 |
-
if speaker_name not in
|
122 |
-
available_speakers = list(
|
123 |
return None, f"β Speaker '{speaker_name}' not found. Available: {available_speakers}"
|
124 |
|
125 |
-
speaker =
|
126 |
-
logger.info(f"
|
127 |
|
|
|
128 |
if use_advanced:
|
129 |
is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
|
130 |
if not is_valid:
|
131 |
return None, f"β {error_msg}"
|
132 |
|
133 |
-
waveform =
|
134 |
text=text.strip(),
|
135 |
speaker_id=speaker,
|
136 |
temperature=temperature,
|
@@ -139,76 +161,104 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
|
|
139 |
max_new_audio_tokens=int(max_tokens)
|
140 |
)
|
141 |
else:
|
142 |
-
|
|
|
143 |
text=text.strip(),
|
144 |
speaker_id=speaker
|
145 |
)
|
146 |
|
147 |
if waveform is None or waveform.size == 0:
|
148 |
-
return None, "Failed to generate audio. Please try again."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
sample_rate = 16000
|
151 |
return (sample_rate, waveform), f"β
Audio generated successfully for speaker {speaker_name}"
|
152 |
|
153 |
except Exception as e:
|
154 |
-
logger.error(f"Speech generation failed: {e}")
|
155 |
return None, f"β Error: {str(e)}"
|
156 |
|
157 |
-
|
|
|
|
|
158 |
|
159 |
examples = [
|
160 |
-
["Aw ni ce", "Adama"],
|
161 |
-
["Mali bΙna diya kΙsΙbΙ,
|
162 |
-
["Ne bΙ se ka sΙbΙnni yΙlΙma ka kΙ kuma ye", "
|
163 |
-
["I ka kΙnΙ wa?", "
|
164 |
-
["LakΙli karamΙgΙw tun tΙ ka se ka sΙbΙnni kΙ ka Ι²Ι walanda kan wa denmisΙnw tun tΙ ka se ka o sΙbΙnni ninnu ye,
|
165 |
-
["sigikafΙ kΙnΙ jamanaw ni Ι²ΙgΙn cΙ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kΙ sariya ani tilennenya kΙnΙ.", "
|
166 |
-
["Aw ni ce. Ne tΙgΙ ye Adama. AwΙ,
|
167 |
-
["An dΙlakelen bΙ masike bilenman don ka tΙw gΙn.", "
|
168 |
-
["Aw ni ce. Seidu bΙ aw fo wa aw ka yafa a ma, ka da a kan tuma dΙw la kow ka can.", "
|
|
|
169 |
]
|
170 |
|
171 |
def build_interface():
|
172 |
"""Build the Gradio interface for Bambara TTS"""
|
173 |
|
174 |
-
with gr.Blocks(
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
with gr.Row():
|
186 |
with gr.Column(scale=2):
|
187 |
text_input = gr.Textbox(
|
188 |
label="π Bambara Text",
|
189 |
-
placeholder="Type your Bambara text here
|
190 |
-
lines=
|
191 |
-
max_lines=
|
192 |
value="I ni ce"
|
193 |
)
|
194 |
|
195 |
speaker_dropdown = gr.Dropdown(
|
196 |
choices=SPEAKER_NAMES,
|
197 |
-
value="
|
198 |
-
label="π£οΈ Speaker Voice"
|
|
|
199 |
)
|
200 |
|
201 |
-
generate_btn = gr.Button(
|
|
|
|
|
|
|
|
|
202 |
|
203 |
with gr.Column(scale=1):
|
204 |
use_advanced = gr.Checkbox(
|
205 |
-
label="βοΈ
|
206 |
value=False,
|
207 |
-
info="
|
208 |
)
|
209 |
|
210 |
with gr.Group(visible=False) as advanced_group:
|
211 |
-
gr.Markdown("
|
212 |
|
213 |
temperature = gr.Slider(
|
214 |
minimum=0.1,
|
@@ -216,7 +266,7 @@ def build_interface():
|
|
216 |
value=0.8,
|
217 |
step=0.1,
|
218 |
label="Temperature",
|
219 |
-
info="Higher = more varied"
|
220 |
)
|
221 |
|
222 |
top_k = gr.Slider(
|
@@ -224,7 +274,8 @@ def build_interface():
|
|
224 |
maximum=100,
|
225 |
value=50,
|
226 |
step=5,
|
227 |
-
label="Top-K"
|
|
|
228 |
)
|
229 |
|
230 |
top_p = gr.Slider(
|
@@ -232,7 +283,8 @@ def build_interface():
|
|
232 |
maximum=1.0,
|
233 |
value=0.9,
|
234 |
step=0.05,
|
235 |
-
label="Top-P"
|
|
|
236 |
)
|
237 |
|
238 |
max_tokens = gr.Slider(
|
@@ -240,7 +292,8 @@ def build_interface():
|
|
240 |
maximum=4096,
|
241 |
value=2048,
|
242 |
step=256,
|
243 |
-
label="Max Length"
|
|
|
244 |
)
|
245 |
|
246 |
gr.Markdown("### π Generated Audio")
|
@@ -248,39 +301,79 @@ def build_interface():
|
|
248 |
audio_output = gr.Audio(
|
249 |
label="Generated Speech",
|
250 |
type="numpy",
|
251 |
-
interactive=False
|
|
|
252 |
)
|
253 |
|
254 |
status_output = gr.Textbox(
|
255 |
label="Status",
|
256 |
interactive=False,
|
257 |
show_label=False,
|
258 |
-
container=False
|
|
|
259 |
)
|
260 |
|
261 |
-
with gr.Accordion("Try These Examples", open=True):
|
262 |
def load_example(text, speaker):
|
263 |
return text, speaker, False, 0.8, 50, 0.9, 2048
|
264 |
|
265 |
-
gr.Markdown("**Click any example below:**")
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
with gr.Accordion("About", open=False):
|
275 |
-
gr.Markdown("""
|
276 |
-
|
277 |
-
|
278 |
-
-
|
279 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
-
**Model
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
""")
|
283 |
|
|
|
284 |
def toggle_advanced(use_adv):
|
285 |
return gr.Group(visible=use_adv)
|
286 |
|
@@ -290,6 +383,7 @@ def build_interface():
|
|
290 |
outputs=[advanced_group]
|
291 |
)
|
292 |
|
|
|
293 |
generate_btn.click(
|
294 |
fn=generate_speech,
|
295 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
@@ -297,6 +391,7 @@ def build_interface():
|
|
297 |
show_progress=True
|
298 |
)
|
299 |
|
|
|
300 |
text_input.submit(
|
301 |
fn=generate_speech,
|
302 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
@@ -308,17 +403,20 @@ def build_interface():
|
|
308 |
|
309 |
def main():
|
310 |
"""Main function to launch the Gradio interface"""
|
311 |
-
logger.info("Starting Bambara TTS Gradio interface
|
312 |
|
313 |
-
#
|
314 |
interface = build_interface()
|
|
|
|
|
315 |
interface.launch(
|
316 |
server_name="0.0.0.0",
|
317 |
server_port=7860,
|
318 |
-
share=False
|
|
|
319 |
)
|
320 |
|
321 |
-
logger.info("Gradio interface launched successfully
|
322 |
|
323 |
if __name__ == "__main__":
|
324 |
main()
|
|
|
1 |
import os
|
2 |
|
3 |
+
# Set environment variables BEFORE any imports
|
4 |
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
5 |
os.environ["TORCH_COMPILE_DISABLE"] = "1"
|
6 |
os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
|
7 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
8 |
|
9 |
+
# Set CUDA environment to help with unsloth GPU detection
|
10 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Force GPU visibility
|
11 |
+
os.environ["FORCE_CUDA"] = "1" # Force CUDA usage
|
12 |
+
|
13 |
import torch
|
14 |
import gradio as gr
|
15 |
import numpy as np
|
16 |
import spaces
|
17 |
import logging
|
18 |
from huggingface_hub import login
|
|
|
19 |
import time
|
20 |
|
21 |
torch._dynamo.config.disable = True
|
|
|
28 |
if hf_token:
|
29 |
login(token=hf_token)
|
30 |
|
31 |
+
# Check GPU availability
|
32 |
+
if torch.cuda.is_available():
|
33 |
+
device = "cuda"
|
34 |
+
logger.info("Using CUDA for inference.")
|
35 |
+
elif torch.backends.mps.is_available():
|
36 |
+
device = "mps"
|
37 |
+
logger.info("Using MPS for inference.")
|
38 |
+
else:
|
39 |
+
device = "cpu"
|
40 |
+
logger.info("Using CPU for inference.")
|
41 |
|
42 |
def get_speakers_dict():
|
43 |
+
"""Get speakers dictionary using the new package structure"""
|
44 |
try:
|
45 |
+
from maliba_ai.config.settings import Speakers
|
46 |
return {
|
47 |
+
"Adama": Speakers.Adama,
|
48 |
+
"Moussa": Speakers.Moussa,
|
49 |
+
"Bourama": Speakers.Bourama,
|
50 |
+
"Modibo": Speakers.Modibo,
|
51 |
+
"Seydou": Speakers.Seydou,
|
52 |
+
"Amadou": Speakers.Amadou,
|
53 |
+
"Bakary": Speakers.Bakary,
|
54 |
+
"Ngolo": Speakers.Ngolo,
|
55 |
+
"Ibrahima": Speakers.Ibrahima,
|
56 |
+
"Amara": Speakers.Amara
|
57 |
}
|
58 |
except Exception as e:
|
59 |
+
logger.error(f"Failed to import all speakers: {e}")
|
60 |
+
# Fallback to core speakers only
|
61 |
+
try:
|
62 |
+
from maliba_ai.config.settings import Speakers
|
63 |
+
return {
|
64 |
+
"Adama": Speakers.Adama,
|
65 |
+
"Moussa": Speakers.Moussa,
|
66 |
+
"Bourama": Speakers.Bourama,
|
67 |
+
"Modibo": Speakers.Modibo,
|
68 |
+
"Seydou": Speakers.Seydou
|
69 |
+
}
|
70 |
+
except:
|
71 |
+
logger.error("Failed to import even core speakers")
|
72 |
+
return {}
|
73 |
|
74 |
+
def initialize_tts_model():
|
75 |
+
"""Initialize TTS model globally - similar to ASR space pattern"""
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
try:
|
77 |
+
logger.info("Initializing Bambara TTS model globally...")
|
78 |
start_time = time.time()
|
79 |
|
80 |
+
# Import and initialize the TTS model
|
81 |
+
from maliba_ai.tts import BambaraTTSInference
|
82 |
|
83 |
+
# Initialize model
|
84 |
model = BambaraTTSInference()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
elapsed = time.time() - start_time
|
87 |
+
logger.info(f"TTS Model initialized successfully in {elapsed:.2f} seconds!")
|
88 |
|
89 |
+
return model
|
90 |
|
91 |
except Exception as e:
|
92 |
+
logger.error(f"Failed to initialize TTS model: {e}")
|
93 |
+
logger.info("Model will be initialized on first request instead")
|
94 |
+
return None
|
95 |
+
|
96 |
+
# Initialize speakers dictionary
|
97 |
+
speakers_dict = get_speakers_dict()
|
98 |
+
logger.info(f"Available speakers: {list(speakers_dict.keys())}")
|
99 |
+
|
100 |
+
# Try to initialize model globally (like ASR space)
|
101 |
+
# If it fails due to GPU detection, it will be None and we'll init on first request
|
102 |
+
tts_model = initialize_tts_model()
|
103 |
|
104 |
def validate_inputs(text, temperature, top_k, top_p, max_tokens):
|
105 |
+
"""Validate user inputs"""
|
106 |
if not text or not text.strip():
|
107 |
return False, "Please enter some Bambara text."
|
108 |
|
|
|
115 |
if not (0.1 <= top_p <= 1.0):
|
116 |
return False, "Top-P must be between 0.1 and 1.0"
|
117 |
|
118 |
+
if len(text.strip()) > 1000:
|
119 |
+
return False, "Text is too long. Please use shorter text (max 1000 characters)."
|
120 |
+
|
121 |
return True, ""
|
122 |
|
123 |
@spaces.GPU()
|
124 |
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
|
125 |
+
"""Generate speech - with fallback initialization if global init failed"""
|
126 |
+
global tts_model
|
127 |
+
|
128 |
if not text.strip():
|
129 |
return None, "Please enter some Bambara text."
|
130 |
|
131 |
try:
|
132 |
+
# If global initialization failed, try to initialize here with GPU decorator
|
133 |
+
if tts_model is None:
|
134 |
+
logger.info("Global model initialization failed, initializing with GPU decorator...")
|
135 |
+
from maliba_ai.tts import BambaraTTSInference
|
136 |
+
tts_model = BambaraTTSInference()
|
137 |
+
logger.info("Model initialized successfully with GPU decorator!")
|
138 |
|
139 |
+
if not speakers_dict:
|
140 |
+
return None, "β Speakers not properly loaded"
|
141 |
|
142 |
+
if speaker_name not in speakers_dict:
|
143 |
+
available_speakers = list(speakers_dict.keys())
|
144 |
return None, f"β Speaker '{speaker_name}' not found. Available: {available_speakers}"
|
145 |
|
146 |
+
speaker = speakers_dict[speaker_name]
|
147 |
+
logger.info(f"Generating speech with speaker: {speaker_name}")
|
148 |
|
149 |
+
# Validate inputs if using advanced settings
|
150 |
if use_advanced:
|
151 |
is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
|
152 |
if not is_valid:
|
153 |
return None, f"β {error_msg}"
|
154 |
|
155 |
+
waveform = tts_model.generate_speech(
|
156 |
text=text.strip(),
|
157 |
speaker_id=speaker,
|
158 |
temperature=temperature,
|
|
|
161 |
max_new_audio_tokens=int(max_tokens)
|
162 |
)
|
163 |
else:
|
164 |
+
# Use default settings
|
165 |
+
waveform = tts_model.generate_speech(
|
166 |
text=text.strip(),
|
167 |
speaker_id=speaker
|
168 |
)
|
169 |
|
170 |
if waveform is None or waveform.size == 0:
|
171 |
+
return None, "β Failed to generate audio. Please try again with different text."
|
172 |
+
|
173 |
+
# Ensure waveform is in correct format
|
174 |
+
if isinstance(waveform, torch.Tensor):
|
175 |
+
waveform = waveform.cpu().numpy()
|
176 |
+
|
177 |
+
# Normalize audio to prevent clipping
|
178 |
+
if np.max(np.abs(waveform)) > 0:
|
179 |
+
waveform = waveform / np.max(np.abs(waveform)) * 0.9
|
180 |
|
181 |
sample_rate = 16000
|
182 |
return (sample_rate, waveform), f"β
Audio generated successfully for speaker {speaker_name}"
|
183 |
|
184 |
except Exception as e:
|
185 |
+
logger.error(f"Speech generation failed: {e}", exc_info=True)
|
186 |
return None, f"β Error: {str(e)}"
|
187 |
|
188 |
+
# Get available speakers for dropdown
|
189 |
+
SPEAKER_NAMES = list(speakers_dict.keys()) if speakers_dict else ["Adama", "Moussa", "Bourama", "Modibo", "Seydou"]
|
190 |
+
|
191 |
|
192 |
examples = [
|
193 |
+
["Aw ni ce", "Adama"],
|
194 |
+
["Mali bΙna diya kΙsΙbΙ, ka a da a kan baara bΙ ka kΙ.", "Bakary"],
|
195 |
+
["Ne bΙ se ka sΙbΙnni yΙlΙma ka kΙ kuma ye", "Moussa"],
|
196 |
+
["I ka kΙnΙ wa?", "Ngolo"],
|
197 |
+
["LakΙli karamΙgΙw tun tΙ ka se ka sΙbΙnni kΙ ka Ι²Ι walanda kan wa denmisΙnw tun tΙ ka se ka o sΙbΙnni ninnu ye, kuma tΙ ka u kalan. DenmisΙnw kΙra kunfinw ye.", "Bourama"],
|
198 |
+
["sigikafΙ kΙnΙ jamanaw ni Ι²ΙgΙn cΙ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kΙ sariya ani tilennenya kΙnΙ.", "Ibrahima"],
|
199 |
+
["Aw ni ce. Ne tΙgΙ ye Adama. AwΙ, ne ye maliden de ye. Aw SanbΙ SanbΙ. San min tΙ Ι²inan ye, an bΙΙ ka jΙ ka o seli Ι²ΙgΙn fΙ, hΙΙrΙ ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa Ι²Ι. Ala ka an ka seliw caya. Ala ka yafa an bΙΙ ma.", "Amara"],
|
200 |
+
["An dΙlakelen bΙ masike bilenman don ka tΙw gΙn.", "Modibo"],
|
201 |
+
["Aw ni ce. Seidu bΙ aw fo wa aw ka yafa a ma, ka da a kan tuma dΙw la kow ka can.", "Amadou"],
|
202 |
+
["Bamanankan ye kan Ι²uman ye", "Seydou"],
|
203 |
]
|
204 |
|
205 |
def build_interface():
|
206 |
"""Build the Gradio interface for Bambara TTS"""
|
207 |
|
208 |
+
with gr.Blocks(
|
209 |
+
title="Bambara TTS - MALIBA-AI",
|
210 |
+
theme=gr.themes.Soft(),
|
211 |
+
css="""
|
212 |
+
.main-header { text-align: center; margin-bottom: 2rem; }
|
213 |
+
.status-box { margin-top: 1rem; }
|
214 |
+
"""
|
215 |
+
) as demo:
|
216 |
|
217 |
+
with gr.Row():
|
218 |
+
gr.Markdown(f"""
|
219 |
+
# π€ Bambara Text-to-Speech
|
220 |
+
|
221 |
+
**Powered by MALIBA-AI** | *First Open-Source Bambara TTS*
|
222 |
+
|
223 |
+
Convert Bambara text to natural-sounding speech using our state-of-the-art neural TTS system.
|
224 |
+
|
225 |
+
**Bambara** is spoken by millions of people in Mali and West Africa π
|
226 |
+
|
227 |
+
**Status**: {'β
Model loaded' if tts_model is not None else 'β³ Model will load on first request'}
|
228 |
+
""", elem_classes=["main-header"])
|
229 |
|
230 |
with gr.Row():
|
231 |
with gr.Column(scale=2):
|
232 |
text_input = gr.Textbox(
|
233 |
label="π Bambara Text",
|
234 |
+
placeholder="I ni ce... (Type your Bambara text here)",
|
235 |
+
lines=4,
|
236 |
+
max_lines=8,
|
237 |
value="I ni ce"
|
238 |
)
|
239 |
|
240 |
speaker_dropdown = gr.Dropdown(
|
241 |
choices=SPEAKER_NAMES,
|
242 |
+
value=SPEAKER_NAMES[0] if SPEAKER_NAMES else "Bourama", # Default to most stable speaker
|
243 |
+
label="π£οΈ Speaker Voice",
|
244 |
+
info=f"Choose from {len(SPEAKER_NAMES)} authentic voices (Bourama recommended for best quality)"
|
245 |
)
|
246 |
|
247 |
+
generate_btn = gr.Button(
|
248 |
+
"π΅ Generate Speech",
|
249 |
+
variant="primary",
|
250 |
+
size="lg"
|
251 |
+
)
|
252 |
|
253 |
with gr.Column(scale=1):
|
254 |
use_advanced = gr.Checkbox(
|
255 |
+
label="βοΈ Advanced Settings",
|
256 |
value=False,
|
257 |
+
info="Customize generation parameters"
|
258 |
)
|
259 |
|
260 |
with gr.Group(visible=False) as advanced_group:
|
261 |
+
gr.Markdown("**π§ Advanced Parameters:**")
|
262 |
|
263 |
temperature = gr.Slider(
|
264 |
minimum=0.1,
|
|
|
266 |
value=0.8,
|
267 |
step=0.1,
|
268 |
label="Temperature",
|
269 |
+
info="Higher = more varied speech"
|
270 |
)
|
271 |
|
272 |
top_k = gr.Slider(
|
|
|
274 |
maximum=100,
|
275 |
value=50,
|
276 |
step=5,
|
277 |
+
label="Top-K",
|
278 |
+
info="Vocabulary selection size"
|
279 |
)
|
280 |
|
281 |
top_p = gr.Slider(
|
|
|
283 |
maximum=1.0,
|
284 |
value=0.9,
|
285 |
step=0.05,
|
286 |
+
label="Top-P",
|
287 |
+
info="Nucleus sampling threshold"
|
288 |
)
|
289 |
|
290 |
max_tokens = gr.Slider(
|
|
|
292 |
maximum=4096,
|
293 |
value=2048,
|
294 |
step=256,
|
295 |
+
label="Max Audio Length",
|
296 |
+
info="Maximum audio duration"
|
297 |
)
|
298 |
|
299 |
gr.Markdown("### π Generated Audio")
|
|
|
301 |
audio_output = gr.Audio(
|
302 |
label="Generated Speech",
|
303 |
type="numpy",
|
304 |
+
interactive=False,
|
305 |
+
show_download_button=True
|
306 |
)
|
307 |
|
308 |
status_output = gr.Textbox(
|
309 |
label="Status",
|
310 |
interactive=False,
|
311 |
show_label=False,
|
312 |
+
container=False,
|
313 |
+
elem_classes=["status-box"]
|
314 |
)
|
315 |
|
316 |
+
with gr.Accordion("π Try These Examples", open=True):
|
317 |
def load_example(text, speaker):
|
318 |
return text, speaker, False, 0.8, 50, 0.9, 2048
|
319 |
|
320 |
+
gr.Markdown("**Click any example below to try it:**")
|
321 |
|
322 |
+
with gr.Row():
|
323 |
+
for i, (text, speaker) in enumerate(examples[:5]):
|
324 |
+
btn = gr.Button(
|
325 |
+
f"πΉ {text[:25]}{'...' if len(text) > 25 else ''}",
|
326 |
+
size="sm"
|
327 |
+
)
|
328 |
+
btn.click(
|
329 |
+
fn=lambda t=text, s=speaker: load_example(t, s),
|
330 |
+
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
|
331 |
+
)
|
332 |
+
|
333 |
+
with gr.Row():
|
334 |
+
for i, (text, speaker) in enumerate(examples[5:]):
|
335 |
+
btn = gr.Button(
|
336 |
+
f"πΉ {text[:25]}{'...' if len(text) > 25 else ''}",
|
337 |
+
size="sm"
|
338 |
+
)
|
339 |
+
btn.click(
|
340 |
+
fn=lambda t=text, s=speaker: load_example(t, s),
|
341 |
+
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
|
342 |
+
)
|
343 |
|
344 |
+
with gr.Accordion("βΉοΈ About", open=False):
|
345 |
+
gr.Markdown(f"""
|
346 |
+
## About MALIBA-AI Bambara TTS
|
347 |
+
|
348 |
+
- **π― Purpose**: First open-source Text-to-Speech system for Bambara language
|
349 |
+
- **π£οΈ Speakers**: {len(SPEAKER_NAMES)} different authentic voices
|
350 |
+
- **π Quality**: 16kHz neural speech synthesis
|
351 |
+
- **β‘ Performance**: Optimized for real-time generation
|
352 |
+
- **π± Usage**: Educational, accessibility, and cultural preservation
|
353 |
+
|
354 |
+
### π Speaker Characteristics:
|
355 |
+
|
356 |
+
- **Bourama**: Most stable and accurate (recommended)
|
357 |
+
- **Adama**: Natural conversational tone
|
358 |
+
- **Moussa**: Clear pronunciation for educational content
|
359 |
+
- **Modibo**: Expressive delivery for storytelling
|
360 |
+
- **Seydou**: Balanced characteristics for general use
|
361 |
+
- **Amadou**: Warm and friendly voice
|
362 |
+
- **Bakary**: Deep, authoritative tone
|
363 |
+
- **Ngolo**: Youthful and energetic
|
364 |
+
- **Ibrahima**: Calm and measured delivery
|
365 |
+
- **Amara**: Melodic and smooth
|
366 |
|
367 |
+
**Model Architecture**: Built on state-of-the-art neural TTS with Bambara-specific optimizations
|
368 |
+
|
369 |
+
**License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
|
370 |
+
|
371 |
+
---
|
372 |
+
|
373 |
+
**MALIBA-AI Mission**: Ensuring no Malian is left behind by technological advances π²π±
|
374 |
""")
|
375 |
|
376 |
+
# Event handlers
|
377 |
def toggle_advanced(use_adv):
|
378 |
return gr.Group(visible=use_adv)
|
379 |
|
|
|
383 |
outputs=[advanced_group]
|
384 |
)
|
385 |
|
386 |
+
# Generate speech on button click
|
387 |
generate_btn.click(
|
388 |
fn=generate_speech,
|
389 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
|
|
391 |
show_progress=True
|
392 |
)
|
393 |
|
394 |
+
# Generate speech on Enter key
|
395 |
text_input.submit(
|
396 |
fn=generate_speech,
|
397 |
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
|
|
|
403 |
|
404 |
def main():
|
405 |
"""Main function to launch the Gradio interface"""
|
406 |
+
logger.info("Starting MALIBA-AI Bambara TTS Gradio interface...")
|
407 |
|
408 |
+
# Build interface
|
409 |
interface = build_interface()
|
410 |
+
|
411 |
+
# Launch interface
|
412 |
interface.launch(
|
413 |
server_name="0.0.0.0",
|
414 |
server_port=7860,
|
415 |
+
share=False,
|
416 |
+
show_error=True
|
417 |
)
|
418 |
|
419 |
+
logger.info("Gradio interface launched successfully!")
|
420 |
|
421 |
if __name__ == "__main__":
|
422 |
main()
|