sudoping01 commited on
Commit
002d115
·
verified ·
1 Parent(s): 1dc725e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -32
app.py CHANGED
@@ -33,25 +33,31 @@ _initialization_in_progress = False
33
  def get_speakers_dict():
34
  """Get speakers dictionary using the new SDK structure"""
35
  try:
36
- # Use new structure from maliba_ai.config.settings (as shown in README)
37
  from maliba_ai.config.settings import Speakers
38
- return {
39
- "Adama": Speakers.Adama,
40
- "Moussa": Speakers.Moussa,
41
- "Bourama": Speakers.Bourama,
42
- "Modibo": Speakers.Modibo,
43
- "Seydou": Speakers.Seydou,
44
- "Amadou": Speakers.Amadou,
45
- "Bakary": Speakers.Bakary,
46
- "Ngolo": Speakers.Ngolo,
47
- "Ibrahima": Speakers.Ibrahima,
48
- "Amara": Speakers.Amara
49
- }
 
 
 
 
 
50
  except Exception as e:
51
  logger.error(f"Failed to import from new settings structure: {e}")
52
  # Fallback to old structure if new one fails
53
  try:
54
  from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
 
55
  return {
56
  "Adama": Adame,
57
  "Moussa": Moussa,
@@ -167,6 +173,18 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
167
  if waveform is None or waveform.size == 0:
168
  return None, "Failed to generate audio. Please try again."
169
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  sample_rate = 16000
171
  return (sample_rate, waveform), f"✅ Audio generated successfully for speaker {speaker_name}"
172
 
@@ -174,32 +192,51 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
174
  logger.error(f"Speech generation failed: {e}")
175
  return None, f"❌ Error: {str(e)}"
176
 
177
- # Use available speakers (prioritize new SDK structure with 10 speakers)
178
  def get_speaker_names():
179
  speakers = get_speakers_dict()
180
- if speakers and len(speakers) >= 10:
181
- # New SDK with all 10 speakers
182
- return ["Bourama", "Adama", "Moussa", "Modibo", "Seydou", "Amadou", "Bakary", "Ngolo", "Ibrahima", "Amara"]
183
- elif speakers:
184
- # Return whatever speakers are available
185
- return list(speakers.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  else:
187
- # Fallback to old 5 speakers
188
- return ["Adama", "Moussa", "Bourama", "Modibo", "Seydou"]
 
189
 
190
  SPEAKER_NAMES = get_speaker_names()
191
 
192
- # Examples with variety of lengths and speakers matched to content
193
  examples = [
194
  ["Aw ni ce", "Adama"], # Natural conversational greeting
195
- ["Mali bɛna diya kɔsɛbɛ, ka a da a kan baara bɛ ka kɛ.", "Bakary" if "Bakary" in SPEAKER_NAMES else "Moussa"],
196
- ["Ne bɛ se ka sɛbɛnni yɛlɛma ka kɛ kuma ye", "Moussa"],
197
- ["I ka kɛnɛ wa?", "Ngolo" if "Ngolo" in SPEAKER_NAMES else "Modibo"],
198
- ["Lakɔli karamɔgɔw tun tɛ ka se ka sɛbɛnni kɛ ka ɲɛ walanda kan wa denmisɛnw tun tɛ ka se ka o sɛbɛnni ninnu ye, kuma tɛ ka u kalan. Denmisɛnw kɛra kunfinw ye.", "Bourama"],
199
- ["sigikafɔ kɔnɔ jamanaw ni ɲɔgɔn cɛ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kɛ sariya ani tilennenya kɔnɔ.", "Ibrahima" if "Ibrahima" in SPEAKER_NAMES else "Seydou"],
200
- ["Aw ni ce. Ne tɔgɔ ye Adama. Awɔ, ne ye maliden de ye. Aw Sanbɛ Sanbɛ. San min tɛ ɲinan ye, an bɛɛ ka jɛ ka o seli ɲɔgɔn fɛ, hɛɛrɛ ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa ɲɛ. Ala ka an ka seliw caya. Ala ka yafa an bɛɛ ma.", "Amara" if "Amara" in SPEAKER_NAMES else "Moussa"],
201
- ["An dɔlakelen bɛ masike bilenman don ka tɔw gɛn.", "Modibo"],
202
- ["Aw ni ce. Seidu bɛ aw fo wa aw ka yafa a ma, ka da a kan tuma dɔw la kow ka can.", "Amadou" if "Amadou" in SPEAKER_NAMES else "Modibo"],
 
 
 
 
 
203
  ]
204
 
205
  def build_interface():
@@ -283,7 +320,8 @@ def build_interface():
283
  audio_output = gr.Audio(
284
  label="Generated Speech",
285
  type="numpy",
286
- interactive=False
 
287
  )
288
 
289
  status_output = gr.Textbox(
 
33
  def get_speakers_dict():
34
  """Get speakers dictionary using the new SDK structure"""
35
  try:
36
+ # Try the new structure first - check what's actually available
37
  from maliba_ai.config.settings import Speakers
38
+
39
+ # Get all available speaker attributes dynamically
40
+ available_speakers = {}
41
+ # Updated speaker list with all 10 speakers in preferred order
42
+ speaker_names = ["Bourama", "Adama", "Moussa", "Modibo", "Seydou",
43
+ "Amadou", "Bakary", "Ngolo", "Ibrahima", "Amara"]
44
+
45
+ for name in speaker_names:
46
+ if hasattr(Speakers, name):
47
+ available_speakers[name] = getattr(Speakers, name)
48
+
49
+ if available_speakers:
50
+ logger.info(f"Loaded {len(available_speakers)} speakers from new structure: {list(available_speakers.keys())}")
51
+ return available_speakers
52
+ else:
53
+ raise AttributeError("No speakers found in new structure")
54
+
55
  except Exception as e:
56
  logger.error(f"Failed to import from new settings structure: {e}")
57
  # Fallback to old structure if new one fails
58
  try:
59
  from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
60
+ logger.info("Using fallback old speaker structure")
61
  return {
62
  "Adama": Adame,
63
  "Moussa": Moussa,
 
173
  if waveform is None or waveform.size == 0:
174
  return None, "Failed to generate audio. Please try again."
175
 
176
+ # Convert to numpy if it's a tensor
177
+ if isinstance(waveform, torch.Tensor):
178
+ waveform = waveform.cpu().numpy()
179
+
180
+ # Ensure proper audio format (convert float32 to int16 range but keep as float for Gradio)
181
+ if waveform.dtype == np.float32:
182
+ # Normalize to [-1, 1] range if needed
183
+ if np.max(np.abs(waveform)) > 1.0:
184
+ waveform = waveform / np.max(np.abs(waveform))
185
+ # Keep as float32 but ensure proper range for Gradio
186
+ waveform = np.clip(waveform, -1.0, 1.0)
187
+
188
  sample_rate = 16000
189
  return (sample_rate, waveform), f"✅ Audio generated successfully for speaker {speaker_name}"
190
 
 
192
  logger.error(f"Speech generation failed: {e}")
193
  return None, f"❌ Error: {str(e)}"
194
 
195
+ # Use available speakers (detect what's actually available, prioritize Bourama)
196
  def get_speaker_names():
197
  speakers = get_speakers_dict()
198
+ if speakers:
199
+ speaker_list = list(speakers.keys())
200
+ # Reorder to match preferred order (Bourama first)
201
+ preferred_order = ["Bourama", "Adama", "Moussa", "Modibo", "Seydou",
202
+ "Amadou", "Bakary", "Ngolo", "Ibrahima", "Amara"]
203
+
204
+ # Sort available speakers according to preferred order
205
+ ordered_speakers = []
206
+ for speaker in preferred_order:
207
+ if speaker in speaker_list:
208
+ ordered_speakers.append(speaker)
209
+
210
+ # Add any remaining speakers not in preferred list
211
+ for speaker in speaker_list:
212
+ if speaker not in ordered_speakers:
213
+ ordered_speakers.append(speaker)
214
+
215
+ logger.info(f"Available speakers: {ordered_speakers}")
216
+ return ordered_speakers
217
  else:
218
+ # Final fallback with Bourama first
219
+ logger.warning("No speakers loaded, using fallback list")
220
+ return ["Bourama", "Adama", "Moussa", "Modibo", "Seydou"]
221
 
222
  SPEAKER_NAMES = get_speaker_names()
223
 
224
+ # Examples with variety of lengths and speakers matched to their characteristics
225
  examples = [
226
  ["Aw ni ce", "Adama"], # Natural conversational greeting
227
+ ["Mali bɛna diya kɔsɛbɛ, ka a da a kan baara bɛ ka kɛ.", "Moussa"], # Clear pronunciation for informative content
228
+ ["Ne bɛ se ka sɛbɛnni yɛlɛma ka kɛ kuma ye", "Bourama"], # Most stable for educational content
229
+ ["I ka kɛnɛ wa?", "Modibo"], # Expressive delivery for questions
230
+ ["Lakɔli karamɔgɔw tun tɛ ka se ka sɛbɛnni kɛ ka ɲɛ walanda kan wa denmisɛnw tun tɛ ka se ka o sɛbɛnni ninnu ye, kuma tɛ ka u kalan. Denmisɛnw kɛra kunfinw ye.", "Adama"], # Natural conversational tone for longer explanation
231
+ ["sigikafɔ kɔnɔ jamanaw ni ɲɔgɔn cɛ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kɛ sariya ani tilennenya kɔnɔ.", "Seydou"], # Balanced characteristics for formal content
232
+ ["Aw ni ce. Ne tɔgɔ ye Adama. Awɔ, ne ye maliden de ye. Aw Sanbɛ Sanbɛ. San min tɛ ɲinan ye, an bɛɛ ka jɛ ka o seli ɲɔgɔn fɛ, hɛɛrɛ ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa ɲɛ. Ala ka an ka seliw caya. Ala ka yafa an bɛɛ ma.", "Moussa"], # Clear pronunciation for heartfelt long message
233
+ ["An dɔlakelen bɛ masike bilenman don ka tɔw gɛn.", "Bourama"], # Most stable for complex statement
234
+ ["Aw ni ce. Seidu bɛ aw fo wa aw ka yafa a ma, ka da a kan tuma dɔw la kow ka can.", "Modibo"], # Expressive delivery for personal greeting
235
+ ["To tɔ nantan ni lafiya, o ka fisa ni so fa dumuniba kɛlɛma ye.", "Amadou"], # Warm and friendly voice for wisdom saying
236
+ ["Mali ye jamana ɲuman ye!", "Bakary"], # Deep, authoritative tone for patriotic statement
237
+ ["An ka ɲɔgɔn dɛmɛ ka baara kɛ ɲɔgɔn fɛ", "Ngolo"], # Youthful and energetic for collaboration
238
+ ["Hakili to yɔrɔ min na, sabali bɛ yen", "Ibrahima"], # Calm and measured for philosophical thought
239
+ ["Dɔnko ɲuman ye, a bɛ dɔn mɔgɔ kɔnɔ", "Amara"], # Melodic and smooth for poetic expression
240
  ]
241
 
242
  def build_interface():
 
320
  audio_output = gr.Audio(
321
  label="Generated Speech",
322
  type="numpy",
323
+ interactive=False,
324
+ format="wav" # Specify WAV format to help with conversion
325
  )
326
 
327
  status_output = gr.Textbox(