Spaces:

MALIBA-AI
/

BambaraText2Speech

Running on Zero

App Files Files Community

sudoping01 commited on 8 days ago

Commit

50f13c9

verified ·

1 Parent(s): b03e687

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -56

app.py CHANGED Viewed

@@ -1,10 +1,15 @@
 import os
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 os.environ["TORCH_COMPILE_DISABLE"] = "1"
 os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import torch
 import gradio as gr
 import numpy as np
@@ -24,50 +29,38 @@ hf_token = os.getenv("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
-# Global variables for model caching (like your old working version)
 _tts_model = None
 _speakers_dict = None
 _model_initialized = False
 _initialization_in_progress = False
 def get_speakers_dict():
-    """Get speakers dictionary using the new SDK structure"""
     try:
-        # Try the new structure first - check what's actually available
         from maliba_ai.config.settings import Speakers
-        # Get all available speaker attributes dynamically
-        available_speakers = {}
-        # Updated speaker list with all 10 speakers in preferred order
-        speaker_names = ["Bourama", "Adama", "Moussa", "Modibo", "Seydou",
-                        "Amadou", "Bakary", "Ngolo", "Ibrahima", "Amara"]
-        for name in speaker_names:
-            if hasattr(Speakers, name):
-                available_speakers[name] = getattr(Speakers, name)
-        if available_speakers:
-            logger.info(f"Loaded {len(available_speakers)} speakers from new structure: {list(available_speakers.keys())}")
-            return available_speakers
-        else:
-            raise AttributeError("No speakers found in new structure")
     except Exception as e:
-        logger.error(f"Failed to import from new settings structure: {e}")
-        # Fallback to old structure if new one fails
-        try:
-            from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
-            logger.info("Using fallback old speaker structure")
-            return {
-                "Adama": Adame,
-                "Moussa": Moussa,
-                "Bourama": Bourama,
-                "Modibo": Modibo,
-                "Seydou": Seydou
-            }
-        except Exception as e2:
-            logger.error(f"Failed to import speakers: {e2}")
-            return {}
 @spaces.GPU()
 def initialize_model_once():
@@ -91,8 +84,8 @@ def initialize_model_once():
         logger.info("Initializing Bambara TTS model...")
         start_time = time.time()
-        # Use the new import structure from the README
-        from maliba_ai.tts import BambaraTTSInference
         model = BambaraTTSInference()
         speakers = get_speakers_dict()
@@ -221,7 +214,7 @@ def get_speaker_names():
 SPEAKER_NAMES = get_speaker_names()
-# Examples with variety of lengths and speakers matched to their characteristics
 examples = [
     ["Aw ni ce", "Adama"],  # Natural conversational greeting
     ["Mali bɛna diya kɔsɛbɛ, ka a da a kan baara bɛ ka kɛ.", "Moussa"],  # Clear pronunciation for informative content
@@ -239,6 +232,31 @@ examples = [
     ["Dɔnko ɲuman ye, a bɛ dɔn mɔgɔ kɔnɔ", "Amara"],  # Melodic and smooth for poetic expression
 ]
 def build_interface():
     """Build the Gradio interface - simplified like your old working version"""
@@ -246,9 +264,9 @@ def build_interface():
         gr.Markdown("""
         # 🎤 Bambara Text-to-Speech
-        **Powered by MALIBA-AI**
-        Convert Bambara text to speech using our state-of-the-art TTS model.
         **Bambara** is spoken by millions of people in Mali and West Africa.
         """)
@@ -267,7 +285,7 @@ def build_interface():
                     choices=SPEAKER_NAMES,
                     value="Bourama" if "Bourama" in SPEAKER_NAMES else SPEAKER_NAMES[0],
                     label="🗣️ Speaker Voice",
-                    info=f"Choose from {len(SPEAKER_NAMES)} authentic voices (Bourama recommended)"
                 )
                 generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
@@ -321,7 +339,7 @@ def build_interface():
             label="Generated Speech",
             type="numpy",
             interactive=False,
-            format="wav"  # Specify WAV format to help with conversion
         )
         status_output = gr.Textbox(
@@ -337,7 +355,10 @@ def build_interface():
             gr.Markdown("**Click any example below:**")
-            for i, (text, speaker) in enumerate(examples):
                 btn = gr.Button(f"{text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
                 btn.click(
                     fn=lambda t=text, s=speaker: load_example(t, s),
@@ -349,7 +370,7 @@ def build_interface():
             ## About MALIBA-AI Bambara TTS
             - **🎯 Purpose**: First open-source Text-to-Speech system for Bambara language
-            - **🗣️ Speakers**: {len(SPEAKER_NAMES)} different authentic voices
             - **🔊 Quality**: 16kHz neural speech synthesis
             - **⚡ Performance**: Model loads once and stays in memory
             - **📱 Usage**: Educational, accessibility, and cultural preservation
@@ -357,18 +378,6 @@ def build_interface():
             ### 🎭 Available Speakers:
             {', '.join(SPEAKER_NAMES)}
-            ### 🎯 Speaker Characteristics:
-            - **Bourama**: Most stable and accurate (recommended)
-            - **Adama**: Natural conversational tone
-            - **Moussa**: Clear pronunciation for educational content
-            - **Modibo**: Expressive delivery for storytelling
-            - **Seydou**: Balanced characteristics for general use
-            - **Amadou**: Warm and friendly voice
-            - **Bakary**: Deep, authoritative tone
-            - **Ngolo**: Youthful and energetic
-            - **Ibrahima**: Calm and measured delivery
-            - **Amara**: Melodic and smooth
             **License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
             ---
@@ -404,8 +413,7 @@ def build_interface():
 def main():
     """Main function to launch the Gradio interface"""
     logger.info("Starting Bambara TTS Gradio interface.")
-    # DO NOT preload - let it initialize on first request only (like your working version)
     interface = build_interface()
     interface.launch(
         server_name="0.0.0.0",

 import os
+# Disable problematic optimizations for ZeroGPU compatibility
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 os.environ["TORCH_COMPILE_DISABLE"] = "1"
 os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Disable Unsloth optimizations that cause issues in ZeroGPU
+os.environ["UNSLOTH_DISABLE"] = "1"
+os.environ["DISABLE_UNSLOTH"] = "1"
 import torch
 import gradio as gr
 import numpy as np
 if hf_token:
     login(token=hf_token)
+# Global variables for model caching
 _tts_model = None
 _speakers_dict = None
 _model_initialized = False
 _initialization_in_progress = False
 def get_speakers_dict():
+    """Get speakers dictionary using the correct SDK structure"""
     try:
+        # Import the Speakers class (not individual speakers)
         from maliba_ai.config.settings import Speakers
+        # Access all 10 speakers through the Speakers class
+        speakers_dict = {
+            "Adama": Speakers.Adama,
+            "Moussa": Speakers.Moussa,
+            "Bourama": Speakers.Bourama,
+            "Modibo": Speakers.Modibo,
+            "Seydou": Speakers.Seydou,
+            "Amadou": Speakers.Amadou,
+            "Bakary": Speakers.Bakary,
+            "Ngolo": Speakers.Ngolo,
+            "Amara": Speakers.Amara,
+            "Ibrahima": Speakers.Ibrahima
+        }
+        logger.info(f"🎤 Successfully loaded {len(speakers_dict)} speakers: {list(speakers_dict.keys())}")
+        return speakers_dict
     except Exception as e:
+        logger.error(f"❌ Failed to import Speakers class: {e}")
+        return {}
 @spaces.GPU()
 def initialize_model_once():
         logger.info("Initializing Bambara TTS model...")
         start_time = time.time()
+        # Use the correct import path
+        from maliba_ai.tts.inference import BambaraTTSInference
         model = BambaraTTSInference()
         speakers = get_speakers_dict()
 SPEAKER_NAMES = get_speaker_names()
+# Examples representing ALL 10 speakers - with fallbacks for missing speakers
 examples = [
     ["Aw ni ce", "Adama"],  # Natural conversational greeting
     ["Mali bɛna diya kɔsɛbɛ, ka a da a kan baara bɛ ka kɛ.", "Moussa"],  # Clear pronunciation for informative content
     ["Dɔnko ɲuman ye, a bɛ dɔn mɔgɔ kɔnɔ", "Amara"],  # Melodic and smooth for poetic expression
 ]
+def get_safe_examples():
+    """Get examples with speaker fallbacks for missing speakers"""
+    safe_examples = []
+    # Fallback mapping for missing speakers
+    fallback_speakers = {
+        "Amadou": "Adama",    # Warm -> Natural conversational
+        "Bakary": "Modibo",   # Authoritative -> Expressive
+        "Ngolo": "Adama",     # Youthful -> Natural conversational
+        "Ibrahima": "Seydou", # Calm -> Balanced
+        "Amara": "Moussa"     # Melodic -> Clear pronunciation
+    }
+    for text, speaker in examples:
+        # Use original speaker if available, otherwise use fallback
+        if speaker in SPEAKER_NAMES:
+            safe_examples.append([text, speaker])
+        elif speaker in fallback_speakers and fallback_speakers[speaker] in SPEAKER_NAMES:
+            safe_examples.append([text, fallback_speakers[speaker]])
+        else:
+            # Final fallback to first available speaker
+            safe_examples.append([text, SPEAKER_NAMES[0]])
+    return safe_examples
 def build_interface():
     """Build the Gradio interface - simplified like your old working version"""
         gr.Markdown("""
         # 🎤 Bambara Text-to-Speech
+        **Powered by MALIBA-AI** | *First Open-Source Bambara TTS*
+        Convert Bambara text to natural-sounding speech using our state-of-the-art neural TTS system.
         **Bambara** is spoken by millions of people in Mali and West Africa.
         """)
                     choices=SPEAKER_NAMES,
                     value="Bourama" if "Bourama" in SPEAKER_NAMES else SPEAKER_NAMES[0],
                     label="🗣️ Speaker Voice",
+                    info=f"Choose from {len(SPEAKER_NAMES)} authentic Bambara voices"
                 )
                 generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
             label="Generated Speech",
             type="numpy",
             interactive=False,
+            format="wav"
         )
         status_output = gr.Textbox(
             gr.Markdown("**Click any example below:**")
+            # Use safe examples with fallbacks for missing speakers
+            safe_examples = get_safe_examples()
+            for i, (text, speaker) in enumerate(safe_examples):
                 btn = gr.Button(f"{text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
                 btn.click(
                     fn=lambda t=text, s=speaker: load_example(t, s),
             ## About MALIBA-AI Bambara TTS
             - **🎯 Purpose**: First open-source Text-to-Speech system for Bambara language
+            - **🗣️ Speakers**: {len(SPEAKER_NAMES)} authentic Bambara voices
             - **🔊 Quality**: 16kHz neural speech synthesis
             - **⚡ Performance**: Model loads once and stays in memory
             - **📱 Usage**: Educational, accessibility, and cultural preservation
             ### 🎭 Available Speakers:
             {', '.join(SPEAKER_NAMES)}
             **License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
             ---
 def main():
     """Main function to launch the Gradio interface"""
     logger.info("Starting Bambara TTS Gradio interface.")
     interface = build_interface()
     interface.launch(
         server_name="0.0.0.0",