sudoping01's picture
Update app.py
043ab85 verified
raw
history blame
16.1 kB
import os
# Set environment variables BEFORE any imports
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set CUDA environment to help with unsloth GPU detection
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Force GPU visibility
os.environ["FORCE_CUDA"] = "1" # Force CUDA usage
import torch
import gradio as gr
import numpy as np
import spaces
import logging
from huggingface_hub import login
import time
torch._dynamo.config.disable = True
torch._dynamo.config.suppress_errors = True
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(token=hf_token)
# Check GPU availability
if torch.cuda.is_available():
device = "cuda"
logger.info("Using CUDA for inference.")
elif torch.backends.mps.is_available():
device = "mps"
logger.info("Using MPS for inference.")
else:
device = "cpu"
logger.info("Using CPU for inference.")
def get_speakers_dict():
"""Get speakers dictionary using the new package structure"""
try:
from maliba_ai.config.settings import Speakers
return {
"Adama": Speakers.Adama,
"Moussa": Speakers.Moussa,
"Bourama": Speakers.Bourama,
"Modibo": Speakers.Modibo,
"Seydou": Speakers.Seydou,
"Amadou": Speakers.Amadou,
"Bakary": Speakers.Bakary,
"Ngolo": Speakers.Ngolo,
"Ibrahima": Speakers.Ibrahima,
"Amara": Speakers.Amara
}
except Exception as e:
logger.error(f"Failed to import all speakers: {e}")
# Fallback to core speakers only
try:
from maliba_ai.config.settings import Speakers
return {
"Adama": Speakers.Adama,
"Moussa": Speakers.Moussa,
"Bourama": Speakers.Bourama,
"Modibo": Speakers.Modibo,
"Seydou": Speakers.Seydou
}
except:
logger.error("Failed to import even core speakers")
return {}
def initialize_tts_model():
"""Initialize TTS model globally - similar to ASR space pattern"""
try:
logger.info("Initializing Bambara TTS model globally...")
start_time = time.time()
# Import and initialize the TTS model
from maliba_ai.tts import BambaraTTSInference
# Initialize model
model = BambaraTTSInference()
elapsed = time.time() - start_time
logger.info(f"TTS Model initialized successfully in {elapsed:.2f} seconds!")
return model
except Exception as e:
logger.error(f"Failed to initialize TTS model: {e}")
logger.info("Model will be initialized on first request instead")
return None
# Initialize speakers dictionary
speakers_dict = get_speakers_dict()
logger.info(f"Available speakers: {list(speakers_dict.keys())}")
# Try to initialize model globally (like ASR space)
# If it fails due to GPU detection, it will be None and we'll init on first request
tts_model = initialize_tts_model()
def validate_inputs(text, temperature, top_k, top_p, max_tokens):
"""Validate user inputs"""
if not text or not text.strip():
return False, "Please enter some Bambara text."
if not (0.001 <= temperature <= 2.0):
return False, "Temperature must be between 0.001 and 2.0"
if not (1 <= top_k <= 100):
return False, "Top-K must be between 1 and 100"
if not (0.1 <= top_p <= 1.0):
return False, "Top-P must be between 0.1 and 1.0"
if len(text.strip()) > 1000:
return False, "Text is too long. Please use shorter text (max 1000 characters)."
return True, ""
@spaces.GPU()
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
"""Generate speech - with fallback initialization if global init failed"""
global tts_model
if not text.strip():
return None, "Please enter some Bambara text."
try:
# If global initialization failed, try to initialize here with GPU decorator
if tts_model is None:
logger.info("Global model initialization failed, initializing with GPU decorator...")
from maliba_ai.tts import BambaraTTSInference
tts_model = BambaraTTSInference()
logger.info("Model initialized successfully with GPU decorator!")
if not speakers_dict:
return None, "❌ Speakers not properly loaded"
if speaker_name not in speakers_dict:
available_speakers = list(speakers_dict.keys())
return None, f"❌ Speaker '{speaker_name}' not found. Available: {available_speakers}"
speaker = speakers_dict[speaker_name]
logger.info(f"Generating speech with speaker: {speaker_name}")
# Validate inputs if using advanced settings
if use_advanced:
is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
if not is_valid:
return None, f"❌ {error_msg}"
waveform = tts_model.generate_speech(
text=text.strip(),
speaker_id=speaker,
temperature=temperature,
top_k=int(top_k),
top_p=top_p,
max_new_audio_tokens=int(max_tokens)
)
else:
# Use default settings
waveform = tts_model.generate_speech(
text=text.strip(),
speaker_id=speaker
)
if waveform is None or waveform.size == 0:
return None, "❌ Failed to generate audio. Please try again with different text."
# Ensure waveform is in correct format
if isinstance(waveform, torch.Tensor):
waveform = waveform.cpu().numpy()
# Normalize audio to prevent clipping
if np.max(np.abs(waveform)) > 0:
waveform = waveform / np.max(np.abs(waveform)) * 0.9
sample_rate = 16000
return (sample_rate, waveform), f"βœ… Audio generated successfully for speaker {speaker_name}"
except Exception as e:
logger.error(f"Speech generation failed: {e}", exc_info=True)
return None, f"❌ Error: {str(e)}"
# Get available speakers for dropdown
SPEAKER_NAMES = list(speakers_dict.keys()) if speakers_dict else ["Adama", "Moussa", "Bourama", "Modibo", "Seydou"]
examples = [
["Aw ni ce", "Adama"],
["Mali bΙ›na diya kΙ”sΙ›bΙ›, ka a da a kan baara bΙ› ka kΙ›.", "Bakary"],
["Ne bΙ› se ka sΙ›bΙ›nni yΙ›lΙ›ma ka kΙ› kuma ye", "Moussa"],
["I ka kΙ›nΙ› wa?", "Ngolo"],
["LakΙ”li karamΙ”gΙ”w tun tΙ› ka se ka sΙ›bΙ›nni kΙ› ka Ι²Ι› walanda kan wa denmisΙ›nw tun tΙ› ka se ka o sΙ›bΙ›nni ninnu ye, kuma tΙ› ka u kalan. DenmisΙ›nw kΙ›ra kunfinw ye.", "Bourama"],
["sigikafΙ” kΙ”nΙ” jamanaw ni Ι²Ι”gΙ”n cΙ›, olu ye a haminankow ye, wa o ko ninnu ka kan ka kΙ› sariya ani tilennenya kΙ”nΙ”.", "Ibrahima"],
["Aw ni ce. Ne tΙ”gΙ” ye Adama. AwΙ”, ne ye maliden de ye. Aw SanbΙ› SanbΙ›. San min tΙ› Ι²inan ye, an bΙ›Ι› ka jΙ› ka o seli Ι²Ι”gΙ”n fΙ›, hΙ›Ι›rΙ› ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa Ι²Ι›. Ala ka an ka seliw caya. Ala ka yafa an bΙ›Ι› ma.", "Amara"],
["An dΙ”lakelen bΙ› masike bilenman don ka tΙ”w gΙ›n.", "Modibo"],
["Aw ni ce. Seidu bΙ› aw fo wa aw ka yafa a ma, ka da a kan tuma dΙ”w la kow ka can.", "Amadou"],
["Bamanankan ye kan Ι²uman ye", "Seydou"],
]
def build_interface():
"""Build the Gradio interface for Bambara TTS"""
with gr.Blocks(
title="Bambara TTS - MALIBA-AI",
theme=gr.themes.Soft(),
css="""
.main-header { text-align: center; margin-bottom: 2rem; }
.status-box { margin-top: 1rem; }
"""
) as demo:
with gr.Row():
gr.Markdown(f"""
# 🎀 Bambara Text-to-Speech
**Powered by MALIBA-AI** | *First Open-Source Bambara TTS*
Convert Bambara text to natural-sounding speech using our state-of-the-art neural TTS system.
**Bambara** is spoken by millions of people in Mali and West Africa 🌍
**Status**: {'βœ… Model loaded' if tts_model is not None else '⏳ Model will load on first request'}
""", elem_classes=["main-header"])
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="πŸ“ Bambara Text",
placeholder="I ni ce... (Type your Bambara text here)",
lines=4,
max_lines=8,
value="I ni ce"
)
speaker_dropdown = gr.Dropdown(
choices=SPEAKER_NAMES,
value=SPEAKER_NAMES[0] if SPEAKER_NAMES else "Bourama", # Default to most stable speaker
label="πŸ—£οΈ Speaker Voice",
info=f"Choose from {len(SPEAKER_NAMES)} authentic voices (Bourama recommended for best quality)"
)
generate_btn = gr.Button(
"🎡 Generate Speech",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
use_advanced = gr.Checkbox(
label="βš™οΈ Advanced Settings",
value=False,
info="Customize generation parameters"
)
with gr.Group(visible=False) as advanced_group:
gr.Markdown("**πŸ”§ Advanced Parameters:**")
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.8,
step=0.1,
label="Temperature",
info="Higher = more varied speech"
)
top_k = gr.Slider(
minimum=1,
maximum=100,
value=50,
step=5,
label="Top-K",
info="Vocabulary selection size"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-P",
info="Nucleus sampling threshold"
)
max_tokens = gr.Slider(
minimum=256,
maximum=4096,
value=2048,
step=256,
label="Max Audio Length",
info="Maximum audio duration"
)
gr.Markdown("### πŸ”Š Generated Audio")
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
interactive=False,
show_download_button=True
)
status_output = gr.Textbox(
label="Status",
interactive=False,
show_label=False,
container=False,
elem_classes=["status-box"]
)
with gr.Accordion("πŸ“š Try These Examples", open=True):
def load_example(text, speaker):
return text, speaker, False, 0.8, 50, 0.9, 2048
gr.Markdown("**Click any example below to try it:**")
with gr.Row():
for i, (text, speaker) in enumerate(examples[:5]):
btn = gr.Button(
f"πŸ”Ή {text[:25]}{'...' if len(text) > 25 else ''}",
size="sm"
)
btn.click(
fn=lambda t=text, s=speaker: load_example(t, s),
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
)
with gr.Row():
for i, (text, speaker) in enumerate(examples[5:]):
btn = gr.Button(
f"πŸ”Ή {text[:25]}{'...' if len(text) > 25 else ''}",
size="sm"
)
btn.click(
fn=lambda t=text, s=speaker: load_example(t, s),
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
)
with gr.Accordion("ℹ️ About", open=False):
gr.Markdown(f"""
## About MALIBA-AI Bambara TTS
- **🎯 Purpose**: First open-source Text-to-Speech system for Bambara language
- **πŸ—£οΈ Speakers**: {len(SPEAKER_NAMES)} different authentic voices
- **πŸ”Š Quality**: 16kHz neural speech synthesis
- **⚑ Performance**: Optimized for real-time generation
- **πŸ“± Usage**: Educational, accessibility, and cultural preservation
### 🎭 Speaker Characteristics:
- **Bourama**: Most stable and accurate (recommended)
- **Adama**: Natural conversational tone
- **Moussa**: Clear pronunciation for educational content
- **Modibo**: Expressive delivery for storytelling
- **Seydou**: Balanced characteristics for general use
- **Amadou**: Warm and friendly voice
- **Bakary**: Deep, authoritative tone
- **Ngolo**: Youthful and energetic
- **Ibrahima**: Calm and measured delivery
- **Amara**: Melodic and smooth
**Model Architecture**: Built on state-of-the-art neural TTS with Bambara-specific optimizations
**License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
---
**MALIBA-AI Mission**: Ensuring no Malian is left behind by technological advances πŸ‡²πŸ‡±
""")
# Event handlers
def toggle_advanced(use_adv):
return gr.Group(visible=use_adv)
use_advanced.change(
fn=toggle_advanced,
inputs=[use_advanced],
outputs=[advanced_group]
)
# Generate speech on button click
generate_btn.click(
fn=generate_speech,
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
outputs=[audio_output, status_output],
show_progress=True
)
# Generate speech on Enter key
text_input.submit(
fn=generate_speech,
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
outputs=[audio_output, status_output],
show_progress=True
)
return demo
def main():
"""Main function to launch the Gradio interface"""
logger.info("Starting MALIBA-AI Bambara TTS Gradio interface...")
# Build interface
interface = build_interface()
# Launch interface
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)
logger.info("Gradio interface launched successfully!")
if __name__ == "__main__":
main()