sudoping01's picture
Update app.py
c5bcdee verified
raw
history blame
13.4 kB
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
import gradio as gr
import numpy as np
import spaces
import logging
from huggingface_hub import login
import threading
import time
torch._dynamo.config.disable = True
torch._dynamo.config.suppress_errors = True
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(token=hf_token)
# Global variables for model caching (like your old working version)
_tts_model = None
_speakers_dict = None
_model_initialized = False
_initialization_in_progress = False
def get_speakers_dict():
"""Get speakers dictionary using the correct import structure"""
try:
# Try new structure first
from maliba_ai.config.settings import Speakers
return {
"Adama": Speakers.Adama,
"Moussa": Speakers.Moussa,
"Bourama": Speakers.Bourama,
"Modibo": Speakers.Modibo,
"Seydou": Speakers.Seydou,
"Amadou": Speakers.Amadou,
"Bakary": Speakers.Bakary,
"Ngolo": Speakers.Ngolo,
"Ibrahima": Speakers.Ibrahima,
"Amara": Speakers.Amara
}
except Exception as e:
logger.error(f"Failed to import from settings: {e}")
# Fallback to old structure (like your working version)
try:
from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
return {
"Adama": Adame,
"Moussa": Moussa,
"Bourama": Bourama,
"Modibo": Modibo,
"Seydou": Seydou
}
except Exception as e2:
logger.error(f"Failed to import speakers: {e2}")
return {}
@spaces.GPU()
def initialize_model_once():
"""Initialize model exactly like your old working version"""
global _tts_model, _speakers_dict, _model_initialized, _initialization_in_progress
if _model_initialized:
logger.info("Model already initialized, returning existing instance")
return _tts_model, _speakers_dict
if _initialization_in_progress:
logger.info("Initialization already in progress, waiting...")
for _ in range(50):
time.sleep(0.1)
if _model_initialized:
return _tts_model, _speakers_dict
_initialization_in_progress = True
try:
logger.info("Initializing Bambara TTS model...")
start_time = time.time()
# Use the same import as your old working version
from maliba_ai.tts import BambaraTTSInference
model = BambaraTTSInference()
speakers = get_speakers_dict()
if not speakers:
raise ValueError("Failed to load speakers dictionary")
_tts_model = model
_speakers_dict = speakers
_model_initialized = True
elapsed = time.time() - start_time
logger.info(f"Model initialized successfully in {elapsed:.2f} seconds!")
return _tts_model, _speakers_dict
except Exception as e:
logger.error(f"Failed to initialize model: {e}")
_initialization_in_progress = False
raise e
finally:
_initialization_in_progress = False
def validate_inputs(text, temperature, top_k, top_p, max_tokens):
"""Same validation as your old version"""
if not text or not text.strip():
return False, "Please enter some Bambara text."
if not (0.001 <= temperature <= 2.0):
return False, "Temperature must be between 0.001 and 2.0"
if not (1 <= top_k <= 100):
return False, "Top-K must be between 1 and 100"
if not (0.1 <= top_p <= 1.0):
return False, "Top-P must be between 0.1 and 1.0"
return True, ""
@spaces.GPU()
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
"""Generate speech - exactly like your old working version"""
if not text.strip():
return None, "Please enter some Bambara text."
try:
tts, speakers = initialize_model_once()
if not tts or not speakers:
return None, "❌ Model not properly initialized"
if speaker_name not in speakers:
available_speakers = list(speakers.keys())
return None, f"❌ Speaker '{speaker_name}' not found. Available: {available_speakers}"
speaker = speakers[speaker_name]
logger.info(f"Using speaker: {speaker_name}")
if use_advanced:
is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
if not is_valid:
return None, f"❌ {error_msg}"
waveform = tts.generate_speech(
text=text.strip(),
speaker_id=speaker,
temperature=temperature,
top_k=int(top_k),
top_p=top_p,
max_new_audio_tokens=int(max_tokens)
)
else:
waveform = tts.generate_speech(
text=text.strip(),
speaker_id=speaker
)
if waveform is None or waveform.size == 0:
return None, "Failed to generate audio. Please try again."
sample_rate = 16000
return (sample_rate, waveform), f"βœ… Audio generated successfully for speaker {speaker_name}"
except Exception as e:
logger.error(f"Speech generation failed: {e}")
return None, f"❌ Error: {str(e)}"
# Use available speakers (try to get 10, fallback to 5)
def get_speaker_names():
speakers = get_speakers_dict()
if speakers:
return list(speakers.keys())
return ["Adama", "Moussa", "Bourama", "Modibo", "Seydou"]
SPEAKER_NAMES = get_speaker_names()
# Examples with variety of lengths and speakers matched to content
examples = [
["Aw ni ce", "Adama"], # Natural conversational greeting
["Mali bΙ›na diya kΙ”sΙ›bΙ›, ka a da a kan baara bΙ› ka kΙ›.", "Bakary" if "Bakary" in SPEAKER_NAMES else "Moussa"],
["Ne bΙ› se ka sΙ›bΙ›nni yΙ›lΙ›ma ka kΙ› kuma ye", "Moussa"],
["I ka kΙ›nΙ› wa?", "Ngolo" if "Ngolo" in SPEAKER_NAMES else "Modibo"],
["LakΙ”li karamΙ”gΙ”w tun tΙ› ka se ka sΙ›bΙ›nni kΙ› ka Ι²Ι› walanda kan wa denmisΙ›nw tun tΙ› ka se ka o sΙ›bΙ›nni ninnu ye, kuma tΙ› ka u kalan. DenmisΙ›nw kΙ›ra kunfinw ye.", "Bourama"],
["sigikafΙ” kΙ”nΙ” jamanaw ni Ι²Ι”gΙ”n cΙ›, olu ye a haminankow ye, wa o ko ninnu ka kan ka kΙ› sariya ani tilennenya kΙ”nΙ”.", "Ibrahima" if "Ibrahima" in SPEAKER_NAMES else "Seydou"],
["Aw ni ce. Ne tΙ”gΙ” ye Adama. AwΙ”, ne ye maliden de ye. Aw SanbΙ› SanbΙ›. San min tΙ› Ι²inan ye, an bΙ›Ι› ka jΙ› ka o seli Ι²Ι”gΙ”n fΙ›, hΙ›Ι›rΙ› ni lafiya la. Ala ka Mali suma. Ala ka Mali yiriwa. Ala ka Mali taa Ι²Ι›. Ala ka an ka seliw caya. Ala ka yafa an bΙ›Ι› ma.", "Amara" if "Amara" in SPEAKER_NAMES else "Moussa"],
["An dΙ”lakelen bΙ› masike bilenman don ka tΙ”w gΙ›n.", "Modibo"],
["Aw ni ce. Seidu bΙ› aw fo wa aw ka yafa a ma, ka da a kan tuma dΙ”w la kow ka can.", "Amadou" if "Amadou" in SPEAKER_NAMES else "Modibo"],
]
def build_interface():
"""Build the Gradio interface - simplified like your old working version"""
with gr.Blocks(title="Bambara TTS - MALIBA-AI") as demo:
gr.Markdown("""
# 🎀 Bambara Text-to-Speech
**Powered by MALIBA-AI**
Convert Bambara text to speech using our state-of-the-art TTS model.
**Bambara** is spoken by millions of people in Mali and West Africa.
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="πŸ“ Bambara Text",
placeholder="Type your Bambara text here...",
lines=3,
max_lines=10,
value="I ni ce"
)
speaker_dropdown = gr.Dropdown(
choices=SPEAKER_NAMES,
value="Bourama" if "Bourama" in SPEAKER_NAMES else SPEAKER_NAMES[0],
label="πŸ—£οΈ Speaker Voice",
info=f"Choose from {len(SPEAKER_NAMES)} authentic voices"
)
generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
use_advanced = gr.Checkbox(
label="βš™οΈ Use Advanced Settings",
value=False,
info="Enable to customize generation parameters"
)
with gr.Group(visible=False) as advanced_group:
gr.Markdown("**Advanced Parameters:**")
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.8,
step=0.1,
label="Temperature",
info="Higher = more varied"
)
top_k = gr.Slider(
minimum=1,
maximum=100,
value=50,
step=5,
label="Top-K"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-P"
)
max_tokens = gr.Slider(
minimum=256,
maximum=4096,
value=2048,
step=256,
label="Max Length"
)
gr.Markdown("### πŸ”Š Generated Audio")
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
interactive=False
)
status_output = gr.Textbox(
label="Status",
interactive=False,
show_label=False,
container=False
)
with gr.Accordion("Try These Examples", open=True):
def load_example(text, speaker):
return text, speaker, False, 0.8, 50, 0.9, 2048
gr.Markdown("**Click any example below:**")
for i, (text, speaker) in enumerate(examples):
btn = gr.Button(f"{text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
btn.click(
fn=lambda t=text, s=speaker: load_example(t, s),
outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
)
with gr.Accordion("About", open=False):
gr.Markdown(f"""
## About MALIBA-AI Bambara TTS
- **🎯 Purpose**: First open-source Text-to-Speech system for Bambara language
- **πŸ—£οΈ Speakers**: {len(SPEAKER_NAMES)} different authentic voices
- **πŸ”Š Quality**: 16kHz neural speech synthesis
- **⚑ Performance**: Model loads once and stays in memory
- **πŸ“± Usage**: Educational, accessibility, and cultural preservation
### 🎭 Available Speakers:
{', '.join(SPEAKER_NAMES)}
**License**: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0)
---
**MALIBA-AI Mission**: Ensuring no Malian is left behind by technological advances πŸ‡²πŸ‡±
""")
def toggle_advanced(use_adv):
return gr.Group(visible=use_adv)
use_advanced.change(
fn=toggle_advanced,
inputs=[use_advanced],
outputs=[advanced_group]
)
generate_btn.click(
fn=generate_speech,
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
outputs=[audio_output, status_output],
show_progress=True
)
text_input.submit(
fn=generate_speech,
inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
outputs=[audio_output, status_output],
show_progress=True
)
return demo
def main():
"""Main function to launch the Gradio interface"""
logger.info("Starting Bambara TTS Gradio interface.")
# DO NOT preload - let it initialize on first request only (like your working version)
interface = build_interface()
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)
logger.info("Gradio interface launched successfully.")
if __name__ == "__main__":
main()