Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import torch | |
import numpy as np | |
import gradio as gr | |
import scipy.io.wavfile as wavfile | |
from pydub import AudioSegment | |
from transformers import VitsModel, AutoTokenizer | |
# ---------- Configuration -------------------------------------------------- | |
# Define available TTS models here. Add new entries as needed. | |
TTS_MODELS = { | |
"Ewe": { | |
"tokenizer": "FarmerlineML/Ewe-tts-2025_v2", | |
"checkpoint": "FarmerlineML/Ewe-tts-2025_v2" | |
}, | |
"Swahili": { | |
"tokenizer": "FarmerlineML/swahili-tts-2025", | |
"checkpoint": "FarmerlineML/Swahili-tts-2025_part4" | |
}, | |
"Krio": { | |
"tokenizer": "FarmerlineML/Krio-TTS", | |
"checkpoint": "FarmerlineML/Krio-TTS" | |
}, | |
} | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# ---------- Load all models & tokenizers ----------------------------------- | |
models = {} | |
tokenizers = {} | |
for name, paths in TTS_MODELS.items(): | |
print(f"Loading {name} model...") | |
model = VitsModel.from_pretrained(paths["checkpoint"]).to(device) | |
model.eval() | |
# Apply clear-speech inference parameters (tweak per model if desired) | |
model.noise_scale = 0.8 | |
model.noise_scale_duration = 0.667 | |
model.speaking_rate = 0.75 | |
models[name] = model | |
tokenizers[name] = AutoTokenizer.from_pretrained(paths["tokenizer"]) | |
# ---------- Utility: WAV ➔ MP3 Conversion ----------------------------------- | |
def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str: | |
"""Convert int16 numpy waveform to an MP3 temp file, return its path.""" | |
# Ensure int16 for pydub | |
if wave_np.dtype != np.int16: | |
wave_np = (wave_np * 32767).astype(np.int16) | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf: | |
wavfile.write(tf.name, sr, wave_np) | |
wav_path = tf.name | |
mp3_path = wav_path.replace(".wav", ".mp3") | |
AudioSegment.from_wav(wav_path).export(mp3_path, format="mp3", bitrate="64k") | |
os.remove(wav_path) | |
return mp3_path | |
# ---------- TTS Generation --------------------------------------------------- | |
def tts_generate(model_name: str, text: str): | |
"""Generate speech for `text` using the selected model.""" | |
if not text: | |
return None | |
model = models[model_name] | |
tokenizer = tokenizers[model_name] | |
inputs = tokenizer(text, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
wave = model(**inputs).waveform[0].cpu().numpy() | |
return _wav_to_mp3(wave, model.config.sampling_rate) | |
# ---------- Gradio Interface ------------------------------------------------ | |
examples = [ | |
["Ewe", "kpovitɔwo dometɔ ɖeka lé kaƒomɔ ɖe asi eye wòɖo ŋɔtsitsyɔnu."], | |
["Ewe", "ɖeviawo ƒe gbɔsɔsɔ me anɔ abe enyi. fi si ɖeviwo le la ƒo ɖi. ɖeviawo kɔ nu kake aɖewo ɖe asi ɖewo hā nɔ wonuiwo kplɔm."], | |
["Ewe", "amewo le yɔƒe me eye aɖake le wogbɔ. wodo awu yibɔ ŋutsu aɖe le kponyi fam le akɔ fam ne nyɔnu aɖe."], | |
["Swahili", "zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"], | |
["Swahili", "Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."], | |
["Swahili", "Tafadhali hakikisha umefunga mlango kabla ya kuondoka."], | |
["Krio", "Wetin na yu nem?"], | |
["Krio", "aw yu de du"], | |
["Krio", "A de go skul"], | |
] | |
demo = gr.Interface( | |
fn=tts_generate, | |
inputs=[ | |
gr.Dropdown(choices=list(TTS_MODELS.keys()), value="Swahili", label="Choose TTS Model"), | |
gr.Textbox(lines=3, placeholder="Enter text here", label="Input Text") | |
], | |
outputs=gr.Audio(type="filepath", label="Audio", autoplay=True), | |
title="Multi‐Model Text-to-Speech", | |
description=( | |
"Select a TTS model from the dropdown and enter text to generate speech." | |
), | |
examples=examples, | |
cache_examples=True, | |
) | |
if __name__ == "__main__": | |
demo.launch() | |