Spaces:
Running
Running
File size: 3,881 Bytes
5a8d3bb 96c7497 5a8d3bb f0f8536 5a8d3bb edc8e2a 5a8d3bb 8cef93c 5a8d3bb 96c7497 3afcae0 5a8d3bb 8c15375 5a8d3bb f0f8536 5a8d3bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os
import tempfile
import torch
import numpy as np
import gradio as gr
import scipy.io.wavfile as wavfile
from pydub import AudioSegment
from transformers import VitsModel, AutoTokenizer
# ---------- Configuration --------------------------------------------------
# Define available TTS models here. Add new entries as needed.
TTS_MODELS = {
"Ewe": {
"tokenizer": "FarmerlineML/Ewe-tts-2025_v2",
"checkpoint": "FarmerlineML/Ewe-tts-2025_v2"
},
"Swahili": {
"tokenizer": "FarmerlineML/swahili-tts-2025",
"checkpoint": "FarmerlineML/Swahili-tts-2025_part4"
},
"Krio": {
"tokenizer": "FarmerlineML/Krio-TTS",
"checkpoint": "FarmerlineML/Krio-TTS"
},
}
device = "cuda" if torch.cuda.is_available() else "cpu"
# ---------- Load all models & tokenizers -----------------------------------
models = {}
tokenizers = {}
for name, paths in TTS_MODELS.items():
print(f"Loading {name} model...")
model = VitsModel.from_pretrained(paths["checkpoint"]).to(device)
model.eval()
# Apply clear-speech inference parameters (tweak per model if desired)
model.noise_scale = 0.8
model.noise_scale_duration = 0.667
model.speaking_rate = 0.75
models[name] = model
tokenizers[name] = AutoTokenizer.from_pretrained(paths["tokenizer"])
# ---------- Utility: WAV ➔ MP3 Conversion -----------------------------------
def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str:
"""Convert int16 numpy waveform to an MP3 temp file, return its path."""
# Ensure int16 for pydub
if wave_np.dtype != np.int16:
wave_np = (wave_np * 32767).astype(np.int16)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
wavfile.write(tf.name, sr, wave_np)
wav_path = tf.name
mp3_path = wav_path.replace(".wav", ".mp3")
AudioSegment.from_wav(wav_path).export(mp3_path, format="mp3", bitrate="64k")
os.remove(wav_path)
return mp3_path
# ---------- TTS Generation ---------------------------------------------------
def tts_generate(model_name: str, text: str):
"""Generate speech for `text` using the selected model."""
if not text:
return None
model = models[model_name]
tokenizer = tokenizers[model_name]
inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
wave = model(**inputs).waveform[0].cpu().numpy()
return _wav_to_mp3(wave, model.config.sampling_rate)
# ---------- Gradio Interface ------------------------------------------------
examples = [
["Ewe", "kpovitɔwo dometɔ ɖeka lé kaƒomɔ ɖe asi eye wòɖo ŋɔtsitsyɔnu."],
["Ewe", "ɖeviawo ƒe gbɔsɔsɔ me anɔ abe enyi. fi si ɖeviwo le la ƒo ɖi. ɖeviawo kɔ nu kake aɖewo ɖe asi ɖewo hā nɔ wonuiwo kplɔm."],
["Ewe", "amewo le yɔƒe me eye aɖake le wogbɔ. wodo awu yibɔ ŋutsu aɖe le kponyi fam le akɔ fam ne nyɔnu aɖe."],
["Swahili", "zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"],
["Swahili", "Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."],
["Swahili", "Tafadhali hakikisha umefunga mlango kabla ya kuondoka."],
["Krio", "Wetin na yu nem?"],
["Krio", "aw yu de du"],
["Krio", "A de go skul"],
]
demo = gr.Interface(
fn=tts_generate,
inputs=[
gr.Dropdown(choices=list(TTS_MODELS.keys()), value="Swahili", label="Choose TTS Model"),
gr.Textbox(lines=3, placeholder="Enter text here", label="Input Text")
],
outputs=gr.Audio(type="filepath", label="Audio", autoplay=True),
title="Multi‐Model Text-to-Speech",
description=(
"Select a TTS model from the dropdown and enter text to generate speech."
),
examples=examples,
cache_examples=True,
)
if __name__ == "__main__":
demo.launch()
|