Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import VitsModel, AutoTokenizer | |
| import soundfile as sf | |
| import tempfile | |
| LANG_MODEL_MAP = { | |
| "English": "facebook/mms-tts-eng", | |
| "Hindi": "facebook/mms-tts-hin", | |
| "Tamil": "facebook/mms-tts-tam", | |
| "Malayalam": "facebook/mms-tts-mal", | |
| "Kannada": "facebook/mms-tts-kan" | |
| } | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| cache = {} | |
| def load_model_and_tokenizer(language): | |
| model_name = LANG_MODEL_MAP[language] | |
| if model_name not in cache: | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = VitsModel.from_pretrained(model_name).to(device) | |
| cache[model_name] = (tokenizer, model) | |
| return cache[model_name] | |
| def tts(language, text): | |
| tokenizer, model = load_model_and_tokenizer(language) | |
| inputs = tokenizer(text, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| output = model(**inputs) | |
| # Save waveform to temp file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
| sf.write(f.name, output.waveform.cpu().numpy(), samplerate=16000) | |
| return f.name | |
| iface = gr.Interface( | |
| fn=tts, | |
| inputs=[ | |
| gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"), | |
| gr.Textbox(label="Enter Text", placeholder="Type something...") | |
| ], | |
| outputs=gr.Audio(type="filepath", label="Synthesized Audio"), | |
| title="Multilingual Text-to-Speech (MMS)", | |
| description="Generate speech in English, Hindi, Tamil, Malayalam, or Kannada using Meta's MMS TTS models." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |