Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
import torch | |
import torchaudio | |
LANG_MODEL_MAP = { | |
"English": "facebook/mms-tts-eng", | |
"Hindi": "facebook/mms-tts-hin", | |
"Tamil": "facebook/mms-tts-tam", | |
"Malayalam": "facebook/mms-tts-mal", | |
"Kannada": "facebook/mms-tts-kan" | |
} | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
cache = {} | |
def load_model_and_processor(language): | |
model_name = LANG_MODEL_MAP[language] | |
if model_name not in cache: | |
processor = AutoProcessor.from_pretrained(model_name) | |
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device) | |
cache[model_name] = (processor, model) | |
return cache[model_name] | |
def synthesize(language, text): | |
processor, model = load_model_and_processor(language) | |
inputs = processor(text=text, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
generated_ids = model.generate(**inputs) | |
audio = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# Decode and return waveform | |
waveform, sr = torchaudio.load(audio) | |
return sr, waveform.squeeze().numpy() | |
iface = gr.Interface( | |
fn=synthesize, | |
inputs=[ | |
gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"), | |
gr.Textbox(label="Enter Text", placeholder="Type something...") | |
], | |
outputs=gr.Audio(label="Synthesized Speech", type="numpy"), | |
title="Multilingual TTS - MMS Facebook", | |
description="A Gradio demo for multilingual TTS using Meta's MMS models. Supports English, Hindi, Tamil, Malayalam, and Kannada." | |
) | |
if __name__ == "__main__": | |
iface.launch() |