File size: 1,675 Bytes
5e6c5bb
c573494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66d0bf1
 
 
 
c573494
 
66d0bf1
c573494
 
 
66d0bf1
 
c573494
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import gradio as gr
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio

LANG_MODEL_MAP = {
    "English": "facebook/mms-tts-eng",
    "Hindi": "facebook/mms-tts-hin",
    "Tamil": "facebook/mms-tts-tam",
    "Malayalam": "facebook/mms-tts-mal",
    "Kannada": "facebook/mms-tts-kan"
}

device = "cuda" if torch.cuda.is_available() else "cpu"
cache = {}

def load_model_and_processor(language):
    model_name = LANG_MODEL_MAP[language]
    if model_name not in cache:
        processor = AutoProcessor.from_pretrained(model_name)
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
        cache[model_name] = (processor, model)
    return cache[model_name]

def synthesize(language, text):
    processor, model = load_model_and_processor(language)
    
    inputs = processor(text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
    audio = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Decode and return waveform
    waveform, sr = torchaudio.load(audio)
    return sr, waveform.squeeze().numpy()

iface = gr.Interface(
    fn=synthesize,
    inputs=[
        gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
        gr.Textbox(label="Enter Text", placeholder="Type something...")
    ],
    outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
    title="Multilingual TTS - MMS Facebook",
    description="A Gradio demo for multilingual TTS using Meta's MMS models. Supports English, Hindi, Tamil, Malayalam, and Kannada."
)

if __name__ == "__main__":
    iface.launch()