Spaces:

leenag
/

Multilingual_TTS

Running

App Files Files Community

leenag commited on May 7

Commit

9ce846a

verified ·

1 Parent(s): 99f9cdf

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -20

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-from transformers import VitsModel, AutoTokenizer
 import torch
-import torchaudio
 LANG_MODEL_MAP = {
     "English": "facebook/mms-tts-eng",
@@ -15,35 +15,35 @@ LANG_MODEL_MAP = {
 device = "cuda" if torch.cuda.is_available() else "cpu"
 cache = {}
-def load_model_and_processor(language):
     model_name = LANG_MODEL_MAP[language]
     if model_name not in cache:
-        processor = AutoProcessor.from_pretrained(model_name)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
-        cache[model_name] = (processor, model)
     return cache[model_name]
-def synthesize(language, text):
-    processor, model = load_model_and_processor(language)
-    inputs = processor(text=text, return_tensors="pt").to(device)
     with torch.no_grad():
-        generated_ids = model.generate(**inputs)
-    audio = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Decode and return waveform
-    waveform, sr = torchaudio.load(audio)
-    return sr, waveform.squeeze().numpy()
 iface = gr.Interface(
-    fn=synthesize,
     inputs=[
         gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
         gr.Textbox(label="Enter Text", placeholder="Type something...")
     ],
-    outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
-    title="Multilingual TTS - MMS Facebook",
-    description="A Gradio demo for multilingual TTS using Meta's MMS models. Supports English, Hindi, Tamil, Malayalam, and Kannada."
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+from transformers import VitsModel, AutoTokenizer
+import soundfile as sf
+import tempfile
 LANG_MODEL_MAP = {
     "English": "facebook/mms-tts-eng",
 device = "cuda" if torch.cuda.is_available() else "cpu"
 cache = {}
+def load_model_and_tokenizer(language):
     model_name = LANG_MODEL_MAP[language]
     if model_name not in cache:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = VitsModel.from_pretrained(model_name).to(device)
+        cache[model_name] = (tokenizer, model)
     return cache[model_name]
+def tts(language, text):
+    tokenizer, model = load_model_and_tokenizer(language)
+    inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
+        output = model(**inputs)
+    # Save waveform to temp file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        sf.write(f.name, output.waveform.cpu().numpy(), samplerate=16000)
+        return f.name
 iface = gr.Interface(
+    fn=tts,
     inputs=[
         gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
         gr.Textbox(label="Enter Text", placeholder="Type something...")
     ],
+    outputs=gr.Audio(type="filepath", label="Synthesized Audio"),
+    title="Multilingual Text-to-Speech (MMS)",
+    description="Generate speech in English, Hindi, Tamil, Malayalam, or Kannada using Meta's MMS TTS models."
 )
 if __name__ == "__main__":