Spaces:

leenag
/

Multilingual_TTS

Running

App Files Files Community

leenag commited on May 7

Commit

c573494

verified ·

1 Parent(s): 6716bc1

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -87

app.py CHANGED Viewed

@@ -1,95 +1,49 @@
-import torch
-import soundfile as sf
-import uuid
 import gradio as gr
-import numpy as np
-import re
-from parler_tts import ParlerTTSForConditionalGeneration
-from transformers import AutoTokenizer
-# Load model and tokenizers
-model_name = "ai4bharat/indic-parler-tts"
-device = "cpu"
-print("Loading model...")
-model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device).eval()
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
-print("Applying dynamic quantization...")
-quantized_model = torch.quantization.quantize_dynamic(
-    model,
-    {torch.nn.Linear},
-    dtype=torch.qint8
-)
-# Sentence splitter
-def split_text(text, max_len=150):
-    chunks = re.split(r'(?<=[.!?]) +', text)
-    refined = []
-    for chunk in chunks:
-        if len(chunk) <= max_len:
-            refined.append(chunk)
-        else:
-            words = chunk.split()
-            temp = []
-            buf_len = 0
-            for word in words:
-                temp.append(word)
-                buf_len += len(word) + 1
-                if buf_len > max_len:
-                    refined.append(' '.join(temp))
-                    temp = []
-                    buf_len = 0
-            if temp:
-                refined.append(' '.join(temp))
-    return refined
-# Core TTS function
-def synthesize(language, text, gender, emotion, speed):
-    description = (
-        f"A native {language.lower()} female speaker with an expressive tone."
-    )
-    audio_chunks = []
-    text_chunks = split_text(text)
-    for chunk in text_chunks:
-        # New tokenization for each chunk
-        desc_input = desc_tokenizer(description, return_tensors="pt").to(device)
-        prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
-        with torch.no_grad():
-            output = quantized_model.generate(
-                input_ids=desc_input.input_ids,
-                attention_mask=desc_input.attention_mask,
-                prompt_input_ids=prompt_input.input_ids,
-                prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
-            )
-        audio = output.cpu().numpy().squeeze()
-        audio_chunks.append(audio)
-    full_audio = np.concatenate(audio_chunks)
-    filename = f"{uuid.uuid4().hex}.wav"
-    sf.write(filename, full_audio, quantized_model.config.sampling_rate)
-    return filename
-# Gradio UI
 iface = gr.Interface(
     fn=synthesize,
     inputs=[
-        gr.Dropdown(["Malayalam", "Hindi", "Tamil", "English", "Kannada"], label="Language"),
-        gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Enter your sentence here..."),
-        # gr.Radio(["Male", "Female"], label="Speaker Gender"),
-        # gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
-        # gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
-        #gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
-        #gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
     ],
-    outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
-    title="Multilingual Indic TTS (Quantized + Chunked)",
-    description="CPU-based TTS with quantized Parler-TTS and chunked input for Malayalam, Hindi, Tamil, and English.",
 )
-iface.launch()

 import gradio as gr
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import torch
+import torchaudio
+LANG_MODEL_MAP = {
+    "English": "facebook/mms-tts-eng",
+    "Hindi": "facebook/mms-tts-hin",
+    "Tamil": "facebook/mms-tts-tam",
+    "Malayalam": "facebook/mms-tts-mal",
+    "Kannada": "facebook/mms-tts-kan"
+}
+device = "cuda" if torch.cuda.is_available() else "cpu"
+cache = {}
+def load_model_and_processor(language):
+    model_name = LANG_MODEL_MAP[language]
+    if model_name not in cache:
+        processor = AutoProcessor.from_pretrained(model_name)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
+        cache[model_name] = (processor, model)
+    return cache[model_name]
+def synthesize(language, text):
+    processor, model = load_model_and_processor(language)
+    inputs = processor(text=text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs)
+    audio = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Decode and return waveform
+    waveform, sr = torchaudio.load(audio)
+    return sr, waveform.squeeze().numpy()
 iface = gr.Interface(
     fn=synthesize,
     inputs=[
+        gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
+        gr.Textbox(label="Enter Text", placeholder="Type something...")
     ],
+    outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
+    title="Multilingual TTS - MMS Facebook",
+    description="A Gradio demo for multilingual TTS using Meta's MMS models. Supports English, Hindi, Tamil, Malayalam, and Kannada."
 )
+if __name__ == "__main__":
+    iface.launch()