import gradio as gr
import torch
from transformers import VitsModel, AutoTokenizer
import soundfile as sf
import tempfile

LANG_MODEL_MAP = {
    "English": "facebook/mms-tts-eng",
    "Hindi": "facebook/mms-tts-hin",
    "Tamil": "facebook/mms-tts-tam",
    "Malayalam": "facebook/mms-tts-mal",
    "Kannada": "facebook/mms-tts-kan"
}

device = "cuda" if torch.cuda.is_available() else "cpu"
cache = {}

def load_model_and_tokenizer(language):
    model_name = LANG_MODEL_MAP[language]
    if model_name not in cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = VitsModel.from_pretrained(model_name).to(device)
        cache[model_name] = (tokenizer, model)
    return cache[model_name]

def tts(language, text):
    tokenizer, model = load_model_and_tokenizer(language)
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model(**inputs)

    # Save waveform to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        sf.write(f.name, output.waveform.cpu().numpy(), samplerate=16000)
        return f.name

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
        gr.Textbox(label="Enter Text", placeholder="Type something...")
    ],
    outputs=gr.Audio(type="filepath", label="Synthesized Audio"),
    title="Multilingual Text-to-Speech (MMS)",
    description="Generate speech in English, Hindi, Tamil, Malayalam, or Kannada using Meta's MMS TTS models."
)

if __name__ == "__main__":
    iface.launch()