|
import gradio as gr |
|
import torch |
|
from transformers import VitsModel, VitsTokenizer |
|
|
|
|
|
MODEL_NAME = "facebook/mms-tts-tam" |
|
tokenizer = VitsTokenizer.from_pretrained(MODEL_NAME) |
|
model = VitsModel.from_pretrained(MODEL_NAME) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
def synthesize_speech(text): |
|
try: |
|
if not text.strip(): |
|
raise ValueError("Text input cannot be empty") |
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt").to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
speech = model(**inputs).waveform.cpu().squeeze().numpy() |
|
|
|
|
|
sample_rate = model.config.sampling_rate |
|
return (sample_rate, speech) |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}", None |
|
|
|
|
|
interface = gr.Interface( |
|
fn=synthesize_speech, |
|
inputs=gr.Textbox( |
|
label="Input Text", |
|
placeholder="Enter text to synthesize...", |
|
lines=3 |
|
), |
|
outputs=gr.Audio( |
|
label="Generated Speech", |
|
type="numpy" |
|
), |
|
title="MMS-TTS English Text-to-Speech", |
|
description="Convert text to speech using Facebook's MMS-TTS-ENG model", |
|
examples=[ |
|
["Hello! This is a text-to-speech demonstration."], |
|
["The quick brown fox jumps over the lazy dog."], |
|
["Natural language processing is fascinating!"] |
|
] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch(server_name="0.0.0.0" if torch.cuda.is_available() else None) |