import gradio as gr import torch from transformers import VitsModel, VitsTokenizer # Load the MMS-TTS model and tokenizer from Hugging Face MODEL_NAME = "facebook/mms-tts-tam" tokenizer = VitsTokenizer.from_pretrained(MODEL_NAME) model = VitsModel.from_pretrained(MODEL_NAME) # Set up device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def synthesize_speech(text): try: if not text.strip(): raise ValueError("Text input cannot be empty") # Tokenize input text inputs = tokenizer(text, return_tensors="pt").to(device) # Generate speech with torch.no_grad(): speech = model(**inputs).waveform.cpu().squeeze().numpy() # Return sample rate and waveform sample_rate = model.config.sampling_rate return (sample_rate, speech) except Exception as e: return f"Error: {str(e)}", None # Create Gradio interface interface = gr.Interface( fn=synthesize_speech, inputs=gr.Textbox( label="Input Text", placeholder="Enter text to synthesize...", lines=3 ), outputs=gr.Audio( label="Generated Speech", type="numpy" ), title="MMS-TTS English Text-to-Speech", description="Convert text to speech using Facebook's MMS-TTS-ENG model", examples=[ ["Hello! This is a text-to-speech demonstration."], ["The quick brown fox jumps over the lazy dog."], ["Natural language processing is fascinating!"] ] ) # Launch the application if __name__ == "__main__": interface.launch(server_name="0.0.0.0" if torch.cuda.is_available() else None)