Spaces:

Athspi
/

Gggggg

Sleeping

File size: 1,697 Bytes

b0033f2
 
1f5fde3
b0033f2
1f5fde3
4ded59c
1f5fde3
 
b0033f2
1f5fde3
 
 
b0033f2
 
 
 
1f5fde3
 
 
 
a8b416b
1f5fde3
 
 
 
 
 
 
 
b0033f2
1f5fde3
b0033f2
 
 
 
 
 
 
 
 
1f5fde3
 
 
 
b0033f2
1f5fde3
b0033f2
1f5fde3
b0033f2
1f5fde3
b0033f2
 
 
1f5fde3
b0033f2
1f5fde3

import gradio as gr
import torch
from transformers import VitsModel, VitsTokenizer

# Load the MMS-TTS model and tokenizer from Hugging Face
MODEL_NAME = "facebook/mms-tts-tam"
tokenizer = VitsTokenizer.from_pretrained(MODEL_NAME)
model = VitsModel.from_pretrained(MODEL_NAME)

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def synthesize_speech(text):
    try:
        if not text.strip():
            raise ValueError("Text input cannot be empty")
        
        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt").to(device)
        
        # Generate speech
        with torch.no_grad():
            speech = model(**inputs).waveform.cpu().squeeze().numpy()
        
        # Return sample rate and waveform
        sample_rate = model.config.sampling_rate
        return (sample_rate, speech)
    
    except Exception as e:
        return f"Error: {str(e)}", None

# Create Gradio interface
interface = gr.Interface(
    fn=synthesize_speech,
    inputs=gr.Textbox(
        label="Input Text",
        placeholder="Enter text to synthesize...",
        lines=3
    ),
    outputs=gr.Audio(
        label="Generated Speech",
        type="numpy"
    ),
    title="MMS-TTS English Text-to-Speech",
    description="Convert text to speech using Facebook's MMS-TTS-ENG model",
    examples=[
        ["Hello! This is a text-to-speech demonstration."],
        ["The quick brown fox jumps over the lazy dog."],
        ["Natural language processing is fascinating!"]
    ]
)

# Launch the application
if __name__ == "__main__":
    interface.launch(server_name="0.0.0.0" if torch.cuda.is_available() else None)