Athspi commited on
Commit
1f5fde3
·
verified ·
1 Parent(s): a8b416b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -39
app.py CHANGED
@@ -1,44 +1,34 @@
1
  import gradio as gr
2
  import torch
3
- import torchaudio
4
 
5
- # Initialize MMS-TTS pipeline
6
- def load_models():
7
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
- bundle = torchaudio.pipelines.MMS_TTS.get_bundle("eng")
9
-
10
- # Load components
11
- text_processor = bundle.get_text_processor()
12
- tacotron2 = bundle.get_tacotron2().to(device)
13
- vocoder = bundle.get_vocoder().to(device)
14
-
15
- return text_processor, tacotron2, vocoder, device
16
 
17
- text_processor, tacotron2, vocoder, device = load_models()
 
 
18
 
19
  def synthesize_speech(text):
20
  try:
21
  if not text.strip():
22
- return None, "Please enter some text to synthesize"
23
-
24
- with torch.inference_mode():
25
- # Process text
26
- processed, lengths = text_processor(text)
27
- processed = processed.to(device)
28
- lengths = lengths.to(device)
29
-
30
- # Generate mel spectrogram
31
- mel_spec, mel_lengths = tacotron2(processed, lengths)
32
-
33
- # Generate waveform
34
- waveform = vocoder(mel_spec)
35
-
36
- # Convert to numpy array
37
- waveform = waveform.cpu().squeeze().numpy()
38
- return (bundle.sample_rate, waveform), None
39
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
- return None, f"Error: {str(e)}"
42
 
43
  # Create Gradio interface
44
  interface = gr.Interface(
@@ -48,18 +38,19 @@ interface = gr.Interface(
48
  placeholder="Enter text to synthesize...",
49
  lines=3
50
  ),
51
- outputs=[
52
- gr.Audio(label="Generated Speech"),
53
- gr.Textbox(label="Error Message", visible=False)
54
- ],
55
  title="MMS-TTS English Text-to-Speech",
56
- description="Convert text to speech using Facebook's MMS-TTS model",
57
  examples=[
58
- ["Hello! This is a working text-to-speech demonstration."],
59
  ["The quick brown fox jumps over the lazy dog."],
60
- ["Natural language processing is truly fascinating!"]
61
  ]
62
  )
63
 
 
64
  if __name__ == "__main__":
65
- interface.launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import VitsModel, VitsTokenizer
4
 
5
+ # Load the MMS-TTS model and tokenizer from Hugging Face
6
+ MODEL_NAME = "facebook/mms-tts-eng"
7
+ tokenizer = VitsTokenizer.from_pretrained(MODEL_NAME)
8
+ model = VitsModel.from_pretrained(MODEL_NAME)
 
 
 
 
 
 
 
9
 
10
+ # Set up device
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model.to(device)
13
 
14
  def synthesize_speech(text):
15
  try:
16
  if not text.strip():
17
+ raise ValueError("Text input cannot be empty")
18
+
19
+ # Tokenize input text
20
+ inputs = tokenizer(text, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Generate speech
23
+ with torch.no_grad():
24
+ speech = model(**inputs).waveform.cpu().squeeze().numpy()
25
+
26
+ # Return sample rate and waveform
27
+ sample_rate = model.config.sampling_rate
28
+ return (sample_rate, speech)
29
+
30
  except Exception as e:
31
+ return f"Error: {str(e)}", None
32
 
33
  # Create Gradio interface
34
  interface = gr.Interface(
 
38
  placeholder="Enter text to synthesize...",
39
  lines=3
40
  ),
41
+ outputs=gr.Audio(
42
+ label="Generated Speech",
43
+ type="numpy"
44
+ ),
45
  title="MMS-TTS English Text-to-Speech",
46
+ description="Convert text to speech using Facebook's MMS-TTS-ENG model",
47
  examples=[
48
+ ["Hello! This is a text-to-speech demonstration."],
49
  ["The quick brown fox jumps over the lazy dog."],
50
+ ["Natural language processing is fascinating!"]
51
  ]
52
  )
53
 
54
+ # Launch the application
55
  if __name__ == "__main__":
56
+ interface.launch(server_name="0.0.0.0" if torch.cuda.is_available() else None)