Athspi commited on
Commit
a8b416b
·
verified ·
1 Parent(s): 8da735f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -28
app.py CHANGED
@@ -2,40 +2,43 @@ import gradio as gr
2
  import torch
3
  import torchaudio
4
 
5
- # Load MMS-TTS components
6
- bundle = torchaudio.pipelines.MMS_TTS_ENG
7
- text_processor = bundle.get_text_processor()
8
- tacotron2 = bundle.get_tacotron2()
9
- waveglow = bundle.get_waveglow()
 
 
 
 
 
 
10
 
11
- # Set up device
12
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
- tacotron2 = tacotron2.to(device)
14
- waveglow = waveglow.to(device)
15
 
16
  def synthesize_speech(text):
17
  try:
18
  if not text.strip():
19
- raise ValueError("Text input cannot be empty")
20
-
21
  with torch.inference_mode():
22
- # Process text input
23
  processed, lengths = text_processor(text)
24
  processed = processed.to(device)
25
  lengths = lengths.to(device)
26
 
27
- # Generate spectrogram
28
- spec, spec_lengths = tacotron2(processed, lengths)
29
 
30
  # Generate waveform
31
- waveform, lengths = waveglow(spec, spec_lengths)
32
 
33
- # Convert to numpy array for Gradio
34
  waveform = waveform.cpu().squeeze().numpy()
35
- return (bundle.sample_rate, waveform)
36
-
37
  except Exception as e:
38
- return f"Error: {str(e)}", None
39
 
40
  # Create Gradio interface
41
  interface = gr.Interface(
@@ -45,19 +48,18 @@ interface = gr.Interface(
45
  placeholder="Enter text to synthesize...",
46
  lines=3
47
  ),
48
- outputs=gr.Audio(
49
- label="Generated Speech",
50
- type="numpy"
51
- ),
52
  title="MMS-TTS English Text-to-Speech",
53
- description="Convert text to speech using Facebook's MMS-TTS-ENG model",
54
  examples=[
55
- ["Hello! This is a text-to-speech demonstration."],
56
  ["The quick brown fox jumps over the lazy dog."],
57
- ["Natural language processing is fascinating!"]
58
  ]
59
  )
60
 
61
- # Launch the application
62
  if __name__ == "__main__":
63
- interface.launch(server_name="0.0.0.0" if torch.cuda.is_available() else None)
 
2
  import torch
3
  import torchaudio
4
 
5
+ # Initialize MMS-TTS pipeline
6
+ def load_models():
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+ bundle = torchaudio.pipelines.MMS_TTS.get_bundle("eng")
9
+
10
+ # Load components
11
+ text_processor = bundle.get_text_processor()
12
+ tacotron2 = bundle.get_tacotron2().to(device)
13
+ vocoder = bundle.get_vocoder().to(device)
14
+
15
+ return text_processor, tacotron2, vocoder, device
16
 
17
+ text_processor, tacotron2, vocoder, device = load_models()
 
 
 
18
 
19
  def synthesize_speech(text):
20
  try:
21
  if not text.strip():
22
+ return None, "Please enter some text to synthesize"
23
+
24
  with torch.inference_mode():
25
+ # Process text
26
  processed, lengths = text_processor(text)
27
  processed = processed.to(device)
28
  lengths = lengths.to(device)
29
 
30
+ # Generate mel spectrogram
31
+ mel_spec, mel_lengths = tacotron2(processed, lengths)
32
 
33
  # Generate waveform
34
+ waveform = vocoder(mel_spec)
35
 
36
+ # Convert to numpy array
37
  waveform = waveform.cpu().squeeze().numpy()
38
+ return (bundle.sample_rate, waveform), None
39
+
40
  except Exception as e:
41
+ return None, f"Error: {str(e)}"
42
 
43
  # Create Gradio interface
44
  interface = gr.Interface(
 
48
  placeholder="Enter text to synthesize...",
49
  lines=3
50
  ),
51
+ outputs=[
52
+ gr.Audio(label="Generated Speech"),
53
+ gr.Textbox(label="Error Message", visible=False)
54
+ ],
55
  title="MMS-TTS English Text-to-Speech",
56
+ description="Convert text to speech using Facebook's MMS-TTS model",
57
  examples=[
58
+ ["Hello! This is a working text-to-speech demonstration."],
59
  ["The quick brown fox jumps over the lazy dog."],
60
+ ["Natural language processing is truly fascinating!"]
61
  ]
62
  )
63
 
 
64
  if __name__ == "__main__":
65
+ interface.launch()