shukdevdatta123 commited on
Commit
0294388
·
verified ·
1 Parent(s): 4cc61f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -53
app.py CHANGED
@@ -1,57 +1,68 @@
1
  import gradio as gr
2
- from TTS.api import TTS
3
- import numpy as np
4
- from scipy.io import wavfile
5
  import tempfile
6
- import os
7
-
8
- # Load the YourTTS model once at startup
9
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
10
- sample_rate = tts.synthesizer.output_sample_rate
11
-
12
- def generate_speech(reference_audio, text):
13
- """
14
- Generate speech audio mimicking the voice from the reference audio.
15
-
16
- Parameters:
17
- reference_audio (str): Filepath to the uploaded voice sample.
18
- text (str): Text to convert to speech.
19
-
20
- Returns:
21
- str: Path to the generated audio file
22
- """
23
- # Generate speech using the reference audio and text
24
- wav = tts.tts(text=text, speaker_wav=reference_audio, language="en")
25
- # Convert list to numpy array
26
- wav_np = np.array(wav, dtype=np.float32)
27
-
28
- # Create a temporary file to save the audio
29
- temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
30
- temp_file_path = temp_file.name
31
- # Save the audio to the temporary file
32
- wavfile.write(temp_file_path, sample_rate, wav_np)
33
- temp_file.close()
34
-
35
- return temp_file_path
36
-
37
- # Build the Gradio interface
38
- with gr.Blocks(title="Voice Cloning TTS") as app:
39
- gr.Markdown("## Voice Cloning Text-to-Speech")
40
- gr.Markdown("Upload a short voice sample in English, then enter text to hear it in your voice!")
41
-
42
- with gr.Row():
43
- audio_input = gr.Audio(type="filepath", label="Upload Your Voice Sample (English)")
44
- text_input = gr.Textbox(label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate")
45
-
46
- generate_btn = gr.Button("Generate Speech")
47
- audio_output = gr.Audio(label="Generated Speech", interactive=False)
48
-
49
- # Connect the button to the generation function
50
- generate_btn.click(
51
- fn=generate_speech,
52
- inputs=[audio_input, text_input],
53
- outputs=audio_output
54
  )
55
 
56
- # Launch the application
57
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ import torchaudio
 
4
  import tempfile
5
+ from transformers import (
6
+ SpeechT5Processor,
7
+ SpeechT5ForTextToSpeech,
8
+ SpeechT5HifiGan
9
+ )
10
+ import soundfile as sf
11
+
12
+ # 1) Load models at startup
13
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
14
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
15
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
+
17
+ SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
18
+
19
+ def generate_speech(reference_wav, text):
20
+ # 2) Load and (if needed) resample the reference audio
21
+ speech_array, sr = torchaudio.load(reference_wav)
22
+ if sr != SAMPLE_RATE:
23
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
24
+ speech_array = resampler(speech_array)
25
+ speech_array = speech_array.squeeze(0) # (channels=1) → (n_samples,)
26
+
27
+ # 3) Compute speaker embeddings
28
+ with torch.no_grad():
29
+ speaker_embeds = processor.speaker_encoder(
30
+ speech_array, sampling_rate=SAMPLE_RATE
31
+ )
32
+
33
+ # 4) Prepare text and generate speech
34
+ inputs = processor(text=text, return_tensors="pt")
35
+ with torch.no_grad():
36
+ speech = tts_model.generate_speech(
37
+ inputs["input_ids"],
38
+ speaker_embeds,
39
+ vocoder=vocoder
40
+ )
41
+
42
+ # 5) Save to a temp WAV and return path
43
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
44
+ sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
45
+ return tmp.name
46
+
47
+ # 6) Build Gradio interface
48
+ with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
49
+ gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
50
+ gr.Markdown(
51
+ "Upload a short English voice sample, type any text, "
52
+ "and hear it spoken back in **your** voice!"
53
  )
54
 
55
+ with gr.Row():
56
+ audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
57
+ txt_in = gr.Textbox(
58
+ label="Text to Synthesize",
59
+ placeholder="e.g., ``Hello, this is my cloned voice!``"
60
+ )
61
+
62
+ btn = gr.Button("Generate Speech")
63
+ audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)
64
+
65
+ btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)
66
+
67
+ if __name__ == "__main__":
68
+ app.launch()