Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,57 +1,68 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
import
|
4 |
-
from scipy.io import wavfile
|
5 |
import tempfile
|
6 |
-
import
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
)
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
|
|
4 |
import tempfile
|
5 |
+
from transformers import (
|
6 |
+
SpeechT5Processor,
|
7 |
+
SpeechT5ForTextToSpeech,
|
8 |
+
SpeechT5HifiGan
|
9 |
+
)
|
10 |
+
import soundfile as sf
|
11 |
+
|
12 |
+
# 1) Load models at startup
|
13 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
14 |
+
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
15 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
16 |
+
|
17 |
+
SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
|
18 |
+
|
19 |
+
def generate_speech(reference_wav, text):
|
20 |
+
# 2) Load and (if needed) resample the reference audio
|
21 |
+
speech_array, sr = torchaudio.load(reference_wav)
|
22 |
+
if sr != SAMPLE_RATE:
|
23 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
|
24 |
+
speech_array = resampler(speech_array)
|
25 |
+
speech_array = speech_array.squeeze(0) # (channels=1) → (n_samples,)
|
26 |
+
|
27 |
+
# 3) Compute speaker embeddings
|
28 |
+
with torch.no_grad():
|
29 |
+
speaker_embeds = processor.speaker_encoder(
|
30 |
+
speech_array, sampling_rate=SAMPLE_RATE
|
31 |
+
)
|
32 |
+
|
33 |
+
# 4) Prepare text and generate speech
|
34 |
+
inputs = processor(text=text, return_tensors="pt")
|
35 |
+
with torch.no_grad():
|
36 |
+
speech = tts_model.generate_speech(
|
37 |
+
inputs["input_ids"],
|
38 |
+
speaker_embeds,
|
39 |
+
vocoder=vocoder
|
40 |
+
)
|
41 |
+
|
42 |
+
# 5) Save to a temp WAV and return path
|
43 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
44 |
+
sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
|
45 |
+
return tmp.name
|
46 |
+
|
47 |
+
# 6) Build Gradio interface
|
48 |
+
with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
|
49 |
+
gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
|
50 |
+
gr.Markdown(
|
51 |
+
"Upload a short English voice sample, type any text, "
|
52 |
+
"and hear it spoken back in **your** voice!"
|
53 |
)
|
54 |
|
55 |
+
with gr.Row():
|
56 |
+
audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
|
57 |
+
txt_in = gr.Textbox(
|
58 |
+
label="Text to Synthesize",
|
59 |
+
placeholder="e.g., ``Hello, this is my cloned voice!``"
|
60 |
+
)
|
61 |
+
|
62 |
+
btn = gr.Button("Generate Speech")
|
63 |
+
audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)
|
64 |
+
|
65 |
+
btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
app.launch()
|