Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,68 +1,58 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
import torchaudio
|
4 |
import tempfile
|
5 |
-
from transformers import (
|
6 |
-
SpeechT5Processor,
|
7 |
-
SpeechT5ForTextToSpeech,
|
8 |
-
SpeechT5HifiGan
|
9 |
-
)
|
10 |
import soundfile as sf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
15 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
16 |
-
|
17 |
-
SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
|
18 |
-
|
19 |
-
def generate_speech(reference_wav, text):
|
20 |
-
# 2) Load and (if needed) resample the reference audio
|
21 |
-
speech_array, sr = torchaudio.load(reference_wav)
|
22 |
-
if sr != SAMPLE_RATE:
|
23 |
-
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
|
24 |
-
speech_array = resampler(speech_array)
|
25 |
-
speech_array = speech_array.squeeze(0) # (channels=1) → (n_samples,)
|
26 |
-
|
27 |
-
# 3) Compute speaker embeddings
|
28 |
-
with torch.no_grad():
|
29 |
-
speaker_embeds = processor.speaker_encoder(
|
30 |
-
speech_array, sampling_rate=SAMPLE_RATE
|
31 |
-
)
|
32 |
-
|
33 |
-
# 4) Prepare text and generate speech
|
34 |
-
inputs = processor(text=text, return_tensors="pt")
|
35 |
-
with torch.no_grad():
|
36 |
-
speech = tts_model.generate_speech(
|
37 |
-
inputs["input_ids"],
|
38 |
-
speaker_embeds,
|
39 |
-
vocoder=vocoder
|
40 |
-
)
|
41 |
-
|
42 |
-
# 5) Save to a temp WAV and return path
|
43 |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
44 |
-
sf.write(tmp.name,
|
45 |
return tmp.name
|
46 |
|
47 |
-
#
|
48 |
-
with gr.Blocks(title="
|
49 |
-
gr.Markdown("## Voice Cloning
|
50 |
gr.Markdown(
|
51 |
-
"Upload a
|
52 |
"and hear it spoken back in **your** voice!"
|
53 |
)
|
54 |
|
55 |
with gr.Row():
|
56 |
-
|
57 |
-
|
58 |
-
label="Text to Synthesize",
|
59 |
-
placeholder="e.g., ``Hello, this is my cloned voice!``"
|
60 |
-
)
|
61 |
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
|
67 |
if __name__ == "__main__":
|
68 |
app.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
|
|
3 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
4 |
import soundfile as sf
|
5 |
+
from tortoise.api import TextToSpeech
|
6 |
+
from tortoise.utils.audio import load_audio
|
7 |
+
|
8 |
+
# 1) Initialize the Tortoise TTS engine at startup
|
9 |
+
tts = TextToSpeech() # downloads and caches models automatically
|
10 |
+
|
11 |
+
# 2) Define a helper to generate speech from a reference clip + text
|
12 |
+
def generate_speech(reference_audio_path, text):
|
13 |
+
"""
|
14 |
+
reference_audio_path: filepath to a WAV sampled at 22 050 Hz
|
15 |
+
text: the string to synthesize
|
16 |
+
returns: path to a 24 kHz WAV file with your cloned voice
|
17 |
+
"""
|
18 |
+
# Load and resample the reference clip to 22 050 Hz as a torch tensor
|
19 |
+
# (load_audio handles mono conversion)
|
20 |
+
ref_waveform = load_audio(reference_audio_path, sr=22050)
|
21 |
+
|
22 |
+
# Synthesize: one clip, use the 'fast' preset for decent speed/quality tradeoff
|
23 |
+
# returns a Tensor of shape (1, S) at 24 kHz :contentReference[oaicite:1]{index=1}
|
24 |
+
output_tensor = tts.tts_with_preset(
|
25 |
+
text,
|
26 |
+
voice_samples=[ref_waveform],
|
27 |
+
preset="fast"
|
28 |
+
)
|
29 |
|
30 |
+
# Convert to NumPy and save to a temporary WAV (float32, 24 kHz)
|
31 |
+
wav_np = output_tensor.squeeze().cpu().numpy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
33 |
+
sf.write(tmp.name, wav_np, samplerate=24000) # sample rate is 24 kHz :contentReference[oaicite:2]{index=2}
|
34 |
return tmp.name
|
35 |
|
36 |
+
# 3) Build the Gradio interface
|
37 |
+
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
|
38 |
+
gr.Markdown("## Voice Cloning with Tortoise TTS")
|
39 |
gr.Markdown(
|
40 |
+
"Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
|
41 |
"and hear it spoken back in **your** voice!"
|
42 |
)
|
43 |
|
44 |
with gr.Row():
|
45 |
+
voice_sample = gr.Audio(type="filepath", label="Upload Reference Voice (22 050 Hz WAV)")
|
46 |
+
text_input = gr.Textbox(label="Text to Synthesize", placeholder="e.g., Hello, world!")
|
|
|
|
|
|
|
47 |
|
48 |
+
generate_btn = gr.Button("Generate Speech")
|
49 |
+
output_audio = gr.Audio(label="Cloned Speech Output (24 kHz)", interactive=False)
|
50 |
|
51 |
+
generate_btn.click(
|
52 |
+
fn=generate_speech,
|
53 |
+
inputs=[voice_sample, text_input],
|
54 |
+
outputs=output_audio
|
55 |
+
)
|
56 |
|
57 |
if __name__ == "__main__":
|
58 |
app.launch()
|