Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
import tempfile | |
from transformers import ( | |
SpeechT5Processor, | |
SpeechT5ForTextToSpeech, | |
SpeechT5HifiGan | |
) | |
import soundfile as sf | |
# 1) Load models at startup | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0} | |
def generate_speech(reference_wav, text): | |
# 2) Load and (if needed) resample the reference audio | |
speech_array, sr = torchaudio.load(reference_wav) | |
if sr != SAMPLE_RATE: | |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE) | |
speech_array = resampler(speech_array) | |
speech_array = speech_array.squeeze(0) # (channels=1) β (n_samples,) | |
# 3) Compute speaker embeddings | |
with torch.no_grad(): | |
speaker_embeds = processor.speaker_encoder( | |
speech_array, sampling_rate=SAMPLE_RATE | |
) | |
# 4) Prepare text and generate speech | |
inputs = processor(text=text, return_tensors="pt") | |
with torch.no_grad(): | |
speech = tts_model.generate_speech( | |
inputs["input_ids"], | |
speaker_embeds, | |
vocoder=vocoder | |
) | |
# 5) Save to a temp WAV and return path | |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE) | |
return tmp.name | |
# 6) Build Gradio interface | |
with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app: | |
gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5") | |
gr.Markdown( | |
"Upload a short English voice sample, type any text, " | |
"and hear it spoken back in **your** voice!" | |
) | |
with gr.Row(): | |
audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)") | |
txt_in = gr.Textbox( | |
label="Text to Synthesize", | |
placeholder="e.g., ``Hello, this is my cloned voice!``" | |
) | |
btn = gr.Button("Generate Speech") | |
audio_out = gr.Audio(label="Cloned Speech Output", interactive=False) | |
btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out) | |
if __name__ == "__main__": | |
app.launch() |