File size: 2,346 Bytes
4760b00
0294388
 
4cc61f6
0294388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f74edeb
3220f5e
0294388
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
import torch
import torchaudio
import tempfile
from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan
)
import soundfile as sf

# 1) Load models at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder   = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

SAMPLE_RATE = 16000  # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}

def generate_speech(reference_wav, text):
    # 2) Load and (if needed) resample the reference audio
    speech_array, sr = torchaudio.load(reference_wav)
    if sr != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
        speech_array = resampler(speech_array)
    speech_array = speech_array.squeeze(0)  # (channels=1) → (n_samples,)

    # 3) Compute speaker embeddings
    with torch.no_grad():
        speaker_embeds = processor.speaker_encoder(
            speech_array, sampling_rate=SAMPLE_RATE
        )

    # 4) Prepare text and generate speech
    inputs = processor(text=text, return_tensors="pt")
    with torch.no_grad():
        speech = tts_model.generate_speech(
            inputs["input_ids"],
            speaker_embeds,
            vocoder=vocoder
        )

    # 5) Save to a temp WAV and return path
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
    return tmp.name

# 6) Build Gradio interface
with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
    gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
    gr.Markdown(
        "Upload a short English voice sample, type any text, "
        "and hear it spoken back in **your** voice!"
    )

    with gr.Row():
        audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
        txt_in   = gr.Textbox(
            label="Text to Synthesize",
            placeholder="e.g., ``Hello, this is my cloned voice!``"
        )

    btn  = gr.Button("Generate Speech")
    audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)

    btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)

if __name__ == "__main__":
    app.launch()