File size: 1,992 Bytes
4760b00
0294388
4cc61f6
0294388
e93cca7
 
 
 
8d69e71
e93cca7
 
 
 
 
 
 
 
6020e53
 
e93cca7
6020e53
e93cca7
 
 
 
 
0294388
8d69e71
e93cca7
0294388
8d69e71
0294388
 
e93cca7
 
6020e53
0294388
e93cca7
0294388
f74edeb
3220f5e
0294388
6020e53
 
0294388
6020e53
 
0294388
e93cca7
 
 
 
 
0294388
 
6020e53
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
import torch
import tempfile
import soundfile as sf
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio

# 1) Initialize the Tortoise TTS engine at startup
tts = TextToSpeech()  # Downloads and caches models automatically

# 2) Define a helper to generate speech from a reference clip + text
def generate_speech(reference_audio_path, text):
    """
    reference_audio_path: filepath to a WAV sampled at 22 050 Hz
    text: the string to synthesize
    returns: path to a 24 kHz WAV file with your cloned voice
    """
    # βœ… FIXED: Provide sampling_rate as a required positional argument
    ref_waveform = load_audio(reference_audio_path, 22050)

    # Generate speech using 'fast' preset (alternatives: ultra_fast, standard, high_quality)
    output_tensor = tts.tts_with_preset(
        text,
        voice_samples=[ref_waveform],
        preset="fast"
    )

    # Save to temp WAV (float32, 24 kHz)
    wav_np = output_tensor.squeeze().cpu().numpy()
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, wav_np, samplerate=24000)
    return tmp.name

# 3) Build the Gradio interface
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
    gr.Markdown("## πŸ—£οΈ Voice Cloning with Tortoise TTS")
    gr.Markdown(
        "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
        "and hear it spoken back in **your** voice!"
    )

    with gr.Row():
        voice_sample = gr.Audio(type="filepath", label="πŸŽ™οΈ Upload Reference Voice (22 050 Hz WAV)")
        text_input   = gr.Textbox(label="πŸ’¬ Text to Synthesize", placeholder="e.g., Hello, world!")

    generate_btn = gr.Button("πŸ”Š Generate Speech")
    output_audio = gr.Audio(label="πŸ“’ Cloned Speech Output (24 kHz)", interactive=False)

    generate_btn.click(
        fn=generate_speech,
        inputs=[voice_sample, text_input],
        outputs=output_audio
    )

if __name__ == "__main__":
    app.launch()