File size: 1,992 Bytes
9414725
 
8ab8ed4
9414725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ab8ed4
 
9414725
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
import torch
import tempfile
import soundfile as sf
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio

# 1) Initialize the Tortoise TTS engine at startup
tts = TextToSpeech()  # Downloads and caches models automatically

# 2) Define a helper to generate speech from a reference clip + text
def generate_speech(reference_audio_path, text):
    """
    reference_audio_path: filepath to a WAV sampled at 22 050 Hz
    text: the string to synthesize
    returns: path to a 24 kHz WAV file with your cloned voice
    """
    # βœ… FIXED: Provide sampling_rate as a required positional argument
    ref_waveform = load_audio(reference_audio_path, 22050)

    # Generate speech using 'fast' preset (alternatives: ultra_fast, standard, high_quality)
    output_tensor = tts.tts_with_preset(
        text,
        voice_samples=[ref_waveform],
        preset="fast"
    )

    # Save to temp WAV (float32, 24 kHz)
    wav_np = output_tensor.squeeze().cpu().numpy()
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, wav_np, samplerate=24000)
    return tmp.name

# 3) Build the Gradio interface
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
    gr.Markdown("## πŸ—£οΈ Voice Cloning with Tortoise TTS")
    gr.Markdown(
        "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
        "and hear it spoken back in **your** voice!"
    )

    with gr.Row():
        voice_sample = gr.Audio(type="filepath", label="πŸŽ™οΈ Upload Reference Voice (22 050 Hz WAV)")
        text_input   = gr.Textbox(label="πŸ’¬ Text to Synthesize", placeholder="e.g., Hello, world!")

    generate_btn = gr.Button("πŸ”Š Generate Speech")
    output_audio = gr.Audio(label="πŸ“’ Cloned Speech Output (24 kHz)", interactive=False)

    generate_btn.click(
        fn=generate_speech,
        inputs=[voice_sample, text_input],
        outputs=output_audio
    )

if __name__ == "__main__":
    app.launch()