Spaces:
Running
Running
import gradio as gr | |
import torch | |
import tempfile | |
import soundfile as sf | |
from tortoise.api import TextToSpeech | |
from tortoise.utils.audio import load_audio | |
# 1) Initialize the Tortoise TTS engine at startup | |
tts = TextToSpeech() # Downloads and caches models automatically | |
# 2) Define a helper to generate speech from a reference clip + text | |
def generate_speech(reference_audio_path, text): | |
""" | |
reference_audio_path: filepath to a WAV sampled at 22 050 Hz | |
text: the string to synthesize | |
returns: path to a 24 kHz WAV file with your cloned voice | |
""" | |
# β FIXED: Provide sampling_rate as a required positional argument | |
ref_waveform = load_audio(reference_audio_path, 22050) | |
# Generate speech using 'fast' preset (alternatives: ultra_fast, standard, high_quality) | |
output_tensor = tts.tts_with_preset( | |
text, | |
voice_samples=[ref_waveform], | |
preset="fast" | |
) | |
# Save to temp WAV (float32, 24 kHz) | |
wav_np = output_tensor.squeeze().cpu().numpy() | |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
sf.write(tmp.name, wav_np, samplerate=24000) | |
return tmp.name | |
# 3) Build the Gradio interface | |
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app: | |
gr.Markdown("## π£οΈ Voice Cloning with Tortoise TTS") | |
gr.Markdown( | |
"Upload a ~10 sec WAV clip (22 050 Hz), enter English text, " | |
"and hear it spoken back in **your** voice!" | |
) | |
with gr.Row(): | |
voice_sample = gr.Audio(type="filepath", label="ποΈ Upload Reference Voice (22 050 Hz WAV)") | |
text_input = gr.Textbox(label="π¬ Text to Synthesize", placeholder="e.g., Hello, world!") | |
generate_btn = gr.Button("π Generate Speech") | |
output_audio = gr.Audio(label="π’ Cloned Speech Output (24 kHz)", interactive=False) | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[voice_sample, text_input], | |
outputs=output_audio | |
) | |
if __name__ == "__main__": | |
app.launch() |