Spaces:
Running
Running
File size: 1,992 Bytes
4760b00 0294388 4cc61f6 0294388 e93cca7 8d69e71 e93cca7 6020e53 e93cca7 6020e53 e93cca7 0294388 8d69e71 e93cca7 0294388 8d69e71 0294388 e93cca7 6020e53 0294388 e93cca7 0294388 f74edeb 3220f5e 0294388 6020e53 0294388 6020e53 0294388 e93cca7 0294388 6020e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import torch
import tempfile
import soundfile as sf
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio
# 1) Initialize the Tortoise TTS engine at startup
tts = TextToSpeech() # Downloads and caches models automatically
# 2) Define a helper to generate speech from a reference clip + text
def generate_speech(reference_audio_path, text):
"""
reference_audio_path: filepath to a WAV sampled at 22 050 Hz
text: the string to synthesize
returns: path to a 24 kHz WAV file with your cloned voice
"""
# β
FIXED: Provide sampling_rate as a required positional argument
ref_waveform = load_audio(reference_audio_path, 22050)
# Generate speech using 'fast' preset (alternatives: ultra_fast, standard, high_quality)
output_tensor = tts.tts_with_preset(
text,
voice_samples=[ref_waveform],
preset="fast"
)
# Save to temp WAV (float32, 24 kHz)
wav_np = output_tensor.squeeze().cpu().numpy()
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, wav_np, samplerate=24000)
return tmp.name
# 3) Build the Gradio interface
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
gr.Markdown("## π£οΈ Voice Cloning with Tortoise TTS")
gr.Markdown(
"Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
"and hear it spoken back in **your** voice!"
)
with gr.Row():
voice_sample = gr.Audio(type="filepath", label="ποΈ Upload Reference Voice (22 050 Hz WAV)")
text_input = gr.Textbox(label="π¬ Text to Synthesize", placeholder="e.g., Hello, world!")
generate_btn = gr.Button("π Generate Speech")
output_audio = gr.Audio(label="π’ Cloned Speech Output (24 kHz)", interactive=False)
generate_btn.click(
fn=generate_speech,
inputs=[voice_sample, text_input],
outputs=output_audio
)
if __name__ == "__main__":
app.launch() |