Spaces:
Running
Running
File size: 1,992 Bytes
9414725 8ab8ed4 9414725 8ab8ed4 9414725 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import torch
import tempfile
import soundfile as sf
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio
# 1) Initialize the Tortoise TTS engine at startup
tts = TextToSpeech() # Downloads and caches models automatically
# 2) Define a helper to generate speech from a reference clip + text
def generate_speech(reference_audio_path, text):
"""
reference_audio_path: filepath to a WAV sampled at 22 050 Hz
text: the string to synthesize
returns: path to a 24 kHz WAV file with your cloned voice
"""
# β
FIXED: Provide sampling_rate as a required positional argument
ref_waveform = load_audio(reference_audio_path, 22050)
# Generate speech using 'fast' preset (alternatives: ultra_fast, standard, high_quality)
output_tensor = tts.tts_with_preset(
text,
voice_samples=[ref_waveform],
preset="fast"
)
# Save to temp WAV (float32, 24 kHz)
wav_np = output_tensor.squeeze().cpu().numpy()
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, wav_np, samplerate=24000)
return tmp.name
# 3) Build the Gradio interface
with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
gr.Markdown("## π£οΈ Voice Cloning with Tortoise TTS")
gr.Markdown(
"Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
"and hear it spoken back in **your** voice!"
)
with gr.Row():
voice_sample = gr.Audio(type="filepath", label="ποΈ Upload Reference Voice (22 050 Hz WAV)")
text_input = gr.Textbox(label="π¬ Text to Synthesize", placeholder="e.g., Hello, world!")
generate_btn = gr.Button("π Generate Speech")
output_audio = gr.Audio(label="π’ Cloned Speech Output (24 kHz)", interactive=False)
generate_btn.click(
fn=generate_speech,
inputs=[voice_sample, text_input],
outputs=output_audio
)
if __name__ == "__main__":
app.launch() |