shukdevdattaEX commited on
Commit
29543b1
Β·
verified Β·
1 Parent(s): 63564a9

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +56 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import tempfile
4
+ import soundfile as sf
5
+ from tortoise.api import TextToSpeech
6
+ from tortoise.utils.audio import load_audio
7
+
8
+ # 1) Initialize the Tortoise TTS engine at startup
9
+ tts = TextToSpeech() # Downloads and caches models automatically
10
+
11
+ # 2) Define a helper to generate speech from a reference clip + text
12
+ def generate_speech(reference_audio_path, text):
13
+ """
14
+ reference_audio_path: filepath to a WAV sampled at 22 050 Hz
15
+ text: the string to synthesize
16
+ returns: path to a 24 kHz WAV file with your cloned voice
17
+ """
18
+ # βœ… FIXED: Provide sampling_rate as a required positional argument
19
+ ref_waveform = load_audio(reference_audio_path, 22050)
20
+
21
+ # Generate speech using 'fast' preset (alternatives: ultra_fast, standard, high_quality)
22
+ output_tensor = tts.tts_with_preset(
23
+ text,
24
+ voice_samples=[ref_waveform],
25
+ preset="fast"
26
+ )
27
+
28
+ # Save to temp WAV (float32, 24 kHz)
29
+ wav_np = output_tensor.squeeze().cpu().numpy()
30
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
31
+ sf.write(tmp.name, wav_np, samplerate=24000)
32
+ return tmp.name
33
+
34
+ # 3) Build the Gradio interface
35
+ with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
36
+ gr.Markdown("## πŸ—£οΈ Voice Cloning with Tortoise TTS")
37
+ gr.Markdown(
38
+ "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
39
+ "and hear it spoken back in **your** voice!"
40
+ )
41
+
42
+ with gr.Row():
43
+ voice_sample = gr.Audio(type="filepath", label="πŸŽ™οΈ Upload Reference Voice (22 050 Hz WAV)")
44
+ text_input = gr.Textbox(label="πŸ’¬ Text to Synthesize", placeholder="e.g., Hello, world!")
45
+
46
+ generate_btn = gr.Button("πŸ”Š Generate Speech")
47
+ output_audio = gr.Audio(label="πŸ“’ Cloned Speech Output (24 kHz)", interactive=False)
48
+
49
+ generate_btn.click(
50
+ fn=generate_speech,
51
+ inputs=[voice_sample, text_input],
52
+ outputs=output_audio
53
+ )
54
+
55
+ if __name__ == "__main__":
56
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ datasets
5
+ librosa
6
+ soundfile
7
+ numpy
8
+ sentencepiece
9
+ TTS
10
+ tortoise-tts
11
+ pycryptodome