shukdevdatta123 commited on
Commit
e93cca7
·
verified ·
1 Parent(s): 0294388

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -50
app.py CHANGED
@@ -1,68 +1,58 @@
1
  import gradio as gr
2
  import torch
3
- import torchaudio
4
  import tempfile
5
- from transformers import (
6
- SpeechT5Processor,
7
- SpeechT5ForTextToSpeech,
8
- SpeechT5HifiGan
9
- )
10
  import soundfile as sf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # 1) Load models at startup
13
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
14
- tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
15
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
-
17
- SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
18
-
19
- def generate_speech(reference_wav, text):
20
- # 2) Load and (if needed) resample the reference audio
21
- speech_array, sr = torchaudio.load(reference_wav)
22
- if sr != SAMPLE_RATE:
23
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
24
- speech_array = resampler(speech_array)
25
- speech_array = speech_array.squeeze(0) # (channels=1) → (n_samples,)
26
-
27
- # 3) Compute speaker embeddings
28
- with torch.no_grad():
29
- speaker_embeds = processor.speaker_encoder(
30
- speech_array, sampling_rate=SAMPLE_RATE
31
- )
32
-
33
- # 4) Prepare text and generate speech
34
- inputs = processor(text=text, return_tensors="pt")
35
- with torch.no_grad():
36
- speech = tts_model.generate_speech(
37
- inputs["input_ids"],
38
- speaker_embeds,
39
- vocoder=vocoder
40
- )
41
-
42
- # 5) Save to a temp WAV and return path
43
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
44
- sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
45
  return tmp.name
46
 
47
- # 6) Build Gradio interface
48
- with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
49
- gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
50
  gr.Markdown(
51
- "Upload a short English voice sample, type any text, "
52
  "and hear it spoken back in **your** voice!"
53
  )
54
 
55
  with gr.Row():
56
- audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
57
- txt_in = gr.Textbox(
58
- label="Text to Synthesize",
59
- placeholder="e.g., ``Hello, this is my cloned voice!``"
60
- )
61
 
62
- btn = gr.Button("Generate Speech")
63
- audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)
64
 
65
- btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)
 
 
 
 
66
 
67
  if __name__ == "__main__":
68
  app.launch()
 
1
  import gradio as gr
2
  import torch
 
3
  import tempfile
 
 
 
 
 
4
  import soundfile as sf
5
+ from tortoise.api import TextToSpeech
6
+ from tortoise.utils.audio import load_audio
7
+
8
+ # 1) Initialize the Tortoise TTS engine at startup
9
+ tts = TextToSpeech() # downloads and caches models automatically
10
+
11
+ # 2) Define a helper to generate speech from a reference clip + text
12
+ def generate_speech(reference_audio_path, text):
13
+ """
14
+ reference_audio_path: filepath to a WAV sampled at 22 050 Hz
15
+ text: the string to synthesize
16
+ returns: path to a 24 kHz WAV file with your cloned voice
17
+ """
18
+ # Load and resample the reference clip to 22 050 Hz as a torch tensor
19
+ # (load_audio handles mono conversion)
20
+ ref_waveform = load_audio(reference_audio_path, sr=22050)
21
+
22
+ # Synthesize: one clip, use the 'fast' preset for decent speed/quality tradeoff
23
+ # returns a Tensor of shape (1, S) at 24 kHz :contentReference[oaicite:1]{index=1}
24
+ output_tensor = tts.tts_with_preset(
25
+ text,
26
+ voice_samples=[ref_waveform],
27
+ preset="fast"
28
+ )
29
 
30
+ # Convert to NumPy and save to a temporary WAV (float32, 24 kHz)
31
+ wav_np = output_tensor.squeeze().cpu().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
33
+ sf.write(tmp.name, wav_np, samplerate=24000) # sample rate is 24 kHz :contentReference[oaicite:2]{index=2}
34
  return tmp.name
35
 
36
+ # 3) Build the Gradio interface
37
+ with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
38
+ gr.Markdown("## Voice Cloning with Tortoise TTS")
39
  gr.Markdown(
40
+ "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
41
  "and hear it spoken back in **your** voice!"
42
  )
43
 
44
  with gr.Row():
45
+ voice_sample = gr.Audio(type="filepath", label="Upload Reference Voice (22 050 Hz WAV)")
46
+ text_input = gr.Textbox(label="Text to Synthesize", placeholder="e.g., Hello, world!")
 
 
 
47
 
48
+ generate_btn = gr.Button("Generate Speech")
49
+ output_audio = gr.Audio(label="Cloned Speech Output (24 kHz)", interactive=False)
50
 
51
+ generate_btn.click(
52
+ fn=generate_speech,
53
+ inputs=[voice_sample, text_input],
54
+ outputs=output_audio
55
+ )
56
 
57
  if __name__ == "__main__":
58
  app.launch()