Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

shukdevdatta123 commited on about 1 month ago

Commit

e93cca7

verified ·

1 Parent(s): 0294388

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -50

app.py CHANGED Viewed

@@ -1,68 +1,58 @@
 import gradio as gr
 import torch
-import torchaudio
 import tempfile
-from transformers import (
-    SpeechT5Processor,
-    SpeechT5ForTextToSpeech,
-    SpeechT5HifiGan
-)
 import soundfile as sf
-# 1) Load models at startup
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-vocoder   = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-SAMPLE_RATE = 16000  # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
-def generate_speech(reference_wav, text):
-    # 2) Load and (if needed) resample the reference audio
-    speech_array, sr = torchaudio.load(reference_wav)
-    if sr != SAMPLE_RATE:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
-        speech_array = resampler(speech_array)
-    speech_array = speech_array.squeeze(0)  # (channels=1) → (n_samples,)
-    # 3) Compute speaker embeddings
-    with torch.no_grad():
-        speaker_embeds = processor.speaker_encoder(
-            speech_array, sampling_rate=SAMPLE_RATE
-        )
-    # 4) Prepare text and generate speech
-    inputs = processor(text=text, return_tensors="pt")
-    with torch.no_grad():
-        speech = tts_model.generate_speech(
-            inputs["input_ids"],
-            speaker_embeds,
-            vocoder=vocoder
-        )
-    # 5) Save to a temp WAV and return path
     tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
     return tmp.name
-# 6) Build Gradio interface
-with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
-    gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
     gr.Markdown(
-        "Upload a short English voice sample, type any text, "
         "and hear it spoken back in **your** voice!"
     )
     with gr.Row():
-        audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
-        txt_in   = gr.Textbox(
-            label="Text to Synthesize",
-            placeholder="e.g., ``Hello, this is my cloned voice!``"
-        )
-    btn  = gr.Button("Generate Speech")
-    audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)
-    btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)
 if __name__ == "__main__":
     app.launch()

 import gradio as gr
 import torch
 import tempfile
 import soundfile as sf
+from tortoise.api import TextToSpeech
+from tortoise.utils.audio import load_audio
+# 1) Initialize the Tortoise TTS engine at startup
+tts = TextToSpeech()  # downloads and caches models automatically
+# 2) Define a helper to generate speech from a reference clip + text
+def generate_speech(reference_audio_path, text):
+    """
+    reference_audio_path: filepath to a WAV sampled at 22 050 Hz
+    text: the string to synthesize
+    returns: path to a 24 kHz WAV file with your cloned voice
+    """
+    # Load and resample the reference clip to 22 050 Hz as a torch tensor
+    # (load_audio handles mono conversion)
+    ref_waveform = load_audio(reference_audio_path, sr=22050)
+    # Synthesize: one clip, use the 'fast' preset for decent speed/quality tradeoff
+    # returns a Tensor of shape (1, S) at 24 kHz :contentReference[oaicite:1]{index=1}
+    output_tensor = tts.tts_with_preset(
+        text,
+        voice_samples=[ref_waveform],
+        preset="fast"
+    )
+    # Convert to NumPy and save to a temporary WAV (float32, 24 kHz)
+    wav_np = output_tensor.squeeze().cpu().numpy()
     tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    sf.write(tmp.name, wav_np, samplerate=24000)  # sample rate is 24 kHz :contentReference[oaicite:2]{index=2}
     return tmp.name
+# 3) Build the Gradio interface
+with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
+    gr.Markdown("## Voice Cloning with Tortoise TTS")
     gr.Markdown(
+        "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
         "and hear it spoken back in **your** voice!"
     )
     with gr.Row():
+        voice_sample = gr.Audio(type="filepath", label="Upload Reference Voice (22 050 Hz WAV)")
+        text_input   = gr.Textbox(label="Text to Synthesize", placeholder="e.g., Hello, world!")
+    generate_btn = gr.Button("Generate Speech")
+    output_audio = gr.Audio(label="Cloned Speech Output (24 kHz)", interactive=False)
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[voice_sample, text_input],
+        outputs=output_audio
+    )
 if __name__ == "__main__":
     app.launch()