Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

shukdevdatta123 commited on 29 days ago

Commit

0294388

verified ·

1 Parent(s): 4cc61f6

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -53

app.py CHANGED Viewed

@@ -1,57 +1,68 @@
 import gradio as gr
-from TTS.api import TTS
-import numpy as np
-from scipy.io import wavfile
 import tempfile
-import os
-# Load the YourTTS model once at startup
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
-sample_rate = tts.synthesizer.output_sample_rate
-def generate_speech(reference_audio, text):
-    """
-    Generate speech audio mimicking the voice from the reference audio.
-    Parameters:
-    reference_audio (str): Filepath to the uploaded voice sample.
-    text (str): Text to convert to speech.
-    Returns:
-    str: Path to the generated audio file
-    """
-    # Generate speech using the reference audio and text
-    wav = tts.tts(text=text, speaker_wav=reference_audio, language="en")
-    # Convert list to numpy array
-    wav_np = np.array(wav, dtype=np.float32)
-    # Create a temporary file to save the audio
-    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    temp_file_path = temp_file.name
-    # Save the audio to the temporary file
-    wavfile.write(temp_file_path, sample_rate, wav_np)
-    temp_file.close()
-    return temp_file_path
-# Build the Gradio interface
-with gr.Blocks(title="Voice Cloning TTS") as app:
-    gr.Markdown("## Voice Cloning Text-to-Speech")
-    gr.Markdown("Upload a short voice sample in English, then enter text to hear it in your voice!")
-    with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="Upload Your Voice Sample (English)")
-        text_input = gr.Textbox(label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate")
-    generate_btn = gr.Button("Generate Speech")
-    audio_output = gr.Audio(label="Generated Speech", interactive=False)
-    # Connect the button to the generation function
-    generate_btn.click(
-        fn=generate_speech,
-        inputs=[audio_input, text_input],
-        outputs=audio_output
     )
-# Launch the application
-app.launch()

 import gradio as gr
+import torch
+import torchaudio
 import tempfile
+from transformers import (
+    SpeechT5Processor,
+    SpeechT5ForTextToSpeech,
+    SpeechT5HifiGan
+)
+import soundfile as sf
+# 1) Load models at startup
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+vocoder   = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+SAMPLE_RATE = 16000  # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
+def generate_speech(reference_wav, text):
+    # 2) Load and (if needed) resample the reference audio
+    speech_array, sr = torchaudio.load(reference_wav)
+    if sr != SAMPLE_RATE:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
+        speech_array = resampler(speech_array)
+    speech_array = speech_array.squeeze(0)  # (channels=1) → (n_samples,)
+    # 3) Compute speaker embeddings
+    with torch.no_grad():
+        speaker_embeds = processor.speaker_encoder(
+            speech_array, sampling_rate=SAMPLE_RATE
+        )
+    # 4) Prepare text and generate speech
+    inputs = processor(text=text, return_tensors="pt")
+    with torch.no_grad():
+        speech = tts_model.generate_speech(
+            inputs["input_ids"],
+            speaker_embeds,
+            vocoder=vocoder
+        )
+    # 5) Save to a temp WAV and return path
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
+    return tmp.name
+# 6) Build Gradio interface
+with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
+    gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
+    gr.Markdown(
+        "Upload a short English voice sample, type any text, "
+        "and hear it spoken back in **your** voice!"
     )
+    with gr.Row():
+        audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
+        txt_in   = gr.Textbox(
+            label="Text to Synthesize",
+            placeholder="e.g., ``Hello, this is my cloned voice!``"
+        )
+    btn  = gr.Button("Generate Speech")
+    audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)
+    btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)
+if __name__ == "__main__":
+    app.launch()