VocalForge-AI / app.py
shukdevdatta123's picture
Update app.py
0294388 verified
raw
history blame
2.35 kB
import gradio as gr
import torch
import torchaudio
import tempfile
from transformers import (
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan
)
import soundfile as sf
# 1) Load models at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}
def generate_speech(reference_wav, text):
# 2) Load and (if needed) resample the reference audio
speech_array, sr = torchaudio.load(reference_wav)
if sr != SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
speech_array = resampler(speech_array)
speech_array = speech_array.squeeze(0) # (channels=1) β†’ (n_samples,)
# 3) Compute speaker embeddings
with torch.no_grad():
speaker_embeds = processor.speaker_encoder(
speech_array, sampling_rate=SAMPLE_RATE
)
# 4) Prepare text and generate speech
inputs = processor(text=text, return_tensors="pt")
with torch.no_grad():
speech = tts_model.generate_speech(
inputs["input_ids"],
speaker_embeds,
vocoder=vocoder
)
# 5) Save to a temp WAV and return path
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
return tmp.name
# 6) Build Gradio interface
with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
gr.Markdown(
"Upload a short English voice sample, type any text, "
"and hear it spoken back in **your** voice!"
)
with gr.Row():
audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
txt_in = gr.Textbox(
label="Text to Synthesize",
placeholder="e.g., ``Hello, this is my cloned voice!``"
)
btn = gr.Button("Generate Speech")
audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)
btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)
if __name__ == "__main__":
app.launch()