Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

VocalForge-AI / app.py

shukdevdatta123

Update app.py

0294388 verified about 1 month ago

raw

history blame

2.35 kB

	import gradio as gr
	import torch
	import torchaudio
	import tempfile
	from transformers import (
	SpeechT5Processor,
	SpeechT5ForTextToSpeech,
	SpeechT5HifiGan
	)
	import soundfile as sf

	# 1) Load models at startup
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	SAMPLE_RATE = 16000 # SpeechT5 always uses 16 kHz :contentReference[oaicite:0]{index=0}

	def generate_speech(reference_wav, text):
	# 2) Load and (if needed) resample the reference audio
	speech_array, sr = torchaudio.load(reference_wav)
	if sr != SAMPLE_RATE:
	resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
	speech_array = resampler(speech_array)
	speech_array = speech_array.squeeze(0) # (channels=1) → (n_samples,)

	# 3) Compute speaker embeddings
	with torch.no_grad():
	speaker_embeds = processor.speaker_encoder(
	speech_array, sampling_rate=SAMPLE_RATE
	)

	# 4) Prepare text and generate speech
	inputs = processor(text=text, return_tensors="pt")
	with torch.no_grad():
	speech = tts_model.generate_speech(
	inputs["input_ids"],
	speaker_embeds,
	vocoder=vocoder
	)

	# 5) Save to a temp WAV and return path
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	sf.write(tmp.name, speech.cpu().numpy(), SAMPLE_RATE)
	return tmp.name

	# 6) Build Gradio interface
	with gr.Blocks(title="SpeechT5 Voice Cloning TTS") as app:
	gr.Markdown("## Voice Cloning Text-to-Speech with SpeechT5")
	gr.Markdown(
	"Upload a short English voice sample, type any text, "
	"and hear it spoken back in your voice!"
	)

	with gr.Row():
	audio_in = gr.Audio(type="filepath", label="Your Voice Sample (wav/16 kHz)")
	txt_in = gr.Textbox(
	label="Text to Synthesize",
	placeholder="e.g., ``Hello, this is my cloned voice!``"
	)

	btn = gr.Button("Generate Speech")
	audio_out = gr.Audio(label="Cloned Speech Output", interactive=False)

	btn.click(fn=generate_speech, inputs=[audio_in, txt_in], outputs=audio_out)

	if __name__ == "__main__":
	app.launch()