VoiceClone-TTS

Running on Zero

App Files Files Community

VoiceClone-TTS / app.py

Steveeeeeeen HF Staff

Create app.py

748ecaa verified 10 months ago

raw

history blame

14.8 kB

	import torch
	import torchaudio
	import gradio as gr

	from zonos.model import Zonos
	from zonos.conditioning import make_cond_dict, supported_language_codes

	device = "cuda"
	CURRENT_MODEL_TYPE = None
	CURRENT_MODEL = None


	def load_model_if_needed(model_choice: str):
	global CURRENT_MODEL_TYPE, CURRENT_MODEL
	if CURRENT_MODEL_TYPE != model_choice:
	if CURRENT_MODEL is not None:
	del CURRENT_MODEL
	torch.cuda.empty_cache()
	print(f"Loading {model_choice} model...")
	if model_choice == "Transformer":
	CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device)
	else:
	CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=device)
	CURRENT_MODEL.to(device)
	CURRENT_MODEL.bfloat16()
	CURRENT_MODEL.eval()
	CURRENT_MODEL_TYPE = model_choice
	print(f"{model_choice} model loaded successfully!")
	else:
	print(f"{model_choice} model is already loaded.")
	return CURRENT_MODEL


	def update_ui(model_choice):
	"""
	Dynamically show/hide UI elements based on the model's conditioners.
	We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
	"""
	model = load_model_if_needed(model_choice)
	cond_names = [c.name for c in model.prefix_conditioner.conditioners]
	print("Conditioners in this model:", cond_names)

	text_update = gr.update(visible=("espeak" in cond_names))
	language_update = gr.update(visible=("espeak" in cond_names))
	speaker_audio_update = gr.update(visible=("speaker" in cond_names))
	prefix_audio_update = gr.update(visible=True)
	skip_speaker_update = gr.update(visible=("speaker" in cond_names))
	skip_emotion_update = gr.update(visible=("emotion" in cond_names))
	emotion1_update = gr.update(visible=("emotion" in cond_names))
	emotion2_update = gr.update(visible=("emotion" in cond_names))
	emotion3_update = gr.update(visible=("emotion" in cond_names))
	emotion4_update = gr.update(visible=("emotion" in cond_names))
	emotion5_update = gr.update(visible=("emotion" in cond_names))
	emotion6_update = gr.update(visible=("emotion" in cond_names))
	emotion7_update = gr.update(visible=("emotion" in cond_names))
	emotion8_update = gr.update(visible=("emotion" in cond_names))
	skip_vqscore_8_update = gr.update(visible=("vqscore_8" in cond_names))
	vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
	fmax_slider_update = gr.update(visible=("fmax" in cond_names))
	skip_fmax_update = gr.update(visible=("fmax" in cond_names))
	pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
	skip_pitch_std_update = gr.update(visible=("pitch_std" in cond_names))
	speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
	skip_speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names))
	dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
	skip_dnsmos_ovrl_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
	speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
	skip_speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names))

	return (
	text_update, # 1
	language_update, # 2
	speaker_audio_update, # 3
	prefix_audio_update, # 4
	skip_speaker_update, # 5
	skip_emotion_update, # 6
	emotion1_update, # 7
	emotion2_update, # 8
	emotion3_update, # 9
	emotion4_update, # 10
	emotion5_update, # 11
	emotion6_update, # 12
	emotion7_update, # 13
	emotion8_update, # 14
	skip_vqscore_8_update, # 15
	vq_single_slider_update, # 16
	fmax_slider_update, # 17
	skip_fmax_update, # 18
	pitch_std_slider_update, # 19
	skip_pitch_std_update, # 20
	speaking_rate_slider_update, # 21
	skip_speaking_rate_update, # 22
	dnsmos_slider_update, # 23
	skip_dnsmos_ovrl_update, # 24
	speaker_noised_checkbox_update, # 25
	skip_speaker_noised_update, # 26
	)


	def generate_audio(
	model_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	skip_speaker,
	skip_emotion,
	e1,
	e2,
	e3,
	e4,
	e5,
	e6,
	e7,
	e8,
	skip_vqscore_8,
	vq_single,
	fmax,
	skip_fmax,
	pitch_std,
	skip_pitch_std,
	speaking_rate,
	skip_speaking_rate,
	dnsmos_ovrl,
	skip_dnsmos_ovrl,
	speaker_noised,
	skip_speaker_noised,
	cfg_scale,
	min_p,
	seed,
	):
	"""
	Generates audio based on the provided UI parameters.
	We do NOT use language_id or ctc_loss even if the model has them.
	"""
	selected_model = load_model_if_needed(model_choice)

	uncond_keys = []
	if skip_speaker:
	uncond_keys.append("speaker")
	if skip_emotion:
	uncond_keys.append("emotion")
	if skip_vqscore_8:
	uncond_keys.append("vqscore_8")
	if skip_fmax:
	uncond_keys.append("fmax")
	if skip_pitch_std:
	uncond_keys.append("pitch_std")
	if skip_speaking_rate:
	uncond_keys.append("speaking_rate")
	if skip_dnsmos_ovrl:
	uncond_keys.append("dnsmos_ovrl")
	if skip_speaker_noised:
	uncond_keys.append("speaker_noised")

	speaker_noised_bool = bool(speaker_noised)
	fmax = float(fmax)
	pitch_std = float(pitch_std)
	speaking_rate = float(speaking_rate)
	dnsmos_ovrl = float(dnsmos_ovrl)
	cfg_scale = float(cfg_scale)
	min_p = float(min_p)
	seed = int(seed)
	max_new_tokens = 86 * 30

	torch.manual_seed(seed)

	speaker_embedding = None
	if speaker_audio is not None and not skip_speaker:
	wav, sr = torchaudio.load(speaker_audio)
	speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
	speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)

	audio_prefix_codes = None
	if prefix_audio is not None:
	wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
	wav_prefix = wav_prefix.mean(0, keepdim=True)
	wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
	wav_prefix = wav_prefix.to(device, dtype=torch.float32)
	with torch.autocast(device, dtype=torch.float32):
	audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))

	emotion_tensor = torch.tensor(
	[[float(e1), float(e2), float(e3), float(e4), float(e5), float(e6), float(e7), float(e8)]], device=device
	)

	vq_val = float(vq_single)
	vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)

	cond_dict = make_cond_dict(
	text=text,
	language=language,
	speaker=speaker_embedding,
	emotion=emotion_tensor,
	vqscore_8=vq_tensor,
	fmax=fmax,
	pitch_std=pitch_std,
	speaking_rate=speaking_rate,
	dnsmos_ovrl=dnsmos_ovrl,
	speaker_noised=speaker_noised_bool,
	device=device,
	unconditional_keys=uncond_keys,
	)
	conditioning = selected_model.prepare_conditioning(cond_dict)

	codes = selected_model.generate(
	prefix_conditioning=conditioning,
	audio_prefix_codes=audio_prefix_codes,
	max_new_tokens=max_new_tokens,
	cfg_scale=cfg_scale,
	batch_size=1,
	sampling_params=dict(min_p=min_p),
	)

	wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
	sr_out = selected_model.autoencoder.sampling_rate
	if wav_out.dim() == 2 and wav_out.size(0) > 1:
	wav_out = wav_out[0:1, :]
	return sr_out, wav_out.squeeze().numpy()


	def build_interface():
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	model_choice = gr.Dropdown(
	choices=["Hybrid", "Transformer"],
	value="Transformer",
	label="Zonos Model Type",
	info="Select the model variant to use.",
	)
	text = gr.Textbox(
	label="Text to Synthesize", value="Zonos uses eSpeak for text to phoneme conversion!", lines=4
	)
	language = gr.Dropdown(
	choices=supported_language_codes,
	value="en-us",
	label="Language Code",
	info="Select a language code.",
	)
	prefix_audio = gr.Audio(
	value="assets/silence_100ms.wav",
	label="Optional Prefix Audio (continue from this audio)",
	type="filepath",
	)
	with gr.Column():
	speaker_audio = gr.Audio(
	label="Optional Speaker Audio (for cloning)",
	type="filepath",
	)
	speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)

	with gr.Column():
	gr.Markdown("## Conditioning Parameters")

	with gr.Row():
	dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall")
	fmax_slider = gr.Slider(0, 24000, value=22050, step=1, label="Fmax (Hz)")
	vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score")
	pitch_std_slider = gr.Slider(0.0, 400.0, value=20.0, step=1, label="Pitch Std")
	speaking_rate_slider = gr.Slider(0.0, 40.0, value=15.0, step=1, label="Speaking Rate")

	gr.Markdown("### Emotion Sliders")
	with gr.Row():
	emotion1 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Happiness")
	emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness")
	emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust")
	emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear")
	with gr.Row():
	emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise")
	emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger")
	emotion7 = gr.Slider(0.0, 1.0, 0.5, 0.05, label="Other")
	emotion8 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Neutral")

	gr.Markdown("### Unconditional Toggles")
	with gr.Row():
	skip_speaker = gr.Checkbox(label="Skip Speaker", value=False)
	skip_emotion = gr.Checkbox(label="Skip Emotion", value=False)
	skip_vqscore_8 = gr.Checkbox(label="Skip VQ Score", value=True)
	skip_fmax = gr.Checkbox(label="Skip Fmax", value=False)
	skip_pitch_std = gr.Checkbox(label="Skip Pitch Std", value=False)
	skip_speaking_rate = gr.Checkbox(label="Skip Speaking Rate", value=False)
	skip_dnsmos_ovrl = gr.Checkbox(label="Skip DNSMOS", value=True)
	skip_speaker_noised = gr.Checkbox(label="Skip Noised Speaker", value=False)

	with gr.Column():
	gr.Markdown("## Generation Parameters")
	with gr.Row():
	cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
	min_p_slider = gr.Slider(0.0, 1.0, 0.1, 0.01, label="Min P")
	seed_number = gr.Number(label="Seed", value=420, precision=0)

	generate_button = gr.Button("Generate Audio")
	output_audio = gr.Audio(label="Generated Audio", type="numpy")

	model_choice.change(
	fn=update_ui,
	inputs=[model_choice],
	outputs=[
	text, # 1
	language, # 2
	speaker_audio, # 3
	prefix_audio, # 4
	skip_speaker, # 5
	skip_emotion, # 6
	emotion1, # 7
	emotion2, # 8
	emotion3, # 9
	emotion4, # 10
	emotion5, # 11
	emotion6, # 12
	emotion7, # 13
	emotion8, # 14
	skip_vqscore_8, # 15
	vq_single_slider, # 16
	fmax_slider, # 17
	skip_fmax, # 18
	pitch_std_slider, # 19
	skip_pitch_std, # 20
	speaking_rate_slider, # 21
	skip_speaking_rate, # 22
	dnsmos_slider, # 23
	skip_dnsmos_ovrl, # 24
	speaker_noised_checkbox, # 25
	skip_speaker_noised, # 26
	],
	)

	# On page load, trigger the same UI refresh
	demo.load(
	fn=update_ui,
	inputs=[model_choice],
	outputs=[
	text,
	language,
	speaker_audio,
	prefix_audio,
	skip_speaker,
	skip_emotion,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	skip_vqscore_8,
	vq_single_slider,
	fmax_slider,
	skip_fmax,
	pitch_std_slider,
	skip_pitch_std,
	speaking_rate_slider,
	skip_speaking_rate,
	dnsmos_slider,
	skip_dnsmos_ovrl,
	speaker_noised_checkbox,
	skip_speaker_noised,
	],
	)

	# Generate audio on button click
	generate_button.click(
	fn=generate_audio,
	inputs=[
	model_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	skip_speaker,
	skip_emotion,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	skip_vqscore_8,
	vq_single_slider,
	fmax_slider,
	skip_fmax,
	pitch_std_slider,
	skip_pitch_std,
	speaking_rate_slider,
	skip_speaking_rate,
	dnsmos_slider,
	skip_dnsmos_ovrl,
	speaker_noised_checkbox,
	skip_speaker_noised,
	cfg_scale_slider,
	min_p_slider,
	seed_number,
	],
	outputs=[output_audio],
	)

	return demo


	if __name__ == "__main__":
	demo = build_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)