VoiceClone-TTS

Running on Zero

App Files Files Community

VoiceClone-TTS / app.py

ginipick

Update app.py

b28071e verified about 1 month ago

raw

history blame contribute delete

21.2 kB

	import os
	import sys
	import subprocess

	# Emergency flash-attn installation if not found
	try:
	import flash_attn
	except ImportError:
	print("flash_attn not found, attempting to install...")
	try:
	# Try installing pre-built wheel first (fastest)
	subprocess.run([
	sys.executable, "-m", "pip", "install",
	"https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
	], check=True)
	except:
	# Fallback: install without CUDA build (slower but more compatible)
	env = os.environ.copy()
	env["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
	subprocess.run([
	sys.executable, "-m", "pip", "install", "flash-attn", "--no-build-isolation"
	], env=env, check=True)

	# Restart the script after installation
	os.execv(sys.executable, [sys.executable] + sys.argv)

	import spaces
	import torch
	import torchaudio
	import gradio as gr
	from os import getenv

	from zonos.model import Zonos
	from zonos.conditioning import make_cond_dict, supported_language_codes

	device = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
	MODELS = {}

	# Load models with error handling
	for name in MODEL_NAMES:
	try:
	MODELS[name] = Zonos.from_pretrained(name, device=device)
	MODELS[name].requires_grad_(False).eval()
	print(f"Successfully loaded model: {name}")
	except Exception as e:
	print(f"Failed to load model {name}: {e}")
	if not MODELS: # If no models loaded at all
	raise

	def update_ui(model_choice):
	"""
	Dynamically show/hide UI elements based on the model's conditioners.
	We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
	"""
	model = MODELS[model_choice]
	cond_names = [c.name for c in model.prefix_conditioner.conditioners]
	print("Conditioners in this model:", cond_names)

	text_update = gr.update(visible=("espeak" in cond_names))
	language_update = gr.update(visible=("espeak" in cond_names))
	speaker_audio_update = gr.update(visible=("speaker" in cond_names))
	prefix_audio_update = gr.update(visible=True)
	emotion1_update = gr.update(visible=("emotion" in cond_names))
	emotion2_update = gr.update(visible=("emotion" in cond_names))
	emotion3_update = gr.update(visible=("emotion" in cond_names))
	emotion4_update = gr.update(visible=("emotion" in cond_names))
	emotion5_update = gr.update(visible=("emotion" in cond_names))
	emotion6_update = gr.update(visible=("emotion" in cond_names))
	emotion7_update = gr.update(visible=("emotion" in cond_names))
	emotion8_update = gr.update(visible=("emotion" in cond_names))
	vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
	fmax_slider_update = gr.update(visible=("fmax" in cond_names))
	pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
	speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
	dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
	speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
	unconditional_keys_update = gr.update(
	choices=[name for name in cond_names if name not in ("espeak", "language_id")]
	)

	return (
	text_update,
	language_update,
	speaker_audio_update,
	prefix_audio_update,
	emotion1_update,
	emotion2_update,
	emotion3_update,
	emotion4_update,
	emotion5_update,
	emotion6_update,
	emotion7_update,
	emotion8_update,
	vq_single_slider_update,
	fmax_slider_update,
	pitch_std_slider_update,
	speaking_rate_slider_update,
	dnsmos_slider_update,
	speaker_noised_checkbox_update,
	unconditional_keys_update,
	)


	@spaces.GPU(duration=120)
	def generate_audio(
	model_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	e1,
	e2,
	e3,
	e4,
	e5,
	e6,
	e7,
	e8,
	vq_single,
	fmax,
	pitch_std,
	speaking_rate,
	dnsmos_ovrl,
	speaker_noised,
	cfg_scale,
	min_p,
	seed,
	randomize_seed,
	unconditional_keys,
	progress=gr.Progress(),
	):
	"""
	Generates audio based on the provided UI parameters.
	We do NOT use language_id or ctc_loss even if the model has them.
	"""
	selected_model = MODELS[model_choice]

	speaker_noised_bool = bool(speaker_noised)
	fmax = float(fmax)
	pitch_std = float(pitch_std)
	speaking_rate = float(speaking_rate)
	dnsmos_ovrl = float(dnsmos_ovrl)
	cfg_scale = float(cfg_scale)
	min_p = float(min_p)
	seed = int(seed)
	max_new_tokens = 86 * 30

	if randomize_seed:
	seed = torch.randint(0, 2**32 - 1, (1,)).item()
	torch.manual_seed(seed)

	speaker_embedding = None
	if speaker_audio is not None and "speaker" not in unconditional_keys:
	wav, sr = torchaudio.load(speaker_audio)
	speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
	speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)

	audio_prefix_codes = None
	if prefix_audio is not None:
	wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
	wav_prefix = wav_prefix.mean(0, keepdim=True)
	wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
	wav_prefix = wav_prefix.to(device, dtype=torch.float32)
	with torch.autocast(device, dtype=torch.float32):
	audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))

	emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)

	vq_val = float(vq_single)
	vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)

	cond_dict = make_cond_dict(
	text=text,
	language=language,
	speaker=speaker_embedding,
	emotion=emotion_tensor,
	vqscore_8=vq_tensor,
	fmax=fmax,
	pitch_std=pitch_std,
	speaking_rate=speaking_rate,
	dnsmos_ovrl=dnsmos_ovrl,
	speaker_noised=speaker_noised_bool,
	device=device,
	unconditional_keys=unconditional_keys,
	)
	conditioning = selected_model.prepare_conditioning(cond_dict)

	estimated_generation_duration = 30 * len(text) / 400
	estimated_total_steps = int(estimated_generation_duration * 86)

	def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
	progress((step, estimated_total_steps))
	return True

	codes = selected_model.generate(
	prefix_conditioning=conditioning,
	audio_prefix_codes=audio_prefix_codes,
	max_new_tokens=max_new_tokens,
	cfg_scale=cfg_scale,
	batch_size=1,
	sampling_params=dict(min_p=min_p),
	callback=update_progress,
	)

	wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
	sr_out = selected_model.autoencoder.sampling_rate
	if wav_out.dim() == 2 and wav_out.size(0) > 1:
	wav_out = wav_out[0:1, :]
	return (sr_out, wav_out.squeeze().numpy()), seed


	# Custom CSS for pastel gradient background and enhanced UI
	custom_css = """
	.gradio-container {
	background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
	background-size: 400% 400%;
	animation: gradient 15s ease infinite;
	}

	@keyframes gradient {
	0% {
	background-position: 0% 50%;
	}
	50% {
	background-position: 100% 50%;
	}
	100% {
	background-position: 0% 50%;
	}
	}

	.container {
	max-width: 1200px;
	margin: 0 auto;
	padding: 20px;
	}

	.panel {
	background-color: rgba(255, 255, 255, 0.7);
	border-radius: 16px;
	padding: 20px;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
	margin-bottom: 16px;
	backdrop-filter: blur(5px);
	transition: all 0.3s ease;
	}

	.panel:hover {
	box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
	transform: translateY(-2px);
	}

	.title {
	font-size: 1.2em;
	font-weight: 600;
	margin-bottom: 12px;
	color: #6a3ea1;
	border-bottom: 2px solid #f0e6ff;
	padding-bottom: 8px;
	}

	.slider-container {
	background-color: rgba(255, 255, 255, 0.5);
	border-radius: 10px;
	padding: 10px;
	margin: 5px 0;
	}

	/* Make sliders more appealing */
	input[type=range] {
	height: 5px;
	appearance: none;
	width: 100%;
	border-radius: 3px;
	background: linear-gradient(90deg, #9c83e0, #83b1e0);
	}

	.generate-button {
	background: linear-gradient(90deg, #a673ff, #7c4dff);
	color: white;
	border: none;
	border-radius: 8px;
	padding: 12px 24px;
	font-size: 16px;
	font-weight: 500;
	cursor: pointer;
	transition: all 0.3s ease;
	box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
	display: block;
	width: 100%;
	margin: 20px 0;
	}

	.generate-button:hover {
	background: linear-gradient(90deg, #9c5eff, #6a3aff);
	box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
	transform: translateY(-2px);
	}

	/* Tabs styling */
	.tabs {
	display: flex;
	border-bottom: 1px solid #e0e0e0;
	margin-bottom: 20px;
	}

	.tab {
	padding: 10px 20px;
	cursor: pointer;
	transition: all 0.3s ease;
	background-color: transparent;
	border: none;
	color: #666;
	}

	.tab.active {
	color: #7c4dff;
	border-bottom: 3px solid #7c4dff;
	font-weight: 600;
	}

	/* Emotion sliders container */
	.emotion-grid {
	display: grid;
	grid-template-columns: repeat(4, 1fr);
	gap: 12px;
	}

	/* Header styling */
	.app-header {
	text-align: center;
	margin-bottom: 25px;
	}

	.app-header h1 {
	font-size: 2.5em;
	color: #6a3ea1;
	margin-bottom: 8px;
	font-weight: 700;
	}

	.app-header p {
	font-size: 1.1em;
	color: #666;
	margin-bottom: 20px;
	}

	/* Audio player styling */
	.audio-output {
	margin-top: 20px;
	}

	/* Make output area more prominent */
	.output-container {
	background-color: rgba(255, 255, 255, 0.85);
	border-radius: 16px;
	padding: 24px;
	box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
	margin-top: 20px;
	}
	"""


	def build_interface():
	# Build interface with enhanced visual elements and layout
	with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
	gr.HTML(
	"""
	<div class='container' style='display:flex; justify-content:center; gap:12px;'>
	<a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
	<img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
	</a>

	<a href="https://discord.gg/openfreeai" target="_blank">
	<img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
	</a>
	</div>
	"""
	)

	# Header section
	with gr.Column(elem_classes="app-header"):
	gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
	gr.Markdown("Create natural-sounding speech with customizable voice characteristics")

	# Main content container
	with gr.Column(elem_classes="container"):
	# First panel - Text & Model Selection
	with gr.Column(elem_classes="panel"):
	gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
	with gr.Row():
	with gr.Column(scale=2):
	model_choice = gr.Dropdown(
	choices=list(MODELS.keys()),
	value=list(MODELS.keys())[0] if MODELS else None,
	label="Zonos Model Type",
	info="Select the model variant to use.",
	)
	text = gr.Textbox(
	label="Text to Synthesize",
	value="Zonos uses eSpeak for text to phoneme conversion!",
	lines=4,
	max_length=500,
	)
	language = gr.Dropdown(
	choices=supported_language_codes,
	value="en-us",
	label="Language Code",
	info="Select a language code.",
	)
	with gr.Column(scale=1):
	prefix_audio = gr.Audio(
	value="assets/silence_100ms.wav" if os.path.exists("assets/silence_100ms.wav") else None,
	label="Optional Prefix Audio (continue from this audio)",
	type="filepath",
	)

	# Second panel - Voice Characteristics
	with gr.Column(elem_classes="panel"):
	gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
	with gr.Row():
	with gr.Column(scale=1):
	speaker_audio = gr.Audio(
	label="Optional Speaker Audio (for voice cloning)",
	type="filepath",
	)
	speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)

	with gr.Column(scale=2):
	with gr.Row():
	with gr.Column():
	dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
	fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
	vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
	with gr.Column():
	pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
	speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")

	# Third panel - Generation Parameters
	with gr.Column(elem_classes="panel"):
	gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
	with gr.Row():
	with gr.Column():
	cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
	min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
	with gr.Column():
	seed_number = gr.Number(label="Seed", value=420, precision=0)
	randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)

	# Emotion Panel with Tabbed Interface
	with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
	gr.Markdown(
	"Adjust these sliders to control the emotional tone of the generated speech.\n"
	"For a neutral voice, keep 'Neutral' high and other emotions low."
	)
	with gr.Row(elem_classes="emotion-grid"):
	emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
	emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
	emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
	emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
	with gr.Row(elem_classes="emotion-grid"):
	emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
	emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
	emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
	emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")

	# Advanced Settings Panel
	with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
	gr.Markdown(
	"### Unconditional Toggles\n"
	"Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
	'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
	)
	unconditional_keys = gr.CheckboxGroup(
	[
	"speaker",
	"emotion",
	"vqscore_8",
	"fmax",
	"pitch_std",
	"speaking_rate",
	"dnsmos_ovrl",
	"speaker_noised",
	],
	value=["emotion"],
	label="Unconditional Keys",
	)

	# Generate Button and Output Area
	with gr.Column(elem_classes="panel output-container"):
	gr.Markdown('<div class="title">🔊 Generate & Output</div>')
	generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
	output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")

	if MODELS: # Only set up callbacks if models loaded successfully
	model_choice.change(
	fn=update_ui,
	inputs=[model_choice],
	outputs=[
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	unconditional_keys,
	],
	)

	# On page load, trigger the same UI refresh
	demo.load(
	fn=update_ui,
	inputs=[model_choice],
	outputs=[
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	unconditional_keys,
	],
	)

	# Generate audio on button click
	generate_button.click(
	fn=generate_audio,
	inputs=[
	model_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	cfg_scale_slider,
	min_p_slider,
	seed_number,
	randomize_seed_toggle,
	unconditional_keys,
	],
	outputs=[output_audio, seed_number],
	)

	return demo


	if __name__ == "__main__":
	demo = build_interface()
	share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
	demo.launch(server_name="0.0.0.0", server_port=7860, share=share)