Spaces:
Paused
Paused
File size: 4,420 Bytes
a527f65 4b77634 a527f65 390c99c 4b77634 a00a4e7 767aa72 a527f65 e9391fb a527f65 e9391fb a527f65 e9391fb a527f65 39d0c1a 4b77634 a527f65 e2288b2 39d0c1a e2288b2 a527f65 e2288b2 a527f65 e2288b2 a527f65 e2288b2 39d0c1a e2288b2 71ed3fb a527f65 71ed3fb a527f65 e2288b2 39d0c1a e2288b2 39d0c1a 71ed3fb a527f65 0285a90 a527f65 e2288b2 a527f65 e2288b2 a527f65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# Imports
import gradio as gr
import spaces
import os
import torch
import torchaudio
import time
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes
# Variables
HF_TOKEN = os.environ.get("HF_TOKEN", "")
device = "cuda"
REPO = "Zyphra/Zonos-v0.1-transformer"
model = Zonos.from_pretrained(REPO, device=device)
# Functions
def patch_cuda():
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
p = torch.cuda.get_device_properties(i)
if not hasattr(p, "regs_per_multiprocessor"):
setattr(p, "regs_per_multiprocessor", 65536)
if not hasattr(p, "max_threads_per_multi_processor"):
setattr(p, "max_threads_per_multi_processor", 2048)
@spaces.GPU
def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
if randomize_seed: seed = int(time.time())
torch.manual_seed(seed)
speaker_embedding = None
if speaker_audio is not None:
wav, sr = torchaudio.load(speaker_audio)
speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
cond_dict = make_cond_dict(
text=input,
language=language,
speaker=speaker_embedding,
emotion=emotion_tensor,
vqscore_8=vq_tensor,
fmax=float(fmax),
pitch_std=float(pitch_std),
speaking_rate=float(speaking_rate),
dnsmos_ovrl=float(dnsmos_ovrl),
device=device,
)
conditioning = model.prepare_conditioning(cond_dict)
codes = model.generate(
prefix_conditioning=conditioning,
max_new_tokens=int(steps),
cfg_scale=float(cfg_scale),
batch_size=1,
sampling_params=dict(min_p=float(min_p)),
)
wav_out = model.autoencoder.decode(codes).cpu().detach()
sr_out = model.autoencoder.sampling_rate
if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
return (sr_out, wav_out.squeeze().numpy())
# Initialize
patch_cuda()
with gr.Blocks() as main:
text = gr.Textbox(label="text", value="hello, world!")
language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
speaker_audio = gr.Audio(label="voice reference", type="filepath")
clarity_slider = gr.Slider(0.5, 0.8, 0.8, 0.01, label="clarity")
steps_slider = gr.Slider(1, 3000, 300, 1, label="steps")
dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
pitch_std_slider = gr.Slider(0.0, 300.0, 30.0, 1, label="pitch std")
speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.01, label="min p")
with gr.Row():
e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")
seed_number = gr.Number(label="seed", value=42, precision=0)
randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)
generate_button = gr.Button("generate")
output_audio = gr.Audio(label="output", type="numpy", autoplay=True)
generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)
main.launch() |