File size: 4,420 Bytes
a527f65
 
4b77634
a527f65
 
 
390c99c
4b77634
a00a4e7
 
767aa72
a527f65
 
e9391fb
a527f65
e9391fb
a527f65
 
e9391fb
a527f65
 
 
 
 
 
 
 
 
39d0c1a
4b77634
a527f65
 
 
e2288b2
 
39d0c1a
e2288b2
a527f65
e2288b2
a527f65
 
e2288b2
 
a527f65
e2288b2
 
 
 
39d0c1a
 
 
 
e2288b2
71ed3fb
a527f65
 
71ed3fb
a527f65
e2288b2
39d0c1a
 
e2288b2
39d0c1a
71ed3fb
 
a527f65
 
 
 
0285a90
a527f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2288b2
a527f65
e2288b2
a527f65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Imports
import gradio as gr
import spaces
import os
import torch
import torchaudio
import time

from zonos.model import Zonos
from zonos.conditioning import make_cond_dict, supported_language_codes

# Variables
HF_TOKEN = os.environ.get("HF_TOKEN", "")

device = "cuda"

REPO = "Zyphra/Zonos-v0.1-transformer"
model = Zonos.from_pretrained(REPO, device=device)

# Functions
def patch_cuda():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            p = torch.cuda.get_device_properties(i)
            if not hasattr(p, "regs_per_multiprocessor"):
                setattr(p, "regs_per_multiprocessor", 65536)
            if not hasattr(p, "max_threads_per_multi_processor"):
                setattr(p, "max_threads_per_multi_processor", 2048)

@spaces.GPU
def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed):
    if randomize_seed: seed = int(time.time())
    torch.manual_seed(seed)

    speaker_embedding = None
    if speaker_audio is not None:
        wav, sr = torchaudio.load(speaker_audio)
        speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))

    emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
    vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)

    cond_dict = make_cond_dict(
        text=input,
        language=language,
        speaker=speaker_embedding,
        emotion=emotion_tensor,
        vqscore_8=vq_tensor,
        fmax=float(fmax),
        pitch_std=float(pitch_std),
        speaking_rate=float(speaking_rate),
        dnsmos_ovrl=float(dnsmos_ovrl),
        device=device,
    )
    
    conditioning = model.prepare_conditioning(cond_dict)

    codes = model.generate(
        prefix_conditioning=conditioning,
        max_new_tokens=int(steps),
        cfg_scale=float(cfg_scale),
        batch_size=1,
        sampling_params=dict(min_p=float(min_p)),
    )

    wav_out = model.autoencoder.decode(codes).cpu().detach()
    sr_out = model.autoencoder.sampling_rate
    
    if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
        
    return (sr_out, wav_out.squeeze().numpy())

# Initialize
patch_cuda()

with gr.Blocks() as main:
    text = gr.Textbox(label="text", value="hello, world!")
    language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language")
    speaker_audio = gr.Audio(label="voice reference", type="filepath")

    clarity_slider = gr.Slider(0.5, 0.8, 0.8, 0.01, label="clarity")
    steps_slider = gr.Slider(1, 3000, 300, 1, label="steps")

    dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality")
    fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax")
    pitch_std_slider = gr.Slider(0.0, 300.0, 30.0, 1, label="pitch std")
    speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate")

    cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance")
    min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.01, label="min p")

    with gr.Row():
        e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy")
        e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad")
        e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust")
        e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear")
        e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise")
        e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger")
        e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other")
        e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral")

    seed_number = gr.Number(label="seed", value=42, precision=0)
    randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True)

    generate_button = gr.Button("generate")
    output_audio = gr.Audio(label="output", type="numpy", autoplay=True)

    generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio)

main.launch()