Staticaliza commited on
Commit
b5cb70e
·
verified ·
1 Parent(s): 67c56ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -93
app.py CHANGED
@@ -1,99 +1,54 @@
1
- import torch, torchaudio, gradio as gr, spaces
2
- from functools import lru_cache
 
3
  from zonos.model import Zonos
4
- from zonos.conditioning import make_cond_dict, supported_language_codes
5
 
6
- MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
 
7
 
8
- @lru_cache(maxsize = 1)
9
- def get_model():
10
- return Zonos.from_pretrained(MODEL_ID, device = "cuda").eval().requires_grad_(False)
 
 
 
11
 
12
- @spaces.GPU(duration = 120)
13
- def synthesize(
14
- text,
15
- language,
16
- ref_audio,
17
- pitch_std,
18
- speaking_rate,
19
- guidance_scale,
20
- seed,
21
- happiness,
22
- sadness,
23
- disgust,
24
- fear,
25
- surprise,
26
- anger,
27
- other,
28
- neutral
29
- ):
30
- model = get_model()
31
- if seed == 0:
32
- seed = int(torch.randint(0, 2 ** 32 - 1, (1,)).item())
33
- torch.manual_seed(seed)
34
- speaker = None
35
- if ref_audio:
36
- wav, sr = torchaudio.load(ref_audio)
37
- speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype = torch.bfloat16)
38
- emotion = torch.tensor(
39
- [happiness, sadness, disgust, fear, surprise, anger, other, neutral],
40
- device = "cuda"
41
- )
42
- cond = make_cond_dict(
43
- text = text,
44
- language = language,
45
- speaker = speaker,
46
- emotion = emotion,
47
- fmax = 24_000.0,
48
- pitch_std = float(pitch_std),
49
- speaking_rate = float(speaking_rate),
50
- device = "cuda"
51
- )
52
- conditioning = model.prepare_conditioning(cond)
53
- codes = model.generate(conditioning, cfg_scale = float(guidance_scale), max_new_tokens = 2_590)
54
- wav_out = model.autoencoder.decode(codes).cpu().squeeze()
55
- return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
56
 
57
- with gr.Blocks() as demo:
58
- gr.Markdown("## zonos v0.1 tts")
59
- text = gr.Textbox(label = "text to synthesize", lines = 3)
60
- language = gr.Dropdown(supported_language_codes, value = "en-us", label = "language")
61
- ref_audio = gr.Audio(label = "reference audio (zeroshot tts)", type = "filepath")
62
- pitch_std = gr.Slider(0.0, 300.0, 45.0, step = 1, label = "pitch variation")
63
- speaking_rate = gr.Slider(5.0, 30.0, 15.0, step = 0.5, label = "speaking rate")
64
- guidance_scale = gr.Slider(1.0, 5.0, 2.0, step = 0.1, label = "guidance scale")
65
- seed = gr.Number(value = 0, label = "seed (0 = random)", precision = 0)
66
- gr.Markdown("### emotion settings")
67
- happiness = gr.Slider(0, 1, 0.0, step = 0.01, label = "happiness")
68
- sadness = gr.Slider(0, 1, 0.0, step = 0.01, label = "sadness")
69
- disgust = gr.Slider(0, 1, 0.0, step = 0.01, label = "disgust")
70
- fear = gr.Slider(0, 1, 0.0, step = 0.01, label = "fear")
71
- surprise = gr.Slider(0, 1, 0.0, step = 0.01, label = "surprise")
72
- anger = gr.Slider(0, 1, 0.0, step = 0.01, label = "anger")
73
- other = gr.Slider(0, 1, 0.0, step = 0.01, label = "other")
74
- neutral = gr.Slider(0, 1, 1.0, step = 0.01, label = "neutral")
75
- btn = gr.Button("synthesize")
76
- out_audio = gr.Audio(label = "output")
77
- out_seed = gr.Number(label = "used seed", interactive = False)
78
- btn.click(
79
- synthesize,
80
- [
81
- text,
82
- language,
83
- ref_audio,
84
- pitch_std,
85
- speaking_rate,
86
- guidance_scale,
87
- seed,
88
- happiness,
89
- sadness,
90
- disgust,
91
- fear,
92
- surprise,
93
- anger,
94
- other,
95
- neutral
96
- ],
97
- [out_audio, out_seed]
98
- )
99
  demo.launch()
 
1
+ # app.py
2
+ import gradio as gr
3
+ import torch, torchaudio, spaces
4
  from zonos.model import Zonos
5
+ from zonos.conditioning import make_cond_dict
6
 
7
+ cpu = "cpu"
8
+ model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=cpu)
9
 
10
+ def _speaker_embed(audio):
11
+ if audio is None:
12
+ return None
13
+ sr, wav = audio
14
+ wav = torch.tensor(wav).unsqueeze(0)
15
+ return model.make_speaker_embedding(wav, sr)
16
 
17
+ @spaces.GPU
18
+ def tts(text, language, speaker_audio, emotion_vec, speaking_rate, pitch_std):
19
+ speaker = _speaker_embed(speaker_audio)
20
+ emotion = [float(x) for x in emotion_vec.split(",")] if emotion_vec else None
21
+ model.to("cuda")
22
+ with torch.no_grad():
23
+ cond = make_cond_dict(
24
+ text=text,
25
+ language=language,
26
+ speaker=speaker,
27
+ emotion=emotion,
28
+ speaking_rate=float(speaking_rate),
29
+ pitch_std=float(pitch_std),
30
+ )
31
+ conditioning = model.prepare_conditioning(cond)
32
+ codes = model.generate(conditioning)
33
+ wav = model.autoencoder.decode(codes)[0].cpu()
34
+ model.to(cpu)
35
+ torch.cuda.empty_cache()
36
+ return (model.autoencoder.sampling_rate, wav.numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ langs = ["en-us", "fr-fr", "ja", "de-de", "zh"]
39
+ demo = gr.Interface(
40
+ fn=tts,
41
+ inputs=[
42
+ gr.Textbox(label="text"),
43
+ gr.Dropdown(langs, value="en-us", label="language"),
44
+ gr.Audio(type="numpy", label="speaker reference (optional)"),
45
+ gr.Textbox(value="0.3,0,0,0,0,0,0.2,0.5", label="emotion (8 comma-sep floats)"),
46
+ gr.Slider(0, 40, value=15, step=1, label="speaking_rate"),
47
+ gr.Slider(0, 400, value=20, step=1, label="pitch_std"),
48
+ ],
49
+ outputs=gr.Audio(label="generated speech"),
50
+ title="zonos-v0.1 zerogpu tts",
51
+ )
52
+
53
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  demo.launch()