Staticaliza commited on
Commit
d9a9c0c
·
verified ·
1 Parent(s): 8c97384

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -36
app.py CHANGED
@@ -1,38 +1,116 @@
1
- # app.py
2
- import os, gradio as gr, spaces
3
-
4
- # ---- deactivate deepspeed CUDA build BEFORE anything else imports it
5
- os.environ["DS_ACCELERATOR"] = "cpu" # force deepspeed to CPU backend
6
- os.environ["DS_BUILD_OPS"] = "0" # skip compiling custom ops
7
- # ------------------------------------
8
-
9
- from huggingface_hub import snapshot_download
10
- from indextts.infer import IndexTTS
11
-
12
- model_dir = snapshot_download("IndexTeam/IndexTTS-1.5",
13
- local_dir="checkpoints",
14
- local_dir_use_symlinks=False)
15
- cfg_path = os.path.join(model_dir, "config.yaml")
16
-
17
- tts = None
18
- def load():
19
- global tts
20
- if tts is None:
21
- tts = IndexTTS(model_dir=model_dir, cfg_path=cfg_path) # now imports deepspeed safely
22
-
23
- @spaces.GPU
24
- def synth(ref_wav, prompt):
25
- load()
26
- out = "out.wav"
27
- tts.infer(ref_wav, prompt, out)
28
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  with gr.Blocks() as demo:
31
- gr.Markdown("# index-tts 1.5 – zero-gpu space")
32
- txt = gr.Textbox(label="text prompt")
33
- ref = gr.Audio(label="reference voice", type="filepath")
34
- gen = gr.Audio(label="generated speech", type="filepath")
35
- gr.Button("generate").click(synth, [ref, txt], gen)
36
-
37
- demo.queue()
38
- demo.launch(show_api=False, ssr_mode=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+ import spaces
5
+ from functools import lru_cache
6
+ from zonos.model import Zonos
7
+ from zonos.conditioning import make_cond_dict, supported_language_codes
8
+
9
+ MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
10
+
11
+ @lru_cache(maxsize=1)
12
+ def load_model():
13
+ return Zonos.from_pretrained(MODEL_ID, device="cuda").eval().requires_grad_(False)
14
+
15
+ @spaces.GPU(duration=120)
16
+ def tts(
17
+ text,
18
+ language,
19
+ ref_audio,
20
+ fmax,
21
+ pitch_std,
22
+ speaking_rate,
23
+ guidance_scale,
24
+ seed,
25
+ happiness,
26
+ sadness,
27
+ disgust,
28
+ fear,
29
+ surprise,
30
+ anger,
31
+ other,
32
+ neutral,
33
+ ):
34
+ model = load_model()
35
+ if seed == 0:
36
+ seed = int(torch.randint(0, 2**32 - 1, (1,)).item())
37
+ torch.manual_seed(seed)
38
+
39
+ speaker = None
40
+ if ref_audio is not None:
41
+ wav, sr = torchaudio.load(ref_audio)
42
+ speaker = model.make_speaker_embedding(wav, sr).to("cuda", dtype=torch.bfloat16)
43
+
44
+ emotion = torch.tensor(
45
+ [happiness, sadness, disgust, fear, surprise, anger, other, neutral], device="cuda"
46
+ )
47
+ cond = make_cond_dict(
48
+ text=text,
49
+ language=language,
50
+ speaker=speaker,
51
+ emotion=emotion,
52
+ fmax=float(fmax),
53
+ pitch_std=float(pitch_std),
54
+ speaking_rate=float(speaking_rate),
55
+ device="cuda",
56
+ )
57
+ conditioning = model.prepare_conditioning(cond)
58
+ codes = model.generate(conditioning, cfg_scale=float(guidance_scale), max_new_tokens=2590)
59
+ wav_out = model.autoencoder.decode(codes).cpu().squeeze()
60
+ return (model.autoencoder.sampling_rate, wav_out.numpy()), seed
61
+
62
 
63
  with gr.Blocks() as demo:
64
+ gr.Markdown("## zonos v0.1 tts")
65
+ text = gr.Textbox(label="text to synthesize", lines=3)
66
+ language = gr.Dropdown(
67
+ choices=supported_language_codes, value="en-us", label="language"
68
+ )
69
+ ref_audio = gr.Audio(label="reference audio (zeroshot tts)", type="filepath")
70
+
71
+ fmax = gr.Slider(0, 24000, 24000, step=1, label="frequency max (Hz)")
72
+ pitch_std = gr.Slider(0.0, 300.0, 45.0, step=1, label="pitch variation")
73
+ speaking_rate = gr.Slider(5.0, 30.0, 15.0, step=0.5, label="speaking rate")
74
+
75
+ guidance_scale = gr.Slider(1.0, 5.0, 2.0, step=0.1, label="guidance scale")
76
+ seed = gr.Number(value=0, label="seed (0 = random)", precision=0)
77
+
78
+ gr.Markdown("### emotion settings")
79
+ happiness = gr.Slider(0, 1, 0.0, step=0.01, label="happiness")
80
+ sadness = gr.Slider(0, 1, 0.0, step=0.01, label="sadness")
81
+ disgust = gr.Slider(0, 1, 0.0, step=0.01, label="disgust")
82
+ fear = gr.Slider(0, 1, 0.0, step=0.01, label="fear")
83
+ surprise = gr.Slider(0, 1, 0.0, step=0.01, label="surprise")
84
+ anger = gr.Slider(0, 1, 0.0, step=0.01, label="anger")
85
+ other = gr.Slider(0, 1, 0.0, step=0.01, label="other")
86
+ neutral = gr.Slider(0, 1, 1.0, step=0.01, label="neutral")
87
+
88
+ btn = gr.Button("synthesize")
89
+ out_audio = gr.Audio(label="output")
90
+ out_seed = gr.Number(label="used seed", interactive=False)
91
+
92
+ btn.click(
93
+ tts,
94
+ [
95
+ text,
96
+ language,
97
+ ref_audio,
98
+ fmax,
99
+ pitch_std,
100
+ speaking_rate,
101
+ guidance_scale,
102
+ seed,
103
+ happiness,
104
+ sadness,
105
+ disgust,
106
+ fear,
107
+ surprise,
108
+ anger,
109
+ other,
110
+ neutral,
111
+ ],
112
+ [out_audio, out_seed],
113
+ )
114
+
115
+ if __name__ == "__main__":
116
+ demo.launch()