_Noxty commited on
Commit
dd3664d
·
verified ·
1 Parent(s): 391f668

Upload 14 files

Browse files
Files changed (14) hide show
  1. .env +9 -0
  2. .gitignore +28 -0
  3. __init__.py +0 -0
  4. config.json +1 -0
  5. config.py +254 -0
  6. download_models.py +79 -0
  7. infer-web.py +1619 -0
  8. infer_batch_rvc.py +72 -0
  9. infer_cli.py +67 -0
  10. modules.py +304 -0
  11. pipeline.py +457 -0
  12. pyproject.toml +64 -0
  13. requirements.txt +34 -0
  14. utils.py +33 -0
.env ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ OPENBLAS_NUM_THREADS = 1
2
+ no_proxy = localhost, 127.0.0.1, ::1
3
+
4
+ # You can change the location of the model, etc. by changing here
5
+ weight_root = assets/weights
6
+ weight_uvr5_root = assets/uvr5_weights
7
+ index_root = logs
8
+ outside_index_root = assets/indices
9
+ rmvpe_root = assets/rmvpe
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__
3
+ /TEMP
4
+ *.pyd
5
+ .venv
6
+ /opt
7
+ tools/aria2c/
8
+ tools/flag.txt
9
+
10
+ # Imported from huggingface.co/lj1995/VoiceConversionWebUI
11
+ /pretrained
12
+ /pretrained_v2
13
+ /uvr5_weights
14
+ hubert_base.pt
15
+ rmvpe.onnx
16
+ rmvpe.pt
17
+
18
+ # Generated by RVC
19
+ /logs
20
+ /weights
21
+
22
+ # To set a Python version for the project
23
+ .tool-versions
24
+
25
+ /runtime
26
+ /assets/weights/*
27
+ ffmpeg.*
28
+ ffprobe.*
__init__.py ADDED
File without changes
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "formant": 0.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}
config.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import json
5
+ import shutil
6
+ from multiprocessing import cpu_count
7
+
8
+ import torch
9
+
10
+ try:
11
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
12
+
13
+ if torch.xpu.is_available():
14
+ from infer.modules.ipex import ipex_init
15
+
16
+ ipex_init()
17
+ except Exception: # pylint: disable=broad-exception-caught
18
+ pass
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ version_config_list = [
25
+ "v1/32k.json",
26
+ "v1/40k.json",
27
+ "v1/48k.json",
28
+ "v2/48k.json",
29
+ "v2/32k.json",
30
+ ]
31
+
32
+
33
+ def singleton_variable(func):
34
+ def wrapper(*args, **kwargs):
35
+ if not wrapper.instance:
36
+ wrapper.instance = func(*args, **kwargs)
37
+ return wrapper.instance
38
+
39
+ wrapper.instance = None
40
+ return wrapper
41
+
42
+
43
+ @singleton_variable
44
+ class Config:
45
+ def __init__(self):
46
+ self.device = "cuda:0"
47
+ self.is_half = True
48
+ self.use_jit = False
49
+ self.n_cpu = 0
50
+ self.gpu_name = None
51
+ self.json_config = self.load_config_json()
52
+ self.gpu_mem = None
53
+ (
54
+ self.python_cmd,
55
+ self.listen_port,
56
+ self.iscolab,
57
+ self.noparallel,
58
+ self.noautoopen,
59
+ self.dml,
60
+ ) = self.arg_parse()
61
+ self.instead = ""
62
+ self.preprocess_per = 3.7
63
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
64
+
65
+ @staticmethod
66
+ def load_config_json() -> dict:
67
+ d = {}
68
+ for config_file in version_config_list:
69
+ p = f"configs/inuse/{config_file}"
70
+ if not os.path.exists(p):
71
+ shutil.copy(f"configs/{config_file}", p)
72
+ with open(f"configs/inuse/{config_file}", "r") as f:
73
+ d[config_file] = json.load(f)
74
+ return d
75
+
76
+ @staticmethod
77
+ def arg_parse() -> tuple:
78
+ exe = sys.executable or "python"
79
+ parser = argparse.ArgumentParser()
80
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
81
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
82
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
83
+ parser.add_argument(
84
+ "--noparallel", action="store_true", help="Disable parallel processing"
85
+ )
86
+ parser.add_argument(
87
+ "--noautoopen",
88
+ action="store_true",
89
+ help="Do not open in browser automatically",
90
+ )
91
+ parser.add_argument(
92
+ "--dml",
93
+ action="store_true",
94
+ help="torch_dml",
95
+ )
96
+ cmd_opts = parser.parse_args()
97
+
98
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
99
+
100
+ return (
101
+ cmd_opts.pycmd,
102
+ cmd_opts.port,
103
+ cmd_opts.colab,
104
+ cmd_opts.noparallel,
105
+ cmd_opts.noautoopen,
106
+ cmd_opts.dml,
107
+ )
108
+
109
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
110
+ # check `getattr` and try it for compatibility
111
+ @staticmethod
112
+ def has_mps() -> bool:
113
+ if not torch.backends.mps.is_available():
114
+ return False
115
+ try:
116
+ torch.zeros(1).to(torch.device("mps"))
117
+ return True
118
+ except Exception:
119
+ return False
120
+
121
+ @staticmethod
122
+ def has_xpu() -> bool:
123
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
124
+ return True
125
+ else:
126
+ return False
127
+
128
+ def use_fp32_config(self):
129
+ for config_file in version_config_list:
130
+ self.json_config[config_file]["train"]["fp16_run"] = False
131
+ with open(f"configs/inuse/{config_file}", "r") as f:
132
+ strr = f.read().replace("true", "false")
133
+ with open(f"configs/inuse/{config_file}", "w") as f:
134
+ f.write(strr)
135
+ logger.info("overwrite " + config_file)
136
+ self.preprocess_per = 3.0
137
+ logger.info("overwrite preprocess_per to %d" % (self.preprocess_per))
138
+
139
+ def device_config(self) -> tuple:
140
+ if torch.cuda.is_available():
141
+ if self.has_xpu():
142
+ self.device = self.instead = "xpu:0"
143
+ self.is_half = True
144
+ i_device = int(self.device.split(":")[-1])
145
+ self.gpu_name = torch.cuda.get_device_name(i_device)
146
+ if (
147
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
148
+ or "P40" in self.gpu_name.upper()
149
+ or "P10" in self.gpu_name.upper()
150
+ or "1060" in self.gpu_name
151
+ or "1070" in self.gpu_name
152
+ or "1080" in self.gpu_name
153
+ ):
154
+ logger.info("Found GPU %s, force to fp32", self.gpu_name)
155
+ self.is_half = False
156
+ self.use_fp32_config()
157
+ else:
158
+ logger.info("Found GPU %s", self.gpu_name)
159
+ self.gpu_mem = int(
160
+ torch.cuda.get_device_properties(i_device).total_memory
161
+ / 1024
162
+ / 1024
163
+ / 1024
164
+ + 0.4
165
+ )
166
+ if self.gpu_mem <= 4:
167
+ self.preprocess_per = 3.0
168
+ elif self.has_mps():
169
+ logger.info("No supported Nvidia GPU found")
170
+ self.device = self.instead = "mps"
171
+ self.is_half = False
172
+ self.use_fp32_config()
173
+ else:
174
+ logger.info("No supported Nvidia GPU found")
175
+ self.device = self.instead = "cpu"
176
+ self.is_half = False
177
+ self.use_fp32_config()
178
+
179
+ if self.n_cpu == 0:
180
+ self.n_cpu = cpu_count()
181
+
182
+ if self.is_half:
183
+ # 6G显存配置
184
+ x_pad = 3
185
+ x_query = 10
186
+ x_center = 60
187
+ x_max = 65
188
+ else:
189
+ # 5G显存配置
190
+ x_pad = 1
191
+ x_query = 6
192
+ x_center = 38
193
+ x_max = 41
194
+
195
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
196
+ x_pad = 1
197
+ x_query = 5
198
+ x_center = 30
199
+ x_max = 32
200
+ if self.dml:
201
+ logger.info("Use DirectML instead")
202
+ if (
203
+ os.path.exists(
204
+ "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
205
+ )
206
+ == False
207
+ ):
208
+ try:
209
+ os.rename(
210
+ "runtime\Lib\site-packages\onnxruntime",
211
+ "runtime\Lib\site-packages\onnxruntime-cuda",
212
+ )
213
+ except:
214
+ pass
215
+ try:
216
+ os.rename(
217
+ "runtime\Lib\site-packages\onnxruntime-dml",
218
+ "runtime\Lib\site-packages\onnxruntime",
219
+ )
220
+ except:
221
+ pass
222
+ # if self.device != "cpu":
223
+ import torch_directml
224
+
225
+ self.device = torch_directml.device(torch_directml.default_device())
226
+ self.is_half = False
227
+ else:
228
+ if self.instead:
229
+ logger.info(f"Use {self.instead} instead")
230
+ if (
231
+ os.path.exists(
232
+ "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
233
+ )
234
+ == False
235
+ ):
236
+ try:
237
+ os.rename(
238
+ "runtime\Lib\site-packages\onnxruntime",
239
+ "runtime\Lib\site-packages\onnxruntime-dml",
240
+ )
241
+ except:
242
+ pass
243
+ try:
244
+ os.rename(
245
+ "runtime\Lib\site-packages\onnxruntime-cuda",
246
+ "runtime\Lib\site-packages\onnxruntime",
247
+ )
248
+ except:
249
+ pass
250
+ logger.info(
251
+ "Half-precision floating-point: %s, device: %s"
252
+ % (self.is_half, self.device)
253
+ )
254
+ return x_pad, x_query, x_center, x_max
download_models.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import requests
4
+
5
+ RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
6
+
7
+ BASE_DIR = Path(__file__).resolve().parent.parent
8
+
9
+
10
+ def dl_model(link, model_name, dir_name):
11
+ with requests.get(f"{link}{model_name}") as r:
12
+ r.raise_for_status()
13
+ os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
14
+ with open(dir_name / model_name, "wb") as f:
15
+ for chunk in r.iter_content(chunk_size=8192):
16
+ f.write(chunk)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ print("Downloading hubert_base.pt...")
21
+ dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
22
+ print("Downloading rmvpe.pt...")
23
+ dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
24
+ print("Downloading vocals.onnx...")
25
+ dl_model(
26
+ RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
27
+ "vocals.onnx",
28
+ BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
29
+ )
30
+
31
+ rvc_models_dir = BASE_DIR / "assets/pretrained"
32
+
33
+ print("Downloading pretrained models:")
34
+
35
+ model_names = [
36
+ "D32k.pth",
37
+ "D40k.pth",
38
+ "D48k.pth",
39
+ "G32k.pth",
40
+ "G40k.pth",
41
+ "G48k.pth",
42
+ "f0D32k.pth",
43
+ "f0D40k.pth",
44
+ "f0D48k.pth",
45
+ "f0G32k.pth",
46
+ "f0G40k.pth",
47
+ "f0G48k.pth",
48
+ ]
49
+ for model in model_names:
50
+ print(f"Downloading {model}...")
51
+ dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
52
+
53
+ rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
54
+
55
+ print("Downloading pretrained models v2:")
56
+
57
+ for model in model_names:
58
+ print(f"Downloading {model}...")
59
+ dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
60
+
61
+ print("Downloading uvr5_weights:")
62
+
63
+ rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
64
+
65
+ model_names = [
66
+ "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
67
+ "HP2_all_vocals.pth",
68
+ "HP3_all_vocals.pth",
69
+ "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
70
+ "HP5_only_main_vocal.pth",
71
+ "VR-DeEchoAggressive.pth",
72
+ "VR-DeEchoDeReverb.pth",
73
+ "VR-DeEchoNormal.pth",
74
+ ]
75
+ for model in model_names:
76
+ print(f"Downloading {model}...")
77
+ dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
78
+
79
+ print("All models downloaded!")
infer-web.py ADDED
@@ -0,0 +1,1619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from dotenv import load_dotenv
4
+
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+ load_dotenv()
8
+ from infer.modules.vc.modules import VC
9
+ from infer.modules.uvr5.modules import uvr
10
+ from infer.lib.train.process_ckpt import (
11
+ change_info,
12
+ extract_small_model,
13
+ merge,
14
+ show_info,
15
+ )
16
+ from i18n.i18n import I18nAuto
17
+ from configs.config import Config
18
+ from sklearn.cluster import MiniBatchKMeans
19
+ import torch, platform
20
+ import numpy as np
21
+ import gradio as gr
22
+ import faiss
23
+ import fairseq
24
+ import pathlib
25
+ import json
26
+ from time import sleep
27
+ from subprocess import Popen
28
+ from random import shuffle
29
+ import warnings
30
+ import traceback
31
+ import threading
32
+ import shutil
33
+ import logging
34
+
35
+
36
+ logging.getLogger("numba").setLevel(logging.WARNING)
37
+ logging.getLogger("httpx").setLevel(logging.WARNING)
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ tmp = os.path.join(now_dir, "TEMP")
42
+ shutil.rmtree(tmp, ignore_errors=True)
43
+ shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
44
+ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True)
45
+ os.makedirs(tmp, exist_ok=True)
46
+ os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
47
+ os.makedirs(os.path.join(now_dir, "assets/weights"), exist_ok=True)
48
+ os.environ["TEMP"] = tmp
49
+ warnings.filterwarnings("ignore")
50
+ torch.manual_seed(114514)
51
+
52
+
53
+ config = Config()
54
+ vc = VC(config)
55
+
56
+
57
+ if config.dml == True:
58
+
59
+ def forward_dml(ctx, x, scale):
60
+ ctx.scale = scale
61
+ res = x.clone().detach()
62
+ return res
63
+
64
+ fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
65
+ i18n = I18nAuto()
66
+ logger.info(i18n)
67
+ # 判断是否有能用来训练和加速推理的N卡
68
+ ngpu = torch.cuda.device_count()
69
+ gpu_infos = []
70
+ mem = []
71
+ if_gpu_ok = False
72
+
73
+ if torch.cuda.is_available() or ngpu != 0:
74
+ for i in range(ngpu):
75
+ gpu_name = torch.cuda.get_device_name(i)
76
+ if any(
77
+ value in gpu_name.upper()
78
+ for value in [
79
+ "10",
80
+ "16",
81
+ "20",
82
+ "30",
83
+ "40",
84
+ "A2",
85
+ "A3",
86
+ "A4",
87
+ "P4",
88
+ "A50",
89
+ "500",
90
+ "A60",
91
+ "70",
92
+ "80",
93
+ "90",
94
+ "M4",
95
+ "T4",
96
+ "TITAN",
97
+ "4060",
98
+ "L",
99
+ "6000",
100
+ ]
101
+ ):
102
+ # A10#A100#V100#A40#P40#M40#K80#A4500
103
+ if_gpu_ok = True # 至少有一张能用的N卡
104
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
105
+ mem.append(
106
+ int(
107
+ torch.cuda.get_device_properties(i).total_memory
108
+ / 1024
109
+ / 1024
110
+ / 1024
111
+ + 0.4
112
+ )
113
+ )
114
+ if if_gpu_ok and len(gpu_infos) > 0:
115
+ gpu_info = "\n".join(gpu_infos)
116
+ default_batch_size = min(mem) // 2
117
+ else:
118
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
119
+ default_batch_size = 1
120
+ gpus = "-".join([i[0] for i in gpu_infos])
121
+
122
+
123
+ class ToolButton(gr.Button, gr.components.FormComponent):
124
+ """Small button with single emoji as text, fits inside gradio forms"""
125
+
126
+ def __init__(self, **kwargs):
127
+ super().__init__(variant="tool", **kwargs)
128
+
129
+ def get_block_name(self):
130
+ return "button"
131
+
132
+
133
+ weight_root = os.getenv("weight_root")
134
+ weight_uvr5_root = os.getenv("weight_uvr5_root")
135
+ index_root = os.getenv("index_root")
136
+ outside_index_root = os.getenv("outside_index_root")
137
+
138
+ names = []
139
+ for name in os.listdir(weight_root):
140
+ if name.endswith(".pth"):
141
+ names.append(name)
142
+ index_paths = []
143
+
144
+
145
+ def lookup_indices(index_root):
146
+ global index_paths
147
+ for root, dirs, files in os.walk(index_root, topdown=False):
148
+ for name in files:
149
+ if name.endswith(".index") and "trained" not in name:
150
+ index_paths.append("%s/%s" % (root, name))
151
+
152
+
153
+ lookup_indices(index_root)
154
+ lookup_indices(outside_index_root)
155
+ uvr5_names = []
156
+ for name in os.listdir(weight_uvr5_root):
157
+ if name.endswith(".pth") or "onnx" in name:
158
+ uvr5_names.append(name.replace(".pth", ""))
159
+
160
+
161
+ def change_choices():
162
+ names = []
163
+ for name in os.listdir(weight_root):
164
+ if name.endswith(".pth"):
165
+ names.append(name)
166
+ index_paths = []
167
+ for root, dirs, files in os.walk(index_root, topdown=False):
168
+ for name in files:
169
+ if name.endswith(".index") and "trained" not in name:
170
+ index_paths.append("%s/%s" % (root, name))
171
+ return {"choices": sorted(names), "__type__": "update"}, {
172
+ "choices": sorted(index_paths),
173
+ "__type__": "update",
174
+ }
175
+
176
+
177
+ def clean():
178
+ return {"value": "", "__type__": "update"}
179
+
180
+
181
+ def export_onnx(ModelPath, ExportedPath):
182
+ from infer.modules.onnx.export import export_onnx as eo
183
+
184
+ eo(ModelPath, ExportedPath)
185
+
186
+
187
+ sr_dict = {
188
+ "32k": 32000,
189
+ "40k": 40000,
190
+ "48k": 48000,
191
+ }
192
+
193
+
194
+ def if_done(done, p):
195
+ while 1:
196
+ if p.poll() is None:
197
+ sleep(0.5)
198
+ else:
199
+ break
200
+ done[0] = True
201
+
202
+
203
+ def if_done_multi(done, ps):
204
+ while 1:
205
+ # poll==None代表进程未结束
206
+ # 只要有一个进程未结束都不停
207
+ flag = 1
208
+ for p in ps:
209
+ if p.poll() is None:
210
+ flag = 0
211
+ sleep(0.5)
212
+ break
213
+ if flag == 1:
214
+ break
215
+ done[0] = True
216
+
217
+
218
+ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
219
+ sr = sr_dict[sr]
220
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
221
+ f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
222
+ f.close()
223
+ cmd = '"%s" infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" %s %.1f' % (
224
+ config.python_cmd,
225
+ trainset_dir,
226
+ sr,
227
+ n_p,
228
+ now_dir,
229
+ exp_dir,
230
+ config.noparallel,
231
+ config.preprocess_per,
232
+ )
233
+ logger.info("Execute: " + cmd)
234
+ # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
235
+ p = Popen(cmd, shell=True)
236
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
237
+ done = [False]
238
+ threading.Thread(
239
+ target=if_done,
240
+ args=(
241
+ done,
242
+ p,
243
+ ),
244
+ ).start()
245
+ while 1:
246
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
247
+ yield (f.read())
248
+ sleep(1)
249
+ if done[0]:
250
+ break
251
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
252
+ log = f.read()
253
+ logger.info(log)
254
+ yield log
255
+
256
+
257
+ # but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
258
+ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvpe):
259
+ gpus = gpus.split("-")
260
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
261
+ f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
262
+ f.close()
263
+ if if_f0:
264
+ if f0method != "rmvpe_gpu":
265
+ cmd = (
266
+ '"%s" infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s'
267
+ % (
268
+ config.python_cmd,
269
+ now_dir,
270
+ exp_dir,
271
+ n_p,
272
+ f0method,
273
+ )
274
+ )
275
+ logger.info("Execute: " + cmd)
276
+ p = Popen(
277
+ cmd, shell=True, cwd=now_dir
278
+ ) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
279
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
280
+ done = [False]
281
+ threading.Thread(
282
+ target=if_done,
283
+ args=(
284
+ done,
285
+ p,
286
+ ),
287
+ ).start()
288
+ else:
289
+ if gpus_rmvpe != "-":
290
+ gpus_rmvpe = gpus_rmvpe.split("-")
291
+ leng = len(gpus_rmvpe)
292
+ ps = []
293
+ for idx, n_g in enumerate(gpus_rmvpe):
294
+ cmd = (
295
+ '"%s" infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s '
296
+ % (
297
+ config.python_cmd,
298
+ leng,
299
+ idx,
300
+ n_g,
301
+ now_dir,
302
+ exp_dir,
303
+ config.is_half,
304
+ )
305
+ )
306
+ logger.info("Execute: " + cmd)
307
+ p = Popen(
308
+ cmd, shell=True, cwd=now_dir
309
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
310
+ ps.append(p)
311
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
312
+ done = [False]
313
+ threading.Thread(
314
+ target=if_done_multi, #
315
+ args=(
316
+ done,
317
+ ps,
318
+ ),
319
+ ).start()
320
+ else:
321
+ cmd = (
322
+ config.python_cmd
323
+ + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" '
324
+ % (
325
+ now_dir,
326
+ exp_dir,
327
+ )
328
+ )
329
+ logger.info("Execute: " + cmd)
330
+ p = Popen(
331
+ cmd, shell=True, cwd=now_dir
332
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
333
+ p.wait()
334
+ done = [True]
335
+ while 1:
336
+ with open(
337
+ "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
338
+ ) as f:
339
+ yield (f.read())
340
+ sleep(1)
341
+ if done[0]:
342
+ break
343
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
344
+ log = f.read()
345
+ logger.info(log)
346
+ yield log
347
+ # 对不同part分别开多进程
348
+ """
349
+ n_part=int(sys.argv[1])
350
+ i_part=int(sys.argv[2])
351
+ i_gpu=sys.argv[3]
352
+ exp_dir=sys.argv[4]
353
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
354
+ """
355
+ leng = len(gpus)
356
+ ps = []
357
+ for idx, n_g in enumerate(gpus):
358
+ cmd = (
359
+ '"%s" infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s %s'
360
+ % (
361
+ config.python_cmd,
362
+ config.device,
363
+ leng,
364
+ idx,
365
+ n_g,
366
+ now_dir,
367
+ exp_dir,
368
+ version19,
369
+ config.is_half,
370
+ )
371
+ )
372
+ logger.info("Execute: " + cmd)
373
+ p = Popen(
374
+ cmd, shell=True, cwd=now_dir
375
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
376
+ ps.append(p)
377
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
378
+ done = [False]
379
+ threading.Thread(
380
+ target=if_done_multi,
381
+ args=(
382
+ done,
383
+ ps,
384
+ ),
385
+ ).start()
386
+ while 1:
387
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
388
+ yield (f.read())
389
+ sleep(1)
390
+ if done[0]:
391
+ break
392
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
393
+ log = f.read()
394
+ logger.info(log)
395
+ yield log
396
+
397
+
398
+ def get_pretrained_models(path_str, f0_str, sr2):
399
+ if_pretrained_generator_exist = os.access(
400
+ "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
401
+ )
402
+ if_pretrained_discriminator_exist = os.access(
403
+ "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
404
+ )
405
+ if not if_pretrained_generator_exist:
406
+ logger.warning(
407
+ "assets/pretrained%s/%sG%s.pth not exist, will not use pretrained model",
408
+ path_str,
409
+ f0_str,
410
+ sr2,
411
+ )
412
+ if not if_pretrained_discriminator_exist:
413
+ logger.warning(
414
+ "assets/pretrained%s/%sD%s.pth not exist, will not use pretrained model",
415
+ path_str,
416
+ f0_str,
417
+ sr2,
418
+ )
419
+ return (
420
+ (
421
+ "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
422
+ if if_pretrained_generator_exist
423
+ else ""
424
+ ),
425
+ (
426
+ "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
427
+ if if_pretrained_discriminator_exist
428
+ else ""
429
+ ),
430
+ )
431
+
432
+
433
+ def change_sr2(sr2, if_f0_3, version19):
434
+ path_str = "" if version19 == "v1" else "_v2"
435
+ f0_str = "f0" if if_f0_3 else ""
436
+ return get_pretrained_models(path_str, f0_str, sr2)
437
+
438
+
439
+ def change_version19(sr2, if_f0_3, version19):
440
+ path_str = "" if version19 == "v1" else "_v2"
441
+ if sr2 == "32k" and version19 == "v1":
442
+ sr2 = "40k"
443
+ to_return_sr2 = (
444
+ {"choices": ["40k", "48k"], "__type__": "update", "value": sr2}
445
+ if version19 == "v1"
446
+ else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2}
447
+ )
448
+ f0_str = "f0" if if_f0_3 else ""
449
+ return (
450
+ *get_pretrained_models(path_str, f0_str, sr2),
451
+ to_return_sr2,
452
+ )
453
+
454
+
455
+ def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
456
+ path_str = "" if version19 == "v1" else "_v2"
457
+ return (
458
+ {"visible": if_f0_3, "__type__": "update"},
459
+ {"visible": if_f0_3, "__type__": "update"},
460
+ *get_pretrained_models(path_str, "f0" if if_f0_3 == True else "", sr2),
461
+ )
462
+
463
+
464
+ # but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
465
+ def click_train(
466
+ exp_dir1,
467
+ sr2,
468
+ if_f0_3,
469
+ spk_id5,
470
+ save_epoch10,
471
+ total_epoch11,
472
+ batch_size12,
473
+ if_save_latest13,
474
+ pretrained_G14,
475
+ pretrained_D15,
476
+ gpus16,
477
+ if_cache_gpu17,
478
+ if_save_every_weights18,
479
+ version19,
480
+ ):
481
+ # 生成filelist
482
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
483
+ os.makedirs(exp_dir, exist_ok=True)
484
+ gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
485
+ feature_dir = (
486
+ "%s/3_feature256" % (exp_dir)
487
+ if version19 == "v1"
488
+ else "%s/3_feature768" % (exp_dir)
489
+ )
490
+ if if_f0_3:
491
+ f0_dir = "%s/2a_f0" % (exp_dir)
492
+ f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
493
+ names = (
494
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
495
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
496
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
497
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
498
+ )
499
+ else:
500
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
501
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
502
+ )
503
+ opt = []
504
+ for name in names:
505
+ if if_f0_3:
506
+ opt.append(
507
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
508
+ % (
509
+ gt_wavs_dir.replace("\\", "\\\\"),
510
+ name,
511
+ feature_dir.replace("\\", "\\\\"),
512
+ name,
513
+ f0_dir.replace("\\", "\\\\"),
514
+ name,
515
+ f0nsf_dir.replace("\\", "\\\\"),
516
+ name,
517
+ spk_id5,
518
+ )
519
+ )
520
+ else:
521
+ opt.append(
522
+ "%s/%s.wav|%s/%s.npy|%s"
523
+ % (
524
+ gt_wavs_dir.replace("\\", "\\\\"),
525
+ name,
526
+ feature_dir.replace("\\", "\\\\"),
527
+ name,
528
+ spk_id5,
529
+ )
530
+ )
531
+ fea_dim = 256 if version19 == "v1" else 768
532
+ if if_f0_3:
533
+ for _ in range(2):
534
+ opt.append(
535
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
536
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
537
+ )
538
+ else:
539
+ for _ in range(2):
540
+ opt.append(
541
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
542
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
543
+ )
544
+ shuffle(opt)
545
+ with open("%s/filelist.txt" % exp_dir, "w") as f:
546
+ f.write("\n".join(opt))
547
+ logger.debug("Write filelist done")
548
+ # 生成config#无需生成config
549
+ # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
550
+ logger.info("Use gpus: %s", str(gpus16))
551
+ if pretrained_G14 == "":
552
+ logger.info("No pretrained Generator")
553
+ if pretrained_D15 == "":
554
+ logger.info("No pretrained Discriminator")
555
+ if version19 == "v1" or sr2 == "40k":
556
+ config_path = "v1/%s.json" % sr2
557
+ else:
558
+ config_path = "v2/%s.json" % sr2
559
+ config_save_path = os.path.join(exp_dir, "config.json")
560
+ if not pathlib.Path(config_save_path).exists():
561
+ with open(config_save_path, "w", encoding="utf-8") as f:
562
+ json.dump(
563
+ config.json_config[config_path],
564
+ f,
565
+ ensure_ascii=False,
566
+ indent=4,
567
+ sort_keys=True,
568
+ )
569
+ f.write("\n")
570
+ if gpus16:
571
+ cmd = (
572
+ '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
573
+ % (
574
+ config.python_cmd,
575
+ exp_dir1,
576
+ sr2,
577
+ 1 if if_f0_3 else 0,
578
+ batch_size12,
579
+ gpus16,
580
+ total_epoch11,
581
+ save_epoch10,
582
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
583
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
584
+ 1 if if_save_latest13 == i18n("是") else 0,
585
+ 1 if if_cache_gpu17 == i18n("是") else 0,
586
+ 1 if if_save_every_weights18 == i18n("是") else 0,
587
+ version19,
588
+ )
589
+ )
590
+ else:
591
+ cmd = (
592
+ '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
593
+ % (
594
+ config.python_cmd,
595
+ exp_dir1,
596
+ sr2,
597
+ 1 if if_f0_3 else 0,
598
+ batch_size12,
599
+ total_epoch11,
600
+ save_epoch10,
601
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
602
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
603
+ 1 if if_save_latest13 == i18n("是") else 0,
604
+ 1 if if_cache_gpu17 == i18n("是") else 0,
605
+ 1 if if_save_every_weights18 == i18n("是") else 0,
606
+ version19,
607
+ )
608
+ )
609
+ logger.info("Execute: " + cmd)
610
+ p = Popen(cmd, shell=True, cwd=now_dir)
611
+ p.wait()
612
+ return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"
613
+
614
+
615
+ # but4.click(train_index, [exp_dir1], info3)
616
+ def train_index(exp_dir1, version19):
617
+ # exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
618
+ exp_dir = "logs/%s" % (exp_dir1)
619
+ os.makedirs(exp_dir, exist_ok=True)
620
+ feature_dir = (
621
+ "%s/3_feature256" % (exp_dir)
622
+ if version19 == "v1"
623
+ else "%s/3_feature768" % (exp_dir)
624
+ )
625
+ if not os.path.exists(feature_dir):
626
+ return "请先进行特征提取!"
627
+ listdir_res = list(os.listdir(feature_dir))
628
+ if len(listdir_res) == 0:
629
+ return "请先进行特征提取!"
630
+ infos = []
631
+ npys = []
632
+ for name in sorted(listdir_res):
633
+ phone = np.load("%s/%s" % (feature_dir, name))
634
+ npys.append(phone)
635
+ big_npy = np.concatenate(npys, 0)
636
+ big_npy_idx = np.arange(big_npy.shape[0])
637
+ np.random.shuffle(big_npy_idx)
638
+ big_npy = big_npy[big_npy_idx]
639
+ if big_npy.shape[0] > 2e5:
640
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
641
+ yield "\n".join(infos)
642
+ try:
643
+ big_npy = (
644
+ MiniBatchKMeans(
645
+ n_clusters=10000,
646
+ verbose=True,
647
+ batch_size=256 * config.n_cpu,
648
+ compute_labels=False,
649
+ init="random",
650
+ )
651
+ .fit(big_npy)
652
+ .cluster_centers_
653
+ )
654
+ except:
655
+ info = traceback.format_exc()
656
+ logger.info(info)
657
+ infos.append(info)
658
+ yield "\n".join(infos)
659
+
660
+ np.save("%s/total_fea.npy" % exp_dir, big_npy)
661
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
662
+ infos.append("%s,%s" % (big_npy.shape, n_ivf))
663
+ yield "\n".join(infos)
664
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
665
+ # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
666
+ infos.append("training")
667
+ yield "\n".join(infos)
668
+ index_ivf = faiss.extract_index_ivf(index) #
669
+ index_ivf.nprobe = 1
670
+ index.train(big_npy)
671
+ faiss.write_index(
672
+ index,
673
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
674
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
675
+ )
676
+ infos.append("adding")
677
+ yield "\n".join(infos)
678
+ batch_size_add = 8192
679
+ for i in range(0, big_npy.shape[0], batch_size_add):
680
+ index.add(big_npy[i : i + batch_size_add])
681
+ faiss.write_index(
682
+ index,
683
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
684
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
685
+ )
686
+ infos.append(
687
+ "成功构建索引 added_IVF%s_Flat_nprobe_%s_%s_%s.index"
688
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
689
+ )
690
+ try:
691
+ link = os.link if platform.system() == "Windows" else os.symlink
692
+ link(
693
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
694
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
695
+ "%s/%s_IVF%s_Flat_nprobe_%s_%s_%s.index"
696
+ % (
697
+ outside_index_root,
698
+ exp_dir1,
699
+ n_ivf,
700
+ index_ivf.nprobe,
701
+ exp_dir1,
702
+ version19,
703
+ ),
704
+ )
705
+ infos.append("链接索引到外部-%s" % (outside_index_root))
706
+ except:
707
+ infos.append("链接索引到外部-%s失败" % (outside_index_root))
708
+
709
+ # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
710
+ # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
711
+ yield "\n".join(infos)
712
+
713
+
714
+ # but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
715
+ def train1key(
716
+ exp_dir1,
717
+ sr2,
718
+ if_f0_3,
719
+ trainset_dir4,
720
+ spk_id5,
721
+ np7,
722
+ f0method8,
723
+ save_epoch10,
724
+ total_epoch11,
725
+ batch_size12,
726
+ if_save_latest13,
727
+ pretrained_G14,
728
+ pretrained_D15,
729
+ gpus16,
730
+ if_cache_gpu17,
731
+ if_save_every_weights18,
732
+ version19,
733
+ gpus_rmvpe,
734
+ ):
735
+ infos = []
736
+
737
+ def get_info_str(strr):
738
+ infos.append(strr)
739
+ return "\n".join(infos)
740
+
741
+ # step1:处理数据
742
+ yield get_info_str(i18n("step1:正在处理数据"))
743
+ [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)]
744
+
745
+ # step2a:提取音高
746
+ yield get_info_str(i18n("step2:正在提取音高&正在提取特征"))
747
+ [
748
+ get_info_str(_)
749
+ for _ in extract_f0_feature(
750
+ gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe
751
+ )
752
+ ]
753
+
754
+ # step3a:训练模型
755
+ yield get_info_str(i18n("step3a:正在训练模型"))
756
+ click_train(
757
+ exp_dir1,
758
+ sr2,
759
+ if_f0_3,
760
+ spk_id5,
761
+ save_epoch10,
762
+ total_epoch11,
763
+ batch_size12,
764
+ if_save_latest13,
765
+ pretrained_G14,
766
+ pretrained_D15,
767
+ gpus16,
768
+ if_cache_gpu17,
769
+ if_save_every_weights18,
770
+ version19,
771
+ )
772
+ yield get_info_str(
773
+ i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")
774
+ )
775
+
776
+ # step3b:训练索引
777
+ [get_info_str(_) for _ in train_index(exp_dir1, version19)]
778
+ yield get_info_str(i18n("全流程结束!"))
779
+
780
+
781
+ # ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
782
+ def change_info_(ckpt_path):
783
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
784
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
785
+ try:
786
+ with open(
787
+ ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
788
+ ) as f:
789
+ info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
790
+ sr, f0 = info["sample_rate"], info["if_f0"]
791
+ version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
792
+ return sr, str(f0), version
793
+ except:
794
+ traceback.print_exc()
795
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
796
+
797
+
798
+ F0GPUVisible = config.dml == False
799
+
800
+
801
+ def change_f0_method(f0method8):
802
+ if f0method8 == "rmvpe_gpu":
803
+ visible = F0GPUVisible
804
+ else:
805
+ visible = False
806
+ return {"visible": visible, "__type__": "update"}
807
+
808
+
809
+ with gr.Blocks(title="RVC WebUI") as app:
810
+ gr.Markdown("## RVC WebUI")
811
+ gr.Markdown(
812
+ value=i18n(
813
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>."
814
+ )
815
+ )
816
+ with gr.Tabs():
817
+ with gr.TabItem(i18n("模型推理")):
818
+ with gr.Row():
819
+ sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
820
+ with gr.Column():
821
+ refresh_button = gr.Button(
822
+ i18n("刷新音色列表和索引路径"), variant="primary"
823
+ )
824
+ clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
825
+ spk_item = gr.Slider(
826
+ minimum=0,
827
+ maximum=2333,
828
+ step=1,
829
+ label=i18n("请选择说话人id"),
830
+ value=0,
831
+ visible=False,
832
+ interactive=True,
833
+ )
834
+ clean_button.click(
835
+ fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
836
+ )
837
+ with gr.TabItem(i18n("单次推理")):
838
+ with gr.Group():
839
+ with gr.Row():
840
+ with gr.Column():
841
+ vc_transform0 = gr.Number(
842
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"),
843
+ value=0,
844
+ )
845
+ input_audio0 = gr.Textbox(
846
+ label=i18n(
847
+ "输入待处理音频文件路径(默认是正确格式示例)"
848
+ ),
849
+ placeholder="C:\\Users\\Desktop\\audio_example.wav",
850
+ )
851
+ file_index1 = gr.Textbox(
852
+ label=i18n(
853
+ "特征检索库文件路径,为空则使用下拉的选择结果"
854
+ ),
855
+ placeholder="C:\\Users\\Desktop\\model_example.index",
856
+ interactive=True,
857
+ )
858
+ file_index2 = gr.Dropdown(
859
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
860
+ choices=sorted(index_paths),
861
+ interactive=True,
862
+ )
863
+ f0method0 = gr.Radio(
864
+ label=i18n(
865
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
866
+ ),
867
+ choices=(
868
+ ["pm", "harvest", "crepe", "rmvpe"]
869
+ if config.dml == False
870
+ else ["pm", "harvest", "rmvpe"]
871
+ ),
872
+ value="rmvpe",
873
+ interactive=True,
874
+ )
875
+
876
+ with gr.Column():
877
+ resample_sr0 = gr.Slider(
878
+ minimum=0,
879
+ maximum=48000,
880
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
881
+ value=0,
882
+ step=1,
883
+ interactive=True,
884
+ )
885
+ rms_mix_rate0 = gr.Slider(
886
+ minimum=0,
887
+ maximum=1,
888
+ label=i18n(
889
+ "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"
890
+ ),
891
+ value=0.25,
892
+ interactive=True,
893
+ )
894
+ protect0 = gr.Slider(
895
+ minimum=0,
896
+ maximum=0.5,
897
+ label=i18n(
898
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
899
+ ),
900
+ value=0.33,
901
+ step=0.01,
902
+ interactive=True,
903
+ )
904
+ filter_radius0 = gr.Slider(
905
+ minimum=0,
906
+ maximum=7,
907
+ label=i18n(
908
+ ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"
909
+ ),
910
+ value=3,
911
+ step=1,
912
+ interactive=True,
913
+ )
914
+ index_rate1 = gr.Slider(
915
+ minimum=0,
916
+ maximum=1,
917
+ label=i18n("检索特征占比"),
918
+ value=0.75,
919
+ interactive=True,
920
+ )
921
+ f0_file = gr.File(
922
+ label=i18n(
923
+ "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"
924
+ ),
925
+ visible=False,
926
+ )
927
+
928
+ refresh_button.click(
929
+ fn=change_choices,
930
+ inputs=[],
931
+ outputs=[sid0, file_index2],
932
+ api_name="infer_refresh",
933
+ )
934
+ # file_big_npy1 = gr.Textbox(
935
+ # label=i18n("特征文件路径"),
936
+ # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
937
+ # interactive=True,
938
+ # )
939
+ with gr.Group():
940
+ with gr.Column():
941
+ but0 = gr.Button(i18n("转换"), variant="primary")
942
+ with gr.Row():
943
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
944
+ vc_output2 = gr.Audio(
945
+ label=i18n("输出音频(右下角三个点,点了可以下载)")
946
+ )
947
+
948
+ but0.click(
949
+ vc.vc_single,
950
+ [
951
+ spk_item,
952
+ input_audio0,
953
+ vc_transform0,
954
+ f0_file,
955
+ f0method0,
956
+ file_index1,
957
+ file_index2,
958
+ # file_big_npy1,
959
+ index_rate1,
960
+ filter_radius0,
961
+ resample_sr0,
962
+ rms_mix_rate0,
963
+ protect0,
964
+ ],
965
+ [vc_output1, vc_output2],
966
+ api_name="infer_convert",
967
+ )
968
+ with gr.TabItem(i18n("批量推理")):
969
+ gr.Markdown(
970
+ value=i18n(
971
+ "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. "
972
+ )
973
+ )
974
+ with gr.Row():
975
+ with gr.Column():
976
+ vc_transform1 = gr.Number(
977
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"),
978
+ value=0,
979
+ )
980
+ opt_input = gr.Textbox(
981
+ label=i18n("指定输出文件夹"), value="opt"
982
+ )
983
+ file_index3 = gr.Textbox(
984
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
985
+ value="",
986
+ interactive=True,
987
+ )
988
+ file_index4 = gr.Dropdown(
989
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
990
+ choices=sorted(index_paths),
991
+ interactive=True,
992
+ )
993
+ f0method1 = gr.Radio(
994
+ label=i18n(
995
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
996
+ ),
997
+ choices=(
998
+ ["pm", "harvest", "crepe", "rmvpe"]
999
+ if config.dml == False
1000
+ else ["pm", "harvest", "rmvpe"]
1001
+ ),
1002
+ value="rmvpe",
1003
+ interactive=True,
1004
+ )
1005
+ format1 = gr.Radio(
1006
+ label=i18n("导出文件格式"),
1007
+ choices=["wav", "flac", "mp3", "m4a"],
1008
+ value="wav",
1009
+ interactive=True,
1010
+ )
1011
+
1012
+ refresh_button.click(
1013
+ fn=lambda: change_choices()[1],
1014
+ inputs=[],
1015
+ outputs=file_index4,
1016
+ api_name="infer_refresh_batch",
1017
+ )
1018
+ # file_big_npy2 = gr.Textbox(
1019
+ # label=i18n("特征文件路径"),
1020
+ # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
1021
+ # interactive=True,
1022
+ # )
1023
+
1024
+ with gr.Column():
1025
+ resample_sr1 = gr.Slider(
1026
+ minimum=0,
1027
+ maximum=48000,
1028
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
1029
+ value=0,
1030
+ step=1,
1031
+ interactive=True,
1032
+ )
1033
+ rms_mix_rate1 = gr.Slider(
1034
+ minimum=0,
1035
+ maximum=1,
1036
+ label=i18n(
1037
+ "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"
1038
+ ),
1039
+ value=1,
1040
+ interactive=True,
1041
+ )
1042
+ protect1 = gr.Slider(
1043
+ minimum=0,
1044
+ maximum=0.5,
1045
+ label=i18n(
1046
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
1047
+ ),
1048
+ value=0.33,
1049
+ step=0.01,
1050
+ interactive=True,
1051
+ )
1052
+ filter_radius1 = gr.Slider(
1053
+ minimum=0,
1054
+ maximum=7,
1055
+ label=i18n(
1056
+ ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"
1057
+ ),
1058
+ value=3,
1059
+ step=1,
1060
+ interactive=True,
1061
+ )
1062
+ index_rate2 = gr.Slider(
1063
+ minimum=0,
1064
+ maximum=1,
1065
+ label=i18n("检索特征占比"),
1066
+ value=1,
1067
+ interactive=True,
1068
+ )
1069
+ with gr.Row():
1070
+ dir_input = gr.Textbox(
1071
+ label=i18n(
1072
+ "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"
1073
+ ),
1074
+ placeholder="C:\\Users\\Desktop\\input_vocal_dir",
1075
+ )
1076
+ inputs = gr.File(
1077
+ file_count="multiple",
1078
+ label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"),
1079
+ )
1080
+
1081
+ with gr.Row():
1082
+ but1 = gr.Button(i18n("转换"), variant="primary")
1083
+ vc_output3 = gr.Textbox(label=i18n("输出信息"))
1084
+
1085
+ but1.click(
1086
+ vc.vc_multi,
1087
+ [
1088
+ spk_item,
1089
+ dir_input,
1090
+ opt_input,
1091
+ inputs,
1092
+ vc_transform1,
1093
+ f0method1,
1094
+ file_index3,
1095
+ file_index4,
1096
+ # file_big_npy2,
1097
+ index_rate2,
1098
+ filter_radius1,
1099
+ resample_sr1,
1100
+ rms_mix_rate1,
1101
+ protect1,
1102
+ format1,
1103
+ ],
1104
+ [vc_output3],
1105
+ api_name="infer_convert_batch",
1106
+ )
1107
+ sid0.change(
1108
+ fn=vc.get_vc,
1109
+ inputs=[sid0, protect0, protect1],
1110
+ outputs=[spk_item, protect0, protect1, file_index2, file_index4],
1111
+ api_name="infer_change_voice",
1112
+ )
1113
+ with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
1114
+ with gr.Group():
1115
+ gr.Markdown(
1116
+ value=i18n(
1117
+ "人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"
1118
+ )
1119
+ )
1120
+ with gr.Row():
1121
+ with gr.Column():
1122
+ dir_wav_input = gr.Textbox(
1123
+ label=i18n("输入待处理音频文件夹路径"),
1124
+ placeholder="C:\\Users\\Desktop\\todo-songs",
1125
+ )
1126
+ wav_inputs = gr.File(
1127
+ file_count="multiple",
1128
+ label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹"),
1129
+ )
1130
+ with gr.Column():
1131
+ model_choose = gr.Dropdown(
1132
+ label=i18n("模型"), choices=uvr5_names
1133
+ )
1134
+ agg = gr.Slider(
1135
+ minimum=0,
1136
+ maximum=20,
1137
+ step=1,
1138
+ label="人声提取激进程度",
1139
+ value=10,
1140
+ interactive=True,
1141
+ visible=False, # 先不开放调整
1142
+ )
1143
+ opt_vocal_root = gr.Textbox(
1144
+ label=i18n("指定输出主人声文件夹"), value="opt"
1145
+ )
1146
+ opt_ins_root = gr.Textbox(
1147
+ label=i18n("指定输出非主人声文件夹"), value="opt"
1148
+ )
1149
+ format0 = gr.Radio(
1150
+ label=i18n("导出文件格式"),
1151
+ choices=["wav", "flac", "mp3", "m4a"],
1152
+ value="flac",
1153
+ interactive=True,
1154
+ )
1155
+ but2 = gr.Button(i18n("转换"), variant="primary")
1156
+ vc_output4 = gr.Textbox(label=i18n("输出信息"))
1157
+ but2.click(
1158
+ uvr,
1159
+ [
1160
+ model_choose,
1161
+ dir_wav_input,
1162
+ opt_vocal_root,
1163
+ wav_inputs,
1164
+ opt_ins_root,
1165
+ agg,
1166
+ format0,
1167
+ ],
1168
+ [vc_output4],
1169
+ api_name="uvr_convert",
1170
+ )
1171
+ with gr.TabItem(i18n("训练")):
1172
+ gr.Markdown(
1173
+ value=i18n(
1174
+ "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. "
1175
+ )
1176
+ )
1177
+ with gr.Row():
1178
+ exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test")
1179
+ sr2 = gr.Radio(
1180
+ label=i18n("目标采样率"),
1181
+ choices=["40k", "48k"],
1182
+ value="40k",
1183
+ interactive=True,
1184
+ )
1185
+ if_f0_3 = gr.Radio(
1186
+ label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"),
1187
+ choices=[True, False],
1188
+ value=True,
1189
+ interactive=True,
1190
+ )
1191
+ version19 = gr.Radio(
1192
+ label=i18n("版本"),
1193
+ choices=["v1", "v2"],
1194
+ value="v2",
1195
+ interactive=True,
1196
+ visible=True,
1197
+ )
1198
+ np7 = gr.Slider(
1199
+ minimum=0,
1200
+ maximum=config.n_cpu,
1201
+ step=1,
1202
+ label=i18n("提取音高和处理数据使用的CPU进程数"),
1203
+ value=int(np.ceil(config.n_cpu / 1.5)),
1204
+ interactive=True,
1205
+ )
1206
+ with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理
1207
+ gr.Markdown(
1208
+ value=i18n(
1209
+ "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. "
1210
+ )
1211
+ )
1212
+ with gr.Row():
1213
+ trainset_dir4 = gr.Textbox(
1214
+ label=i18n("输入训练文件夹路径"),
1215
+ value=i18n("E:\\语音音频+标注\\米津玄师\\src"),
1216
+ )
1217
+ spk_id5 = gr.Slider(
1218
+ minimum=0,
1219
+ maximum=4,
1220
+ step=1,
1221
+ label=i18n("请指定说话人id"),
1222
+ value=0,
1223
+ interactive=True,
1224
+ )
1225
+ but1 = gr.Button(i18n("处理数据"), variant="primary")
1226
+ info1 = gr.Textbox(label=i18n("输出信息"), value="")
1227
+ but1.click(
1228
+ preprocess_dataset,
1229
+ [trainset_dir4, exp_dir1, sr2, np7],
1230
+ [info1],
1231
+ api_name="train_preprocess",
1232
+ )
1233
+ with gr.Group():
1234
+ gr.Markdown(
1235
+ value=i18n(
1236
+ "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)"
1237
+ )
1238
+ )
1239
+ with gr.Row():
1240
+ with gr.Column():
1241
+ gpus6 = gr.Textbox(
1242
+ label=i18n(
1243
+ "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"
1244
+ ),
1245
+ value=gpus,
1246
+ interactive=True,
1247
+ visible=F0GPUVisible,
1248
+ )
1249
+ gpu_info9 = gr.Textbox(
1250
+ label=i18n("显卡信息"), value=gpu_info, visible=F0GPUVisible
1251
+ )
1252
+ with gr.Column():
1253
+ f0method8 = gr.Radio(
1254
+ label=i18n(
1255
+ "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU"
1256
+ ),
1257
+ choices=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"],
1258
+ value="rmvpe_gpu",
1259
+ interactive=True,
1260
+ )
1261
+ gpus_rmvpe = gr.Textbox(
1262
+ label=i18n(
1263
+ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程"
1264
+ ),
1265
+ value="%s-%s" % (gpus, gpus),
1266
+ interactive=True,
1267
+ visible=F0GPUVisible,
1268
+ )
1269
+ but2 = gr.Button(i18n("特征提取"), variant="primary")
1270
+ info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1271
+ f0method8.change(
1272
+ fn=change_f0_method,
1273
+ inputs=[f0method8],
1274
+ outputs=[gpus_rmvpe],
1275
+ )
1276
+ but2.click(
1277
+ extract_f0_feature,
1278
+ [
1279
+ gpus6,
1280
+ np7,
1281
+ f0method8,
1282
+ if_f0_3,
1283
+ exp_dir1,
1284
+ version19,
1285
+ gpus_rmvpe,
1286
+ ],
1287
+ [info2],
1288
+ api_name="train_extract_f0_feature",
1289
+ )
1290
+ with gr.Group():
1291
+ gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引"))
1292
+ with gr.Row():
1293
+ save_epoch10 = gr.Slider(
1294
+ minimum=1,
1295
+ maximum=50,
1296
+ step=1,
1297
+ label=i18n("保存频率save_every_epoch"),
1298
+ value=5,
1299
+ interactive=True,
1300
+ )
1301
+ total_epoch11 = gr.Slider(
1302
+ minimum=2,
1303
+ maximum=1000,
1304
+ step=1,
1305
+ label=i18n("总训练轮数total_epoch"),
1306
+ value=20,
1307
+ interactive=True,
1308
+ )
1309
+ batch_size12 = gr.Slider(
1310
+ minimum=1,
1311
+ maximum=40,
1312
+ step=1,
1313
+ label=i18n("每张显卡的batch_size"),
1314
+ value=default_batch_size,
1315
+ interactive=True,
1316
+ )
1317
+ if_save_latest13 = gr.Radio(
1318
+ label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"),
1319
+ choices=[i18n("是"), i18n("否")],
1320
+ value=i18n("否"),
1321
+ interactive=True,
1322
+ )
1323
+ if_cache_gpu17 = gr.Radio(
1324
+ label=i18n(
1325
+ "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速"
1326
+ ),
1327
+ choices=[i18n("是"), i18n("否")],
1328
+ value=i18n("否"),
1329
+ interactive=True,
1330
+ )
1331
+ if_save_every_weights18 = gr.Radio(
1332
+ label=i18n(
1333
+ "是否在每次保存时间点将最终小模型保存至weights文件夹"
1334
+ ),
1335
+ choices=[i18n("是"), i18n("否")],
1336
+ value=i18n("否"),
1337
+ interactive=True,
1338
+ )
1339
+ with gr.Row():
1340
+ pretrained_G14 = gr.Textbox(
1341
+ label=i18n("加载预训练底模G路径"),
1342
+ value="assets/pretrained_v2/f0G40k.pth",
1343
+ interactive=True,
1344
+ )
1345
+ pretrained_D15 = gr.Textbox(
1346
+ label=i18n("加载预训练底模D路径"),
1347
+ value="assets/pretrained_v2/f0D40k.pth",
1348
+ interactive=True,
1349
+ )
1350
+ sr2.change(
1351
+ change_sr2,
1352
+ [sr2, if_f0_3, version19],
1353
+ [pretrained_G14, pretrained_D15],
1354
+ )
1355
+ version19.change(
1356
+ change_version19,
1357
+ [sr2, if_f0_3, version19],
1358
+ [pretrained_G14, pretrained_D15, sr2],
1359
+ )
1360
+ if_f0_3.change(
1361
+ change_f0,
1362
+ [if_f0_3, sr2, version19],
1363
+ [f0method8, gpus_rmvpe, pretrained_G14, pretrained_D15],
1364
+ )
1365
+ gpus16 = gr.Textbox(
1366
+ label=i18n(
1367
+ "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"
1368
+ ),
1369
+ value=gpus,
1370
+ interactive=True,
1371
+ )
1372
+ but3 = gr.Button(i18n("训练模型"), variant="primary")
1373
+ but4 = gr.Button(i18n("训练特征索引"), variant="primary")
1374
+ but5 = gr.Button(i18n("一键训练"), variant="primary")
1375
+ info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10)
1376
+ but3.click(
1377
+ click_train,
1378
+ [
1379
+ exp_dir1,
1380
+ sr2,
1381
+ if_f0_3,
1382
+ spk_id5,
1383
+ save_epoch10,
1384
+ total_epoch11,
1385
+ batch_size12,
1386
+ if_save_latest13,
1387
+ pretrained_G14,
1388
+ pretrained_D15,
1389
+ gpus16,
1390
+ if_cache_gpu17,
1391
+ if_save_every_weights18,
1392
+ version19,
1393
+ ],
1394
+ info3,
1395
+ api_name="train_start",
1396
+ )
1397
+ but4.click(train_index, [exp_dir1, version19], info3)
1398
+ but5.click(
1399
+ train1key,
1400
+ [
1401
+ exp_dir1,
1402
+ sr2,
1403
+ if_f0_3,
1404
+ trainset_dir4,
1405
+ spk_id5,
1406
+ np7,
1407
+ f0method8,
1408
+ save_epoch10,
1409
+ total_epoch11,
1410
+ batch_size12,
1411
+ if_save_latest13,
1412
+ pretrained_G14,
1413
+ pretrained_D15,
1414
+ gpus16,
1415
+ if_cache_gpu17,
1416
+ if_save_every_weights18,
1417
+ version19,
1418
+ gpus_rmvpe,
1419
+ ],
1420
+ info3,
1421
+ api_name="train_start_all",
1422
+ )
1423
+
1424
+ with gr.TabItem(i18n("ckpt处理")):
1425
+ with gr.Group():
1426
+ gr.Markdown(value=i18n("模型融合, 可用于测试音色融合"))
1427
+ with gr.Row():
1428
+ ckpt_a = gr.Textbox(
1429
+ label=i18n("A模型路径"), value="", interactive=True
1430
+ )
1431
+ ckpt_b = gr.Textbox(
1432
+ label=i18n("B模型路径"), value="", interactive=True
1433
+ )
1434
+ alpha_a = gr.Slider(
1435
+ minimum=0,
1436
+ maximum=1,
1437
+ label=i18n("A模型权重"),
1438
+ value=0.5,
1439
+ interactive=True,
1440
+ )
1441
+ with gr.Row():
1442
+ sr_ = gr.Radio(
1443
+ label=i18n("目标采样率"),
1444
+ choices=["40k", "48k"],
1445
+ value="40k",
1446
+ interactive=True,
1447
+ )
1448
+ if_f0_ = gr.Radio(
1449
+ label=i18n("模型是否带音高指导"),
1450
+ choices=[i18n("是"), i18n("否")],
1451
+ value=i18n("是"),
1452
+ interactive=True,
1453
+ )
1454
+ info__ = gr.Textbox(
1455
+ label=i18n("要置入的模型信息"),
1456
+ value="",
1457
+ max_lines=8,
1458
+ interactive=True,
1459
+ )
1460
+ name_to_save0 = gr.Textbox(
1461
+ label=i18n("保存的模型名不带后缀"),
1462
+ value="",
1463
+ max_lines=1,
1464
+ interactive=True,
1465
+ )
1466
+ version_2 = gr.Radio(
1467
+ label=i18n("模型版本型号"),
1468
+ choices=["v1", "v2"],
1469
+ value="v1",
1470
+ interactive=True,
1471
+ )
1472
+ with gr.Row():
1473
+ but6 = gr.Button(i18n("融合"), variant="primary")
1474
+ info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1475
+ but6.click(
1476
+ merge,
1477
+ [
1478
+ ckpt_a,
1479
+ ckpt_b,
1480
+ alpha_a,
1481
+ sr_,
1482
+ if_f0_,
1483
+ info__,
1484
+ name_to_save0,
1485
+ version_2,
1486
+ ],
1487
+ info4,
1488
+ api_name="ckpt_merge",
1489
+ ) # def merge(path1,path2,alpha1,sr,f0,info):
1490
+ with gr.Group():
1491
+ gr.Markdown(
1492
+ value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)")
1493
+ )
1494
+ with gr.Row():
1495
+ ckpt_path0 = gr.Textbox(
1496
+ label=i18n("模型路径"), value="", interactive=True
1497
+ )
1498
+ info_ = gr.Textbox(
1499
+ label=i18n("要改的模型信息"),
1500
+ value="",
1501
+ max_lines=8,
1502
+ interactive=True,
1503
+ )
1504
+ name_to_save1 = gr.Textbox(
1505
+ label=i18n("保存的文件名, 默认空为和源文件同名"),
1506
+ value="",
1507
+ max_lines=8,
1508
+ interactive=True,
1509
+ )
1510
+ with gr.Row():
1511
+ but7 = gr.Button(i18n("修改"), variant="primary")
1512
+ info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1513
+ but7.click(
1514
+ change_info,
1515
+ [ckpt_path0, info_, name_to_save1],
1516
+ info5,
1517
+ api_name="ckpt_modify",
1518
+ )
1519
+ with gr.Group():
1520
+ gr.Markdown(
1521
+ value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)")
1522
+ )
1523
+ with gr.Row():
1524
+ ckpt_path1 = gr.Textbox(
1525
+ label=i18n("模型路径"), value="", interactive=True
1526
+ )
1527
+ but8 = gr.Button(i18n("查看"), variant="primary")
1528
+ info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1529
+ but8.click(show_info, [ckpt_path1], info6, api_name="ckpt_show")
1530
+ with gr.Group():
1531
+ gr.Markdown(
1532
+ value=i18n(
1533
+ "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况"
1534
+ )
1535
+ )
1536
+ with gr.Row():
1537
+ ckpt_path2 = gr.Textbox(
1538
+ label=i18n("模型路径"),
1539
+ value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth",
1540
+ interactive=True,
1541
+ )
1542
+ save_name = gr.Textbox(
1543
+ label=i18n("保存名"), value="", interactive=True
1544
+ )
1545
+ sr__ = gr.Radio(
1546
+ label=i18n("目标采样率"),
1547
+ choices=["32k", "40k", "48k"],
1548
+ value="40k",
1549
+ interactive=True,
1550
+ )
1551
+ if_f0__ = gr.Radio(
1552
+ label=i18n("模型是否带音高指导,1是0否"),
1553
+ choices=["1", "0"],
1554
+ value="1",
1555
+ interactive=True,
1556
+ )
1557
+ version_1 = gr.Radio(
1558
+ label=i18n("模型版本型号"),
1559
+ choices=["v1", "v2"],
1560
+ value="v2",
1561
+ interactive=True,
1562
+ )
1563
+ info___ = gr.Textbox(
1564
+ label=i18n("要置入的模型信息"),
1565
+ value="",
1566
+ max_lines=8,
1567
+ interactive=True,
1568
+ )
1569
+ but9 = gr.Button(i18n("提取"), variant="primary")
1570
+ info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1571
+ ckpt_path2.change(
1572
+ change_info_, [ckpt_path2], [sr__, if_f0__, version_1]
1573
+ )
1574
+ but9.click(
1575
+ extract_small_model,
1576
+ [ckpt_path2, save_name, sr__, if_f0__, info___, version_1],
1577
+ info7,
1578
+ api_name="ckpt_extract",
1579
+ )
1580
+
1581
+ with gr.TabItem(i18n("Onnx导出")):
1582
+ with gr.Row():
1583
+ ckpt_dir = gr.Textbox(
1584
+ label=i18n("RVC模型路径"), value="", interactive=True
1585
+ )
1586
+ with gr.Row():
1587
+ onnx_dir = gr.Textbox(
1588
+ label=i18n("Onnx输出路径"), value="", interactive=True
1589
+ )
1590
+ with gr.Row():
1591
+ infoOnnx = gr.Label(label="info")
1592
+ with gr.Row():
1593
+ butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary")
1594
+ butOnnx.click(
1595
+ export_onnx, [ckpt_dir, onnx_dir], infoOnnx, api_name="export_onnx"
1596
+ )
1597
+
1598
+ tab_faq = i18n("常见问题解答")
1599
+ with gr.TabItem(tab_faq):
1600
+ try:
1601
+ if tab_faq == "常见问题解答":
1602
+ with open("docs/cn/faq.md", "r", encoding="utf8") as f:
1603
+ info = f.read()
1604
+ else:
1605
+ with open("docs/en/faq_en.md", "r", encoding="utf8") as f:
1606
+ info = f.read()
1607
+ gr.Markdown(value=info)
1608
+ except:
1609
+ gr.Markdown(traceback.format_exc())
1610
+
1611
+ if config.iscolab:
1612
+ app.queue(concurrency_count=511, max_size=1022).launch(share=True)
1613
+ else:
1614
+ app.queue(concurrency_count=511, max_size=1022).launch(
1615
+ server_name="0.0.0.0",
1616
+ inbrowser=not config.noautoopen,
1617
+ server_port=config.listen_port,
1618
+ quiet=True,
1619
+ )
infer_batch_rvc.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+
5
+ print("Command-line arguments:", sys.argv)
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ import sys
10
+
11
+ import tqdm as tq
12
+ from dotenv import load_dotenv
13
+ from scipy.io import wavfile
14
+
15
+ from config import Config
16
+ from modules import VC
17
+
18
+
19
+ def arg_parse() -> tuple:
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--f0up_key", type=int, default=0)
22
+ parser.add_argument("--input_path", type=str, help="input path")
23
+ parser.add_argument("--index_path", type=str, help="index path")
24
+ parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25
+ parser.add_argument("--opt_path", type=str, help="opt path")
26
+ parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27
+ parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28
+ parser.add_argument("--device", type=str, help="device")
29
+ parser.add_argument("--is_half", type=bool, help="use half -> True")
30
+ parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31
+ parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32
+ parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33
+ parser.add_argument("--protect", type=float, default=0.33, help="protect")
34
+
35
+ args = parser.parse_args()
36
+ sys.argv = sys.argv[:1]
37
+
38
+ return args
39
+
40
+
41
+ def main():
42
+ load_dotenv()
43
+ args = arg_parse()
44
+ config = Config()
45
+ config.device = args.device if args.device else config.device
46
+ config.is_half = args.is_half if args.is_half else config.is_half
47
+ vc = VC(config)
48
+ vc.get_vc(args.model_name)
49
+ audios = os.listdir(args.input_path)
50
+ for file in tq.tqdm(audios):
51
+ if file.endswith(".wav"):
52
+ file_path = os.path.join(args.input_path, file)
53
+ _, wav_opt = vc.vc_single(
54
+ 0,
55
+ file_path,
56
+ args.f0up_key,
57
+ None,
58
+ args.f0method,
59
+ args.index_path,
60
+ None,
61
+ args.index_rate,
62
+ args.filter_radius,
63
+ args.resample_sr,
64
+ args.rms_mix_rate,
65
+ args.protect,
66
+ )
67
+ out_path = os.path.join(args.opt_path, file)
68
+ wavfile.write(out_path, wav_opt[0], wav_opt[1])
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
infer_cli.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+ from dotenv import load_dotenv
8
+ from scipy.io import wavfile
9
+
10
+ from configs.config import Config
11
+ from infer.modules.vc.modules import VC
12
+
13
+ ####
14
+ # USAGE
15
+ #
16
+ # In your Terminal or CMD or whatever
17
+
18
+
19
+ def arg_parse() -> tuple:
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--f0up_key", type=int, default=0)
22
+ parser.add_argument("--input_path", type=str, help="input path")
23
+ parser.add_argument("--index_path", type=str, help="index path")
24
+ parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25
+ parser.add_argument("--opt_path", type=str, help="opt path")
26
+ parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27
+ parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28
+ parser.add_argument("--device", type=str, help="device")
29
+ parser.add_argument("--is_half", type=bool, help="use half -> True")
30
+ parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31
+ parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32
+ parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33
+ parser.add_argument("--protect", type=float, default=0.33, help="protect")
34
+
35
+ args = parser.parse_args()
36
+ sys.argv = sys.argv[:1]
37
+
38
+ return args
39
+
40
+
41
+ def main():
42
+ load_dotenv()
43
+ args = arg_parse()
44
+ config = Config()
45
+ config.device = args.device if args.device else config.device
46
+ config.is_half = args.is_half if args.is_half else config.is_half
47
+ vc = VC(config)
48
+ vc.get_vc(args.model_name)
49
+ _, wav_opt = vc.vc_single(
50
+ 0,
51
+ args.input_path,
52
+ args.f0up_key,
53
+ None,
54
+ args.f0method,
55
+ args.index_path,
56
+ None,
57
+ args.index_rate,
58
+ args.filter_radius,
59
+ args.resample_sr,
60
+ args.rms_mix_rate,
61
+ args.protect,
62
+ )
63
+ wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
modules.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import torch
9
+ from io import BytesIO
10
+
11
+ from infer.lib.audio import load_audio, wav2
12
+ from infer.lib.infer_pack.models import (
13
+ SynthesizerTrnMs256NSFsid,
14
+ SynthesizerTrnMs256NSFsid_nono,
15
+ SynthesizerTrnMs768NSFsid,
16
+ SynthesizerTrnMs768NSFsid_nono,
17
+ )
18
+ from infer.modules.vc.pipeline import Pipeline
19
+ from infer.modules.vc.utils import *
20
+
21
+
22
+ class VC:
23
+ def __init__(self, config):
24
+ self.n_spk = None
25
+ self.tgt_sr = None
26
+ self.net_g = None
27
+ self.pipeline = None
28
+ self.cpt = None
29
+ self.version = None
30
+ self.if_f0 = None
31
+ self.version = None
32
+ self.hubert_model = None
33
+
34
+ self.config = config
35
+
36
+ def get_vc(self, sid, *to_return_protect):
37
+ logger.info("Get sid: " + sid)
38
+
39
+ to_return_protect0 = {
40
+ "visible": self.if_f0 != 0,
41
+ "value": (
42
+ to_return_protect[0] if self.if_f0 != 0 and to_return_protect else 0.5
43
+ ),
44
+ "__type__": "update",
45
+ }
46
+ to_return_protect1 = {
47
+ "visible": self.if_f0 != 0,
48
+ "value": (
49
+ to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33
50
+ ),
51
+ "__type__": "update",
52
+ }
53
+
54
+ if sid == "" or sid == []:
55
+ if (
56
+ self.hubert_model is not None
57
+ ): # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
58
+ logger.info("Clean model cache")
59
+ del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt
60
+ self.hubert_model = self.net_g = self.n_spk = self.hubert_model = (
61
+ self.tgt_sr
62
+ ) = None
63
+ if torch.cuda.is_available():
64
+ torch.cuda.empty_cache()
65
+ ###楼下不这么折腾清理不干净
66
+ self.if_f0 = self.cpt.get("f0", 1)
67
+ self.version = self.cpt.get("version", "v1")
68
+ if self.version == "v1":
69
+ if self.if_f0 == 1:
70
+ self.net_g = SynthesizerTrnMs256NSFsid(
71
+ *self.cpt["config"], is_half=self.config.is_half
72
+ )
73
+ else:
74
+ self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
75
+ elif self.version == "v2":
76
+ if self.if_f0 == 1:
77
+ self.net_g = SynthesizerTrnMs768NSFsid(
78
+ *self.cpt["config"], is_half=self.config.is_half
79
+ )
80
+ else:
81
+ self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
82
+ del self.net_g, self.cpt
83
+ if torch.cuda.is_available():
84
+ torch.cuda.empty_cache()
85
+ return (
86
+ {"visible": False, "__type__": "update"},
87
+ {
88
+ "visible": True,
89
+ "value": to_return_protect0,
90
+ "__type__": "update",
91
+ },
92
+ {
93
+ "visible": True,
94
+ "value": to_return_protect1,
95
+ "__type__": "update",
96
+ },
97
+ "",
98
+ "",
99
+ )
100
+ person = f'{os.getenv("weight_root")}/{sid}'
101
+ logger.info(f"Loading: {person}")
102
+
103
+ self.cpt = torch.load(person, map_location="cpu")
104
+ self.tgt_sr = self.cpt["config"][-1]
105
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk
106
+ self.if_f0 = self.cpt.get("f0", 1)
107
+ self.version = self.cpt.get("version", "v1")
108
+
109
+ synthesizer_class = {
110
+ ("v1", 1): SynthesizerTrnMs256NSFsid,
111
+ ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
112
+ ("v2", 1): SynthesizerTrnMs768NSFsid,
113
+ ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
114
+ }
115
+
116
+ self.net_g = synthesizer_class.get(
117
+ (self.version, self.if_f0), SynthesizerTrnMs256NSFsid
118
+ )(*self.cpt["config"], is_half=self.config.is_half)
119
+
120
+ del self.net_g.enc_q
121
+
122
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
123
+ self.net_g.eval().to(self.config.device)
124
+ if self.config.is_half:
125
+ self.net_g = self.net_g.half()
126
+ else:
127
+ self.net_g = self.net_g.float()
128
+
129
+ self.pipeline = Pipeline(self.tgt_sr, self.config)
130
+ n_spk = self.cpt["config"][-3]
131
+ index = {"value": get_index_path_from_model(sid), "__type__": "update"}
132
+ logger.info("Select index: " + index["value"])
133
+
134
+ return (
135
+ (
136
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
137
+ to_return_protect0,
138
+ to_return_protect1,
139
+ index,
140
+ index,
141
+ )
142
+ if to_return_protect
143
+ else {"visible": True, "maximum": n_spk, "__type__": "update"}
144
+ )
145
+
146
+ def vc_single(
147
+ self,
148
+ sid,
149
+ input_audio_path,
150
+ f0_up_key,
151
+ f0_file,
152
+ f0_method,
153
+ file_index,
154
+ file_index2,
155
+ index_rate,
156
+ filter_radius,
157
+ resample_sr,
158
+ rms_mix_rate,
159
+ protect,
160
+ ):
161
+ if input_audio_path is None:
162
+ return "You need to upload an audio", None
163
+ f0_up_key = int(f0_up_key)
164
+ try:
165
+ audio = load_audio(input_audio_path, 16000)
166
+ audio_max = np.abs(audio).max() / 0.95
167
+ if audio_max > 1:
168
+ audio /= audio_max
169
+ times = [0, 0, 0]
170
+
171
+ if self.hubert_model is None:
172
+ self.hubert_model = load_hubert(self.config)
173
+
174
+ if file_index:
175
+ file_index = (
176
+ file_index.strip(" ")
177
+ .strip('"')
178
+ .strip("\n")
179
+ .strip('"')
180
+ .strip(" ")
181
+ .replace("trained", "added")
182
+ )
183
+ elif file_index2:
184
+ file_index = file_index2
185
+ else:
186
+ file_index = "" # 防止小白写错,自动帮他替换掉
187
+
188
+ audio_opt = self.pipeline.pipeline(
189
+ self.hubert_model,
190
+ self.net_g,
191
+ sid,
192
+ audio,
193
+ input_audio_path,
194
+ times,
195
+ f0_up_key,
196
+ f0_method,
197
+ file_index,
198
+ index_rate,
199
+ self.if_f0,
200
+ filter_radius,
201
+ self.tgt_sr,
202
+ resample_sr,
203
+ rms_mix_rate,
204
+ self.version,
205
+ protect,
206
+ f0_file,
207
+ )
208
+ if self.tgt_sr != resample_sr >= 16000:
209
+ tgt_sr = resample_sr
210
+ else:
211
+ tgt_sr = self.tgt_sr
212
+ index_info = (
213
+ "Index:\n%s." % file_index
214
+ if os.path.exists(file_index)
215
+ else "Index not used."
216
+ )
217
+ return (
218
+ "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
219
+ % (index_info, *times),
220
+ (tgt_sr, audio_opt),
221
+ )
222
+ except:
223
+ info = traceback.format_exc()
224
+ logger.warning(info)
225
+ return info, (None, None)
226
+
227
+ def vc_multi(
228
+ self,
229
+ sid,
230
+ dir_path,
231
+ opt_root,
232
+ paths,
233
+ f0_up_key,
234
+ f0_method,
235
+ file_index,
236
+ file_index2,
237
+ index_rate,
238
+ filter_radius,
239
+ resample_sr,
240
+ rms_mix_rate,
241
+ protect,
242
+ format1,
243
+ ):
244
+ try:
245
+ dir_path = (
246
+ dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
247
+ ) # 防止小白拷路径头尾带了空格和"和回车
248
+ opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
249
+ os.makedirs(opt_root, exist_ok=True)
250
+ try:
251
+ if dir_path != "":
252
+ paths = [
253
+ os.path.join(dir_path, name) for name in os.listdir(dir_path)
254
+ ]
255
+ else:
256
+ paths = [path.name for path in paths]
257
+ except:
258
+ traceback.print_exc()
259
+ paths = [path.name for path in paths]
260
+ infos = []
261
+ for path in paths:
262
+ info, opt = self.vc_single(
263
+ sid,
264
+ path,
265
+ f0_up_key,
266
+ None,
267
+ f0_method,
268
+ file_index,
269
+ file_index2,
270
+ # file_big_npy,
271
+ index_rate,
272
+ filter_radius,
273
+ resample_sr,
274
+ rms_mix_rate,
275
+ protect,
276
+ )
277
+ if "Success" in info:
278
+ try:
279
+ tgt_sr, audio_opt = opt
280
+ if format1 in ["wav", "flac"]:
281
+ sf.write(
282
+ "%s/%s.%s"
283
+ % (opt_root, os.path.basename(path), format1),
284
+ audio_opt,
285
+ tgt_sr,
286
+ )
287
+ else:
288
+ path = "%s/%s.%s" % (
289
+ opt_root,
290
+ os.path.basename(path),
291
+ format1,
292
+ )
293
+ with BytesIO() as wavf:
294
+ sf.write(wavf, audio_opt, tgt_sr, format="wav")
295
+ wavf.seek(0, 0)
296
+ with open(path, "wb") as outf:
297
+ wav2(wavf, outf, format1)
298
+ except:
299
+ info += traceback.format_exc()
300
+ infos.append("%s->%s" % (os.path.basename(path), info))
301
+ yield "\n".join(infos)
302
+ yield "\n".join(infos)
303
+ except:
304
+ yield traceback.format_exc()
pipeline.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ from functools import lru_cache
9
+ from time import time as ttime
10
+
11
+ import faiss
12
+ import librosa
13
+ import numpy as np
14
+ import parselmouth
15
+ import pyworld
16
+ import torch
17
+ import torch.nn.functional as F
18
+ import torchcrepe
19
+ from scipy import signal
20
+
21
+ now_dir = os.getcwd()
22
+ sys.path.append(now_dir)
23
+
24
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
25
+
26
+ input_audio_path2wav = {}
27
+
28
+
29
+ @lru_cache
30
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
31
+ audio = input_audio_path2wav[input_audio_path]
32
+ f0, t = pyworld.harvest(
33
+ audio,
34
+ fs=fs,
35
+ f0_ceil=f0max,
36
+ f0_floor=f0min,
37
+ frame_period=frame_period,
38
+ )
39
+ f0 = pyworld.stonemask(audio, f0, t, fs)
40
+ return f0
41
+
42
+
43
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
44
+ # print(data1.max(),data2.max())
45
+ rms1 = librosa.feature.rms(
46
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
47
+ ) # 每半秒一个点
48
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
49
+ rms1 = torch.from_numpy(rms1)
50
+ rms1 = F.interpolate(
51
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
52
+ ).squeeze()
53
+ rms2 = torch.from_numpy(rms2)
54
+ rms2 = F.interpolate(
55
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
56
+ ).squeeze()
57
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
58
+ data2 *= (
59
+ torch.pow(rms1, torch.tensor(1 - rate))
60
+ * torch.pow(rms2, torch.tensor(rate - 1))
61
+ ).numpy()
62
+ return data2
63
+
64
+
65
+ class Pipeline(object):
66
+ def __init__(self, tgt_sr, config):
67
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
68
+ config.x_pad,
69
+ config.x_query,
70
+ config.x_center,
71
+ config.x_max,
72
+ config.is_half,
73
+ )
74
+ self.sr = 16000 # hubert输入采样率
75
+ self.window = 160 # 每帧点数
76
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
77
+ self.t_pad_tgt = tgt_sr * self.x_pad
78
+ self.t_pad2 = self.t_pad * 2
79
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
80
+ self.t_center = self.sr * self.x_center # 查询切点位置
81
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
82
+ self.device = config.device
83
+
84
+ def get_f0(
85
+ self,
86
+ input_audio_path,
87
+ x,
88
+ p_len,
89
+ f0_up_key,
90
+ f0_method,
91
+ filter_radius,
92
+ inp_f0=None,
93
+ ):
94
+ global input_audio_path2wav
95
+ time_step = self.window / self.sr * 1000
96
+ f0_min = 50
97
+ f0_max = 1100
98
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
99
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
100
+ if f0_method == "pm":
101
+ f0 = (
102
+ parselmouth.Sound(x, self.sr)
103
+ .to_pitch_ac(
104
+ time_step=time_step / 1000,
105
+ voicing_threshold=0.6,
106
+ pitch_floor=f0_min,
107
+ pitch_ceiling=f0_max,
108
+ )
109
+ .selected_array["frequency"]
110
+ )
111
+ pad_size = (p_len - len(f0) + 1) // 2
112
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
113
+ f0 = np.pad(
114
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
115
+ )
116
+ elif f0_method == "harvest":
117
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
118
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
119
+ if filter_radius > 2:
120
+ f0 = signal.medfilt(f0, 3)
121
+ elif f0_method == "crepe":
122
+ model = "full"
123
+ # Pick a batch size that doesn't cause memory errors on your gpu
124
+ batch_size = 512
125
+ # Compute pitch using first gpu
126
+ audio = torch.tensor(np.copy(x))[None].float()
127
+ f0, pd = torchcrepe.predict(
128
+ audio,
129
+ self.sr,
130
+ self.window,
131
+ f0_min,
132
+ f0_max,
133
+ model,
134
+ batch_size=batch_size,
135
+ device=self.device,
136
+ return_periodicity=True,
137
+ )
138
+ pd = torchcrepe.filter.median(pd, 3)
139
+ f0 = torchcrepe.filter.mean(f0, 3)
140
+ f0[pd < 0.1] = 0
141
+ f0 = f0[0].cpu().numpy()
142
+ elif f0_method == "rmvpe":
143
+ if not hasattr(self, "model_rmvpe"):
144
+ from infer.lib.rmvpe import RMVPE
145
+
146
+ logger.info(
147
+ "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
148
+ )
149
+ self.model_rmvpe = RMVPE(
150
+ "%s/rmvpe.pt" % os.environ["rmvpe_root"],
151
+ is_half=self.is_half,
152
+ device=self.device,
153
+ )
154
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
155
+
156
+ if "privateuseone" in str(self.device): # clean ortruntime memory
157
+ del self.model_rmvpe.model
158
+ del self.model_rmvpe
159
+ logger.info("Cleaning ortruntime memory")
160
+
161
+ f0 *= pow(2, f0_up_key / 12)
162
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
163
+ tf0 = self.sr // self.window # 每秒f0点数
164
+ if inp_f0 is not None:
165
+ delta_t = np.round(
166
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
167
+ ).astype("int16")
168
+ replace_f0 = np.interp(
169
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
170
+ )
171
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
172
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
173
+ :shape
174
+ ]
175
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
176
+ f0bak = f0.copy()
177
+ f0_mel = 1127 * np.log(1 + f0 / 700)
178
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
179
+ f0_mel_max - f0_mel_min
180
+ ) + 1
181
+ f0_mel[f0_mel <= 1] = 1
182
+ f0_mel[f0_mel > 255] = 255
183
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
184
+ return f0_coarse, f0bak # 1-0
185
+
186
+ def vc(
187
+ self,
188
+ model,
189
+ net_g,
190
+ sid,
191
+ audio0,
192
+ pitch,
193
+ pitchf,
194
+ times,
195
+ index,
196
+ big_npy,
197
+ index_rate,
198
+ version,
199
+ protect,
200
+ ): # ,file_index,file_big_npy
201
+ feats = torch.from_numpy(audio0)
202
+ if self.is_half:
203
+ feats = feats.half()
204
+ else:
205
+ feats = feats.float()
206
+ if feats.dim() == 2: # double channels
207
+ feats = feats.mean(-1)
208
+ assert feats.dim() == 1, feats.dim()
209
+ feats = feats.view(1, -1)
210
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
211
+
212
+ inputs = {
213
+ "source": feats.to(self.device),
214
+ "padding_mask": padding_mask,
215
+ "output_layer": 9 if version == "v1" else 12,
216
+ }
217
+ t0 = ttime()
218
+ with torch.no_grad():
219
+ logits = model.extract_features(**inputs)
220
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
221
+ if protect < 0.5 and pitch is not None and pitchf is not None:
222
+ feats0 = feats.clone()
223
+ if (
224
+ not isinstance(index, type(None))
225
+ and not isinstance(big_npy, type(None))
226
+ and index_rate != 0
227
+ ):
228
+ npy = feats[0].cpu().numpy()
229
+ if self.is_half:
230
+ npy = npy.astype("float32")
231
+
232
+ # _, I = index.search(npy, 1)
233
+ # npy = big_npy[I.squeeze()]
234
+
235
+ score, ix = index.search(npy, k=8)
236
+ weight = np.square(1 / score)
237
+ weight /= weight.sum(axis=1, keepdims=True)
238
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
239
+
240
+ if self.is_half:
241
+ npy = npy.astype("float16")
242
+ feats = (
243
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
244
+ + (1 - index_rate) * feats
245
+ )
246
+
247
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
248
+ if protect < 0.5 and pitch is not None and pitchf is not None:
249
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
250
+ 0, 2, 1
251
+ )
252
+ t1 = ttime()
253
+ p_len = audio0.shape[0] // self.window
254
+ if feats.shape[1] < p_len:
255
+ p_len = feats.shape[1]
256
+ if pitch is not None and pitchf is not None:
257
+ pitch = pitch[:, :p_len]
258
+ pitchf = pitchf[:, :p_len]
259
+
260
+ if protect < 0.5 and pitch is not None and pitchf is not None:
261
+ pitchff = pitchf.clone()
262
+ pitchff[pitchf > 0] = 1
263
+ pitchff[pitchf < 1] = protect
264
+ pitchff = pitchff.unsqueeze(-1)
265
+ feats = feats * pitchff + feats0 * (1 - pitchff)
266
+ feats = feats.to(feats0.dtype)
267
+ p_len = torch.tensor([p_len], device=self.device).long()
268
+ with torch.no_grad():
269
+ hasp = pitch is not None and pitchf is not None
270
+ arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
271
+ audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
272
+ del hasp, arg
273
+ del feats, p_len, padding_mask
274
+ if torch.cuda.is_available():
275
+ torch.cuda.empty_cache()
276
+ t2 = ttime()
277
+ times[0] += t1 - t0
278
+ times[2] += t2 - t1
279
+ return audio1
280
+
281
+ def pipeline(
282
+ self,
283
+ model,
284
+ net_g,
285
+ sid,
286
+ audio,
287
+ input_audio_path,
288
+ times,
289
+ f0_up_key,
290
+ f0_method,
291
+ file_index,
292
+ index_rate,
293
+ if_f0,
294
+ filter_radius,
295
+ tgt_sr,
296
+ resample_sr,
297
+ rms_mix_rate,
298
+ version,
299
+ protect,
300
+ f0_file=None,
301
+ ):
302
+ if (
303
+ file_index != ""
304
+ # and file_big_npy != ""
305
+ # and os.path.exists(file_big_npy) == True
306
+ and os.path.exists(file_index)
307
+ and index_rate != 0
308
+ ):
309
+ try:
310
+ index = faiss.read_index(file_index)
311
+ # big_npy = np.load(file_big_npy)
312
+ big_npy = index.reconstruct_n(0, index.ntotal)
313
+ except:
314
+ traceback.print_exc()
315
+ index = big_npy = None
316
+ else:
317
+ index = big_npy = None
318
+ audio = signal.filtfilt(bh, ah, audio)
319
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
320
+ opt_ts = []
321
+ if audio_pad.shape[0] > self.t_max:
322
+ audio_sum = np.zeros_like(audio)
323
+ for i in range(self.window):
324
+ audio_sum += np.abs(audio_pad[i : i - self.window])
325
+ for t in range(self.t_center, audio.shape[0], self.t_center):
326
+ opt_ts.append(
327
+ t
328
+ - self.t_query
329
+ + np.where(
330
+ audio_sum[t - self.t_query : t + self.t_query]
331
+ == audio_sum[t - self.t_query : t + self.t_query].min()
332
+ )[0][0]
333
+ )
334
+ s = 0
335
+ audio_opt = []
336
+ t = None
337
+ t1 = ttime()
338
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
339
+ p_len = audio_pad.shape[0] // self.window
340
+ inp_f0 = None
341
+ if hasattr(f0_file, "name"):
342
+ try:
343
+ with open(f0_file.name, "r") as f:
344
+ lines = f.read().strip("\n").split("\n")
345
+ inp_f0 = []
346
+ for line in lines:
347
+ inp_f0.append([float(i) for i in line.split(",")])
348
+ inp_f0 = np.array(inp_f0, dtype="float32")
349
+ except:
350
+ traceback.print_exc()
351
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
352
+ pitch, pitchf = None, None
353
+ if if_f0 == 1:
354
+ pitch, pitchf = self.get_f0(
355
+ input_audio_path,
356
+ audio_pad,
357
+ p_len,
358
+ f0_up_key,
359
+ f0_method,
360
+ filter_radius,
361
+ inp_f0,
362
+ )
363
+ pitch = pitch[:p_len]
364
+ pitchf = pitchf[:p_len]
365
+ if "mps" not in str(self.device) or "xpu" not in str(self.device):
366
+ pitchf = pitchf.astype(np.float32)
367
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
368
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
369
+ t2 = ttime()
370
+ times[1] += t2 - t1
371
+ for t in opt_ts:
372
+ t = t // self.window * self.window
373
+ if if_f0 == 1:
374
+ audio_opt.append(
375
+ self.vc(
376
+ model,
377
+ net_g,
378
+ sid,
379
+ audio_pad[s : t + self.t_pad2 + self.window],
380
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
381
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
382
+ times,
383
+ index,
384
+ big_npy,
385
+ index_rate,
386
+ version,
387
+ protect,
388
+ )[self.t_pad_tgt : -self.t_pad_tgt]
389
+ )
390
+ else:
391
+ audio_opt.append(
392
+ self.vc(
393
+ model,
394
+ net_g,
395
+ sid,
396
+ audio_pad[s : t + self.t_pad2 + self.window],
397
+ None,
398
+ None,
399
+ times,
400
+ index,
401
+ big_npy,
402
+ index_rate,
403
+ version,
404
+ protect,
405
+ )[self.t_pad_tgt : -self.t_pad_tgt]
406
+ )
407
+ s = t
408
+ if if_f0 == 1:
409
+ audio_opt.append(
410
+ self.vc(
411
+ model,
412
+ net_g,
413
+ sid,
414
+ audio_pad[t:],
415
+ pitch[:, t // self.window :] if t is not None else pitch,
416
+ pitchf[:, t // self.window :] if t is not None else pitchf,
417
+ times,
418
+ index,
419
+ big_npy,
420
+ index_rate,
421
+ version,
422
+ protect,
423
+ )[self.t_pad_tgt : -self.t_pad_tgt]
424
+ )
425
+ else:
426
+ audio_opt.append(
427
+ self.vc(
428
+ model,
429
+ net_g,
430
+ sid,
431
+ audio_pad[t:],
432
+ None,
433
+ None,
434
+ times,
435
+ index,
436
+ big_npy,
437
+ index_rate,
438
+ version,
439
+ protect,
440
+ )[self.t_pad_tgt : -self.t_pad_tgt]
441
+ )
442
+ audio_opt = np.concatenate(audio_opt)
443
+ if rms_mix_rate != 1:
444
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
445
+ if tgt_sr != resample_sr >= 16000:
446
+ audio_opt = librosa.resample(
447
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
448
+ )
449
+ audio_max = np.abs(audio_opt).max() / 0.99
450
+ max_int16 = 32768
451
+ if audio_max > 1:
452
+ max_int16 /= audio_max
453
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
454
+ del pitch, pitchf, sid
455
+ if torch.cuda.is_available():
456
+ torch.cuda.empty_cache()
457
+ return audio_opt
pyproject.toml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rvc-beta"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["lj1995"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.9"
10
+ torch = "2.4.0"
11
+ torchaudio = "2.4.0"
12
+ Cython = "^3.0.11"
13
+ gradio = "3.34.0"
14
+ pydub = ">=0.25.1"
15
+ soundfile = ">=0.12.1"
16
+ ffmpeg-python = ">=0.2.0"
17
+ tensorboardX = "^2.6.2.2"
18
+ fairseq = "0.12.2"
19
+ faiss-cpu = "1.7.3"
20
+ Jinja2 = ">=3.1.2"
21
+ json5 = "^0.9.25"
22
+ librosa = "0.9.1"
23
+ llvmlite = "0.39.0"
24
+ Markdown = "^3.6"
25
+ matplotlib = ">=3.7.0"
26
+ matplotlib-inline = ">=0.1.3"
27
+ numba = "0.56.4"
28
+ numpy = "1.23.5"
29
+ scipy = "1.13.1"
30
+ praat-parselmouth = ">=0.4.2"
31
+ Pillow = ">=9.1.1"
32
+ pyworld = "0.3.2"
33
+ resampy = ">=0.4.2"
34
+ scikit-learn = "^1.5.1"
35
+ tensorboard = "^2.17.0"
36
+ tqdm = ">=4.63.1"
37
+ tornado = ">=6.1"
38
+ Werkzeug = ">=2.2.3"
39
+ uc-micro-py = ">=1.0.1"
40
+ sympy = ">=1.11.1"
41
+ tabulate = ">=0.8.10"
42
+ PyYAML = ">=6.0"
43
+ pyasn1 = ">=0.4.8"
44
+ pyasn1-modules = ">=0.2.8"
45
+ fsspec = ">=2022.11.0"
46
+ absl-py = ">=1.2.0"
47
+ audioread = "^3.0.1"
48
+ uvicorn = ">=0.21.1"
49
+ colorama = ">=0.4.5"
50
+ torchcrepe = "0.0.20"
51
+ python-dotenv = ">=1.0.0"
52
+ av = "^12.3.0"
53
+ joblib = ">=1.1.0"
54
+ httpx = "^0.27.0"
55
+ onnxruntime-gpu = "^1.18.1"
56
+ fastapi = "0.88"
57
+ torchfcpe = "^0.0.4"
58
+ ffmpy = "0.3.1"
59
+ torchvision = "0.19.0"
60
+ [tool.poetry.dev-dependencies]
61
+
62
+ [build-system]
63
+ requires = ["poetry-core>=1.0.0"]
64
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aria2
2
+ joblib>=1.1.0
3
+ numba==0.56.4
4
+ numpy==1.23.5
5
+ scipy
6
+ librosa==0.9.1
7
+ llvmlite==0.39.0
8
+ fairseq==0.12.2
9
+ faiss-cpu==1.7.3
10
+ gradio==3.34.0
11
+ Cython
12
+ pydub>=0.25.1
13
+ soundfile>=0.12.1
14
+ ffmpeg-python>=0.2.0
15
+ tensorboardX
16
+ Jinja2>=3.1.2
17
+ json5
18
+ Markdown
19
+ matplotlib>=3.7.0
20
+ matplotlib-inline>=0.1.3
21
+ praat-parselmouth>=0.4.2
22
+
23
+ tensorboard
24
+ tqdm>=4.63.1
25
+ tornado>=6.1
26
+ httpx
27
+ onnxruntime; sys_platform == 'darwin'
28
+ onnxruntime-gpu; sys_platform != 'darwin'
29
+ torchcrepe==0.0.20
30
+ fastapi==0.88
31
+ torchfcpe
32
+ ffmpy==0.3.1
33
+ python-dotenv>=1.0.0
34
+ av
utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from fairseq import checkpoint_utils
4
+
5
+
6
+ def get_index_path_from_model(sid):
7
+ return next(
8
+ (
9
+ f
10
+ for f in [
11
+ os.path.join(root, name)
12
+ for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
13
+ for name in files
14
+ if name.endswith(".index") and "trained" not in name
15
+ ]
16
+ if sid.split(".")[0] in f
17
+ ),
18
+ "",
19
+ )
20
+
21
+
22
+ def load_hubert(config):
23
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
24
+ ["assets/hubert/hubert_base.pt"],
25
+ suffix="",
26
+ )
27
+ hubert_model = models[0]
28
+ hubert_model = hubert_model.to(config.device)
29
+ if config.is_half:
30
+ hubert_model = hubert_model.half()
31
+ else:
32
+ hubert_model = hubert_model.float()
33
+ return hubert_model.eval()