kevinwang676 BartPoint commited on
Commit
7b7cc52
·
0 Parent(s):

Duplicate from BartPoint/VoiceChange

Browse files

Co-authored-by: Bart Point <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +40 -0
  2. LICENSE +51 -0
  3. README.md +12 -0
  4. app_multi.py +469 -0
  5. config.py +17 -0
  6. hubert_base.pt +3 -0
  7. infer_pack/attentions.py +417 -0
  8. infer_pack/commons.py +166 -0
  9. infer_pack/models.py +1124 -0
  10. infer_pack/models_onnx.py +818 -0
  11. infer_pack/modules.py +522 -0
  12. infer_pack/modules/F0Predictor/DioF0Predictor.py +90 -0
  13. infer_pack/modules/F0Predictor/F0Predictor.py +16 -0
  14. infer_pack/modules/F0Predictor/HarvestF0Predictor.py +86 -0
  15. infer_pack/modules/F0Predictor/PMF0Predictor.py +97 -0
  16. infer_pack/modules/F0Predictor/__init__.py +0 -0
  17. infer_pack/onnx_inference.py +139 -0
  18. infer_pack/transforms.py +209 -0
  19. model/alphawann/Alpha.png +0 -0
  20. model/alphawann/Alpha2333333.pth +3 -0
  21. model/alphawann/added_IVF1322_Flat_nprobe_1.index +3 -0
  22. model/alphawann/config.json +11 -0
  23. model/angele/Angele.png +3 -0
  24. model/angele/angele.pth +3 -0
  25. model/angele/config.json +11 -0
  26. model/angele/trained_IVF1260_Flat_nprobe_1_v2.index +3 -0
  27. model/arianagrande/Ariana.png +0 -0
  28. model/arianagrande/added_IVF1033_Flat_nprobe_1_v2.index +3 -0
  29. model/arianagrande/arianagrande.pth +3 -0
  30. model/arianagrande/config.json +11 -0
  31. model/leto/Leto.png +3 -0
  32. model/leto/added_IVF408_Flat_nprobe_1_v2.index +3 -0
  33. model/leto/config.json +11 -0
  34. model/leto/leto.pth +3 -0
  35. model/macmiller/added_IVF2124_Flat_nprobe_1_v2.index +3 -0
  36. model/macmiller/config.json +11 -0
  37. model/macmiller/macmiller.png +3 -0
  38. model/macmiller/macmillerv3.pth +3 -0
  39. model/mickaeljackson/Mickael.png +0 -0
  40. model/mickaeljackson/added_IVF1448_Flat_nprobe_1_v2.index +3 -0
  41. model/mickaeljackson/config.json +11 -0
  42. model/mickaeljackson/michael-jackson.pth +3 -0
  43. model/oboy/Oboy.png +0 -0
  44. model/oboy/added_IVF419_Flat_nprobe_1_v2.index +3 -0
  45. model/oboy/config.json +11 -0
  46. model/oboy/oboy.pth +3 -0
  47. model/zamdane/added_IVF966_Flat_nprobe_1_v2.index +3 -0
  48. model/zamdane/config.json +11 -0
  49. model/zamdane/zamdane.png +0 -0
  50. model/zamdane/zamdane.pth +3 -0
.gitattributes ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.index filter=lfs diff=lfs merge=lfs -text
36
+ model/macmiller/macmiller.png filter=lfs diff=lfs merge=lfs -text
37
+ model/hamza/Hamza.png filter=lfs diff=lfs merge=lfs -text
38
+ model/gambino/Hamza.png filter=lfs diff=lfs merge=lfs -text
39
+ model/angele/Angele.png filter=lfs diff=lfs merge=lfs -text
40
+ model/leto/Leto.png filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 liujing04
4
+ Copyright (c) 2023 源文雨
5
+ Copyright (c) 2023 on9.moe Webslaves
6
+
7
+ 本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。
8
+ 如不认可该条款,则不能使用或引用软件包内任何代码和文件。
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15
+
16
+ 特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件:
17
+ 上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。
18
+ 软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。
19
+
20
+ 相关引用库协议如下:
21
+ #################
22
+ ContentVec
23
+ https://github.com/auspicious3000/contentvec/blob/main/LICENSE
24
+ MIT License
25
+ #################
26
+ VITS
27
+ https://github.com/jaywalnut310/vits/blob/main/LICENSE
28
+ MIT License
29
+ #################
30
+ HIFIGAN
31
+ https://github.com/jik876/hifi-gan/blob/master/LICENSE
32
+ MIT License
33
+ #################
34
+ gradio
35
+ https://github.com/gradio-app/gradio/blob/main/LICENSE
36
+ Apache License 2.0
37
+ #################
38
+ ffmpeg
39
+ https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3
40
+ https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2021-02-28-12-32/ffmpeg-n4.3.2-160-gfbb9368226-win64-lgpl-4.3.zip
41
+ LPGLv3 License
42
+ MIT License
43
+ #################
44
+ ultimatevocalremovergui
45
+ https://github.com/Anjok07/ultimatevocalremovergui/blob/master/LICENSE
46
+ https://github.com/yang123qwe/vocal_separation_by_uvr5
47
+ MIT License
48
+ #################
49
+ audio-slicer
50
+ https://github.com/openvpi/audio-slicer/blob/main/LICENSE
51
+ MIT License
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VoiceChange
3
+ emoji: 👀
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.28.3
8
+ app_file: app_multi.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: BartPoint/VoiceChange
12
+ ---
app_multi.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from argparse import ArgumentParser
4
+
5
+ import asyncio
6
+ import json
7
+ import hashlib
8
+ from os import path, getenv
9
+
10
+ import gradio as gr
11
+
12
+ import torch
13
+
14
+ import numpy as np
15
+ import librosa
16
+
17
+ import edge_tts
18
+
19
+ import config
20
+ import util
21
+ from infer_pack.models import (
22
+ SynthesizerTrnMs768NSFsid,
23
+ SynthesizerTrnMs768NSFsid_nono
24
+ )
25
+ from vc_infer_pipeline import VC
26
+
27
+ # Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21 # noqa
28
+ in_hf_space = getenv('SYSTEM') == 'spaces'
29
+
30
+ # Argument parsing
31
+ arg_parser = ArgumentParser()
32
+ arg_parser.add_argument(
33
+ '--hubert',
34
+ default=getenv('RVC_HUBERT', 'hubert_base.pt'),
35
+ help='path to hubert base model (default: hubert_base.pt)'
36
+ )
37
+ arg_parser.add_argument(
38
+ '--config',
39
+ default=getenv('RVC_MULTI_CFG', 'multi_config.json'),
40
+ help='path to config file (default: multi_config.json)'
41
+ )
42
+ arg_parser.add_argument(
43
+ '--api',
44
+ action='store_true',
45
+ help='enable api endpoint'
46
+ )
47
+ arg_parser.add_argument(
48
+ '--cache-examples',
49
+ action='store_true',
50
+ help='enable example caching, please remember delete gradio_cached_examples folder when example config has been modified' # noqa
51
+ )
52
+ args = arg_parser.parse_args()
53
+
54
+ app_css = '''
55
+ #model_info img {
56
+ max-width: 100px;
57
+ max-height: 100px;
58
+ float: right;
59
+ }
60
+
61
+ #model_info p {
62
+ margin: unset;
63
+ }
64
+ '''
65
+
66
+ app = gr.Blocks(
67
+ theme=gr.themes.Soft(primary_hue="orange", secondary_hue="slate"),
68
+ css=app_css,
69
+ analytics_enabled=False
70
+ )
71
+
72
+ # Load hubert model
73
+ hubert_model = util.load_hubert_model(config.device, args.hubert)
74
+ hubert_model.eval()
75
+
76
+ # Load models
77
+ multi_cfg = json.load(open(args.config, 'r'))
78
+ loaded_models = []
79
+
80
+ for model_name in multi_cfg.get('models'):
81
+ print(f'Loading model: {model_name}')
82
+
83
+ # Load model info
84
+ model_info = json.load(
85
+ open(path.join('model', model_name, 'config.json'), 'r')
86
+ )
87
+
88
+ # Load RVC checkpoint
89
+ cpt = torch.load(
90
+ path.join('model', model_name, model_info['model']),
91
+ map_location='cpu'
92
+ )
93
+ tgt_sr = cpt['config'][-1]
94
+ cpt['config'][-3] = cpt['weight']['emb_g.weight'].shape[0] # n_spk
95
+
96
+ if_f0 = cpt.get('f0', 1)
97
+ net_g: Union[SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono]
98
+ if if_f0 == 1:
99
+ net_g = SynthesizerTrnMs768NSFsid(
100
+ *cpt['config'],
101
+ is_half=util.is_half(config.device)
102
+ )
103
+ else:
104
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt['config'])
105
+
106
+ del net_g.enc_q
107
+
108
+ # According to original code, this thing seems necessary.
109
+ print(net_g.load_state_dict(cpt['weight'], strict=False))
110
+
111
+ net_g.eval().to(config.device)
112
+ net_g = net_g.half() if util.is_half(config.device) else net_g.float()
113
+
114
+ vc = VC(tgt_sr, config)
115
+
116
+ loaded_models.append(dict(
117
+ name=model_name,
118
+ metadata=model_info,
119
+ vc=vc,
120
+ net_g=net_g,
121
+ if_f0=if_f0,
122
+ target_sr=tgt_sr
123
+ ))
124
+
125
+ print(f'Models loaded: {len(loaded_models)}')
126
+
127
+ # Edge TTS speakers
128
+ tts_speakers_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) # noqa
129
+
130
+
131
+ # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa
132
+ def vc_func(
133
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
134
+ filter_radius, rms_mix_rate, resample_option
135
+ ):
136
+ if input_audio is None:
137
+ return (None, 'Please provide input audio.')
138
+
139
+ if model_index is None:
140
+ return (None, 'Please select a model.')
141
+
142
+ model = loaded_models[model_index]
143
+
144
+ # Reference: so-vits
145
+ (audio_samp, audio_npy) = input_audio
146
+
147
+ # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
148
+ # Can be change well, we will see
149
+ if (audio_npy.shape[0] / audio_samp) > 320 and in_hf_space:
150
+ return (None, 'Input audio is longer than 60 secs.')
151
+
152
+ # Bloody hell: https://stackoverflow.com/questions/26921836/
153
+ if audio_npy.dtype != np.float32: # :thonk:
154
+ audio_npy = (
155
+ audio_npy / np.iinfo(audio_npy.dtype).max
156
+ ).astype(np.float32)
157
+
158
+ if len(audio_npy.shape) > 1:
159
+ audio_npy = librosa.to_mono(audio_npy.transpose(1, 0))
160
+
161
+ if audio_samp != 16000:
162
+ audio_npy = librosa.resample(
163
+ audio_npy,
164
+ orig_sr=audio_samp,
165
+ target_sr=16000
166
+ )
167
+
168
+ pitch_int = int(pitch_adjust)
169
+
170
+ resample = (
171
+ 0 if resample_option == 'Disable resampling'
172
+ else int(resample_option)
173
+ )
174
+
175
+ times = [0, 0, 0]
176
+
177
+ checksum = hashlib.sha512()
178
+ checksum.update(audio_npy.tobytes())
179
+
180
+ output_audio = model['vc'].pipeline(
181
+ hubert_model,
182
+ model['net_g'],
183
+ model['metadata'].get('speaker_id', 0),
184
+ audio_npy,
185
+ checksum.hexdigest(),
186
+ times,
187
+ pitch_int,
188
+ f0_method,
189
+ path.join('model', model['name'], model['metadata']['feat_index']),
190
+ feat_ratio,
191
+ model['if_f0'],
192
+ filter_radius,
193
+ model['target_sr'],
194
+ resample,
195
+ rms_mix_rate,
196
+ 'v2'
197
+ )
198
+
199
+ out_sr = (
200
+ resample if resample >= 16000 and model['target_sr'] != resample
201
+ else model['target_sr']
202
+ )
203
+
204
+ print(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s')
205
+ return ((out_sr, output_audio), 'Success')
206
+
207
+
208
+ async def edge_tts_vc_func(
209
+ input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
210
+ filter_radius, rms_mix_rate, resample_option
211
+ ):
212
+ if input_text is None:
213
+ return (None, 'Please provide TTS text.')
214
+
215
+ if tts_speaker is None:
216
+ return (None, 'Please select TTS speaker.')
217
+
218
+ if model_index is None:
219
+ return (None, 'Please select a model.')
220
+
221
+ speaker = tts_speakers_list[tts_speaker]['ShortName']
222
+ (tts_np, tts_sr) = await util.call_edge_tts(speaker, input_text)
223
+ return vc_func(
224
+ (tts_sr, tts_np),
225
+ model_index,
226
+ pitch_adjust,
227
+ f0_method,
228
+ feat_ratio,
229
+ filter_radius,
230
+ rms_mix_rate,
231
+ resample_option
232
+ )
233
+
234
+
235
+ def update_model_info(model_index):
236
+ if model_index is None:
237
+ return str(
238
+ '### Model info\n'
239
+ 'Please select a model from dropdown above.'
240
+ )
241
+
242
+ model = loaded_models[model_index]
243
+ model_icon = model['metadata'].get('icon', '')
244
+
245
+ return str(
246
+ '### Model info\n'
247
+ '![model icon]({icon})'
248
+ '**{name}**\n\n'
249
+ 'Author: {author}\n\n'
250
+ 'Source: {source}\n\n'
251
+ '{note}'
252
+ ).format(
253
+ name=model['metadata'].get('name'),
254
+ author=model['metadata'].get('author', 'Anonymous'),
255
+ source=model['metadata'].get('source', 'Unknown'),
256
+ note=model['metadata'].get('note', ''),
257
+ icon=(
258
+ model_icon
259
+ if model_icon.startswith(('http://', 'https://'))
260
+ else '/file/model/%s/%s' % (model['name'], model_icon)
261
+ )
262
+ )
263
+
264
+
265
+ def _example_vc(
266
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
267
+ filter_radius, rms_mix_rate, resample_option
268
+ ):
269
+ (audio, message) = vc_func(
270
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
271
+ filter_radius, rms_mix_rate, resample_option
272
+ )
273
+ return (
274
+ audio,
275
+ message,
276
+ update_model_info(model_index)
277
+ )
278
+
279
+
280
+ async def _example_edge_tts(
281
+ input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
282
+ filter_radius, rms_mix_rate, resample_option
283
+ ):
284
+ (audio, message) = await edge_tts_vc_func(
285
+ input_text, model_index, tts_speaker, pitch_adjust, f0_method,
286
+ feat_ratio, filter_radius, rms_mix_rate, resample_option
287
+ )
288
+ return (
289
+ audio,
290
+ message,
291
+ update_model_info(model_index)
292
+ )
293
+
294
+
295
+ with app:
296
+ gr.Markdown(
297
+ '## A simplistic Web interface\n'
298
+ 'RVC interface, project based on [RVC-WebUI](https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI)' # thx noqa
299
+ 'A lot of inspiration from what\'s already out there, including [zomehwh/rvc-models](https://huggingface.co/spaces/zomehwh/rvc-models) & [DJQmUKV/rvc-inference](https://huggingface.co/spaces/DJQmUKV/rvc-inference).\n ' # thx noqa
300
+ )
301
+
302
+ with gr.Row():
303
+ with gr.Column():
304
+ with gr.Tab('Audio conversion'):
305
+ input_audio = gr.Audio(label='Input audio')
306
+
307
+ vc_convert_btn = gr.Button('Convert', variant='primary')
308
+
309
+ with gr.Tab('TTS conversion'):
310
+ tts_input = gr.TextArea(
311
+ label='TTS input text'
312
+ )
313
+ tts_speaker = gr.Dropdown(
314
+ [
315
+ '%s (%s)' % (
316
+ s['FriendlyName'],
317
+ s['Gender']
318
+ )
319
+ for s in tts_speakers_list
320
+ ],
321
+ label='TTS speaker',
322
+ type='index'
323
+ )
324
+
325
+ tts_convert_btn = gr.Button('Convert', variant='primary')
326
+
327
+ pitch_adjust = gr.Slider(
328
+ label='Pitch',
329
+ minimum=-24,
330
+ maximum=24,
331
+ step=1,
332
+ value=0
333
+ )
334
+ f0_method = gr.Radio(
335
+ label='f0 methods',
336
+ choices=['pm', 'harvest', 'crepe'],
337
+ value='pm',
338
+ interactive=True
339
+ )
340
+
341
+ with gr.Accordion('Advanced options', open=False):
342
+ feat_ratio = gr.Slider(
343
+ label='Feature ratio',
344
+ minimum=0,
345
+ maximum=1,
346
+ step=0.1,
347
+ value=0.6
348
+ )
349
+ filter_radius = gr.Slider(
350
+ label='Filter radius',
351
+ minimum=0,
352
+ maximum=7,
353
+ step=1,
354
+ value=3
355
+ )
356
+ rms_mix_rate = gr.Slider(
357
+ label='Volume envelope mix rate',
358
+ minimum=0,
359
+ maximum=1,
360
+ step=0.1,
361
+ value=1
362
+ )
363
+ resample_rate = gr.Dropdown(
364
+ [
365
+ 'Disable resampling',
366
+ '16000',
367
+ '22050',
368
+ '44100',
369
+ '48000'
370
+ ],
371
+ label='Resample rate',
372
+ value='Disable resampling'
373
+ )
374
+
375
+ with gr.Column():
376
+ # Model select
377
+ model_index = gr.Dropdown(
378
+ [
379
+ '%s - %s' % (
380
+ m['metadata'].get('source', 'Unknown'),
381
+ m['metadata'].get('name')
382
+ )
383
+ for m in loaded_models
384
+ ],
385
+ label='Model',
386
+ type='index'
387
+ )
388
+
389
+ # Model info
390
+ with gr.Box():
391
+ model_info = gr.Markdown(
392
+ '### Model info\n'
393
+ 'Please select a model from dropdown above.',
394
+ elem_id='model_info'
395
+ )
396
+
397
+ output_audio = gr.Audio(label='Output audio')
398
+ output_msg = gr.Textbox(label='Output message')
399
+
400
+ multi_examples = multi_cfg.get('examples')
401
+ if (
402
+ multi_examples and
403
+ multi_examples.get('vc') and multi_examples.get('tts_vc')
404
+ ):
405
+ with gr.Accordion('Sweet sweet examples', open=False):
406
+ with gr.Row():
407
+ # VC Example
408
+ if multi_examples.get('vc'):
409
+ gr.Examples(
410
+ label='Audio conversion examples',
411
+ examples=multi_examples.get('vc'),
412
+ inputs=[
413
+ input_audio, model_index, pitch_adjust, f0_method,
414
+ feat_ratio
415
+ ],
416
+ outputs=[output_audio, output_msg, model_info],
417
+ fn=_example_vc,
418
+ cache_examples=args.cache_examples,
419
+ run_on_click=args.cache_examples
420
+ )
421
+
422
+ # Edge TTS Example
423
+ if multi_examples.get('tts_vc'):
424
+ gr.Examples(
425
+ label='TTS conversion examples',
426
+ examples=multi_examples.get('tts_vc'),
427
+ inputs=[
428
+ tts_input, model_index, tts_speaker, pitch_adjust,
429
+ f0_method, feat_ratio
430
+ ],
431
+ outputs=[output_audio, output_msg, model_info],
432
+ fn=_example_edge_tts,
433
+ cache_examples=args.cache_examples,
434
+ run_on_click=args.cache_examples
435
+ )
436
+
437
+ vc_convert_btn.click(
438
+ vc_func,
439
+ [
440
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
441
+ filter_radius, rms_mix_rate, resample_rate
442
+ ],
443
+ [output_audio, output_msg],
444
+ api_name='audio_conversion'
445
+ )
446
+
447
+ tts_convert_btn.click(
448
+ edge_tts_vc_func,
449
+ [
450
+ tts_input, model_index, tts_speaker, pitch_adjust, f0_method,
451
+ feat_ratio, filter_radius, rms_mix_rate, resample_rate
452
+ ],
453
+ [output_audio, output_msg],
454
+ api_name='tts_conversion'
455
+ )
456
+
457
+ model_index.change(
458
+ update_model_info,
459
+ inputs=[model_index],
460
+ outputs=[model_info],
461
+ show_progress=False,
462
+ queue=False
463
+ )
464
+
465
+ app.queue(
466
+ concurrency_count=1,
467
+ max_size=20,
468
+ api_open=args.api
469
+ ).launch()
config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ import util
4
+
5
+ device = (
6
+ 'cuda:0' if torch.cuda.is_available()
7
+ else (
8
+ 'mps' if util.has_mps()
9
+ else 'cpu'
10
+ )
11
+ )
12
+ is_half = util.is_half(device)
13
+
14
+ x_pad = 3 if is_half else 1
15
+ x_query = 10 if is_half else 6
16
+ x_center = 60 if is_half else 38
17
+ x_max = 65 if is_half else 41
hubert_base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
3
+ size 189507909
infer_pack/attentions.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from infer_pack import commons
9
+ from infer_pack import modules
10
+ from infer_pack.modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(
15
+ self,
16
+ hidden_channels,
17
+ filter_channels,
18
+ n_heads,
19
+ n_layers,
20
+ kernel_size=1,
21
+ p_dropout=0.0,
22
+ window_size=10,
23
+ **kwargs
24
+ ):
25
+ super().__init__()
26
+ self.hidden_channels = hidden_channels
27
+ self.filter_channels = filter_channels
28
+ self.n_heads = n_heads
29
+ self.n_layers = n_layers
30
+ self.kernel_size = kernel_size
31
+ self.p_dropout = p_dropout
32
+ self.window_size = window_size
33
+
34
+ self.drop = nn.Dropout(p_dropout)
35
+ self.attn_layers = nn.ModuleList()
36
+ self.norm_layers_1 = nn.ModuleList()
37
+ self.ffn_layers = nn.ModuleList()
38
+ self.norm_layers_2 = nn.ModuleList()
39
+ for i in range(self.n_layers):
40
+ self.attn_layers.append(
41
+ MultiHeadAttention(
42
+ hidden_channels,
43
+ hidden_channels,
44
+ n_heads,
45
+ p_dropout=p_dropout,
46
+ window_size=window_size,
47
+ )
48
+ )
49
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
50
+ self.ffn_layers.append(
51
+ FFN(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ filter_channels,
55
+ kernel_size,
56
+ p_dropout=p_dropout,
57
+ )
58
+ )
59
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
60
+
61
+ def forward(self, x, x_mask):
62
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63
+ x = x * x_mask
64
+ for i in range(self.n_layers):
65
+ y = self.attn_layers[i](x, x, attn_mask)
66
+ y = self.drop(y)
67
+ x = self.norm_layers_1[i](x + y)
68
+
69
+ y = self.ffn_layers[i](x, x_mask)
70
+ y = self.drop(y)
71
+ x = self.norm_layers_2[i](x + y)
72
+ x = x * x_mask
73
+ return x
74
+
75
+
76
+ class Decoder(nn.Module):
77
+ def __init__(
78
+ self,
79
+ hidden_channels,
80
+ filter_channels,
81
+ n_heads,
82
+ n_layers,
83
+ kernel_size=1,
84
+ p_dropout=0.0,
85
+ proximal_bias=False,
86
+ proximal_init=True,
87
+ **kwargs
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(self, x, x_mask, h, h_mask):
137
+ """
138
+ x: decoder input
139
+ h: encoder output
140
+ """
141
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142
+ device=x.device, dtype=x.dtype
143
+ )
144
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145
+ x = x * x_mask
146
+ for i in range(self.n_layers):
147
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
148
+ y = self.drop(y)
149
+ x = self.norm_layers_0[i](x + y)
150
+
151
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_1[i](x + y)
154
+
155
+ y = self.ffn_layers[i](x, x_mask)
156
+ y = self.drop(y)
157
+ x = self.norm_layers_2[i](x + y)
158
+ x = x * x_mask
159
+ return x
160
+
161
+
162
+ class MultiHeadAttention(nn.Module):
163
+ def __init__(
164
+ self,
165
+ channels,
166
+ out_channels,
167
+ n_heads,
168
+ p_dropout=0.0,
169
+ window_size=None,
170
+ heads_share=True,
171
+ block_length=None,
172
+ proximal_bias=False,
173
+ proximal_init=False,
174
+ ):
175
+ super().__init__()
176
+ assert channels % n_heads == 0
177
+
178
+ self.channels = channels
179
+ self.out_channels = out_channels
180
+ self.n_heads = n_heads
181
+ self.p_dropout = p_dropout
182
+ self.window_size = window_size
183
+ self.heads_share = heads_share
184
+ self.block_length = block_length
185
+ self.proximal_bias = proximal_bias
186
+ self.proximal_init = proximal_init
187
+ self.attn = None
188
+
189
+ self.k_channels = channels // n_heads
190
+ self.conv_q = nn.Conv1d(channels, channels, 1)
191
+ self.conv_k = nn.Conv1d(channels, channels, 1)
192
+ self.conv_v = nn.Conv1d(channels, channels, 1)
193
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
194
+ self.drop = nn.Dropout(p_dropout)
195
+
196
+ if window_size is not None:
197
+ n_heads_rel = 1 if heads_share else n_heads
198
+ rel_stddev = self.k_channels**-0.5
199
+ self.emb_rel_k = nn.Parameter(
200
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
+ * rel_stddev
202
+ )
203
+ self.emb_rel_v = nn.Parameter(
204
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
+ * rel_stddev
206
+ )
207
+
208
+ nn.init.xavier_uniform_(self.conv_q.weight)
209
+ nn.init.xavier_uniform_(self.conv_k.weight)
210
+ nn.init.xavier_uniform_(self.conv_v.weight)
211
+ if proximal_init:
212
+ with torch.no_grad():
213
+ self.conv_k.weight.copy_(self.conv_q.weight)
214
+ self.conv_k.bias.copy_(self.conv_q.bias)
215
+
216
+ def forward(self, x, c, attn_mask=None):
217
+ q = self.conv_q(x)
218
+ k = self.conv_k(c)
219
+ v = self.conv_v(c)
220
+
221
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
+
223
+ x = self.conv_o(x)
224
+ return x
225
+
226
+ def attention(self, query, key, value, mask=None):
227
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
228
+ b, d, t_s, t_t = (*key.size(), query.size(2))
229
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232
+
233
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234
+ if self.window_size is not None:
235
+ assert (
236
+ t_s == t_t
237
+ ), "Relative attention is only available for self-attention."
238
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239
+ rel_logits = self._matmul_with_relative_keys(
240
+ query / math.sqrt(self.k_channels), key_relative_embeddings
241
+ )
242
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
243
+ scores = scores + scores_local
244
+ if self.proximal_bias:
245
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
246
+ scores = scores + self._attention_bias_proximal(t_s).to(
247
+ device=scores.device, dtype=scores.dtype
248
+ )
249
+ if mask is not None:
250
+ scores = scores.masked_fill(mask == 0, -1e4)
251
+ if self.block_length is not None:
252
+ assert (
253
+ t_s == t_t
254
+ ), "Local attention is only available for self-attention."
255
+ block_mask = (
256
+ torch.ones_like(scores)
257
+ .triu(-self.block_length)
258
+ .tril(self.block_length)
259
+ )
260
+ scores = scores.masked_fill(block_mask == 0, -1e4)
261
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262
+ p_attn = self.drop(p_attn)
263
+ output = torch.matmul(p_attn, value)
264
+ if self.window_size is not None:
265
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
266
+ value_relative_embeddings = self._get_relative_embeddings(
267
+ self.emb_rel_v, t_s
268
+ )
269
+ output = output + self._matmul_with_relative_values(
270
+ relative_weights, value_relative_embeddings
271
+ )
272
+ output = (
273
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
274
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275
+ return output, p_attn
276
+
277
+ def _matmul_with_relative_values(self, x, y):
278
+ """
279
+ x: [b, h, l, m]
280
+ y: [h or 1, m, d]
281
+ ret: [b, h, l, d]
282
+ """
283
+ ret = torch.matmul(x, y.unsqueeze(0))
284
+ return ret
285
+
286
+ def _matmul_with_relative_keys(self, x, y):
287
+ """
288
+ x: [b, h, l, d]
289
+ y: [h or 1, m, d]
290
+ ret: [b, h, l, m]
291
+ """
292
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293
+ return ret
294
+
295
+ def _get_relative_embeddings(self, relative_embeddings, length):
296
+ max_relative_position = 2 * self.window_size + 1
297
+ # Pad first before slice to avoid using cond ops.
298
+ pad_length = max(length - (self.window_size + 1), 0)
299
+ slice_start_position = max((self.window_size + 1) - length, 0)
300
+ slice_end_position = slice_start_position + 2 * length - 1
301
+ if pad_length > 0:
302
+ padded_relative_embeddings = F.pad(
303
+ relative_embeddings,
304
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305
+ )
306
+ else:
307
+ padded_relative_embeddings = relative_embeddings
308
+ used_relative_embeddings = padded_relative_embeddings[
309
+ :, slice_start_position:slice_end_position
310
+ ]
311
+ return used_relative_embeddings
312
+
313
+ def _relative_position_to_absolute_position(self, x):
314
+ """
315
+ x: [b, h, l, 2*l-1]
316
+ ret: [b, h, l, l]
317
+ """
318
+ batch, heads, length, _ = x.size()
319
+ # Concat columns of pad to shift from relative to absolute indexing.
320
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321
+
322
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
323
+ x_flat = x.view([batch, heads, length * 2 * length])
324
+ x_flat = F.pad(
325
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326
+ )
327
+
328
+ # Reshape and slice out the padded elements.
329
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330
+ :, :, :length, length - 1 :
331
+ ]
332
+ return x_final
333
+
334
+ def _absolute_position_to_relative_position(self, x):
335
+ """
336
+ x: [b, h, l, l]
337
+ ret: [b, h, l, 2*l-1]
338
+ """
339
+ batch, heads, length, _ = x.size()
340
+ # padd along column
341
+ x = F.pad(
342
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343
+ )
344
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345
+ # add 0's in the beginning that will skew the elements after reshape
346
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348
+ return x_final
349
+
350
+ def _attention_bias_proximal(self, length):
351
+ """Bias for self-attention to encourage attention to close positions.
352
+ Args:
353
+ length: an integer scalar.
354
+ Returns:
355
+ a Tensor with shape [1, 1, length, length]
356
+ """
357
+ r = torch.arange(length, dtype=torch.float32)
358
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360
+
361
+
362
+ class FFN(nn.Module):
363
+ def __init__(
364
+ self,
365
+ in_channels,
366
+ out_channels,
367
+ filter_channels,
368
+ kernel_size,
369
+ p_dropout=0.0,
370
+ activation=None,
371
+ causal=False,
372
+ ):
373
+ super().__init__()
374
+ self.in_channels = in_channels
375
+ self.out_channels = out_channels
376
+ self.filter_channels = filter_channels
377
+ self.kernel_size = kernel_size
378
+ self.p_dropout = p_dropout
379
+ self.activation = activation
380
+ self.causal = causal
381
+
382
+ if causal:
383
+ self.padding = self._causal_padding
384
+ else:
385
+ self.padding = self._same_padding
386
+
387
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389
+ self.drop = nn.Dropout(p_dropout)
390
+
391
+ def forward(self, x, x_mask):
392
+ x = self.conv_1(self.padding(x * x_mask))
393
+ if self.activation == "gelu":
394
+ x = x * torch.sigmoid(1.702 * x)
395
+ else:
396
+ x = torch.relu(x)
397
+ x = self.drop(x)
398
+ x = self.conv_2(self.padding(x * x_mask))
399
+ return x * x_mask
400
+
401
+ def _causal_padding(self, x):
402
+ if self.kernel_size == 1:
403
+ return x
404
+ pad_l = self.kernel_size - 1
405
+ pad_r = 0
406
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407
+ x = F.pad(x, commons.convert_pad_shape(padding))
408
+ return x
409
+
410
+ def _same_padding(self, x):
411
+ if self.kernel_size == 1:
412
+ return x
413
+ pad_l = (self.kernel_size - 1) // 2
414
+ pad_r = self.kernel_size // 2
415
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416
+ x = F.pad(x, commons.convert_pad_shape(padding))
417
+ return x
infer_pack/commons.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size * dilation - dilation) / 2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
25
+ """KL(P||Q)"""
26
+ kl = (logs_q - logs_p) - 0.5
27
+ kl += (
28
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
29
+ )
30
+ return kl
31
+
32
+
33
+ def rand_gumbel(shape):
34
+ """Sample from the Gumbel distribution, protect from overflows."""
35
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
36
+ return -torch.log(-torch.log(uniform_samples))
37
+
38
+
39
+ def rand_gumbel_like(x):
40
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
41
+ return g
42
+
43
+
44
+ def slice_segments(x, ids_str, segment_size=4):
45
+ ret = torch.zeros_like(x[:, :, :segment_size])
46
+ for i in range(x.size(0)):
47
+ idx_str = ids_str[i]
48
+ idx_end = idx_str + segment_size
49
+ ret[i] = x[i, :, idx_str:idx_end]
50
+ return ret
51
+
52
+
53
+ def slice_segments2(x, ids_str, segment_size=4):
54
+ ret = torch.zeros_like(x[:, :segment_size])
55
+ for i in range(x.size(0)):
56
+ idx_str = ids_str[i]
57
+ idx_end = idx_str + segment_size
58
+ ret[i] = x[i, idx_str:idx_end]
59
+ return ret
60
+
61
+
62
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
63
+ b, d, t = x.size()
64
+ if x_lengths is None:
65
+ x_lengths = t
66
+ ids_str_max = x_lengths - segment_size + 1
67
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
68
+ ret = slice_segments(x, ids_str, segment_size)
69
+ return ret, ids_str
70
+
71
+
72
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
73
+ position = torch.arange(length, dtype=torch.float)
74
+ num_timescales = channels // 2
75
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
76
+ num_timescales - 1
77
+ )
78
+ inv_timescales = min_timescale * torch.exp(
79
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
80
+ )
81
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
82
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
83
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
84
+ signal = signal.view(1, channels, length)
85
+ return signal
86
+
87
+
88
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
89
+ b, channels, length = x.size()
90
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
91
+ return x + signal.to(dtype=x.dtype, device=x.device)
92
+
93
+
94
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
95
+ b, channels, length = x.size()
96
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
97
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
98
+
99
+
100
+ def subsequent_mask(length):
101
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102
+ return mask
103
+
104
+
105
+ @torch.jit.script
106
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107
+ n_channels_int = n_channels[0]
108
+ in_act = input_a + input_b
109
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
110
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111
+ acts = t_act * s_act
112
+ return acts
113
+
114
+
115
+ def convert_pad_shape(pad_shape):
116
+ l = pad_shape[::-1]
117
+ pad_shape = [item for sublist in l for item in sublist]
118
+ return pad_shape
119
+
120
+
121
+ def shift_1d(x):
122
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123
+ return x
124
+
125
+
126
+ def sequence_mask(length, max_length=None):
127
+ if max_length is None:
128
+ max_length = length.max()
129
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130
+ return x.unsqueeze(0) < length.unsqueeze(1)
131
+
132
+
133
+ def generate_path(duration, mask):
134
+ """
135
+ duration: [b, 1, t_x]
136
+ mask: [b, 1, t_y, t_x]
137
+ """
138
+ device = duration.device
139
+
140
+ b, _, t_y, t_x = mask.shape
141
+ cum_duration = torch.cumsum(duration, -1)
142
+
143
+ cum_duration_flat = cum_duration.view(b * t_x)
144
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145
+ path = path.view(b, t_x, t_y)
146
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147
+ path = path.unsqueeze(1).transpose(2, 3) * mask
148
+ return path
149
+
150
+
151
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
152
+ if isinstance(parameters, torch.Tensor):
153
+ parameters = [parameters]
154
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
155
+ norm_type = float(norm_type)
156
+ if clip_value is not None:
157
+ clip_value = float(clip_value)
158
+
159
+ total_norm = 0
160
+ for p in parameters:
161
+ param_norm = p.grad.data.norm(norm_type)
162
+ total_norm += param_norm.item() ** norm_type
163
+ if clip_value is not None:
164
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
165
+ total_norm = total_norm ** (1.0 / norm_type)
166
+ return total_norm
infer_pack/models.py ADDED
@@ -0,0 +1,1124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from infer_pack import modules
7
+ from infer_pack import attentions
8
+ from infer_pack import commons
9
+ from infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from infer_pack.commons import init_weights
13
+ import numpy as np
14
+ from infer_pack import commons
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
+ rand_ini = torch.rand(
338
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
+ )
340
+ rand_ini[:, 0] = 0
341
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
+ tmp_over_one *= upp
344
+ tmp_over_one = F.interpolate(
345
+ tmp_over_one.transpose(2, 1),
346
+ scale_factor=upp,
347
+ mode="linear",
348
+ align_corners=True,
349
+ ).transpose(2, 1)
350
+ rad_values = F.interpolate(
351
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
+ ).transpose(
353
+ 2, 1
354
+ ) #######
355
+ tmp_over_one %= 1
356
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
+ cumsum_shift = torch.zeros_like(rad_values)
358
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
+ sine_waves = torch.sin(
360
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
+ )
362
+ sine_waves = sine_waves * self.sine_amp
363
+ uv = self._f02uv(f0)
364
+ uv = F.interpolate(
365
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
+ ).transpose(2, 1)
367
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
+ noise = noise_amp * torch.randn_like(sine_waves)
369
+ sine_waves = sine_waves * uv + noise
370
+ return sine_waves, uv, noise
371
+
372
+
373
+ class SourceModuleHnNSF(torch.nn.Module):
374
+ """SourceModule for hn-nsf
375
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
+ add_noise_std=0.003, voiced_threshod=0)
377
+ sampling_rate: sampling_rate in Hz
378
+ harmonic_num: number of harmonic above F0 (default: 0)
379
+ sine_amp: amplitude of sine source signal (default: 0.1)
380
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
381
+ note that amplitude of noise in unvoiced is decided
382
+ by sine_amp
383
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
384
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
+ F0_sampled (batchsize, length, 1)
386
+ Sine_source (batchsize, length, 1)
387
+ noise_source (batchsize, length 1)
388
+ uv (batchsize, length, 1)
389
+ """
390
+
391
+ def __init__(
392
+ self,
393
+ sampling_rate,
394
+ harmonic_num=0,
395
+ sine_amp=0.1,
396
+ add_noise_std=0.003,
397
+ voiced_threshod=0,
398
+ is_half=True,
399
+ ):
400
+ super(SourceModuleHnNSF, self).__init__()
401
+
402
+ self.sine_amp = sine_amp
403
+ self.noise_std = add_noise_std
404
+ self.is_half = is_half
405
+ # to produce sine waveforms
406
+ self.l_sin_gen = SineGen(
407
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
+ )
409
+
410
+ # to merge source harmonics into a single excitation
411
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
+ self.l_tanh = torch.nn.Tanh()
413
+
414
+ def forward(self, x, upp=None):
415
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
+ if self.is_half:
417
+ sine_wavs = sine_wavs.half()
418
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
+ return sine_merge, None, None # noise, uv
420
+
421
+
422
+ class GeneratorNSF(torch.nn.Module):
423
+ def __init__(
424
+ self,
425
+ initial_channel,
426
+ resblock,
427
+ resblock_kernel_sizes,
428
+ resblock_dilation_sizes,
429
+ upsample_rates,
430
+ upsample_initial_channel,
431
+ upsample_kernel_sizes,
432
+ gin_channels,
433
+ sr,
434
+ is_half=False,
435
+ ):
436
+ super(GeneratorNSF, self).__init__()
437
+ self.num_kernels = len(resblock_kernel_sizes)
438
+ self.num_upsamples = len(upsample_rates)
439
+
440
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
+ self.m_source = SourceModuleHnNSF(
442
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
443
+ )
444
+ self.noise_convs = nn.ModuleList()
445
+ self.conv_pre = Conv1d(
446
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
447
+ )
448
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
+
450
+ self.ups = nn.ModuleList()
451
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
453
+ self.ups.append(
454
+ weight_norm(
455
+ ConvTranspose1d(
456
+ upsample_initial_channel // (2**i),
457
+ upsample_initial_channel // (2 ** (i + 1)),
458
+ k,
459
+ u,
460
+ padding=(k - u) // 2,
461
+ )
462
+ )
463
+ )
464
+ if i + 1 < len(upsample_rates):
465
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
466
+ self.noise_convs.append(
467
+ Conv1d(
468
+ 1,
469
+ c_cur,
470
+ kernel_size=stride_f0 * 2,
471
+ stride=stride_f0,
472
+ padding=stride_f0 // 2,
473
+ )
474
+ )
475
+ else:
476
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
+
478
+ self.resblocks = nn.ModuleList()
479
+ for i in range(len(self.ups)):
480
+ ch = upsample_initial_channel // (2 ** (i + 1))
481
+ for j, (k, d) in enumerate(
482
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
+ ):
484
+ self.resblocks.append(resblock(ch, k, d))
485
+
486
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
+ self.ups.apply(init_weights)
488
+
489
+ if gin_channels != 0:
490
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
+
492
+ self.upp = np.prod(upsample_rates)
493
+
494
+ def forward(self, x, f0, g=None):
495
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
496
+ har_source = har_source.transpose(1, 2)
497
+ x = self.conv_pre(x)
498
+ if g is not None:
499
+ x = x + self.cond(g)
500
+
501
+ for i in range(self.num_upsamples):
502
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
+ x = self.ups[i](x)
504
+ x_source = self.noise_convs[i](har_source)
505
+ x = x + x_source
506
+ xs = None
507
+ for j in range(self.num_kernels):
508
+ if xs is None:
509
+ xs = self.resblocks[i * self.num_kernels + j](x)
510
+ else:
511
+ xs += self.resblocks[i * self.num_kernels + j](x)
512
+ x = xs / self.num_kernels
513
+ x = F.leaky_relu(x)
514
+ x = self.conv_post(x)
515
+ x = torch.tanh(x)
516
+ return x
517
+
518
+ def remove_weight_norm(self):
519
+ for l in self.ups:
520
+ remove_weight_norm(l)
521
+ for l in self.resblocks:
522
+ l.remove_weight_norm()
523
+
524
+
525
+ sr2sr = {
526
+ "32k": 32000,
527
+ "40k": 40000,
528
+ "48k": 48000,
529
+ }
530
+
531
+
532
+ class SynthesizerTrnMs256NSFsid(nn.Module):
533
+ def __init__(
534
+ self,
535
+ spec_channels,
536
+ segment_size,
537
+ inter_channels,
538
+ hidden_channels,
539
+ filter_channels,
540
+ n_heads,
541
+ n_layers,
542
+ kernel_size,
543
+ p_dropout,
544
+ resblock,
545
+ resblock_kernel_sizes,
546
+ resblock_dilation_sizes,
547
+ upsample_rates,
548
+ upsample_initial_channel,
549
+ upsample_kernel_sizes,
550
+ spk_embed_dim,
551
+ gin_channels,
552
+ sr,
553
+ **kwargs
554
+ ):
555
+ super().__init__()
556
+ if type(sr) == type("strr"):
557
+ sr = sr2sr[sr]
558
+ self.spec_channels = spec_channels
559
+ self.inter_channels = inter_channels
560
+ self.hidden_channels = hidden_channels
561
+ self.filter_channels = filter_channels
562
+ self.n_heads = n_heads
563
+ self.n_layers = n_layers
564
+ self.kernel_size = kernel_size
565
+ self.p_dropout = p_dropout
566
+ self.resblock = resblock
567
+ self.resblock_kernel_sizes = resblock_kernel_sizes
568
+ self.resblock_dilation_sizes = resblock_dilation_sizes
569
+ self.upsample_rates = upsample_rates
570
+ self.upsample_initial_channel = upsample_initial_channel
571
+ self.upsample_kernel_sizes = upsample_kernel_sizes
572
+ self.segment_size = segment_size
573
+ self.gin_channels = gin_channels
574
+ # self.hop_length = hop_length#
575
+ self.spk_embed_dim = spk_embed_dim
576
+ self.enc_p = TextEncoder256(
577
+ inter_channels,
578
+ hidden_channels,
579
+ filter_channels,
580
+ n_heads,
581
+ n_layers,
582
+ kernel_size,
583
+ p_dropout,
584
+ )
585
+ self.dec = GeneratorNSF(
586
+ inter_channels,
587
+ resblock,
588
+ resblock_kernel_sizes,
589
+ resblock_dilation_sizes,
590
+ upsample_rates,
591
+ upsample_initial_channel,
592
+ upsample_kernel_sizes,
593
+ gin_channels=gin_channels,
594
+ sr=sr,
595
+ is_half=kwargs["is_half"],
596
+ )
597
+ self.enc_q = PosteriorEncoder(
598
+ spec_channels,
599
+ inter_channels,
600
+ hidden_channels,
601
+ 5,
602
+ 1,
603
+ 16,
604
+ gin_channels=gin_channels,
605
+ )
606
+ self.flow = ResidualCouplingBlock(
607
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
608
+ )
609
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
610
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
611
+
612
+ def remove_weight_norm(self):
613
+ self.dec.remove_weight_norm()
614
+ self.flow.remove_weight_norm()
615
+ self.enc_q.remove_weight_norm()
616
+
617
+ def forward(
618
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
619
+ ): # 这里ds是id,[bs,1]
620
+ # print(1,pitch.shape)#[bs,t]
621
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
622
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
623
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
624
+ z_p = self.flow(z, y_mask, g=g)
625
+ z_slice, ids_slice = commons.rand_slice_segments(
626
+ z, y_lengths, self.segment_size
627
+ )
628
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
629
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
630
+ # print(-2,pitchf.shape,z_slice.shape)
631
+ o = self.dec(z_slice, pitchf, g=g)
632
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
633
+
634
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
635
+ g = self.emb_g(sid).unsqueeze(-1)
636
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
637
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
638
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
639
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
640
+ return o, x_mask, (z, z_p, m_p, logs_p)
641
+
642
+
643
+ class SynthesizerTrnMs768NSFsid(nn.Module):
644
+ def __init__(
645
+ self,
646
+ spec_channels,
647
+ segment_size,
648
+ inter_channels,
649
+ hidden_channels,
650
+ filter_channels,
651
+ n_heads,
652
+ n_layers,
653
+ kernel_size,
654
+ p_dropout,
655
+ resblock,
656
+ resblock_kernel_sizes,
657
+ resblock_dilation_sizes,
658
+ upsample_rates,
659
+ upsample_initial_channel,
660
+ upsample_kernel_sizes,
661
+ spk_embed_dim,
662
+ gin_channels,
663
+ sr,
664
+ **kwargs
665
+ ):
666
+ super().__init__()
667
+ if type(sr) == type("strr"):
668
+ sr = sr2sr[sr]
669
+ self.spec_channels = spec_channels
670
+ self.inter_channels = inter_channels
671
+ self.hidden_channels = hidden_channels
672
+ self.filter_channels = filter_channels
673
+ self.n_heads = n_heads
674
+ self.n_layers = n_layers
675
+ self.kernel_size = kernel_size
676
+ self.p_dropout = p_dropout
677
+ self.resblock = resblock
678
+ self.resblock_kernel_sizes = resblock_kernel_sizes
679
+ self.resblock_dilation_sizes = resblock_dilation_sizes
680
+ self.upsample_rates = upsample_rates
681
+ self.upsample_initial_channel = upsample_initial_channel
682
+ self.upsample_kernel_sizes = upsample_kernel_sizes
683
+ self.segment_size = segment_size
684
+ self.gin_channels = gin_channels
685
+ # self.hop_length = hop_length#
686
+ self.spk_embed_dim = spk_embed_dim
687
+ self.enc_p = TextEncoder768(
688
+ inter_channels,
689
+ hidden_channels,
690
+ filter_channels,
691
+ n_heads,
692
+ n_layers,
693
+ kernel_size,
694
+ p_dropout,
695
+ )
696
+ self.dec = GeneratorNSF(
697
+ inter_channels,
698
+ resblock,
699
+ resblock_kernel_sizes,
700
+ resblock_dilation_sizes,
701
+ upsample_rates,
702
+ upsample_initial_channel,
703
+ upsample_kernel_sizes,
704
+ gin_channels=gin_channels,
705
+ sr=sr,
706
+ is_half=kwargs["is_half"],
707
+ )
708
+ self.enc_q = PosteriorEncoder(
709
+ spec_channels,
710
+ inter_channels,
711
+ hidden_channels,
712
+ 5,
713
+ 1,
714
+ 16,
715
+ gin_channels=gin_channels,
716
+ )
717
+ self.flow = ResidualCouplingBlock(
718
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
719
+ )
720
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
721
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
722
+
723
+ def remove_weight_norm(self):
724
+ self.dec.remove_weight_norm()
725
+ self.flow.remove_weight_norm()
726
+ self.enc_q.remove_weight_norm()
727
+
728
+ def forward(
729
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
730
+ ): # 这里ds是id,[bs,1]
731
+ # print(1,pitch.shape)#[bs,t]
732
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
733
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
734
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
735
+ z_p = self.flow(z, y_mask, g=g)
736
+ z_slice, ids_slice = commons.rand_slice_segments(
737
+ z, y_lengths, self.segment_size
738
+ )
739
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
740
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
741
+ # print(-2,pitchf.shape,z_slice.shape)
742
+ o = self.dec(z_slice, pitchf, g=g)
743
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
744
+
745
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
746
+ g = self.emb_g(sid).unsqueeze(-1)
747
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
748
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
749
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
750
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
751
+ return o, x_mask, (z, z_p, m_p, logs_p)
752
+
753
+
754
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
755
+ def __init__(
756
+ self,
757
+ spec_channels,
758
+ segment_size,
759
+ inter_channels,
760
+ hidden_channels,
761
+ filter_channels,
762
+ n_heads,
763
+ n_layers,
764
+ kernel_size,
765
+ p_dropout,
766
+ resblock,
767
+ resblock_kernel_sizes,
768
+ resblock_dilation_sizes,
769
+ upsample_rates,
770
+ upsample_initial_channel,
771
+ upsample_kernel_sizes,
772
+ spk_embed_dim,
773
+ gin_channels,
774
+ sr=None,
775
+ **kwargs
776
+ ):
777
+ super().__init__()
778
+ self.spec_channels = spec_channels
779
+ self.inter_channels = inter_channels
780
+ self.hidden_channels = hidden_channels
781
+ self.filter_channels = filter_channels
782
+ self.n_heads = n_heads
783
+ self.n_layers = n_layers
784
+ self.kernel_size = kernel_size
785
+ self.p_dropout = p_dropout
786
+ self.resblock = resblock
787
+ self.resblock_kernel_sizes = resblock_kernel_sizes
788
+ self.resblock_dilation_sizes = resblock_dilation_sizes
789
+ self.upsample_rates = upsample_rates
790
+ self.upsample_initial_channel = upsample_initial_channel
791
+ self.upsample_kernel_sizes = upsample_kernel_sizes
792
+ self.segment_size = segment_size
793
+ self.gin_channels = gin_channels
794
+ # self.hop_length = hop_length#
795
+ self.spk_embed_dim = spk_embed_dim
796
+ self.enc_p = TextEncoder256(
797
+ inter_channels,
798
+ hidden_channels,
799
+ filter_channels,
800
+ n_heads,
801
+ n_layers,
802
+ kernel_size,
803
+ p_dropout,
804
+ f0=False,
805
+ )
806
+ self.dec = Generator(
807
+ inter_channels,
808
+ resblock,
809
+ resblock_kernel_sizes,
810
+ resblock_dilation_sizes,
811
+ upsample_rates,
812
+ upsample_initial_channel,
813
+ upsample_kernel_sizes,
814
+ gin_channels=gin_channels,
815
+ )
816
+ self.enc_q = PosteriorEncoder(
817
+ spec_channels,
818
+ inter_channels,
819
+ hidden_channels,
820
+ 5,
821
+ 1,
822
+ 16,
823
+ gin_channels=gin_channels,
824
+ )
825
+ self.flow = ResidualCouplingBlock(
826
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
827
+ )
828
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
829
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
830
+
831
+ def remove_weight_norm(self):
832
+ self.dec.remove_weight_norm()
833
+ self.flow.remove_weight_norm()
834
+ self.enc_q.remove_weight_norm()
835
+
836
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
837
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
838
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
839
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
840
+ z_p = self.flow(z, y_mask, g=g)
841
+ z_slice, ids_slice = commons.rand_slice_segments(
842
+ z, y_lengths, self.segment_size
843
+ )
844
+ o = self.dec(z_slice, g=g)
845
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
846
+
847
+ def infer(self, phone, phone_lengths, sid, max_len=None):
848
+ g = self.emb_g(sid).unsqueeze(-1)
849
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
850
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
851
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
852
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
853
+ return o, x_mask, (z, z_p, m_p, logs_p)
854
+
855
+
856
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
857
+ def __init__(
858
+ self,
859
+ spec_channels,
860
+ segment_size,
861
+ inter_channels,
862
+ hidden_channels,
863
+ filter_channels,
864
+ n_heads,
865
+ n_layers,
866
+ kernel_size,
867
+ p_dropout,
868
+ resblock,
869
+ resblock_kernel_sizes,
870
+ resblock_dilation_sizes,
871
+ upsample_rates,
872
+ upsample_initial_channel,
873
+ upsample_kernel_sizes,
874
+ spk_embed_dim,
875
+ gin_channels,
876
+ sr=None,
877
+ **kwargs
878
+ ):
879
+ super().__init__()
880
+ self.spec_channels = spec_channels
881
+ self.inter_channels = inter_channels
882
+ self.hidden_channels = hidden_channels
883
+ self.filter_channels = filter_channels
884
+ self.n_heads = n_heads
885
+ self.n_layers = n_layers
886
+ self.kernel_size = kernel_size
887
+ self.p_dropout = p_dropout
888
+ self.resblock = resblock
889
+ self.resblock_kernel_sizes = resblock_kernel_sizes
890
+ self.resblock_dilation_sizes = resblock_dilation_sizes
891
+ self.upsample_rates = upsample_rates
892
+ self.upsample_initial_channel = upsample_initial_channel
893
+ self.upsample_kernel_sizes = upsample_kernel_sizes
894
+ self.segment_size = segment_size
895
+ self.gin_channels = gin_channels
896
+ # self.hop_length = hop_length#
897
+ self.spk_embed_dim = spk_embed_dim
898
+ self.enc_p = TextEncoder768(
899
+ inter_channels,
900
+ hidden_channels,
901
+ filter_channels,
902
+ n_heads,
903
+ n_layers,
904
+ kernel_size,
905
+ p_dropout,
906
+ f0=False,
907
+ )
908
+ self.dec = Generator(
909
+ inter_channels,
910
+ resblock,
911
+ resblock_kernel_sizes,
912
+ resblock_dilation_sizes,
913
+ upsample_rates,
914
+ upsample_initial_channel,
915
+ upsample_kernel_sizes,
916
+ gin_channels=gin_channels,
917
+ )
918
+ self.enc_q = PosteriorEncoder(
919
+ spec_channels,
920
+ inter_channels,
921
+ hidden_channels,
922
+ 5,
923
+ 1,
924
+ 16,
925
+ gin_channels=gin_channels,
926
+ )
927
+ self.flow = ResidualCouplingBlock(
928
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
929
+ )
930
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
931
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
932
+
933
+ def remove_weight_norm(self):
934
+ self.dec.remove_weight_norm()
935
+ self.flow.remove_weight_norm()
936
+ self.enc_q.remove_weight_norm()
937
+
938
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
939
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
940
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
941
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
942
+ z_p = self.flow(z, y_mask, g=g)
943
+ z_slice, ids_slice = commons.rand_slice_segments(
944
+ z, y_lengths, self.segment_size
945
+ )
946
+ o = self.dec(z_slice, g=g)
947
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
948
+
949
+ def infer(self, phone, phone_lengths, sid, max_len=None):
950
+ g = self.emb_g(sid).unsqueeze(-1)
951
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
952
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
953
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
954
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
955
+ return o, x_mask, (z, z_p, m_p, logs_p)
956
+
957
+
958
+ class MultiPeriodDiscriminator(torch.nn.Module):
959
+ def __init__(self, use_spectral_norm=False):
960
+ super(MultiPeriodDiscriminator, self).__init__()
961
+ periods = [2, 3, 5, 7, 11, 17]
962
+ # periods = [3, 5, 7, 11, 17, 23, 37]
963
+
964
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
965
+ discs = discs + [
966
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
967
+ ]
968
+ self.discriminators = nn.ModuleList(discs)
969
+
970
+ def forward(self, y, y_hat):
971
+ y_d_rs = [] #
972
+ y_d_gs = []
973
+ fmap_rs = []
974
+ fmap_gs = []
975
+ for i, d in enumerate(self.discriminators):
976
+ y_d_r, fmap_r = d(y)
977
+ y_d_g, fmap_g = d(y_hat)
978
+ # for j in range(len(fmap_r)):
979
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
980
+ y_d_rs.append(y_d_r)
981
+ y_d_gs.append(y_d_g)
982
+ fmap_rs.append(fmap_r)
983
+ fmap_gs.append(fmap_g)
984
+
985
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
986
+
987
+
988
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
989
+ def __init__(self, use_spectral_norm=False):
990
+ super(MultiPeriodDiscriminatorV2, self).__init__()
991
+ # periods = [2, 3, 5, 7, 11, 17]
992
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
993
+
994
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
995
+ discs = discs + [
996
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
997
+ ]
998
+ self.discriminators = nn.ModuleList(discs)
999
+
1000
+ def forward(self, y, y_hat):
1001
+ y_d_rs = [] #
1002
+ y_d_gs = []
1003
+ fmap_rs = []
1004
+ fmap_gs = []
1005
+ for i, d in enumerate(self.discriminators):
1006
+ y_d_r, fmap_r = d(y)
1007
+ y_d_g, fmap_g = d(y_hat)
1008
+ # for j in range(len(fmap_r)):
1009
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1010
+ y_d_rs.append(y_d_r)
1011
+ y_d_gs.append(y_d_g)
1012
+ fmap_rs.append(fmap_r)
1013
+ fmap_gs.append(fmap_g)
1014
+
1015
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1016
+
1017
+
1018
+ class DiscriminatorS(torch.nn.Module):
1019
+ def __init__(self, use_spectral_norm=False):
1020
+ super(DiscriminatorS, self).__init__()
1021
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1022
+ self.convs = nn.ModuleList(
1023
+ [
1024
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1025
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1026
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1027
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1028
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1029
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1030
+ ]
1031
+ )
1032
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1033
+
1034
+ def forward(self, x):
1035
+ fmap = []
1036
+
1037
+ for l in self.convs:
1038
+ x = l(x)
1039
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1040
+ fmap.append(x)
1041
+ x = self.conv_post(x)
1042
+ fmap.append(x)
1043
+ x = torch.flatten(x, 1, -1)
1044
+
1045
+ return x, fmap
1046
+
1047
+
1048
+ class DiscriminatorP(torch.nn.Module):
1049
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1050
+ super(DiscriminatorP, self).__init__()
1051
+ self.period = period
1052
+ self.use_spectral_norm = use_spectral_norm
1053
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1054
+ self.convs = nn.ModuleList(
1055
+ [
1056
+ norm_f(
1057
+ Conv2d(
1058
+ 1,
1059
+ 32,
1060
+ (kernel_size, 1),
1061
+ (stride, 1),
1062
+ padding=(get_padding(kernel_size, 1), 0),
1063
+ )
1064
+ ),
1065
+ norm_f(
1066
+ Conv2d(
1067
+ 32,
1068
+ 128,
1069
+ (kernel_size, 1),
1070
+ (stride, 1),
1071
+ padding=(get_padding(kernel_size, 1), 0),
1072
+ )
1073
+ ),
1074
+ norm_f(
1075
+ Conv2d(
1076
+ 128,
1077
+ 512,
1078
+ (kernel_size, 1),
1079
+ (stride, 1),
1080
+ padding=(get_padding(kernel_size, 1), 0),
1081
+ )
1082
+ ),
1083
+ norm_f(
1084
+ Conv2d(
1085
+ 512,
1086
+ 1024,
1087
+ (kernel_size, 1),
1088
+ (stride, 1),
1089
+ padding=(get_padding(kernel_size, 1), 0),
1090
+ )
1091
+ ),
1092
+ norm_f(
1093
+ Conv2d(
1094
+ 1024,
1095
+ 1024,
1096
+ (kernel_size, 1),
1097
+ 1,
1098
+ padding=(get_padding(kernel_size, 1), 0),
1099
+ )
1100
+ ),
1101
+ ]
1102
+ )
1103
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1104
+
1105
+ def forward(self, x):
1106
+ fmap = []
1107
+
1108
+ # 1d to 2d
1109
+ b, c, t = x.shape
1110
+ if t % self.period != 0: # pad first
1111
+ n_pad = self.period - (t % self.period)
1112
+ x = F.pad(x, (0, n_pad), "reflect")
1113
+ t = t + n_pad
1114
+ x = x.view(b, c, t // self.period, self.period)
1115
+
1116
+ for l in self.convs:
1117
+ x = l(x)
1118
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1119
+ fmap.append(x)
1120
+ x = self.conv_post(x)
1121
+ fmap.append(x)
1122
+ x = torch.flatten(x, 1, -1)
1123
+
1124
+ return x, fmap
infer_pack/models_onnx.py ADDED
@@ -0,0 +1,818 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from infer_pack import modules
7
+ from infer_pack import attentions
8
+ from infer_pack import commons
9
+ from infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from infer_pack.commons import init_weights
13
+ import numpy as np
14
+ from infer_pack import commons
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
+ rand_ini = torch.rand(
338
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
+ )
340
+ rand_ini[:, 0] = 0
341
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
+ tmp_over_one *= upp
344
+ tmp_over_one = F.interpolate(
345
+ tmp_over_one.transpose(2, 1),
346
+ scale_factor=upp,
347
+ mode="linear",
348
+ align_corners=True,
349
+ ).transpose(2, 1)
350
+ rad_values = F.interpolate(
351
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
+ ).transpose(
353
+ 2, 1
354
+ ) #######
355
+ tmp_over_one %= 1
356
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
+ cumsum_shift = torch.zeros_like(rad_values)
358
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
+ sine_waves = torch.sin(
360
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
+ )
362
+ sine_waves = sine_waves * self.sine_amp
363
+ uv = self._f02uv(f0)
364
+ uv = F.interpolate(
365
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
+ ).transpose(2, 1)
367
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
+ noise = noise_amp * torch.randn_like(sine_waves)
369
+ sine_waves = sine_waves * uv + noise
370
+ return sine_waves, uv, noise
371
+
372
+
373
+ class SourceModuleHnNSF(torch.nn.Module):
374
+ """SourceModule for hn-nsf
375
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
+ add_noise_std=0.003, voiced_threshod=0)
377
+ sampling_rate: sampling_rate in Hz
378
+ harmonic_num: number of harmonic above F0 (default: 0)
379
+ sine_amp: amplitude of sine source signal (default: 0.1)
380
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
381
+ note that amplitude of noise in unvoiced is decided
382
+ by sine_amp
383
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
384
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
+ F0_sampled (batchsize, length, 1)
386
+ Sine_source (batchsize, length, 1)
387
+ noise_source (batchsize, length 1)
388
+ uv (batchsize, length, 1)
389
+ """
390
+
391
+ def __init__(
392
+ self,
393
+ sampling_rate,
394
+ harmonic_num=0,
395
+ sine_amp=0.1,
396
+ add_noise_std=0.003,
397
+ voiced_threshod=0,
398
+ is_half=True,
399
+ ):
400
+ super(SourceModuleHnNSF, self).__init__()
401
+
402
+ self.sine_amp = sine_amp
403
+ self.noise_std = add_noise_std
404
+ self.is_half = is_half
405
+ # to produce sine waveforms
406
+ self.l_sin_gen = SineGen(
407
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
+ )
409
+
410
+ # to merge source harmonics into a single excitation
411
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
+ self.l_tanh = torch.nn.Tanh()
413
+
414
+ def forward(self, x, upp=None):
415
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
+ if self.is_half:
417
+ sine_wavs = sine_wavs.half()
418
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
+ return sine_merge, None, None # noise, uv
420
+
421
+
422
+ class GeneratorNSF(torch.nn.Module):
423
+ def __init__(
424
+ self,
425
+ initial_channel,
426
+ resblock,
427
+ resblock_kernel_sizes,
428
+ resblock_dilation_sizes,
429
+ upsample_rates,
430
+ upsample_initial_channel,
431
+ upsample_kernel_sizes,
432
+ gin_channels,
433
+ sr,
434
+ is_half=False,
435
+ ):
436
+ super(GeneratorNSF, self).__init__()
437
+ self.num_kernels = len(resblock_kernel_sizes)
438
+ self.num_upsamples = len(upsample_rates)
439
+
440
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
+ self.m_source = SourceModuleHnNSF(
442
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
443
+ )
444
+ self.noise_convs = nn.ModuleList()
445
+ self.conv_pre = Conv1d(
446
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
447
+ )
448
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
+
450
+ self.ups = nn.ModuleList()
451
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
453
+ self.ups.append(
454
+ weight_norm(
455
+ ConvTranspose1d(
456
+ upsample_initial_channel // (2**i),
457
+ upsample_initial_channel // (2 ** (i + 1)),
458
+ k,
459
+ u,
460
+ padding=(k - u) // 2,
461
+ )
462
+ )
463
+ )
464
+ if i + 1 < len(upsample_rates):
465
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
466
+ self.noise_convs.append(
467
+ Conv1d(
468
+ 1,
469
+ c_cur,
470
+ kernel_size=stride_f0 * 2,
471
+ stride=stride_f0,
472
+ padding=stride_f0 // 2,
473
+ )
474
+ )
475
+ else:
476
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
+
478
+ self.resblocks = nn.ModuleList()
479
+ for i in range(len(self.ups)):
480
+ ch = upsample_initial_channel // (2 ** (i + 1))
481
+ for j, (k, d) in enumerate(
482
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
+ ):
484
+ self.resblocks.append(resblock(ch, k, d))
485
+
486
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
+ self.ups.apply(init_weights)
488
+
489
+ if gin_channels != 0:
490
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
+
492
+ self.upp = np.prod(upsample_rates)
493
+
494
+ def forward(self, x, f0, g=None):
495
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
496
+ har_source = har_source.transpose(1, 2)
497
+ x = self.conv_pre(x)
498
+ if g is not None:
499
+ x = x + self.cond(g)
500
+
501
+ for i in range(self.num_upsamples):
502
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
+ x = self.ups[i](x)
504
+ x_source = self.noise_convs[i](har_source)
505
+ x = x + x_source
506
+ xs = None
507
+ for j in range(self.num_kernels):
508
+ if xs is None:
509
+ xs = self.resblocks[i * self.num_kernels + j](x)
510
+ else:
511
+ xs += self.resblocks[i * self.num_kernels + j](x)
512
+ x = xs / self.num_kernels
513
+ x = F.leaky_relu(x)
514
+ x = self.conv_post(x)
515
+ x = torch.tanh(x)
516
+ return x
517
+
518
+ def remove_weight_norm(self):
519
+ for l in self.ups:
520
+ remove_weight_norm(l)
521
+ for l in self.resblocks:
522
+ l.remove_weight_norm()
523
+
524
+
525
+ sr2sr = {
526
+ "32k": 32000,
527
+ "40k": 40000,
528
+ "48k": 48000,
529
+ }
530
+
531
+
532
+ class SynthesizerTrnMsNSFsidM(nn.Module):
533
+ def __init__(
534
+ self,
535
+ spec_channels,
536
+ segment_size,
537
+ inter_channels,
538
+ hidden_channels,
539
+ filter_channels,
540
+ n_heads,
541
+ n_layers,
542
+ kernel_size,
543
+ p_dropout,
544
+ resblock,
545
+ resblock_kernel_sizes,
546
+ resblock_dilation_sizes,
547
+ upsample_rates,
548
+ upsample_initial_channel,
549
+ upsample_kernel_sizes,
550
+ spk_embed_dim,
551
+ gin_channels,
552
+ sr,
553
+ **kwargs
554
+ ):
555
+ super().__init__()
556
+ if type(sr) == type("strr"):
557
+ sr = sr2sr[sr]
558
+ self.spec_channels = spec_channels
559
+ self.inter_channels = inter_channels
560
+ self.hidden_channels = hidden_channels
561
+ self.filter_channels = filter_channels
562
+ self.n_heads = n_heads
563
+ self.n_layers = n_layers
564
+ self.kernel_size = kernel_size
565
+ self.p_dropout = p_dropout
566
+ self.resblock = resblock
567
+ self.resblock_kernel_sizes = resblock_kernel_sizes
568
+ self.resblock_dilation_sizes = resblock_dilation_sizes
569
+ self.upsample_rates = upsample_rates
570
+ self.upsample_initial_channel = upsample_initial_channel
571
+ self.upsample_kernel_sizes = upsample_kernel_sizes
572
+ self.segment_size = segment_size
573
+ self.gin_channels = gin_channels
574
+ # self.hop_length = hop_length#
575
+ self.spk_embed_dim = spk_embed_dim
576
+ if self.gin_channels == 256:
577
+ self.enc_p = TextEncoder256(
578
+ inter_channels,
579
+ hidden_channels,
580
+ filter_channels,
581
+ n_heads,
582
+ n_layers,
583
+ kernel_size,
584
+ p_dropout,
585
+ )
586
+ else:
587
+ self.enc_p = TextEncoder768(
588
+ inter_channels,
589
+ hidden_channels,
590
+ filter_channels,
591
+ n_heads,
592
+ n_layers,
593
+ kernel_size,
594
+ p_dropout,
595
+ )
596
+ self.dec = GeneratorNSF(
597
+ inter_channels,
598
+ resblock,
599
+ resblock_kernel_sizes,
600
+ resblock_dilation_sizes,
601
+ upsample_rates,
602
+ upsample_initial_channel,
603
+ upsample_kernel_sizes,
604
+ gin_channels=gin_channels,
605
+ sr=sr,
606
+ is_half=kwargs["is_half"],
607
+ )
608
+ self.enc_q = PosteriorEncoder(
609
+ spec_channels,
610
+ inter_channels,
611
+ hidden_channels,
612
+ 5,
613
+ 1,
614
+ 16,
615
+ gin_channels=gin_channels,
616
+ )
617
+ self.flow = ResidualCouplingBlock(
618
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
619
+ )
620
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
621
+ self.speaker_map = None
622
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
623
+
624
+ def remove_weight_norm(self):
625
+ self.dec.remove_weight_norm()
626
+ self.flow.remove_weight_norm()
627
+ self.enc_q.remove_weight_norm()
628
+
629
+ def construct_spkmixmap(self, n_speaker):
630
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
631
+ for i in range(n_speaker):
632
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
633
+ self.speaker_map = self.speaker_map.unsqueeze(0)
634
+
635
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
636
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
637
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
638
+ g = g * self.speaker_map # [N, S, B, 1, H]
639
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
640
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
641
+ else:
642
+ g = g.unsqueeze(0)
643
+ g = self.emb_g(g).transpose(1, 2)
644
+
645
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
646
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
647
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
648
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
649
+ return o
650
+
651
+
652
+ class MultiPeriodDiscriminator(torch.nn.Module):
653
+ def __init__(self, use_spectral_norm=False):
654
+ super(MultiPeriodDiscriminator, self).__init__()
655
+ periods = [2, 3, 5, 7, 11, 17]
656
+ # periods = [3, 5, 7, 11, 17, 23, 37]
657
+
658
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
659
+ discs = discs + [
660
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
661
+ ]
662
+ self.discriminators = nn.ModuleList(discs)
663
+
664
+ def forward(self, y, y_hat):
665
+ y_d_rs = [] #
666
+ y_d_gs = []
667
+ fmap_rs = []
668
+ fmap_gs = []
669
+ for i, d in enumerate(self.discriminators):
670
+ y_d_r, fmap_r = d(y)
671
+ y_d_g, fmap_g = d(y_hat)
672
+ # for j in range(len(fmap_r)):
673
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
674
+ y_d_rs.append(y_d_r)
675
+ y_d_gs.append(y_d_g)
676
+ fmap_rs.append(fmap_r)
677
+ fmap_gs.append(fmap_g)
678
+
679
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
680
+
681
+
682
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
683
+ def __init__(self, use_spectral_norm=False):
684
+ super(MultiPeriodDiscriminatorV2, self).__init__()
685
+ # periods = [2, 3, 5, 7, 11, 17]
686
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
687
+
688
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
689
+ discs = discs + [
690
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
691
+ ]
692
+ self.discriminators = nn.ModuleList(discs)
693
+
694
+ def forward(self, y, y_hat):
695
+ y_d_rs = [] #
696
+ y_d_gs = []
697
+ fmap_rs = []
698
+ fmap_gs = []
699
+ for i, d in enumerate(self.discriminators):
700
+ y_d_r, fmap_r = d(y)
701
+ y_d_g, fmap_g = d(y_hat)
702
+ # for j in range(len(fmap_r)):
703
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
704
+ y_d_rs.append(y_d_r)
705
+ y_d_gs.append(y_d_g)
706
+ fmap_rs.append(fmap_r)
707
+ fmap_gs.append(fmap_g)
708
+
709
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
710
+
711
+
712
+ class DiscriminatorS(torch.nn.Module):
713
+ def __init__(self, use_spectral_norm=False):
714
+ super(DiscriminatorS, self).__init__()
715
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
716
+ self.convs = nn.ModuleList(
717
+ [
718
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
719
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
720
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
721
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
722
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
723
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
724
+ ]
725
+ )
726
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
727
+
728
+ def forward(self, x):
729
+ fmap = []
730
+
731
+ for l in self.convs:
732
+ x = l(x)
733
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
734
+ fmap.append(x)
735
+ x = self.conv_post(x)
736
+ fmap.append(x)
737
+ x = torch.flatten(x, 1, -1)
738
+
739
+ return x, fmap
740
+
741
+
742
+ class DiscriminatorP(torch.nn.Module):
743
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
744
+ super(DiscriminatorP, self).__init__()
745
+ self.period = period
746
+ self.use_spectral_norm = use_spectral_norm
747
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
748
+ self.convs = nn.ModuleList(
749
+ [
750
+ norm_f(
751
+ Conv2d(
752
+ 1,
753
+ 32,
754
+ (kernel_size, 1),
755
+ (stride, 1),
756
+ padding=(get_padding(kernel_size, 1), 0),
757
+ )
758
+ ),
759
+ norm_f(
760
+ Conv2d(
761
+ 32,
762
+ 128,
763
+ (kernel_size, 1),
764
+ (stride, 1),
765
+ padding=(get_padding(kernel_size, 1), 0),
766
+ )
767
+ ),
768
+ norm_f(
769
+ Conv2d(
770
+ 128,
771
+ 512,
772
+ (kernel_size, 1),
773
+ (stride, 1),
774
+ padding=(get_padding(kernel_size, 1), 0),
775
+ )
776
+ ),
777
+ norm_f(
778
+ Conv2d(
779
+ 512,
780
+ 1024,
781
+ (kernel_size, 1),
782
+ (stride, 1),
783
+ padding=(get_padding(kernel_size, 1), 0),
784
+ )
785
+ ),
786
+ norm_f(
787
+ Conv2d(
788
+ 1024,
789
+ 1024,
790
+ (kernel_size, 1),
791
+ 1,
792
+ padding=(get_padding(kernel_size, 1), 0),
793
+ )
794
+ ),
795
+ ]
796
+ )
797
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
798
+
799
+ def forward(self, x):
800
+ fmap = []
801
+
802
+ # 1d to 2d
803
+ b, c, t = x.shape
804
+ if t % self.period != 0: # pad first
805
+ n_pad = self.period - (t % self.period)
806
+ x = F.pad(x, (0, n_pad), "reflect")
807
+ t = t + n_pad
808
+ x = x.view(b, c, t // self.period, self.period)
809
+
810
+ for l in self.convs:
811
+ x = l(x)
812
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
813
+ fmap.append(x)
814
+ x = self.conv_post(x)
815
+ fmap.append(x)
816
+ x = torch.flatten(x, 1, -1)
817
+
818
+ return x, fmap
infer_pack/modules.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ from infer_pack import commons
13
+ from infer_pack.commons import init_weights, get_padding
14
+ from infer_pack.transforms import piecewise_rational_quadratic_transform
15
+
16
+
17
+ LRELU_SLOPE = 0.1
18
+
19
+
20
+ class LayerNorm(nn.Module):
21
+ def __init__(self, channels, eps=1e-5):
22
+ super().__init__()
23
+ self.channels = channels
24
+ self.eps = eps
25
+
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class ConvReluNorm(nn.Module):
36
+ def __init__(
37
+ self,
38
+ in_channels,
39
+ hidden_channels,
40
+ out_channels,
41
+ kernel_size,
42
+ n_layers,
43
+ p_dropout,
44
+ ):
45
+ super().__init__()
46
+ self.in_channels = in_channels
47
+ self.hidden_channels = hidden_channels
48
+ self.out_channels = out_channels
49
+ self.kernel_size = kernel_size
50
+ self.n_layers = n_layers
51
+ self.p_dropout = p_dropout
52
+ assert n_layers > 1, "Number of layers should be larger than 0."
53
+
54
+ self.conv_layers = nn.ModuleList()
55
+ self.norm_layers = nn.ModuleList()
56
+ self.conv_layers.append(
57
+ nn.Conv1d(
58
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
59
+ )
60
+ )
61
+ self.norm_layers.append(LayerNorm(hidden_channels))
62
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
63
+ for _ in range(n_layers - 1):
64
+ self.conv_layers.append(
65
+ nn.Conv1d(
66
+ hidden_channels,
67
+ hidden_channels,
68
+ kernel_size,
69
+ padding=kernel_size // 2,
70
+ )
71
+ )
72
+ self.norm_layers.append(LayerNorm(hidden_channels))
73
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
74
+ self.proj.weight.data.zero_()
75
+ self.proj.bias.data.zero_()
76
+
77
+ def forward(self, x, x_mask):
78
+ x_org = x
79
+ for i in range(self.n_layers):
80
+ x = self.conv_layers[i](x * x_mask)
81
+ x = self.norm_layers[i](x)
82
+ x = self.relu_drop(x)
83
+ x = x_org + self.proj(x)
84
+ return x * x_mask
85
+
86
+
87
+ class DDSConv(nn.Module):
88
+ """
89
+ Dialted and Depth-Separable Convolution
90
+ """
91
+
92
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
93
+ super().__init__()
94
+ self.channels = channels
95
+ self.kernel_size = kernel_size
96
+ self.n_layers = n_layers
97
+ self.p_dropout = p_dropout
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.convs_sep = nn.ModuleList()
101
+ self.convs_1x1 = nn.ModuleList()
102
+ self.norms_1 = nn.ModuleList()
103
+ self.norms_2 = nn.ModuleList()
104
+ for i in range(n_layers):
105
+ dilation = kernel_size**i
106
+ padding = (kernel_size * dilation - dilation) // 2
107
+ self.convs_sep.append(
108
+ nn.Conv1d(
109
+ channels,
110
+ channels,
111
+ kernel_size,
112
+ groups=channels,
113
+ dilation=dilation,
114
+ padding=padding,
115
+ )
116
+ )
117
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118
+ self.norms_1.append(LayerNorm(channels))
119
+ self.norms_2.append(LayerNorm(channels))
120
+
121
+ def forward(self, x, x_mask, g=None):
122
+ if g is not None:
123
+ x = x + g
124
+ for i in range(self.n_layers):
125
+ y = self.convs_sep[i](x * x_mask)
126
+ y = self.norms_1[i](y)
127
+ y = F.gelu(y)
128
+ y = self.convs_1x1[i](y)
129
+ y = self.norms_2[i](y)
130
+ y = F.gelu(y)
131
+ y = self.drop(y)
132
+ x = x + y
133
+ return x * x_mask
134
+
135
+
136
+ class WN(torch.nn.Module):
137
+ def __init__(
138
+ self,
139
+ hidden_channels,
140
+ kernel_size,
141
+ dilation_rate,
142
+ n_layers,
143
+ gin_channels=0,
144
+ p_dropout=0,
145
+ ):
146
+ super(WN, self).__init__()
147
+ assert kernel_size % 2 == 1
148
+ self.hidden_channels = hidden_channels
149
+ self.kernel_size = (kernel_size,)
150
+ self.dilation_rate = dilation_rate
151
+ self.n_layers = n_layers
152
+ self.gin_channels = gin_channels
153
+ self.p_dropout = p_dropout
154
+
155
+ self.in_layers = torch.nn.ModuleList()
156
+ self.res_skip_layers = torch.nn.ModuleList()
157
+ self.drop = nn.Dropout(p_dropout)
158
+
159
+ if gin_channels != 0:
160
+ cond_layer = torch.nn.Conv1d(
161
+ gin_channels, 2 * hidden_channels * n_layers, 1
162
+ )
163
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164
+
165
+ for i in range(n_layers):
166
+ dilation = dilation_rate**i
167
+ padding = int((kernel_size * dilation - dilation) / 2)
168
+ in_layer = torch.nn.Conv1d(
169
+ hidden_channels,
170
+ 2 * hidden_channels,
171
+ kernel_size,
172
+ dilation=dilation,
173
+ padding=padding,
174
+ )
175
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176
+ self.in_layers.append(in_layer)
177
+
178
+ # last one is not necessary
179
+ if i < n_layers - 1:
180
+ res_skip_channels = 2 * hidden_channels
181
+ else:
182
+ res_skip_channels = hidden_channels
183
+
184
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186
+ self.res_skip_layers.append(res_skip_layer)
187
+
188
+ def forward(self, x, x_mask, g=None, **kwargs):
189
+ output = torch.zeros_like(x)
190
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
191
+
192
+ if g is not None:
193
+ g = self.cond_layer(g)
194
+
195
+ for i in range(self.n_layers):
196
+ x_in = self.in_layers[i](x)
197
+ if g is not None:
198
+ cond_offset = i * 2 * self.hidden_channels
199
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200
+ else:
201
+ g_l = torch.zeros_like(x_in)
202
+
203
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204
+ acts = self.drop(acts)
205
+
206
+ res_skip_acts = self.res_skip_layers[i](acts)
207
+ if i < self.n_layers - 1:
208
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
209
+ x = (x + res_acts) * x_mask
210
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
211
+ else:
212
+ output = output + res_skip_acts
213
+ return output * x_mask
214
+
215
+ def remove_weight_norm(self):
216
+ if self.gin_channels != 0:
217
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
218
+ for l in self.in_layers:
219
+ torch.nn.utils.remove_weight_norm(l)
220
+ for l in self.res_skip_layers:
221
+ torch.nn.utils.remove_weight_norm(l)
222
+
223
+
224
+ class ResBlock1(torch.nn.Module):
225
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226
+ super(ResBlock1, self).__init__()
227
+ self.convs1 = nn.ModuleList(
228
+ [
229
+ weight_norm(
230
+ Conv1d(
231
+ channels,
232
+ channels,
233
+ kernel_size,
234
+ 1,
235
+ dilation=dilation[0],
236
+ padding=get_padding(kernel_size, dilation[0]),
237
+ )
238
+ ),
239
+ weight_norm(
240
+ Conv1d(
241
+ channels,
242
+ channels,
243
+ kernel_size,
244
+ 1,
245
+ dilation=dilation[1],
246
+ padding=get_padding(kernel_size, dilation[1]),
247
+ )
248
+ ),
249
+ weight_norm(
250
+ Conv1d(
251
+ channels,
252
+ channels,
253
+ kernel_size,
254
+ 1,
255
+ dilation=dilation[2],
256
+ padding=get_padding(kernel_size, dilation[2]),
257
+ )
258
+ ),
259
+ ]
260
+ )
261
+ self.convs1.apply(init_weights)
262
+
263
+ self.convs2 = nn.ModuleList(
264
+ [
265
+ weight_norm(
266
+ Conv1d(
267
+ channels,
268
+ channels,
269
+ kernel_size,
270
+ 1,
271
+ dilation=1,
272
+ padding=get_padding(kernel_size, 1),
273
+ )
274
+ ),
275
+ weight_norm(
276
+ Conv1d(
277
+ channels,
278
+ channels,
279
+ kernel_size,
280
+ 1,
281
+ dilation=1,
282
+ padding=get_padding(kernel_size, 1),
283
+ )
284
+ ),
285
+ weight_norm(
286
+ Conv1d(
287
+ channels,
288
+ channels,
289
+ kernel_size,
290
+ 1,
291
+ dilation=1,
292
+ padding=get_padding(kernel_size, 1),
293
+ )
294
+ ),
295
+ ]
296
+ )
297
+ self.convs2.apply(init_weights)
298
+
299
+ def forward(self, x, x_mask=None):
300
+ for c1, c2 in zip(self.convs1, self.convs2):
301
+ xt = F.leaky_relu(x, LRELU_SLOPE)
302
+ if x_mask is not None:
303
+ xt = xt * x_mask
304
+ xt = c1(xt)
305
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
306
+ if x_mask is not None:
307
+ xt = xt * x_mask
308
+ xt = c2(xt)
309
+ x = xt + x
310
+ if x_mask is not None:
311
+ x = x * x_mask
312
+ return x
313
+
314
+ def remove_weight_norm(self):
315
+ for l in self.convs1:
316
+ remove_weight_norm(l)
317
+ for l in self.convs2:
318
+ remove_weight_norm(l)
319
+
320
+
321
+ class ResBlock2(torch.nn.Module):
322
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323
+ super(ResBlock2, self).__init__()
324
+ self.convs = nn.ModuleList(
325
+ [
326
+ weight_norm(
327
+ Conv1d(
328
+ channels,
329
+ channels,
330
+ kernel_size,
331
+ 1,
332
+ dilation=dilation[0],
333
+ padding=get_padding(kernel_size, dilation[0]),
334
+ )
335
+ ),
336
+ weight_norm(
337
+ Conv1d(
338
+ channels,
339
+ channels,
340
+ kernel_size,
341
+ 1,
342
+ dilation=dilation[1],
343
+ padding=get_padding(kernel_size, dilation[1]),
344
+ )
345
+ ),
346
+ ]
347
+ )
348
+ self.convs.apply(init_weights)
349
+
350
+ def forward(self, x, x_mask=None):
351
+ for c in self.convs:
352
+ xt = F.leaky_relu(x, LRELU_SLOPE)
353
+ if x_mask is not None:
354
+ xt = xt * x_mask
355
+ xt = c(xt)
356
+ x = xt + x
357
+ if x_mask is not None:
358
+ x = x * x_mask
359
+ return x
360
+
361
+ def remove_weight_norm(self):
362
+ for l in self.convs:
363
+ remove_weight_norm(l)
364
+
365
+
366
+ class Log(nn.Module):
367
+ def forward(self, x, x_mask, reverse=False, **kwargs):
368
+ if not reverse:
369
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370
+ logdet = torch.sum(-y, [1, 2])
371
+ return y, logdet
372
+ else:
373
+ x = torch.exp(x) * x_mask
374
+ return x
375
+
376
+
377
+ class Flip(nn.Module):
378
+ def forward(self, x, *args, reverse=False, **kwargs):
379
+ x = torch.flip(x, [1])
380
+ if not reverse:
381
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382
+ return x, logdet
383
+ else:
384
+ return x
385
+
386
+
387
+ class ElementwiseAffine(nn.Module):
388
+ def __init__(self, channels):
389
+ super().__init__()
390
+ self.channels = channels
391
+ self.m = nn.Parameter(torch.zeros(channels, 1))
392
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
393
+
394
+ def forward(self, x, x_mask, reverse=False, **kwargs):
395
+ if not reverse:
396
+ y = self.m + torch.exp(self.logs) * x
397
+ y = y * x_mask
398
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
399
+ return y, logdet
400
+ else:
401
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
402
+ return x
403
+
404
+
405
+ class ResidualCouplingLayer(nn.Module):
406
+ def __init__(
407
+ self,
408
+ channels,
409
+ hidden_channels,
410
+ kernel_size,
411
+ dilation_rate,
412
+ n_layers,
413
+ p_dropout=0,
414
+ gin_channels=0,
415
+ mean_only=False,
416
+ ):
417
+ assert channels % 2 == 0, "channels should be divisible by 2"
418
+ super().__init__()
419
+ self.channels = channels
420
+ self.hidden_channels = hidden_channels
421
+ self.kernel_size = kernel_size
422
+ self.dilation_rate = dilation_rate
423
+ self.n_layers = n_layers
424
+ self.half_channels = channels // 2
425
+ self.mean_only = mean_only
426
+
427
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428
+ self.enc = WN(
429
+ hidden_channels,
430
+ kernel_size,
431
+ dilation_rate,
432
+ n_layers,
433
+ p_dropout=p_dropout,
434
+ gin_channels=gin_channels,
435
+ )
436
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437
+ self.post.weight.data.zero_()
438
+ self.post.bias.data.zero_()
439
+
440
+ def forward(self, x, x_mask, g=None, reverse=False):
441
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442
+ h = self.pre(x0) * x_mask
443
+ h = self.enc(h, x_mask, g=g)
444
+ stats = self.post(h) * x_mask
445
+ if not self.mean_only:
446
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447
+ else:
448
+ m = stats
449
+ logs = torch.zeros_like(m)
450
+
451
+ if not reverse:
452
+ x1 = m + x1 * torch.exp(logs) * x_mask
453
+ x = torch.cat([x0, x1], 1)
454
+ logdet = torch.sum(logs, [1, 2])
455
+ return x, logdet
456
+ else:
457
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
458
+ x = torch.cat([x0, x1], 1)
459
+ return x
460
+
461
+ def remove_weight_norm(self):
462
+ self.enc.remove_weight_norm()
463
+
464
+
465
+ class ConvFlow(nn.Module):
466
+ def __init__(
467
+ self,
468
+ in_channels,
469
+ filter_channels,
470
+ kernel_size,
471
+ n_layers,
472
+ num_bins=10,
473
+ tail_bound=5.0,
474
+ ):
475
+ super().__init__()
476
+ self.in_channels = in_channels
477
+ self.filter_channels = filter_channels
478
+ self.kernel_size = kernel_size
479
+ self.n_layers = n_layers
480
+ self.num_bins = num_bins
481
+ self.tail_bound = tail_bound
482
+ self.half_channels = in_channels // 2
483
+
484
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486
+ self.proj = nn.Conv1d(
487
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488
+ )
489
+ self.proj.weight.data.zero_()
490
+ self.proj.bias.data.zero_()
491
+
492
+ def forward(self, x, x_mask, g=None, reverse=False):
493
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494
+ h = self.pre(x0)
495
+ h = self.convs(h, x_mask, g=g)
496
+ h = self.proj(h) * x_mask
497
+
498
+ b, c, t = x0.shape
499
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
500
+
501
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503
+ self.filter_channels
504
+ )
505
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
506
+
507
+ x1, logabsdet = piecewise_rational_quadratic_transform(
508
+ x1,
509
+ unnormalized_widths,
510
+ unnormalized_heights,
511
+ unnormalized_derivatives,
512
+ inverse=reverse,
513
+ tails="linear",
514
+ tail_bound=self.tail_bound,
515
+ )
516
+
517
+ x = torch.cat([x0, x1], 1) * x_mask
518
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
519
+ if not reverse:
520
+ return x, logdet
521
+ else:
522
+ return x
infer_pack/modules/F0Predictor/DioF0Predictor.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class DioF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.dio(
66
+ wav.astype(np.double),
67
+ fs=self.sampling_rate,
68
+ f0_floor=self.f0_min,
69
+ f0_ceil=self.f0_max,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
73
+ for index, pitch in enumerate(f0):
74
+ f0[index] = round(pitch, 1)
75
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
76
+
77
+ def compute_f0_uv(self, wav, p_len=None):
78
+ if p_len is None:
79
+ p_len = wav.shape[0] // self.hop_length
80
+ f0, t = pyworld.dio(
81
+ wav.astype(np.double),
82
+ fs=self.sampling_rate,
83
+ f0_floor=self.f0_min,
84
+ f0_ceil=self.f0_max,
85
+ frame_period=1000 * self.hop_length / self.sampling_rate,
86
+ )
87
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
88
+ for index, pitch in enumerate(f0):
89
+ f0[index] = round(pitch, 1)
90
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class F0Predictor(object):
2
+ def compute_f0(self, wav, p_len):
3
+ """
4
+ input: wav:[signal_length]
5
+ p_len:int
6
+ output: f0:[signal_length//hop_length]
7
+ """
8
+ pass
9
+
10
+ def compute_f0_uv(self, wav, p_len):
11
+ """
12
+ input: wav:[signal_length]
13
+ p_len:int
14
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
+ """
16
+ pass
infer_pack/modules/F0Predictor/HarvestF0Predictor.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class HarvestF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.harvest(
66
+ wav.astype(np.double),
67
+ fs=self.hop_length,
68
+ f0_ceil=self.f0_max,
69
+ f0_floor=self.f0_min,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
73
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ if p_len is None:
77
+ p_len = wav.shape[0] // self.hop_length
78
+ f0, t = pyworld.harvest(
79
+ wav.astype(np.double),
80
+ fs=self.sampling_rate,
81
+ f0_floor=self.f0_min,
82
+ f0_ceil=self.f0_max,
83
+ frame_period=1000 * self.hop_length / self.sampling_rate,
84
+ )
85
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
86
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer_pack/modules/F0Predictor/PMF0Predictor.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import parselmouth
3
+ import numpy as np
4
+
5
+
6
+ class PMF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def compute_f0(self, wav, p_len=None):
52
+ x = wav
53
+ if p_len is None:
54
+ p_len = x.shape[0] // self.hop_length
55
+ else:
56
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
57
+ time_step = self.hop_length / self.sampling_rate * 1000
58
+ f0 = (
59
+ parselmouth.Sound(x, self.sampling_rate)
60
+ .to_pitch_ac(
61
+ time_step=time_step / 1000,
62
+ voicing_threshold=0.6,
63
+ pitch_floor=self.f0_min,
64
+ pitch_ceiling=self.f0_max,
65
+ )
66
+ .selected_array["frequency"]
67
+ )
68
+
69
+ pad_size = (p_len - len(f0) + 1) // 2
70
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
71
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
72
+ f0, uv = self.interpolate_f0(f0)
73
+ return f0
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ x = wav
77
+ if p_len is None:
78
+ p_len = x.shape[0] // self.hop_length
79
+ else:
80
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
81
+ time_step = self.hop_length / self.sampling_rate * 1000
82
+ f0 = (
83
+ parselmouth.Sound(x, self.sampling_rate)
84
+ .to_pitch_ac(
85
+ time_step=time_step / 1000,
86
+ voicing_threshold=0.6,
87
+ pitch_floor=self.f0_min,
88
+ pitch_ceiling=self.f0_max,
89
+ )
90
+ .selected_array["frequency"]
91
+ )
92
+
93
+ pad_size = (p_len - len(f0) + 1) // 2
94
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
95
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
96
+ f0, uv = self.interpolate_f0(f0)
97
+ return f0, uv
infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
infer_pack/onnx_inference.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import librosa
3
+ import numpy as np
4
+ import soundfile
5
+
6
+
7
+ class ContentVec:
8
+ def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
9
+ print("load model(s) from {}".format(vec_path))
10
+ if device == "cpu" or device is None:
11
+ providers = ["CPUExecutionProvider"]
12
+ elif device == "cuda":
13
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
14
+ else:
15
+ raise RuntimeError("Unsportted Device")
16
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
17
+
18
+ def __call__(self, wav):
19
+ return self.forward(wav)
20
+
21
+ def forward(self, wav):
22
+ feats = wav
23
+ if feats.ndim == 2: # double channels
24
+ feats = feats.mean(-1)
25
+ assert feats.ndim == 1, feats.ndim
26
+ feats = np.expand_dims(np.expand_dims(feats, 0), 0)
27
+ onnx_input = {self.model.get_inputs()[0].name: feats}
28
+ logits = self.model.run(None, onnx_input)[0]
29
+ return logits.transpose(0, 2, 1)
30
+
31
+
32
+ def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
33
+ if f0_predictor == "pm":
34
+ from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
35
+
36
+ f0_predictor_object = PMF0Predictor(
37
+ hop_length=hop_length, sampling_rate=sampling_rate
38
+ )
39
+ elif f0_predictor == "harvest":
40
+ from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
41
+
42
+ f0_predictor_object = HarvestF0Predictor(
43
+ hop_length=hop_length, sampling_rate=sampling_rate
44
+ )
45
+ elif f0_predictor == "dio":
46
+ from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
47
+
48
+ f0_predictor_object = DioF0Predictor(
49
+ hop_length=hop_length, sampling_rate=sampling_rate
50
+ )
51
+ else:
52
+ raise Exception("Unknown f0 predictor")
53
+ return f0_predictor_object
54
+
55
+
56
+ class OnnxRVC:
57
+ def __init__(
58
+ self,
59
+ model_path,
60
+ sr=40000,
61
+ hop_size=512,
62
+ vec_path="vec-768-layer-12",
63
+ device="cpu",
64
+ ):
65
+ vec_path = f"pretrained/{vec_path}.onnx"
66
+ self.vec_model = ContentVec(vec_path, device)
67
+ if device == "cpu" or device is None:
68
+ providers = ["CPUExecutionProvider"]
69
+ elif device == "cuda":
70
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
71
+ else:
72
+ raise RuntimeError("Unsportted Device")
73
+ self.model = onnxruntime.InferenceSession(model_path, providers=providers)
74
+ self.sampling_rate = sr
75
+ self.hop_size = hop_size
76
+
77
+ def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
78
+ onnx_input = {
79
+ self.model.get_inputs()[0].name: hubert,
80
+ self.model.get_inputs()[1].name: hubert_length,
81
+ self.model.get_inputs()[2].name: pitch,
82
+ self.model.get_inputs()[3].name: pitchf,
83
+ self.model.get_inputs()[4].name: ds,
84
+ self.model.get_inputs()[5].name: rnd,
85
+ }
86
+ return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
87
+
88
+ def inference(
89
+ self,
90
+ raw_path,
91
+ sid,
92
+ f0_method="dio",
93
+ f0_up_key=0,
94
+ pad_time=0.5,
95
+ cr_threshold=0.02,
96
+ ):
97
+ f0_min = 50
98
+ f0_max = 1100
99
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
100
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
101
+ f0_predictor = get_f0_predictor(
102
+ f0_method,
103
+ hop_length=self.hop_size,
104
+ sampling_rate=self.sampling_rate,
105
+ threshold=cr_threshold,
106
+ )
107
+ wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
108
+ org_length = len(wav)
109
+ if org_length / sr > 50.0:
110
+ raise RuntimeError("Reached Max Length")
111
+
112
+ wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
113
+ wav16k = wav16k
114
+
115
+ hubert = self.vec_model(wav16k)
116
+ hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
117
+ hubert_length = hubert.shape[1]
118
+
119
+ pitchf = f0_predictor.compute_f0(wav, hubert_length)
120
+ pitchf = pitchf * 2 ** (f0_up_key / 12)
121
+ pitch = pitchf.copy()
122
+ f0_mel = 1127 * np.log(1 + pitch / 700)
123
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
124
+ f0_mel_max - f0_mel_min
125
+ ) + 1
126
+ f0_mel[f0_mel <= 1] = 1
127
+ f0_mel[f0_mel > 255] = 255
128
+ pitch = np.rint(f0_mel).astype(np.int64)
129
+
130
+ pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
131
+ pitch = pitch.reshape(1, len(pitch))
132
+ ds = np.array([sid]).astype(np.int64)
133
+
134
+ rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
135
+ hubert_length = np.array([hubert_length]).astype(np.int64)
136
+
137
+ out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
138
+ out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
139
+ return out_wav[0:org_length]
infer_pack/transforms.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import numpy as np
5
+
6
+
7
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
8
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
9
+ DEFAULT_MIN_DERIVATIVE = 1e-3
10
+
11
+
12
+ def piecewise_rational_quadratic_transform(
13
+ inputs,
14
+ unnormalized_widths,
15
+ unnormalized_heights,
16
+ unnormalized_derivatives,
17
+ inverse=False,
18
+ tails=None,
19
+ tail_bound=1.0,
20
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
23
+ ):
24
+ if tails is None:
25
+ spline_fn = rational_quadratic_spline
26
+ spline_kwargs = {}
27
+ else:
28
+ spline_fn = unconstrained_rational_quadratic_spline
29
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30
+
31
+ outputs, logabsdet = spline_fn(
32
+ inputs=inputs,
33
+ unnormalized_widths=unnormalized_widths,
34
+ unnormalized_heights=unnormalized_heights,
35
+ unnormalized_derivatives=unnormalized_derivatives,
36
+ inverse=inverse,
37
+ min_bin_width=min_bin_width,
38
+ min_bin_height=min_bin_height,
39
+ min_derivative=min_derivative,
40
+ **spline_kwargs
41
+ )
42
+ return outputs, logabsdet
43
+
44
+
45
+ def searchsorted(bin_locations, inputs, eps=1e-6):
46
+ bin_locations[..., -1] += eps
47
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48
+
49
+
50
+ def unconstrained_rational_quadratic_spline(
51
+ inputs,
52
+ unnormalized_widths,
53
+ unnormalized_heights,
54
+ unnormalized_derivatives,
55
+ inverse=False,
56
+ tails="linear",
57
+ tail_bound=1.0,
58
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
61
+ ):
62
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63
+ outside_interval_mask = ~inside_interval_mask
64
+
65
+ outputs = torch.zeros_like(inputs)
66
+ logabsdet = torch.zeros_like(inputs)
67
+
68
+ if tails == "linear":
69
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70
+ constant = np.log(np.exp(1 - min_derivative) - 1)
71
+ unnormalized_derivatives[..., 0] = constant
72
+ unnormalized_derivatives[..., -1] = constant
73
+
74
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
75
+ logabsdet[outside_interval_mask] = 0
76
+ else:
77
+ raise RuntimeError("{} tails are not implemented.".format(tails))
78
+
79
+ (
80
+ outputs[inside_interval_mask],
81
+ logabsdet[inside_interval_mask],
82
+ ) = rational_quadratic_spline(
83
+ inputs=inputs[inside_interval_mask],
84
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87
+ inverse=inverse,
88
+ left=-tail_bound,
89
+ right=tail_bound,
90
+ bottom=-tail_bound,
91
+ top=tail_bound,
92
+ min_bin_width=min_bin_width,
93
+ min_bin_height=min_bin_height,
94
+ min_derivative=min_derivative,
95
+ )
96
+
97
+ return outputs, logabsdet
98
+
99
+
100
+ def rational_quadratic_spline(
101
+ inputs,
102
+ unnormalized_widths,
103
+ unnormalized_heights,
104
+ unnormalized_derivatives,
105
+ inverse=False,
106
+ left=0.0,
107
+ right=1.0,
108
+ bottom=0.0,
109
+ top=1.0,
110
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
113
+ ):
114
+ if torch.min(inputs) < left or torch.max(inputs) > right:
115
+ raise ValueError("Input to a transform is not within its domain")
116
+
117
+ num_bins = unnormalized_widths.shape[-1]
118
+
119
+ if min_bin_width * num_bins > 1.0:
120
+ raise ValueError("Minimal bin width too large for the number of bins")
121
+ if min_bin_height * num_bins > 1.0:
122
+ raise ValueError("Minimal bin height too large for the number of bins")
123
+
124
+ widths = F.softmax(unnormalized_widths, dim=-1)
125
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126
+ cumwidths = torch.cumsum(widths, dim=-1)
127
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128
+ cumwidths = (right - left) * cumwidths + left
129
+ cumwidths[..., 0] = left
130
+ cumwidths[..., -1] = right
131
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132
+
133
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134
+
135
+ heights = F.softmax(unnormalized_heights, dim=-1)
136
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137
+ cumheights = torch.cumsum(heights, dim=-1)
138
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139
+ cumheights = (top - bottom) * cumheights + bottom
140
+ cumheights[..., 0] = bottom
141
+ cumheights[..., -1] = top
142
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
143
+
144
+ if inverse:
145
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
146
+ else:
147
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
148
+
149
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151
+
152
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153
+ delta = heights / widths
154
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
155
+
156
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158
+
159
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
160
+
161
+ if inverse:
162
+ a = (inputs - input_cumheights) * (
163
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
164
+ ) + input_heights * (input_delta - input_derivatives)
165
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
167
+ )
168
+ c = -input_delta * (inputs - input_cumheights)
169
+
170
+ discriminant = b.pow(2) - 4 * a * c
171
+ assert (discriminant >= 0).all()
172
+
173
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
174
+ outputs = root * input_bin_widths + input_cumwidths
175
+
176
+ theta_one_minus_theta = root * (1 - root)
177
+ denominator = input_delta + (
178
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179
+ * theta_one_minus_theta
180
+ )
181
+ derivative_numerator = input_delta.pow(2) * (
182
+ input_derivatives_plus_one * root.pow(2)
183
+ + 2 * input_delta * theta_one_minus_theta
184
+ + input_derivatives * (1 - root).pow(2)
185
+ )
186
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187
+
188
+ return outputs, -logabsdet
189
+ else:
190
+ theta = (inputs - input_cumwidths) / input_bin_widths
191
+ theta_one_minus_theta = theta * (1 - theta)
192
+
193
+ numerator = input_heights * (
194
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195
+ )
196
+ denominator = input_delta + (
197
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198
+ * theta_one_minus_theta
199
+ )
200
+ outputs = input_cumheights + numerator / denominator
201
+
202
+ derivative_numerator = input_delta.pow(2) * (
203
+ input_derivatives_plus_one * theta.pow(2)
204
+ + 2 * input_delta * theta_one_minus_theta
205
+ + input_derivatives * (1 - theta).pow(2)
206
+ )
207
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208
+
209
+ return outputs, logabsdet
model/alphawann/Alpha.png ADDED
model/alphawann/Alpha2333333.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ab58c53436b5bf37c44db159408c1ff23cc8fbaf586dd20cc363127a11ed040
3
+ size 55093677
model/alphawann/added_IVF1322_Flat_nprobe_1.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28d20605fef0dcd0e11c4b108962fba6795321768cfdb4b66bad7fa26a8aec0
3
+ size 54607387
model/alphawann/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Alpha2333333.pth",
3
+ "feat_index": "added_IVF1322_Flat_nprobe_1.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Alpha Wann",
7
+ "author": "Findit",
8
+ "source": "Album",
9
+ "note": "Trained in RVC v1, All the album",
10
+ "icon": "Alpha.png"
11
+ }
model/angele/Angele.png ADDED

Git LFS Details

  • SHA256: ebc8b91d593b67833db8b2a0203183b846c1e7789d0a23f662dddb697491eb62
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
model/angele/angele.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d2990f49e108eb512ce00ba76866850d6c7270cb8ef39a42cfc9995c9a85d9
3
+ size 55216420
model/angele/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "angele.pth",
3
+ "feat_index": "trained_IVF1260_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Angele",
7
+ "author": "Aalex",
8
+ "source": "All",
9
+ "note": "Trained in RVC v2, All the album",
10
+ "icon": "Angele.png"
11
+ }
model/angele/trained_IVF1260_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ab19068ec36e4d75ad280f8ee757981ab1f8da5ec9db807caa7828fb655e23
3
+ size 3870859
model/arianagrande/Ariana.png ADDED
model/arianagrande/added_IVF1033_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a47bef476fc29dde668b2727ad4ba3dbcf526a62bcea85204595d28b0b854bbb
3
+ size 127336579
model/arianagrande/arianagrande.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f8199a3f13fed7d6f71d98e89c8bf49cbc00701e0d6581383e320997fd8ed20
3
+ size 55226492
model/arianagrande/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "arianagrande.pth",
3
+ "feat_index": "added_IVF1033_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Ariana Grande",
7
+ "author": "Arithyst",
8
+ "source": "ALL",
9
+ "note": "7 minute dataset (All of the dataset are from her Pro-Tools Dataset), Trained in RVC v2, Crepe Hop Length - 30",
10
+ "icon": "Ariana.png"
11
+ }
model/leto/Leto.png ADDED

Git LFS Details

  • SHA256: 4a4e56fb45c003bb3f62cb9513aca3395528f7296bad6df3f3878c77ba37f577
  • Pointer size: 132 Bytes
  • Size of remote file: 2.27 MB
model/leto/added_IVF408_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb3ba7fad42b6c25a3d0019d84fd017851724aaf7bfcf80861cb9abceaf232e2
3
+ size 50296539
model/leto/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "leto.pth",
3
+ "feat_index": "added_IVF408_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Leto",
7
+ "author": "BartPoint",
8
+ "source": "Album",
9
+ "note": "Trained in RVC v2, Crepes 128, 600 epochs",
10
+ "icon": "Leto.png"
11
+ }
model/leto/leto.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357488113dc890fdb15d68672d34e847a3355b58548e7bb3071aa1dec268b201
3
+ size 55216420
model/macmiller/added_IVF2124_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a83907d49984646512ee119fb5ee8e2402d45795c8d7244981820ce6d14dc11
3
+ size 261741619
model/macmiller/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "macmillerv3.pth",
3
+ "feat_index": "added_IVF2124_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "MacMiller",
7
+ "source": "ALL",
8
+ "author": "HZY",
9
+ "note": "Trained on Crepe, 28 minute data-set, 300 epochs",
10
+ "icon": "macmiller.png"
11
+ }
model/macmiller/macmiller.png ADDED

Git LFS Details

  • SHA256: d1b4b3d0e625e4c0045f678b312463ec185b9ea36a241931049c5d363a7fe00f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.14 MB
model/macmiller/macmillerv3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4b2e20687c0288c79b1ddbb07107c9c00e458af46f2e62f6c713328a459ae3e
3
+ size 55225115
model/mickaeljackson/Mickael.png ADDED
model/mickaeljackson/added_IVF1448_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f98424e9a36aa1fc3f753ba1fcded6f6992c8094cd383576c5af17aa8c9d301
3
+ size 178421459
model/mickaeljackson/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "michael-jackson.pth",
3
+ "feat_index": "added_IVF1448_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Mickael Jackson",
7
+ "author": "REU Music",
8
+ "source": "ALL",
9
+ "note": "Trained by me with a 20 minute dataset, based on ERA Off the Wall + Thriller",
10
+ "icon": "Mickael.png"
11
+ }
model/mickaeljackson/michael-jackson.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae7aecdd7fb262d2f38c24a24ad167cf7ac45e05437b51e1df0fe37d22e75f2f
3
+ size 55229246
model/oboy/Oboy.png ADDED
model/oboy/added_IVF419_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb6b348e7ccac14ba8fe1ff264d152393a6c07e44abae257744053a3c17fa457
3
+ size 51642499
model/oboy/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "oboy.pth",
3
+ "feat_index": "added_IVF419_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Oboy",
7
+ "author": "BartPoint",
8
+ "source": "ALL",
9
+ "note": "(Dataset not by BartPoint), Trained in RVC v2, Crepe 78",
10
+ "icon": "Oboy.png"
11
+ }
model/oboy/oboy.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4f1308db21eae7336098479e562d3d6d27eff2329b21c772a44f74d5d9c9f43
3
+ size 55192782
model/zamdane/added_IVF966_Flat_nprobe_1_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37a95dfa85e895185ad8759753cbf60628bca7a7e9c3d96832b6f20184764fe8
3
+ size 119054459
model/zamdane/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "zamdane.pth",
3
+ "feat_index": "added_IVF966_Flat_nprobe_1_v2.index",
4
+ "speaker_id": 0,
5
+
6
+ "name": "Zamdane",
7
+ "author": "Me",
8
+ "source": "ALL",
9
+ "note": "Will see after",
10
+ "icon": "zamdane.png"
11
+ }
model/zamdane/zamdane.png ADDED
model/zamdane/zamdane.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09a9278e6959fe2e8f0a5e926553004a36f6592594743abf5f8de89321a49887
3
+ size 55224197