kevinwang676 Silentlin commited on
Commit
b2c3cd9
·
0 Parent(s):

Duplicate from Silentlin/DiffSinger

Browse files

Co-authored-by: Jinglin Liu <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +33 -0
  2. LICENSE +21 -0
  3. README.md +10 -0
  4. checkpoints/.gitattributes +1 -0
  5. checkpoints/.gitkeep +0 -0
  6. checkpoints/0102_xiaoma_pe/config.yaml +172 -0
  7. checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +3 -0
  8. checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml +241 -0
  9. checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt +3 -0
  10. checkpoints/0228_opencpop_ds100_rel/config.yaml +342 -0
  11. checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt +3 -0
  12. checkpoints/0831_opencpop_ds1000/config.yaml +346 -0
  13. checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt +3 -0
  14. checkpoints/clean.py +12 -0
  15. checkpoints/cleaner.py +8 -0
  16. configs/config_base.yaml +42 -0
  17. configs/singing/base.yaml +42 -0
  18. configs/singing/fs2.yaml +3 -0
  19. configs/tts/base.yaml +95 -0
  20. configs/tts/base_zh.yaml +3 -0
  21. configs/tts/fs2.yaml +80 -0
  22. configs/tts/hifigan.yaml +21 -0
  23. configs/tts/lj/base_mel2wav.yaml +3 -0
  24. configs/tts/lj/base_text2mel.yaml +13 -0
  25. configs/tts/lj/fs2.yaml +3 -0
  26. configs/tts/lj/hifigan.yaml +3 -0
  27. configs/tts/lj/pwg.yaml +3 -0
  28. configs/tts/pwg.yaml +110 -0
  29. data/processed/ljspeech/dict.txt +77 -0
  30. data/processed/ljspeech/metadata_phone.csv +0 -0
  31. data/processed/ljspeech/mfa_dict.txt +0 -0
  32. data/processed/ljspeech/phone_set.json +1 -0
  33. data_gen/singing/binarize.py +398 -0
  34. data_gen/tts/base_binarizer.py +224 -0
  35. data_gen/tts/bin/binarize.py +20 -0
  36. data_gen/tts/binarizer_zh.py +59 -0
  37. data_gen/tts/data_gen_utils.py +347 -0
  38. data_gen/tts/txt_processors/base_text_processor.py +8 -0
  39. data_gen/tts/txt_processors/en.py +78 -0
  40. data_gen/tts/txt_processors/zh.py +41 -0
  41. data_gen/tts/txt_processors/zh_g2pM.py +72 -0
  42. docs/README-SVS-opencpop-cascade.md +111 -0
  43. docs/README-SVS-opencpop-e2e.md +107 -0
  44. docs/README-SVS-popcs.md +63 -0
  45. docs/README-SVS.md +76 -0
  46. docs/README-TTS.md +69 -0
  47. docs/README-zh.md +212 -0
  48. inference/svs/base_svs_infer.py +265 -0
  49. inference/svs/ds_cascade.py +56 -0
  50. inference/svs/ds_e2e.py +67 -0
.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
33
+ checkpoints/0831_opencpop_ds1000 filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Jinglin Liu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
3
+ emoji: 🎶
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: gradio
7
+ app_file: inference/svs/gradio/infer.py
8
+ pinned: false
9
+ duplicated_from: Silentlin/DiffSinger
10
+ ---
checkpoints/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
checkpoints/.gitkeep ADDED
File without changes
checkpoints/0102_xiaoma_pe/config.yaml ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ audio_num_mel_bins: 80
3
+ audio_sample_rate: 24000
4
+ base_config:
5
+ - configs/tts/lj/fs2.yaml
6
+ binarization_args:
7
+ shuffle: false
8
+ with_align: true
9
+ with_f0: true
10
+ with_f0cwt: true
11
+ with_spk_embed: true
12
+ with_txt: true
13
+ with_wav: false
14
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
15
+ binary_data_dir: data/binary/xiaoma1022_24k_128hop
16
+ check_val_every_n_epoch: 10
17
+ clip_grad_norm: 1
18
+ cwt_add_f0_loss: false
19
+ cwt_hidden_size: 128
20
+ cwt_layers: 2
21
+ cwt_loss: l1
22
+ cwt_std_scale: 0.8
23
+ debug: false
24
+ dec_ffn_kernel_size: 9
25
+ dec_layers: 4
26
+ decoder_type: fft
27
+ dict_dir: ''
28
+ dropout: 0.1
29
+ ds_workers: 4
30
+ dur_enc_hidden_stride_kernel:
31
+ - 0,2,3
32
+ - 0,2,3
33
+ - 0,1,3
34
+ dur_loss: mse
35
+ dur_predictor_kernel: 3
36
+ dur_predictor_layers: 2
37
+ enc_ffn_kernel_size: 9
38
+ enc_layers: 4
39
+ encoder_K: 8
40
+ encoder_type: fft
41
+ endless_ds: true
42
+ ffn_act: gelu
43
+ ffn_padding: SAME
44
+ fft_size: 512
45
+ fmax: 12000
46
+ fmin: 30
47
+ gen_dir_name: ''
48
+ hidden_size: 256
49
+ hop_size: 128
50
+ infer: false
51
+ lambda_commit: 0.25
52
+ lambda_energy: 0.1
53
+ lambda_f0: 1.0
54
+ lambda_ph_dur: 1.0
55
+ lambda_sent_dur: 1.0
56
+ lambda_uv: 1.0
57
+ lambda_word_dur: 1.0
58
+ load_ckpt: ''
59
+ log_interval: 100
60
+ loud_norm: false
61
+ lr: 2.0
62
+ max_epochs: 1000
63
+ max_eval_sentences: 1
64
+ max_eval_tokens: 60000
65
+ max_frames: 5000
66
+ max_input_tokens: 1550
67
+ max_sentences: 100000
68
+ max_tokens: 20000
69
+ max_updates: 60000
70
+ mel_loss: l1
71
+ mel_vmax: 1.5
72
+ mel_vmin: -6
73
+ min_level_db: -120
74
+ norm_type: gn
75
+ num_ckpt_keep: 3
76
+ num_heads: 2
77
+ num_sanity_val_steps: 5
78
+ num_spk: 1
79
+ num_test_samples: 20
80
+ num_valid_plots: 10
81
+ optimizer_adam_beta1: 0.9
82
+ optimizer_adam_beta2: 0.98
83
+ out_wav_norm: false
84
+ pitch_ar: false
85
+ pitch_enc_hidden_stride_kernel:
86
+ - 0,2,5
87
+ - 0,2,5
88
+ - 0,2,5
89
+ pitch_extractor_conv_layers: 2
90
+ pitch_loss: l1
91
+ pitch_norm: log
92
+ pitch_type: frame
93
+ pre_align_args:
94
+ allow_no_txt: false
95
+ denoise: false
96
+ forced_align: mfa
97
+ txt_processor: en
98
+ use_sox: false
99
+ use_tone: true
100
+ pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
101
+ predictor_dropout: 0.5
102
+ predictor_grad: 0.1
103
+ predictor_hidden: -1
104
+ predictor_kernel: 5
105
+ predictor_layers: 2
106
+ prenet_dropout: 0.5
107
+ prenet_hidden_size: 256
108
+ pretrain_fs_ckpt: ''
109
+ processed_data_dir: data/processed/ljspeech
110
+ profile_infer: false
111
+ raw_data_dir: data/raw/LJSpeech-1.1
112
+ ref_norm_layer: bn
113
+ reset_phone_dict: true
114
+ save_best: false
115
+ save_ckpt: true
116
+ save_codes:
117
+ - configs
118
+ - modules
119
+ - tasks
120
+ - utils
121
+ - usr
122
+ save_f0: false
123
+ save_gt: false
124
+ seed: 1234
125
+ sort_by_len: true
126
+ stop_token_weight: 5.0
127
+ task_cls: tasks.tts.pe.PitchExtractionTask
128
+ test_ids:
129
+ - 68
130
+ - 70
131
+ - 74
132
+ - 87
133
+ - 110
134
+ - 172
135
+ - 190
136
+ - 215
137
+ - 231
138
+ - 294
139
+ - 316
140
+ - 324
141
+ - 402
142
+ - 422
143
+ - 485
144
+ - 500
145
+ - 505
146
+ - 508
147
+ - 509
148
+ - 519
149
+ test_input_dir: ''
150
+ test_num: 523
151
+ test_set_name: test
152
+ train_set_name: train
153
+ use_denoise: false
154
+ use_energy_embed: false
155
+ use_gt_dur: false
156
+ use_gt_f0: false
157
+ use_pitch_embed: true
158
+ use_pos_embed: true
159
+ use_spk_embed: false
160
+ use_spk_id: false
161
+ use_split_spk_id: false
162
+ use_uv: true
163
+ use_var_enc: false
164
+ val_check_interval: 2000
165
+ valid_num: 348
166
+ valid_set_name: valid
167
+ vocoder: pwg
168
+ vocoder_ckpt: ''
169
+ warmup_updates: 2000
170
+ weight_decay: 0
171
+ win_size: 512
172
+ work_dir: checkpoints/0102_xiaoma_pe
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb
3
+ size 13047222
checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ adam_b1: 0.8
3
+ adam_b2: 0.99
4
+ amp: false
5
+ audio_num_mel_bins: 80
6
+ audio_sample_rate: 24000
7
+ aux_context_window: 0
8
+ #base_config:
9
+ #- egs/egs_bases/singing/pwg.yaml
10
+ #- egs/egs_bases/tts/vocoder/hifigan.yaml
11
+ binarization_args:
12
+ reset_phone_dict: true
13
+ reset_word_dict: true
14
+ shuffle: false
15
+ trim_eos_bos: false
16
+ trim_sil: false
17
+ with_align: false
18
+ with_f0: true
19
+ with_f0cwt: false
20
+ with_linear: false
21
+ with_spk_embed: false
22
+ with_spk_id: true
23
+ with_txt: false
24
+ with_wav: true
25
+ with_word: false
26
+ binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer
27
+ binary_data_dir: data/binary/big_popcs_24k_hop128
28
+ check_val_every_n_epoch: 10
29
+ clip_grad_norm: 1
30
+ clip_grad_value: 0
31
+ datasets: []
32
+ debug: false
33
+ dec_ffn_kernel_size: 9
34
+ dec_layers: 4
35
+ dict_dir: ''
36
+ disc_start_steps: 40000
37
+ discriminator_grad_norm: 1
38
+ discriminator_optimizer_params:
39
+ eps: 1.0e-06
40
+ lr: 0.0002
41
+ weight_decay: 0.0
42
+ discriminator_params:
43
+ bias: true
44
+ conv_channels: 64
45
+ in_channels: 1
46
+ kernel_size: 3
47
+ layers: 10
48
+ nonlinear_activation: LeakyReLU
49
+ nonlinear_activation_params:
50
+ negative_slope: 0.2
51
+ out_channels: 1
52
+ use_weight_norm: true
53
+ discriminator_scheduler_params:
54
+ gamma: 0.999
55
+ step_size: 600
56
+ dropout: 0.1
57
+ ds_workers: 1
58
+ enc_ffn_kernel_size: 9
59
+ enc_layers: 4
60
+ endless_ds: true
61
+ ffn_act: gelu
62
+ ffn_padding: SAME
63
+ fft_size: 512
64
+ fmax: 12000
65
+ fmin: 30
66
+ frames_multiple: 1
67
+ gen_dir_name: ''
68
+ generator_grad_norm: 10
69
+ generator_optimizer_params:
70
+ eps: 1.0e-06
71
+ lr: 0.0002
72
+ weight_decay: 0.0
73
+ generator_params:
74
+ aux_channels: 80
75
+ dropout: 0.0
76
+ gate_channels: 128
77
+ in_channels: 1
78
+ kernel_size: 3
79
+ layers: 30
80
+ out_channels: 1
81
+ residual_channels: 64
82
+ skip_channels: 64
83
+ stacks: 3
84
+ upsample_net: ConvInUpsampleNetwork
85
+ upsample_params:
86
+ upsample_scales:
87
+ - 2
88
+ - 4
89
+ - 4
90
+ - 4
91
+ use_nsf: false
92
+ use_pitch_embed: true
93
+ use_weight_norm: true
94
+ generator_scheduler_params:
95
+ gamma: 0.999
96
+ step_size: 600
97
+ griffin_lim_iters: 60
98
+ hidden_size: 256
99
+ hop_size: 128
100
+ infer: false
101
+ lambda_adv: 1.0
102
+ lambda_cdisc: 4.0
103
+ lambda_energy: 0.0
104
+ lambda_f0: 0.0
105
+ lambda_mel: 5.0
106
+ lambda_mel_adv: 1.0
107
+ lambda_ph_dur: 0.0
108
+ lambda_sent_dur: 0.0
109
+ lambda_uv: 0.0
110
+ lambda_word_dur: 0.0
111
+ load_ckpt: ''
112
+ loud_norm: false
113
+ lr: 2.0
114
+ max_epochs: 1000
115
+ max_frames: 2400
116
+ max_input_tokens: 1550
117
+ max_samples: 8192
118
+ max_sentences: 20
119
+ max_tokens: 24000
120
+ max_updates: 3000000
121
+ max_valid_sentences: 1
122
+ max_valid_tokens: 60000
123
+ mel_loss: ssim:0.5|l1:0.5
124
+ mel_vmax: 1.5
125
+ mel_vmin: -6
126
+ min_frames: 0
127
+ min_level_db: -120
128
+ num_ckpt_keep: 3
129
+ num_heads: 2
130
+ num_mels: 80
131
+ num_sanity_val_steps: 5
132
+ num_spk: 100
133
+ num_test_samples: 0
134
+ num_valid_plots: 10
135
+ optimizer_adam_beta1: 0.9
136
+ optimizer_adam_beta2: 0.98
137
+ out_wav_norm: false
138
+ pitch_extractor: parselmouth
139
+ pitch_type: frame
140
+ pre_align_args:
141
+ allow_no_txt: false
142
+ denoise: false
143
+ sox_resample: true
144
+ sox_to_wav: false
145
+ trim_sil: false
146
+ txt_processor: zh
147
+ use_tone: false
148
+ pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign
149
+ predictor_grad: 0.0
150
+ print_nan_grads: false
151
+ processed_data_dir: ''
152
+ profile_infer: false
153
+ raw_data_dir: ''
154
+ ref_level_db: 20
155
+ rename_tmux: true
156
+ rerun_gen: true
157
+ resblock: '1'
158
+ resblock_dilation_sizes:
159
+ - - 1
160
+ - 3
161
+ - 5
162
+ - - 1
163
+ - 3
164
+ - 5
165
+ - - 1
166
+ - 3
167
+ - 5
168
+ resblock_kernel_sizes:
169
+ - 3
170
+ - 7
171
+ - 11
172
+ resume_from_checkpoint: 0
173
+ save_best: true
174
+ save_codes: []
175
+ save_f0: true
176
+ save_gt: true
177
+ scheduler: rsqrt
178
+ seed: 1234
179
+ sort_by_len: true
180
+ stft_loss_params:
181
+ fft_sizes:
182
+ - 1024
183
+ - 2048
184
+ - 512
185
+ hop_sizes:
186
+ - 120
187
+ - 240
188
+ - 50
189
+ win_lengths:
190
+ - 600
191
+ - 1200
192
+ - 240
193
+ window: hann_window
194
+ task_cls: tasks.vocoder.hifigan.HifiGanTask
195
+ tb_log_interval: 100
196
+ test_ids: []
197
+ test_input_dir: ''
198
+ test_num: 50
199
+ test_prefixes: []
200
+ test_set_name: test
201
+ train_set_name: train
202
+ train_sets: ''
203
+ upsample_initial_channel: 512
204
+ upsample_kernel_sizes:
205
+ - 16
206
+ - 16
207
+ - 4
208
+ - 4
209
+ upsample_rates:
210
+ - 8
211
+ - 4
212
+ - 2
213
+ - 2
214
+ use_cdisc: false
215
+ use_cond_disc: false
216
+ use_fm_loss: false
217
+ use_gt_dur: true
218
+ use_gt_f0: true
219
+ use_mel_loss: true
220
+ use_ms_stft: false
221
+ use_pitch_embed: true
222
+ use_ref_enc: true
223
+ use_spec_disc: false
224
+ use_spk_embed: false
225
+ use_spk_id: false
226
+ use_split_spk_id: false
227
+ val_check_interval: 2000
228
+ valid_infer_interval: 10000
229
+ valid_monitor_key: val_loss
230
+ valid_monitor_mode: min
231
+ valid_set_name: valid
232
+ vocoder: pwg
233
+ vocoder_ckpt: ''
234
+ vocoder_denoise_c: 0.0
235
+ warmup_updates: 8000
236
+ weight_decay: 0
237
+ win_length: null
238
+ win_size: 512
239
+ window: hann
240
+ word_size: 3000
241
+ work_dir: checkpoints/0109_hifigan_bigpopcs_hop128
checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1
3
+ size 55827436
checkpoints/0228_opencpop_ds100_rel/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ K_step: 100
2
+ accumulate_grad_batches: 1
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 24000
5
+ base_config:
6
+ - usr/configs/popcs_ds_beta6.yaml
7
+ - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
8
+ binarization_args:
9
+ shuffle: false
10
+ with_align: true
11
+ with_f0: true
12
+ with_f0cwt: true
13
+ with_spk_embed: false
14
+ with_txt: true
15
+ with_wav: true
16
+ binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
17
+ binary_data_dir: data/binary/opencpop-midi-dp
18
+ check_val_every_n_epoch: 10
19
+ clip_grad_norm: 1
20
+ content_cond_steps: []
21
+ cwt_add_f0_loss: false
22
+ cwt_hidden_size: 128
23
+ cwt_layers: 2
24
+ cwt_loss: l1
25
+ cwt_std_scale: 0.8
26
+ datasets:
27
+ - popcs
28
+ debug: false
29
+ dec_ffn_kernel_size: 9
30
+ dec_layers: 4
31
+ decay_steps: 50000
32
+ decoder_type: fft
33
+ dict_dir: ''
34
+ diff_decoder_type: wavenet
35
+ diff_loss_type: l1
36
+ dilation_cycle_length: 4
37
+ dropout: 0.1
38
+ ds_workers: 4
39
+ dur_enc_hidden_stride_kernel:
40
+ - 0,2,3
41
+ - 0,2,3
42
+ - 0,1,3
43
+ dur_loss: mse
44
+ dur_predictor_kernel: 3
45
+ dur_predictor_layers: 5
46
+ enc_ffn_kernel_size: 9
47
+ enc_layers: 4
48
+ encoder_K: 8
49
+ encoder_type: fft
50
+ endless_ds: true
51
+ ffn_act: gelu
52
+ ffn_padding: SAME
53
+ fft_size: 512
54
+ fmax: 12000
55
+ fmin: 30
56
+ fs2_ckpt: ''
57
+ gaussian_start: true
58
+ gen_dir_name: ''
59
+ gen_tgt_spk_id: -1
60
+ hidden_size: 256
61
+ hop_size: 128
62
+ infer: false
63
+ keep_bins: 80
64
+ lambda_commit: 0.25
65
+ lambda_energy: 0.0
66
+ lambda_f0: 0.0
67
+ lambda_ph_dur: 1.0
68
+ lambda_sent_dur: 1.0
69
+ lambda_uv: 0.0
70
+ lambda_word_dur: 1.0
71
+ load_ckpt: ''
72
+ log_interval: 100
73
+ loud_norm: false
74
+ lr: 0.001
75
+ max_beta: 0.06
76
+ max_epochs: 1000
77
+ max_eval_sentences: 1
78
+ max_eval_tokens: 60000
79
+ max_frames: 8000
80
+ max_input_tokens: 1550
81
+ max_sentences: 48
82
+ max_tokens: 40000
83
+ max_updates: 160000
84
+ mel_loss: ssim:0.5|l1:0.5
85
+ mel_vmax: 1.5
86
+ mel_vmin: -6.0
87
+ min_level_db: -120
88
+ norm_type: gn
89
+ num_ckpt_keep: 3
90
+ num_heads: 2
91
+ num_sanity_val_steps: 1
92
+ num_spk: 1
93
+ num_test_samples: 0
94
+ num_valid_plots: 10
95
+ optimizer_adam_beta1: 0.9
96
+ optimizer_adam_beta2: 0.98
97
+ out_wav_norm: false
98
+ pe_ckpt: checkpoints/0102_xiaoma_pe
99
+ pe_enable: true
100
+ pitch_ar: false
101
+ pitch_enc_hidden_stride_kernel:
102
+ - 0,2,5
103
+ - 0,2,5
104
+ - 0,2,5
105
+ pitch_extractor: parselmouth
106
+ pitch_loss: l1
107
+ pitch_norm: log
108
+ pitch_type: frame
109
+ pre_align_args:
110
+ allow_no_txt: false
111
+ denoise: false
112
+ forced_align: mfa
113
+ txt_processor: zh_g2pM
114
+ use_sox: true
115
+ use_tone: false
116
+ pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
117
+ predictor_dropout: 0.5
118
+ predictor_grad: 0.1
119
+ predictor_hidden: -1
120
+ predictor_kernel: 5
121
+ predictor_layers: 5
122
+ prenet_dropout: 0.5
123
+ prenet_hidden_size: 256
124
+ pretrain_fs_ckpt: ''
125
+ processed_data_dir: data/processed/popcs
126
+ profile_infer: false
127
+ raw_data_dir: data/raw/popcs
128
+ ref_norm_layer: bn
129
+ rel_pos: true
130
+ reset_phone_dict: true
131
+ residual_channels: 256
132
+ residual_layers: 20
133
+ save_best: false
134
+ save_ckpt: true
135
+ save_codes:
136
+ - configs
137
+ - modules
138
+ - tasks
139
+ - utils
140
+ - usr
141
+ save_f0: true
142
+ save_gt: false
143
+ schedule_type: linear
144
+ seed: 1234
145
+ sort_by_len: true
146
+ spec_max:
147
+ - -0.79453
148
+ - -0.81116
149
+ - -0.61631
150
+ - -0.30679
151
+ - -0.13863
152
+ - -0.050652
153
+ - -0.11563
154
+ - -0.10679
155
+ - -0.091068
156
+ - -0.062174
157
+ - -0.075302
158
+ - -0.072217
159
+ - -0.063815
160
+ - -0.073299
161
+ - 0.007361
162
+ - -0.072508
163
+ - -0.050234
164
+ - -0.16534
165
+ - -0.26928
166
+ - -0.20782
167
+ - -0.20823
168
+ - -0.11702
169
+ - -0.070128
170
+ - -0.065868
171
+ - -0.012675
172
+ - 0.0015121
173
+ - -0.089902
174
+ - -0.21392
175
+ - -0.23789
176
+ - -0.28922
177
+ - -0.30405
178
+ - -0.23029
179
+ - -0.22088
180
+ - -0.21542
181
+ - -0.29367
182
+ - -0.30137
183
+ - -0.38281
184
+ - -0.4359
185
+ - -0.28681
186
+ - -0.46855
187
+ - -0.57485
188
+ - -0.47022
189
+ - -0.54266
190
+ - -0.44848
191
+ - -0.6412
192
+ - -0.687
193
+ - -0.6486
194
+ - -0.76436
195
+ - -0.49971
196
+ - -0.71068
197
+ - -0.69724
198
+ - -0.61487
199
+ - -0.55843
200
+ - -0.69773
201
+ - -0.57502
202
+ - -0.70919
203
+ - -0.82431
204
+ - -0.84213
205
+ - -0.90431
206
+ - -0.8284
207
+ - -0.77945
208
+ - -0.82758
209
+ - -0.87699
210
+ - -1.0532
211
+ - -1.0766
212
+ - -1.1198
213
+ - -1.0185
214
+ - -0.98983
215
+ - -1.0001
216
+ - -1.0756
217
+ - -1.0024
218
+ - -1.0304
219
+ - -1.0579
220
+ - -1.0188
221
+ - -1.05
222
+ - -1.0842
223
+ - -1.0923
224
+ - -1.1223
225
+ - -1.2381
226
+ - -1.6467
227
+ spec_min:
228
+ - -6.0
229
+ - -6.0
230
+ - -6.0
231
+ - -6.0
232
+ - -6.0
233
+ - -6.0
234
+ - -6.0
235
+ - -6.0
236
+ - -6.0
237
+ - -6.0
238
+ - -6.0
239
+ - -6.0
240
+ - -6.0
241
+ - -6.0
242
+ - -6.0
243
+ - -6.0
244
+ - -6.0
245
+ - -6.0
246
+ - -6.0
247
+ - -6.0
248
+ - -6.0
249
+ - -6.0
250
+ - -6.0
251
+ - -6.0
252
+ - -6.0
253
+ - -6.0
254
+ - -6.0
255
+ - -6.0
256
+ - -6.0
257
+ - -6.0
258
+ - -6.0
259
+ - -6.0
260
+ - -6.0
261
+ - -6.0
262
+ - -6.0
263
+ - -6.0
264
+ - -6.0
265
+ - -6.0
266
+ - -6.0
267
+ - -6.0
268
+ - -6.0
269
+ - -6.0
270
+ - -6.0
271
+ - -6.0
272
+ - -6.0
273
+ - -6.0
274
+ - -6.0
275
+ - -6.0
276
+ - -6.0
277
+ - -6.0
278
+ - -6.0
279
+ - -6.0
280
+ - -6.0
281
+ - -6.0
282
+ - -6.0
283
+ - -6.0
284
+ - -6.0
285
+ - -6.0
286
+ - -6.0
287
+ - -6.0
288
+ - -6.0
289
+ - -6.0
290
+ - -6.0
291
+ - -6.0
292
+ - -6.0
293
+ - -6.0
294
+ - -6.0
295
+ - -6.0
296
+ - -6.0
297
+ - -6.0
298
+ - -6.0
299
+ - -6.0
300
+ - -6.0
301
+ - -6.0
302
+ - -6.0
303
+ - -6.0
304
+ - -6.0
305
+ - -6.0
306
+ - -6.0
307
+ - -6.0
308
+ spk_cond_steps: []
309
+ stop_token_weight: 5.0
310
+ task_cls: usr.diffsinger_task.DiffSingerMIDITask
311
+ test_ids: []
312
+ test_input_dir: ''
313
+ test_num: 0
314
+ test_prefixes:
315
+ - "popcs-\u8BF4\u6563\u5C31\u6563"
316
+ - "popcs-\u9690\u5F62\u7684\u7FC5\u8180"
317
+ test_set_name: test
318
+ timesteps: 100
319
+ train_set_name: train
320
+ use_denoise: false
321
+ use_energy_embed: false
322
+ use_gt_dur: false
323
+ use_gt_f0: false
324
+ use_midi: true
325
+ use_nsf: true
326
+ use_pitch_embed: false
327
+ use_pos_embed: true
328
+ use_spk_embed: false
329
+ use_spk_id: false
330
+ use_split_spk_id: false
331
+ use_uv: true
332
+ use_var_enc: false
333
+ val_check_interval: 2000
334
+ valid_num: 0
335
+ valid_set_name: valid
336
+ vocoder: vocoders.hifigan.HifiGAN
337
+ vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
338
+ warmup_updates: 2000
339
+ wav2spec_eps: 1e-6
340
+ weight_decay: 0
341
+ win_size: 512
342
+ work_dir: checkpoints/0228_opencpop_ds100_rel
checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c
3
+ size 170226367
checkpoints/0831_opencpop_ds1000/config.yaml ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ K_step: 1000
2
+ accumulate_grad_batches: 1
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 24000
5
+ base_config:
6
+ - usr/configs/popcs_ds_beta6.yaml
7
+ - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
8
+ binarization_args:
9
+ shuffle: false
10
+ with_align: true
11
+ with_f0: true
12
+ with_f0cwt: true
13
+ with_spk_embed: false
14
+ with_txt: true
15
+ with_wav: true
16
+ binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
17
+ binary_data_dir: data/binary/opencpop-midi-dp
18
+ check_val_every_n_epoch: 10
19
+ clip_grad_norm: 1
20
+ content_cond_steps: []
21
+ cwt_add_f0_loss: false
22
+ cwt_hidden_size: 128
23
+ cwt_layers: 2
24
+ cwt_loss: l1
25
+ cwt_std_scale: 0.8
26
+ datasets:
27
+ - opencpop
28
+ debug: false
29
+ dec_ffn_kernel_size: 9
30
+ dec_layers: 4
31
+ decay_steps: 50000
32
+ decoder_type: fft
33
+ dict_dir: ''
34
+ diff_decoder_type: wavenet
35
+ diff_loss_type: l1
36
+ dilation_cycle_length: 4
37
+ dropout: 0.1
38
+ ds_workers: 4
39
+ dur_enc_hidden_stride_kernel:
40
+ - 0,2,3
41
+ - 0,2,3
42
+ - 0,1,3
43
+ dur_loss: mse
44
+ dur_predictor_kernel: 3
45
+ dur_predictor_layers: 5
46
+ enc_ffn_kernel_size: 9
47
+ enc_layers: 4
48
+ encoder_K: 8
49
+ encoder_type: fft
50
+ endless_ds: true
51
+ ffn_act: gelu
52
+ ffn_padding: SAME
53
+ fft_size: 512
54
+ fmax: 12000
55
+ fmin: 30
56
+ fs2_ckpt: ''
57
+ gaussian_start: true
58
+ gen_dir_name: ''
59
+ gen_tgt_spk_id: -1
60
+ hidden_size: 256
61
+ hop_size: 128
62
+ infer: false
63
+ keep_bins: 80
64
+ lambda_commit: 0.25
65
+ lambda_energy: 0.0
66
+ lambda_f0: 0.0
67
+ lambda_ph_dur: 1.0
68
+ lambda_sent_dur: 1.0
69
+ lambda_uv: 0.0
70
+ lambda_word_dur: 1.0
71
+ load_ckpt: ''
72
+ log_interval: 100
73
+ loud_norm: false
74
+ lr: 0.001
75
+ max_beta: 0.02
76
+ max_epochs: 1000
77
+ max_eval_sentences: 1
78
+ max_eval_tokens: 60000
79
+ max_frames: 8000
80
+ max_input_tokens: 1550
81
+ max_sentences: 48
82
+ max_tokens: 36000
83
+ max_updates: 320000
84
+ mel_loss: ssim:0.5|l1:0.5
85
+ mel_vmax: 1.5
86
+ mel_vmin: -6.0
87
+ min_level_db: -120
88
+ norm_type: gn
89
+ num_ckpt_keep: 3
90
+ num_heads: 2
91
+ num_sanity_val_steps: 1
92
+ num_spk: 1
93
+ num_test_samples: 0
94
+ num_valid_plots: 10
95
+ optimizer_adam_beta1: 0.9
96
+ optimizer_adam_beta2: 0.98
97
+ out_wav_norm: false
98
+ pe_ckpt: checkpoints/0102_xiaoma_pe
99
+ pe_enable: true
100
+ pitch_ar: false
101
+ pitch_enc_hidden_stride_kernel:
102
+ - 0,2,5
103
+ - 0,2,5
104
+ - 0,2,5
105
+ pitch_extractor: parselmouth
106
+ pitch_loss: l1
107
+ pitch_norm: log
108
+ pitch_type: frame
109
+ pre_align_args:
110
+ allow_no_txt: false
111
+ denoise: false
112
+ forced_align: mfa
113
+ txt_processor: zh_g2pM
114
+ use_sox: true
115
+ use_tone: false
116
+ pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
117
+ predictor_dropout: 0.5
118
+ predictor_grad: 0.1
119
+ predictor_hidden: -1
120
+ predictor_kernel: 5
121
+ predictor_layers: 5
122
+ prenet_dropout: 0.5
123
+ prenet_hidden_size: 256
124
+ pretrain_fs_ckpt: ''
125
+ processed_data_dir: xxx
126
+ profile_infer: false
127
+ raw_data_dir: data/raw/opencpop/segments
128
+ ref_norm_layer: bn
129
+ rel_pos: true
130
+ reset_phone_dict: true
131
+ residual_channels: 256
132
+ residual_layers: 20
133
+ save_best: false
134
+ save_ckpt: true
135
+ save_codes:
136
+ - configs
137
+ - modules
138
+ - tasks
139
+ - utils
140
+ - usr
141
+ save_f0: true
142
+ save_gt: false
143
+ schedule_type: linear
144
+ seed: 1234
145
+ sort_by_len: true
146
+ spec_max:
147
+ - -0.79453
148
+ - -0.81116
149
+ - -0.61631
150
+ - -0.30679
151
+ - -0.13863
152
+ - -0.050652
153
+ - -0.11563
154
+ - -0.10679
155
+ - -0.091068
156
+ - -0.062174
157
+ - -0.075302
158
+ - -0.072217
159
+ - -0.063815
160
+ - -0.073299
161
+ - 0.007361
162
+ - -0.072508
163
+ - -0.050234
164
+ - -0.16534
165
+ - -0.26928
166
+ - -0.20782
167
+ - -0.20823
168
+ - -0.11702
169
+ - -0.070128
170
+ - -0.065868
171
+ - -0.012675
172
+ - 0.0015121
173
+ - -0.089902
174
+ - -0.21392
175
+ - -0.23789
176
+ - -0.28922
177
+ - -0.30405
178
+ - -0.23029
179
+ - -0.22088
180
+ - -0.21542
181
+ - -0.29367
182
+ - -0.30137
183
+ - -0.38281
184
+ - -0.4359
185
+ - -0.28681
186
+ - -0.46855
187
+ - -0.57485
188
+ - -0.47022
189
+ - -0.54266
190
+ - -0.44848
191
+ - -0.6412
192
+ - -0.687
193
+ - -0.6486
194
+ - -0.76436
195
+ - -0.49971
196
+ - -0.71068
197
+ - -0.69724
198
+ - -0.61487
199
+ - -0.55843
200
+ - -0.69773
201
+ - -0.57502
202
+ - -0.70919
203
+ - -0.82431
204
+ - -0.84213
205
+ - -0.90431
206
+ - -0.8284
207
+ - -0.77945
208
+ - -0.82758
209
+ - -0.87699
210
+ - -1.0532
211
+ - -1.0766
212
+ - -1.1198
213
+ - -1.0185
214
+ - -0.98983
215
+ - -1.0001
216
+ - -1.0756
217
+ - -1.0024
218
+ - -1.0304
219
+ - -1.0579
220
+ - -1.0188
221
+ - -1.05
222
+ - -1.0842
223
+ - -1.0923
224
+ - -1.1223
225
+ - -1.2381
226
+ - -1.6467
227
+ spec_min:
228
+ - -6.0
229
+ - -6.0
230
+ - -6.0
231
+ - -6.0
232
+ - -6.0
233
+ - -6.0
234
+ - -6.0
235
+ - -6.0
236
+ - -6.0
237
+ - -6.0
238
+ - -6.0
239
+ - -6.0
240
+ - -6.0
241
+ - -6.0
242
+ - -6.0
243
+ - -6.0
244
+ - -6.0
245
+ - -6.0
246
+ - -6.0
247
+ - -6.0
248
+ - -6.0
249
+ - -6.0
250
+ - -6.0
251
+ - -6.0
252
+ - -6.0
253
+ - -6.0
254
+ - -6.0
255
+ - -6.0
256
+ - -6.0
257
+ - -6.0
258
+ - -6.0
259
+ - -6.0
260
+ - -6.0
261
+ - -6.0
262
+ - -6.0
263
+ - -6.0
264
+ - -6.0
265
+ - -6.0
266
+ - -6.0
267
+ - -6.0
268
+ - -6.0
269
+ - -6.0
270
+ - -6.0
271
+ - -6.0
272
+ - -6.0
273
+ - -6.0
274
+ - -6.0
275
+ - -6.0
276
+ - -6.0
277
+ - -6.0
278
+ - -6.0
279
+ - -6.0
280
+ - -6.0
281
+ - -6.0
282
+ - -6.0
283
+ - -6.0
284
+ - -6.0
285
+ - -6.0
286
+ - -6.0
287
+ - -6.0
288
+ - -6.0
289
+ - -6.0
290
+ - -6.0
291
+ - -6.0
292
+ - -6.0
293
+ - -6.0
294
+ - -6.0
295
+ - -6.0
296
+ - -6.0
297
+ - -6.0
298
+ - -6.0
299
+ - -6.0
300
+ - -6.0
301
+ - -6.0
302
+ - -6.0
303
+ - -6.0
304
+ - -6.0
305
+ - -6.0
306
+ - -6.0
307
+ - -6.0
308
+ spk_cond_steps: []
309
+ stop_token_weight: 5.0
310
+ task_cls: usr.diffsinger_task.DiffSingerMIDITask
311
+ test_ids: []
312
+ test_input_dir: ''
313
+ test_num: 0
314
+ test_prefixes:
315
+ - '2044'
316
+ - '2086'
317
+ - '2092'
318
+ - '2093'
319
+ - '2100'
320
+ test_set_name: test
321
+ timesteps: 1000
322
+ train_set_name: train
323
+ use_denoise: false
324
+ use_energy_embed: false
325
+ use_gt_dur: false
326
+ use_gt_f0: false
327
+ use_midi: true
328
+ use_nsf: true
329
+ use_pitch_embed: false
330
+ use_pos_embed: true
331
+ use_spk_embed: false
332
+ use_spk_id: false
333
+ use_split_spk_id: false
334
+ use_uv: true
335
+ use_var_enc: false
336
+ val_check_interval: 2000
337
+ valid_num: 0
338
+ valid_set_name: valid
339
+ vocoder: vocoders.hifigan.HifiGAN
340
+ vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
341
+ warmup_updates: 2000
342
+ wav2spec_eps: 1e-6
343
+ weight_decay: 0
344
+ win_size: 512
345
+ work_dir: checkpoints/0831_opencpop_ds1000
346
+ pndm_speedup: 10
checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e
3
+ size 170269591
checkpoints/clean.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+
4
+ if __name__ == '__main__':
5
+ ckpt_path = sys.argv[1]
6
+ checkpoint = torch.load(ckpt_path, map_location='cpu')
7
+ print(checkpoint['state_dict'].keys())
8
+ if 'model' in checkpoint['state_dict']:
9
+ checkpoint = {'state_dict': {'model': checkpoint['state_dict']['model']}}
10
+ else:
11
+ checkpoint = {'state_dict': {'model_gen': checkpoint['state_dict']['model_gen']}}
12
+ torch.save(checkpoint, ckpt_path, _use_new_zipfile_serialization=False)
checkpoints/cleaner.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+
4
+ if __name__ == '__main__':
5
+ ckpt_path = sys.argv[1]
6
+ checkpoint = torch.load(ckpt_path, map_location='cpu')
7
+ checkpoint = {'state_dict': checkpoint['state_dict']}
8
+ torch.save(checkpoint, ckpt_path, _use_new_zipfile_serialization=False)
configs/config_base.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ binary_data_dir: ''
3
+ work_dir: '' # experiment directory.
4
+ infer: false # infer
5
+ seed: 1234
6
+ debug: false
7
+ save_codes:
8
+ - configs
9
+ - modules
10
+ - tasks
11
+ - utils
12
+ - usr
13
+
14
+ #############
15
+ # dataset
16
+ #############
17
+ ds_workers: 1
18
+ test_num: 100
19
+ valid_num: 100
20
+ endless_ds: false
21
+ sort_by_len: true
22
+
23
+ #########
24
+ # train and eval
25
+ #########
26
+ load_ckpt: ''
27
+ save_ckpt: true
28
+ save_best: false
29
+ num_ckpt_keep: 3
30
+ clip_grad_norm: 0
31
+ accumulate_grad_batches: 1
32
+ log_interval: 100
33
+ num_sanity_val_steps: 5 # steps of validation at the beginning
34
+ check_val_every_n_epoch: 10
35
+ val_check_interval: 2000
36
+ max_epochs: 1000
37
+ max_updates: 160000
38
+ max_tokens: 31250
39
+ max_sentences: 100000
40
+ max_eval_tokens: -1
41
+ max_eval_sentences: -1
42
+ test_input_dir: ''
configs/singing/base.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - configs/tts/base.yaml
3
+ - configs/tts/base_zh.yaml
4
+
5
+
6
+ datasets: []
7
+ test_prefixes: []
8
+ test_num: 0
9
+ valid_num: 0
10
+
11
+ pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
12
+ binarizer_cls: data_gen.singing.binarize.SingingBinarizer
13
+ pre_align_args:
14
+ use_tone: false # for ZH
15
+ forced_align: mfa
16
+ use_sox: true
17
+ hop_size: 128 # Hop size.
18
+ fft_size: 512 # FFT size.
19
+ win_size: 512 # FFT size.
20
+ max_frames: 8000
21
+ fmin: 50 # Minimum freq in mel basis calculation.
22
+ fmax: 11025 # Maximum frequency in mel basis calculation.
23
+ pitch_type: frame
24
+
25
+ hidden_size: 256
26
+ mel_loss: "ssim:0.5|l1:0.5"
27
+ lambda_f0: 0.0
28
+ lambda_uv: 0.0
29
+ lambda_energy: 0.0
30
+ lambda_ph_dur: 0.0
31
+ lambda_sent_dur: 0.0
32
+ lambda_word_dur: 0.0
33
+ predictor_grad: 0.0
34
+ use_spk_embed: true
35
+ use_spk_id: false
36
+
37
+ max_tokens: 20000
38
+ max_updates: 400000
39
+ num_spk: 100
40
+ save_f0: true
41
+ use_gt_dur: true
42
+ use_gt_f0: true
configs/singing/fs2.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - configs/tts/fs2.yaml
3
+ - configs/singing/base.yaml
configs/tts/base.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ base_config: configs/config_base.yaml
3
+ task_cls: ''
4
+ #############
5
+ # dataset
6
+ #############
7
+ raw_data_dir: ''
8
+ processed_data_dir: ''
9
+ binary_data_dir: ''
10
+ dict_dir: ''
11
+ pre_align_cls: ''
12
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
13
+ pre_align_args:
14
+ use_tone: true # for ZH
15
+ forced_align: mfa
16
+ use_sox: false
17
+ txt_processor: en
18
+ allow_no_txt: false
19
+ denoise: false
20
+ binarization_args:
21
+ shuffle: false
22
+ with_txt: true
23
+ with_wav: false
24
+ with_align: true
25
+ with_spk_embed: true
26
+ with_f0: true
27
+ with_f0cwt: true
28
+
29
+ loud_norm: false
30
+ endless_ds: true
31
+ reset_phone_dict: true
32
+
33
+ test_num: 100
34
+ valid_num: 100
35
+ max_frames: 1550
36
+ max_input_tokens: 1550
37
+ audio_num_mel_bins: 80
38
+ audio_sample_rate: 22050
39
+ hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
40
+ win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
41
+ fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
42
+ fmax: 7600 # To be increased/reduced depending on data.
43
+ fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
44
+ min_level_db: -100
45
+ num_spk: 1
46
+ mel_vmin: -6
47
+ mel_vmax: 1.5
48
+ ds_workers: 4
49
+
50
+ #########
51
+ # model
52
+ #########
53
+ dropout: 0.1
54
+ enc_layers: 4
55
+ dec_layers: 4
56
+ hidden_size: 384
57
+ num_heads: 2
58
+ prenet_dropout: 0.5
59
+ prenet_hidden_size: 256
60
+ stop_token_weight: 5.0
61
+ enc_ffn_kernel_size: 9
62
+ dec_ffn_kernel_size: 9
63
+ ffn_act: gelu
64
+ ffn_padding: 'SAME'
65
+
66
+
67
+ ###########
68
+ # optimization
69
+ ###########
70
+ lr: 2.0
71
+ warmup_updates: 8000
72
+ optimizer_adam_beta1: 0.9
73
+ optimizer_adam_beta2: 0.98
74
+ weight_decay: 0
75
+ clip_grad_norm: 1
76
+
77
+
78
+ ###########
79
+ # train and eval
80
+ ###########
81
+ max_tokens: 30000
82
+ max_sentences: 100000
83
+ max_eval_sentences: 1
84
+ max_eval_tokens: 60000
85
+ train_set_name: 'train'
86
+ valid_set_name: 'valid'
87
+ test_set_name: 'test'
88
+ vocoder: pwg
89
+ vocoder_ckpt: ''
90
+ profile_infer: false
91
+ out_wav_norm: false
92
+ save_gt: false
93
+ save_f0: false
94
+ gen_dir_name: ''
95
+ use_denoise: false
configs/tts/base_zh.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pre_align_args:
2
+ txt_processor: zh_g2pM
3
+ binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer
configs/tts/fs2.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: configs/tts/base.yaml
2
+ task_cls: tasks.tts.fs2.FastSpeech2Task
3
+
4
+ # model
5
+ hidden_size: 256
6
+ dropout: 0.1
7
+ encoder_type: fft # fft|tacotron|tacotron2|conformer
8
+ encoder_K: 8 # for tacotron encoder
9
+ decoder_type: fft # fft|rnn|conv|conformer
10
+ use_pos_embed: true
11
+
12
+ # duration
13
+ predictor_hidden: -1
14
+ predictor_kernel: 5
15
+ predictor_layers: 2
16
+ dur_predictor_kernel: 3
17
+ dur_predictor_layers: 2
18
+ predictor_dropout: 0.5
19
+
20
+ # pitch and energy
21
+ use_pitch_embed: true
22
+ pitch_type: ph # frame|ph|cwt
23
+ use_uv: true
24
+ cwt_hidden_size: 128
25
+ cwt_layers: 2
26
+ cwt_loss: l1
27
+ cwt_add_f0_loss: false
28
+ cwt_std_scale: 0.8
29
+
30
+ pitch_ar: false
31
+ #pitch_embed_type: 0q
32
+ pitch_loss: 'l1' # l1|l2|ssim
33
+ pitch_norm: log
34
+ use_energy_embed: false
35
+
36
+ # reference encoder and speaker embedding
37
+ use_spk_id: false
38
+ use_split_spk_id: false
39
+ use_spk_embed: false
40
+ use_var_enc: false
41
+ lambda_commit: 0.25
42
+ ref_norm_layer: bn
43
+ pitch_enc_hidden_stride_kernel:
44
+ - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
45
+ - 0,2,5
46
+ - 0,2,5
47
+ dur_enc_hidden_stride_kernel:
48
+ - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
49
+ - 0,2,3
50
+ - 0,1,3
51
+
52
+
53
+ # mel
54
+ mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
55
+
56
+ # loss lambda
57
+ lambda_f0: 1.0
58
+ lambda_uv: 1.0
59
+ lambda_energy: 0.1
60
+ lambda_ph_dur: 1.0
61
+ lambda_sent_dur: 1.0
62
+ lambda_word_dur: 1.0
63
+ predictor_grad: 0.1
64
+
65
+ # train and eval
66
+ pretrain_fs_ckpt: ''
67
+ warmup_updates: 2000
68
+ max_tokens: 32000
69
+ max_sentences: 100000
70
+ max_eval_sentences: 1
71
+ max_updates: 120000
72
+ num_valid_plots: 5
73
+ num_test_samples: 0
74
+ test_ids: []
75
+ use_gt_dur: false
76
+ use_gt_f0: false
77
+
78
+ # exp
79
+ dur_loss: mse # huber|mol
80
+ norm_type: gn
configs/tts/hifigan.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: configs/tts/pwg.yaml
2
+ task_cls: tasks.vocoder.hifigan.HifiGanTask
3
+ resblock: "1"
4
+ adam_b1: 0.8
5
+ adam_b2: 0.99
6
+ upsample_rates: [ 8,8,2,2 ]
7
+ upsample_kernel_sizes: [ 16,16,4,4 ]
8
+ upsample_initial_channel: 128
9
+ resblock_kernel_sizes: [ 3,7,11 ]
10
+ resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11
+
12
+ lambda_mel: 45.0
13
+
14
+ max_samples: 8192
15
+ max_sentences: 16
16
+
17
+ generator_params:
18
+ lr: 0.0002 # Generator's learning rate.
19
+ aux_context_window: 0 # Context window size for auxiliary feature.
20
+ discriminator_optimizer_params:
21
+ lr: 0.0002 # Discriminator's learning rate.
configs/tts/lj/base_mel2wav.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
2
+ processed_data_dir: 'data/processed/ljspeech'
3
+ binary_data_dir: 'data/binary/ljspeech_wav'
configs/tts/lj/base_text2mel.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
2
+ processed_data_dir: 'data/processed/ljspeech'
3
+ binary_data_dir: 'data/binary/ljspeech'
4
+ pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
5
+
6
+ pitch_type: cwt
7
+ mel_loss: l1
8
+ num_test_samples: 20
9
+ test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
10
+ 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
11
+ use_energy_embed: false
12
+ test_num: 523
13
+ valid_num: 348
configs/tts/lj/fs2.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - configs/tts/fs2.yaml
3
+ - configs/tts/lj/base_text2mel.yaml
configs/tts/lj/hifigan.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - configs/tts/hifigan.yaml
3
+ - configs/tts/lj/base_mel2wav.yaml
configs/tts/lj/pwg.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - configs/tts/pwg.yaml
3
+ - configs/tts/lj/base_mel2wav.yaml
configs/tts/pwg.yaml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: configs/tts/base.yaml
2
+ task_cls: tasks.vocoder.pwg.PwgTask
3
+
4
+ binarization_args:
5
+ with_wav: true
6
+ with_spk_embed: false
7
+ with_align: false
8
+ test_input_dir: ''
9
+
10
+ ###########
11
+ # train and eval
12
+ ###########
13
+ max_samples: 25600
14
+ max_sentences: 5
15
+ max_eval_sentences: 1
16
+ max_updates: 1000000
17
+ val_check_interval: 2000
18
+
19
+
20
+ ###########################################################
21
+ # FEATURE EXTRACTION SETTING #
22
+ ###########################################################
23
+ sampling_rate: 22050 # Sampling rate.
24
+ fft_size: 1024 # FFT size.
25
+ hop_size: 256 # Hop size.
26
+ win_length: null # Window length.
27
+ # If set to null, it will be the same as fft_size.
28
+ window: "hann" # Window function.
29
+ num_mels: 80 # Number of mel basis.
30
+ fmin: 80 # Minimum freq in mel basis calculation.
31
+ fmax: 7600 # Maximum frequency in mel basis calculation.
32
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
33
+
34
+ ###########################################################
35
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
36
+ ###########################################################
37
+ generator_params:
38
+ in_channels: 1 # Number of input channels.
39
+ out_channels: 1 # Number of output channels.
40
+ kernel_size: 3 # Kernel size of dilated convolution.
41
+ layers: 30 # Number of residual block layers.
42
+ stacks: 3 # Number of stacks i.e., dilation cycles.
43
+ residual_channels: 64 # Number of channels in residual conv.
44
+ gate_channels: 128 # Number of channels in gated conv.
45
+ skip_channels: 64 # Number of channels in skip conv.
46
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
47
+ # Must be the same as num_mels.
48
+ aux_context_window: 2 # Context window size for auxiliary feature.
49
+ # If set to 2, previous 2 and future 2 frames will be considered.
50
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
51
+ use_weight_norm: true # Whether to use weight norm.
52
+ # If set to true, it will be applied to all of the conv layers.
53
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
54
+ upsample_params: # Upsampling network parameters.
55
+ upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
56
+ use_pitch_embed: false
57
+
58
+ ###########################################################
59
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
60
+ ###########################################################
61
+ discriminator_params:
62
+ in_channels: 1 # Number of input channels.
63
+ out_channels: 1 # Number of output channels.
64
+ kernel_size: 3 # Number of output channels.
65
+ layers: 10 # Number of conv layers.
66
+ conv_channels: 64 # Number of chnn layers.
67
+ bias: true # Whether to use bias parameter in conv.
68
+ use_weight_norm: true # Whether to use weight norm.
69
+ # If set to true, it will be applied to all of the conv layers.
70
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
71
+ nonlinear_activation_params: # Nonlinear function parameters
72
+ negative_slope: 0.2 # Alpha in LeakyReLU.
73
+
74
+ ###########################################################
75
+ # STFT LOSS SETTING #
76
+ ###########################################################
77
+ stft_loss_params:
78
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
79
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
80
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
81
+ window: "hann_window" # Window function for STFT-based loss
82
+ use_mel_loss: false
83
+
84
+ ###########################################################
85
+ # ADVERSARIAL LOSS SETTING #
86
+ ###########################################################
87
+ lambda_adv: 4.0 # Loss balancing coefficient.
88
+
89
+ ###########################################################
90
+ # OPTIMIZER & SCHEDULER SETTING #
91
+ ###########################################################
92
+ generator_optimizer_params:
93
+ lr: 0.0001 # Generator's learning rate.
94
+ eps: 1.0e-6 # Generator's epsilon.
95
+ weight_decay: 0.0 # Generator's weight decay coefficient.
96
+ generator_scheduler_params:
97
+ step_size: 200000 # Generator's scheduler step size.
98
+ gamma: 0.5 # Generator's scheduler gamma.
99
+ # At each step size, lr will be multiplied by this parameter.
100
+ generator_grad_norm: 10 # Generator's gradient norm.
101
+ discriminator_optimizer_params:
102
+ lr: 0.00005 # Discriminator's learning rate.
103
+ eps: 1.0e-6 # Discriminator's epsilon.
104
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
105
+ discriminator_scheduler_params:
106
+ step_size: 200000 # Discriminator's scheduler step size.
107
+ gamma: 0.5 # Discriminator's scheduler gamma.
108
+ # At each step size, lr will be multiplied by this parameter.
109
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
110
+ disc_start_steps: 40000 # Number of steps to start to train discriminator.
data/processed/ljspeech/dict.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ! !
2
+ , ,
3
+ . .
4
+ ; ;
5
+ <BOS> <BOS>
6
+ <EOS> <EOS>
7
+ ? ?
8
+ AA0 AA0
9
+ AA1 AA1
10
+ AA2 AA2
11
+ AE0 AE0
12
+ AE1 AE1
13
+ AE2 AE2
14
+ AH0 AH0
15
+ AH1 AH1
16
+ AH2 AH2
17
+ AO0 AO0
18
+ AO1 AO1
19
+ AO2 AO2
20
+ AW0 AW0
21
+ AW1 AW1
22
+ AW2 AW2
23
+ AY0 AY0
24
+ AY1 AY1
25
+ AY2 AY2
26
+ B B
27
+ CH CH
28
+ D D
29
+ DH DH
30
+ EH0 EH0
31
+ EH1 EH1
32
+ EH2 EH2
33
+ ER0 ER0
34
+ ER1 ER1
35
+ ER2 ER2
36
+ EY0 EY0
37
+ EY1 EY1
38
+ EY2 EY2
39
+ F F
40
+ G G
41
+ HH HH
42
+ IH0 IH0
43
+ IH1 IH1
44
+ IH2 IH2
45
+ IY0 IY0
46
+ IY1 IY1
47
+ IY2 IY2
48
+ JH JH
49
+ K K
50
+ L L
51
+ M M
52
+ N N
53
+ NG NG
54
+ OW0 OW0
55
+ OW1 OW1
56
+ OW2 OW2
57
+ OY0 OY0
58
+ OY1 OY1
59
+ OY2 OY2
60
+ P P
61
+ R R
62
+ S S
63
+ SH SH
64
+ T T
65
+ TH TH
66
+ UH0 UH0
67
+ UH1 UH1
68
+ UH2 UH2
69
+ UW0 UW0
70
+ UW1 UW1
71
+ UW2 UW2
72
+ V V
73
+ W W
74
+ Y Y
75
+ Z Z
76
+ ZH ZH
77
+ | |
data/processed/ljspeech/metadata_phone.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/processed/ljspeech/mfa_dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/processed/ljspeech/phone_set.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
data_gen/singing/binarize.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ from copy import deepcopy
4
+ import pandas as pd
5
+ import logging
6
+ from tqdm import tqdm
7
+ import json
8
+ import glob
9
+ import re
10
+ from resemblyzer import VoiceEncoder
11
+ import traceback
12
+ import numpy as np
13
+ import pretty_midi
14
+ import librosa
15
+ from scipy.interpolate import interp1d
16
+ import torch
17
+ from textgrid import TextGrid
18
+
19
+ from utils.hparams import hparams
20
+ from data_gen.tts.data_gen_utils import build_phone_encoder, get_pitch
21
+ from utils.pitch_utils import f0_to_coarse
22
+ from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
23
+ from data_gen.tts.binarizer_zh import ZhBinarizer
24
+ from data_gen.tts.txt_processors.zh_g2pM import ALL_YUNMU
25
+ from vocoders.base_vocoder import VOCODERS
26
+
27
+
28
+ class SingingBinarizer(BaseBinarizer):
29
+ def __init__(self, processed_data_dir=None):
30
+ if processed_data_dir is None:
31
+ processed_data_dir = hparams['processed_data_dir']
32
+ self.processed_data_dirs = processed_data_dir.split(",")
33
+ self.binarization_args = hparams['binarization_args']
34
+ self.pre_align_args = hparams['pre_align_args']
35
+ self.item2txt = {}
36
+ self.item2ph = {}
37
+ self.item2wavfn = {}
38
+ self.item2f0fn = {}
39
+ self.item2tgfn = {}
40
+ self.item2spk = {}
41
+
42
+ def split_train_test_set(self, item_names):
43
+ item_names = deepcopy(item_names)
44
+ test_item_names = [x for x in item_names if any([ts in x for ts in hparams['test_prefixes']])]
45
+ train_item_names = [x for x in item_names if x not in set(test_item_names)]
46
+ logging.info("train {}".format(len(train_item_names)))
47
+ logging.info("test {}".format(len(test_item_names)))
48
+ return train_item_names, test_item_names
49
+
50
+ def load_meta_data(self):
51
+ for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
52
+ wav_suffix = '_wf0.wav'
53
+ txt_suffix = '.txt'
54
+ ph_suffix = '_ph.txt'
55
+ tg_suffix = '.TextGrid'
56
+ all_wav_pieces = glob.glob(f'{processed_data_dir}/*/*{wav_suffix}')
57
+
58
+ for piece_path in all_wav_pieces:
59
+ item_name = raw_item_name = piece_path[len(processed_data_dir)+1:].replace('/', '-')[:-len(wav_suffix)]
60
+ if len(self.processed_data_dirs) > 1:
61
+ item_name = f'ds{ds_id}_{item_name}'
62
+ self.item2txt[item_name] = open(f'{piece_path.replace(wav_suffix, txt_suffix)}').readline()
63
+ self.item2ph[item_name] = open(f'{piece_path.replace(wav_suffix, ph_suffix)}').readline()
64
+ self.item2wavfn[item_name] = piece_path
65
+
66
+ self.item2spk[item_name] = re.split('-|#', piece_path.split('/')[-2])[0]
67
+ if len(self.processed_data_dirs) > 1:
68
+ self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
69
+ self.item2tgfn[item_name] = piece_path.replace(wav_suffix, tg_suffix)
70
+ print('spkers: ', set(self.item2spk.values()))
71
+ self.item_names = sorted(list(self.item2txt.keys()))
72
+ if self.binarization_args['shuffle']:
73
+ random.seed(1234)
74
+ random.shuffle(self.item_names)
75
+ self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
76
+
77
+ @property
78
+ def train_item_names(self):
79
+ return self._train_item_names
80
+
81
+ @property
82
+ def valid_item_names(self):
83
+ return self._test_item_names
84
+
85
+ @property
86
+ def test_item_names(self):
87
+ return self._test_item_names
88
+
89
+ def process(self):
90
+ self.load_meta_data()
91
+ os.makedirs(hparams['binary_data_dir'], exist_ok=True)
92
+ self.spk_map = self.build_spk_map()
93
+ print("| spk_map: ", self.spk_map)
94
+ spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
95
+ json.dump(self.spk_map, open(spk_map_fn, 'w'))
96
+
97
+ self.phone_encoder = self._phone_encoder()
98
+ self.process_data('valid')
99
+ self.process_data('test')
100
+ self.process_data('train')
101
+
102
+ def _phone_encoder(self):
103
+ ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
104
+ ph_set = []
105
+ if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
106
+ for ph_sent in self.item2ph.values():
107
+ ph_set += ph_sent.split(' ')
108
+ ph_set = sorted(set(ph_set))
109
+ json.dump(ph_set, open(ph_set_fn, 'w'))
110
+ print("| Build phone set: ", ph_set)
111
+ else:
112
+ ph_set = json.load(open(ph_set_fn, 'r'))
113
+ print("| Load phone set: ", ph_set)
114
+ return build_phone_encoder(hparams['binary_data_dir'])
115
+
116
+ # @staticmethod
117
+ # def get_pitch(wav_fn, spec, res):
118
+ # wav_suffix = '_wf0.wav'
119
+ # f0_suffix = '_f0.npy'
120
+ # f0fn = wav_fn.replace(wav_suffix, f0_suffix)
121
+ # pitch_info = np.load(f0fn)
122
+ # f0 = [x[1] for x in pitch_info]
123
+ # spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
124
+ # f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
125
+ # f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
126
+ # # f0_x_coor = np.arange(0, 1, 1 / len(f0))
127
+ # # f0_x_coor[-1] = 1
128
+ # # f0 = interp1d(f0_x_coor, f0, 'nearest')(spec_x_coor)[:len(spec)]
129
+ # if sum(f0) == 0:
130
+ # raise BinarizationError("Empty f0")
131
+ # assert len(f0) == len(spec), (len(f0), len(spec))
132
+ # pitch_coarse = f0_to_coarse(f0)
133
+ #
134
+ # # vis f0
135
+ # # import matplotlib.pyplot as plt
136
+ # # from textgrid import TextGrid
137
+ # # tg_fn = wav_fn.replace(wav_suffix, '.TextGrid')
138
+ # # fig = plt.figure(figsize=(12, 6))
139
+ # # plt.pcolor(spec.T, vmin=-5, vmax=0)
140
+ # # ax = plt.gca()
141
+ # # ax2 = ax.twinx()
142
+ # # ax2.plot(f0, color='red')
143
+ # # ax2.set_ylim(0, 800)
144
+ # # itvs = TextGrid.fromFile(tg_fn)[0]
145
+ # # for itv in itvs:
146
+ # # x = itv.maxTime * hparams['audio_sample_rate'] / hparams['hop_size']
147
+ # # plt.vlines(x=x, ymin=0, ymax=80, color='black')
148
+ # # plt.text(x=x, y=20, s=itv.mark, color='black')
149
+ # # plt.savefig('tmp/20211229_singing_plots_test.png')
150
+ #
151
+ # res['f0'] = f0
152
+ # res['pitch'] = pitch_coarse
153
+
154
+ @classmethod
155
+ def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
156
+ if hparams['vocoder'] in VOCODERS:
157
+ wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
158
+ else:
159
+ wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
160
+ res = {
161
+ 'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
162
+ 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
163
+ }
164
+ try:
165
+ if binarization_args['with_f0']:
166
+ # cls.get_pitch(wav_fn, mel, res)
167
+ cls.get_pitch(wav, mel, res)
168
+ if binarization_args['with_txt']:
169
+ try:
170
+ # print(ph)
171
+ phone_encoded = res['phone'] = encoder.encode(ph)
172
+ except:
173
+ traceback.print_exc()
174
+ raise BinarizationError(f"Empty phoneme")
175
+ if binarization_args['with_align']:
176
+ cls.get_align(tg_fn, ph, mel, phone_encoded, res)
177
+ except BinarizationError as e:
178
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
179
+ return None
180
+ return res
181
+
182
+
183
+ class MidiSingingBinarizer(SingingBinarizer):
184
+ item2midi = {}
185
+ item2midi_dur = {}
186
+ item2is_slur = {}
187
+ item2ph_durs = {}
188
+ item2wdb = {}
189
+
190
+ def load_meta_data(self):
191
+ for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
192
+ meta_midi = json.load(open(os.path.join(processed_data_dir, 'meta.json'))) # [list of dict]
193
+
194
+ for song_item in meta_midi:
195
+ item_name = raw_item_name = song_item['item_name']
196
+ if len(self.processed_data_dirs) > 1:
197
+ item_name = f'ds{ds_id}_{item_name}'
198
+ self.item2wavfn[item_name] = song_item['wav_fn']
199
+ self.item2txt[item_name] = song_item['txt']
200
+
201
+ self.item2ph[item_name] = ' '.join(song_item['phs'])
202
+ self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP', '<SIL>'] else 0 for x in song_item['phs']]
203
+ self.item2ph_durs[item_name] = song_item['ph_dur']
204
+
205
+ self.item2midi[item_name] = song_item['notes']
206
+ self.item2midi_dur[item_name] = song_item['notes_dur']
207
+ self.item2is_slur[item_name] = song_item['is_slur']
208
+ self.item2spk[item_name] = 'pop-cs'
209
+ if len(self.processed_data_dirs) > 1:
210
+ self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
211
+
212
+ print('spkers: ', set(self.item2spk.values()))
213
+ self.item_names = sorted(list(self.item2txt.keys()))
214
+ if self.binarization_args['shuffle']:
215
+ random.seed(1234)
216
+ random.shuffle(self.item_names)
217
+ self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
218
+
219
+ @staticmethod
220
+ def get_pitch(wav_fn, wav, spec, ph, res):
221
+ wav_suffix = '.wav'
222
+ # midi_suffix = '.mid'
223
+ wav_dir = 'wavs'
224
+ f0_dir = 'f0'
225
+
226
+ item_name = '/'.join(os.path.splitext(wav_fn)[0].split('/')[-2:]).replace('_wf0', '')
227
+ res['pitch_midi'] = np.asarray(MidiSingingBinarizer.item2midi[item_name])
228
+ res['midi_dur'] = np.asarray(MidiSingingBinarizer.item2midi_dur[item_name])
229
+ res['is_slur'] = np.asarray(MidiSingingBinarizer.item2is_slur[item_name])
230
+ res['word_boundary'] = np.asarray(MidiSingingBinarizer.item2wdb[item_name])
231
+ assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (
232
+ res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
233
+
234
+ # gt f0.
235
+ gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
236
+ if sum(gt_f0) == 0:
237
+ raise BinarizationError("Empty **gt** f0")
238
+ res['f0'] = gt_f0
239
+ res['pitch'] = gt_pitch_coarse
240
+
241
+ @staticmethod
242
+ def get_align(ph_durs, mel, phone_encoded, res, hop_size=hparams['hop_size'], audio_sample_rate=hparams['audio_sample_rate']):
243
+ mel2ph = np.zeros([mel.shape[0]], int)
244
+ startTime = 0
245
+
246
+ for i_ph in range(len(ph_durs)):
247
+ start_frame = int(startTime * audio_sample_rate / hop_size + 0.5)
248
+ end_frame = int((startTime + ph_durs[i_ph]) * audio_sample_rate / hop_size + 0.5)
249
+ mel2ph[start_frame:end_frame] = i_ph + 1
250
+ startTime = startTime + ph_durs[i_ph]
251
+
252
+ # print('ph durs: ', ph_durs)
253
+ # print('mel2ph: ', mel2ph, len(mel2ph))
254
+ res['mel2ph'] = mel2ph
255
+ # res['dur'] = None
256
+
257
+ @classmethod
258
+ def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
259
+ if hparams['vocoder'] in VOCODERS:
260
+ wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
261
+ else:
262
+ wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
263
+ res = {
264
+ 'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
265
+ 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
266
+ }
267
+ try:
268
+ if binarization_args['with_f0']:
269
+ cls.get_pitch(wav_fn, wav, mel, ph, res)
270
+ if binarization_args['with_txt']:
271
+ try:
272
+ phone_encoded = res['phone'] = encoder.encode(ph)
273
+ except:
274
+ traceback.print_exc()
275
+ raise BinarizationError(f"Empty phoneme")
276
+ if binarization_args['with_align']:
277
+ cls.get_align(MidiSingingBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
278
+ except BinarizationError as e:
279
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
280
+ return None
281
+ return res
282
+
283
+
284
+ class ZhSingingBinarizer(ZhBinarizer, SingingBinarizer):
285
+ pass
286
+
287
+
288
+ class OpencpopBinarizer(MidiSingingBinarizer):
289
+ item2midi = {}
290
+ item2midi_dur = {}
291
+ item2is_slur = {}
292
+ item2ph_durs = {}
293
+ item2wdb = {}
294
+
295
+ def split_train_test_set(self, item_names):
296
+ item_names = deepcopy(item_names)
297
+ test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
298
+ train_item_names = [x for x in item_names if x not in set(test_item_names)]
299
+ logging.info("train {}".format(len(train_item_names)))
300
+ logging.info("test {}".format(len(test_item_names)))
301
+ return train_item_names, test_item_names
302
+
303
+ def load_meta_data(self):
304
+ raw_data_dir = hparams['raw_data_dir']
305
+ # meta_midi = json.load(open(os.path.join(raw_data_dir, 'meta.json'))) # [list of dict]
306
+ utterance_labels = open(os.path.join(raw_data_dir, 'transcriptions.txt')).readlines()
307
+
308
+ for utterance_label in utterance_labels:
309
+ song_info = utterance_label.split('|')
310
+ item_name = raw_item_name = song_info[0]
311
+ self.item2wavfn[item_name] = f'{raw_data_dir}/wavs/{item_name}.wav'
312
+ self.item2txt[item_name] = song_info[1]
313
+
314
+ self.item2ph[item_name] = song_info[2]
315
+ # self.item2wdb[item_name] = list(np.nonzero([1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()])[0])
316
+ self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()]
317
+ self.item2ph_durs[item_name] = [float(x) for x in song_info[5].split(" ")]
318
+
319
+ self.item2midi[item_name] = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
320
+ for x in song_info[3].split(" ")]
321
+ self.item2midi_dur[item_name] = [float(x) for x in song_info[4].split(" ")]
322
+ self.item2is_slur[item_name] = [int(x) for x in song_info[6].split(" ")]
323
+ self.item2spk[item_name] = 'opencpop'
324
+
325
+ print('spkers: ', set(self.item2spk.values()))
326
+ self.item_names = sorted(list(self.item2txt.keys()))
327
+ if self.binarization_args['shuffle']:
328
+ random.seed(1234)
329
+ random.shuffle(self.item_names)
330
+ self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
331
+
332
+ @staticmethod
333
+ def get_pitch(wav_fn, wav, spec, ph, res):
334
+ wav_suffix = '.wav'
335
+ # midi_suffix = '.mid'
336
+ wav_dir = 'wavs'
337
+ f0_dir = 'text_f0_align'
338
+
339
+ item_name = os.path.splitext(os.path.basename(wav_fn))[0]
340
+ res['pitch_midi'] = np.asarray(OpencpopBinarizer.item2midi[item_name])
341
+ res['midi_dur'] = np.asarray(OpencpopBinarizer.item2midi_dur[item_name])
342
+ res['is_slur'] = np.asarray(OpencpopBinarizer.item2is_slur[item_name])
343
+ res['word_boundary'] = np.asarray(OpencpopBinarizer.item2wdb[item_name])
344
+ assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
345
+
346
+ # gt f0.
347
+ # f0 = None
348
+ # f0_suffix = '_f0.npy'
349
+ # f0fn = wav_fn.replace(wav_suffix, f0_suffix).replace(wav_dir, f0_dir)
350
+ # pitch_info = np.load(f0fn)
351
+ # f0 = [x[1] for x in pitch_info]
352
+ # spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
353
+ #
354
+ # f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
355
+ # f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
356
+ # if sum(f0) == 0:
357
+ # raise BinarizationError("Empty **gt** f0")
358
+ #
359
+ # pitch_coarse = f0_to_coarse(f0)
360
+ # res['f0'] = f0
361
+ # res['pitch'] = pitch_coarse
362
+
363
+ # gt f0.
364
+ gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
365
+ if sum(gt_f0) == 0:
366
+ raise BinarizationError("Empty **gt** f0")
367
+ res['f0'] = gt_f0
368
+ res['pitch'] = gt_pitch_coarse
369
+
370
+ @classmethod
371
+ def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
372
+ if hparams['vocoder'] in VOCODERS:
373
+ wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
374
+ else:
375
+ wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
376
+ res = {
377
+ 'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
378
+ 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
379
+ }
380
+ try:
381
+ if binarization_args['with_f0']:
382
+ cls.get_pitch(wav_fn, wav, mel, ph, res)
383
+ if binarization_args['with_txt']:
384
+ try:
385
+ phone_encoded = res['phone'] = encoder.encode(ph)
386
+ except:
387
+ traceback.print_exc()
388
+ raise BinarizationError(f"Empty phoneme")
389
+ if binarization_args['with_align']:
390
+ cls.get_align(OpencpopBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
391
+ except BinarizationError as e:
392
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
393
+ return None
394
+ return res
395
+
396
+
397
+ if __name__ == "__main__":
398
+ SingingBinarizer().process()
data_gen/tts/base_binarizer.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["OMP_NUM_THREADS"] = "1"
3
+
4
+ from utils.multiprocess_utils import chunked_multiprocess_run
5
+ import random
6
+ import traceback
7
+ import json
8
+ from resemblyzer import VoiceEncoder
9
+ from tqdm import tqdm
10
+ from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
11
+ from utils.hparams import set_hparams, hparams
12
+ import numpy as np
13
+ from utils.indexed_datasets import IndexedDatasetBuilder
14
+ from vocoders.base_vocoder import VOCODERS
15
+ import pandas as pd
16
+
17
+
18
+ class BinarizationError(Exception):
19
+ pass
20
+
21
+
22
+ class BaseBinarizer:
23
+ def __init__(self, processed_data_dir=None):
24
+ if processed_data_dir is None:
25
+ processed_data_dir = hparams['processed_data_dir']
26
+ self.processed_data_dirs = processed_data_dir.split(",")
27
+ self.binarization_args = hparams['binarization_args']
28
+ self.pre_align_args = hparams['pre_align_args']
29
+ self.forced_align = self.pre_align_args['forced_align']
30
+ tg_dir = None
31
+ if self.forced_align == 'mfa':
32
+ tg_dir = 'mfa_outputs'
33
+ if self.forced_align == 'kaldi':
34
+ tg_dir = 'kaldi_outputs'
35
+ self.item2txt = {}
36
+ self.item2ph = {}
37
+ self.item2wavfn = {}
38
+ self.item2tgfn = {}
39
+ self.item2spk = {}
40
+ for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
41
+ self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
42
+ for r_idx, r in self.meta_df.iterrows():
43
+ item_name = raw_item_name = r['item_name']
44
+ if len(self.processed_data_dirs) > 1:
45
+ item_name = f'ds{ds_id}_{item_name}'
46
+ self.item2txt[item_name] = r['txt']
47
+ self.item2ph[item_name] = r['ph']
48
+ self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
49
+ self.item2spk[item_name] = r.get('spk', 'SPK1')
50
+ if len(self.processed_data_dirs) > 1:
51
+ self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
52
+ if tg_dir is not None:
53
+ self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
54
+ self.item_names = sorted(list(self.item2txt.keys()))
55
+ if self.binarization_args['shuffle']:
56
+ random.seed(1234)
57
+ random.shuffle(self.item_names)
58
+
59
+ @property
60
+ def train_item_names(self):
61
+ return self.item_names[hparams['test_num']+hparams['valid_num']:]
62
+
63
+ @property
64
+ def valid_item_names(self):
65
+ return self.item_names[0: hparams['test_num']+hparams['valid_num']] #
66
+
67
+ @property
68
+ def test_item_names(self):
69
+ return self.item_names[0: hparams['test_num']] # Audios for MOS testing are in 'test_ids'
70
+
71
+ def build_spk_map(self):
72
+ spk_map = set()
73
+ for item_name in self.item_names:
74
+ spk_name = self.item2spk[item_name]
75
+ spk_map.add(spk_name)
76
+ spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
77
+ assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
78
+ return spk_map
79
+
80
+ def item_name2spk_id(self, item_name):
81
+ return self.spk_map[self.item2spk[item_name]]
82
+
83
+ def _phone_encoder(self):
84
+ ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
85
+ ph_set = []
86
+ if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
87
+ for processed_data_dir in self.processed_data_dirs:
88
+ ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
89
+ ph_set = sorted(set(ph_set))
90
+ json.dump(ph_set, open(ph_set_fn, 'w'))
91
+ else:
92
+ ph_set = json.load(open(ph_set_fn, 'r'))
93
+ print("| phone set: ", ph_set)
94
+ return build_phone_encoder(hparams['binary_data_dir'])
95
+
96
+ def meta_data(self, prefix):
97
+ if prefix == 'valid':
98
+ item_names = self.valid_item_names
99
+ elif prefix == 'test':
100
+ item_names = self.test_item_names
101
+ else:
102
+ item_names = self.train_item_names
103
+ for item_name in item_names:
104
+ ph = self.item2ph[item_name]
105
+ txt = self.item2txt[item_name]
106
+ tg_fn = self.item2tgfn.get(item_name)
107
+ wav_fn = self.item2wavfn[item_name]
108
+ spk_id = self.item_name2spk_id(item_name)
109
+ yield item_name, ph, txt, tg_fn, wav_fn, spk_id
110
+
111
+ def process(self):
112
+ os.makedirs(hparams['binary_data_dir'], exist_ok=True)
113
+ self.spk_map = self.build_spk_map()
114
+ print("| spk_map: ", self.spk_map)
115
+ spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
116
+ json.dump(self.spk_map, open(spk_map_fn, 'w'))
117
+
118
+ self.phone_encoder = self._phone_encoder()
119
+ self.process_data('valid')
120
+ self.process_data('test')
121
+ self.process_data('train')
122
+
123
+ def process_data(self, prefix):
124
+ data_dir = hparams['binary_data_dir']
125
+ args = []
126
+ builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
127
+ lengths = []
128
+ f0s = []
129
+ total_sec = 0
130
+ if self.binarization_args['with_spk_embed']:
131
+ voice_encoder = VoiceEncoder().cuda()
132
+
133
+ meta_data = list(self.meta_data(prefix))
134
+ for m in meta_data:
135
+ args.append(list(m) + [self.phone_encoder, self.binarization_args])
136
+ num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
137
+ for f_id, (_, item) in enumerate(
138
+ zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
139
+ if item is None:
140
+ continue
141
+ item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
142
+ if self.binarization_args['with_spk_embed'] else None
143
+ if not self.binarization_args['with_wav'] and 'wav' in item:
144
+ print("del wav")
145
+ del item['wav']
146
+ builder.add_item(item)
147
+ lengths.append(item['len'])
148
+ total_sec += item['sec']
149
+ if item.get('f0') is not None:
150
+ f0s.append(item['f0'])
151
+ builder.finalize()
152
+ np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
153
+ if len(f0s) > 0:
154
+ f0s = np.concatenate(f0s, 0)
155
+ f0s = f0s[f0s != 0]
156
+ np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
157
+ print(f"| {prefix} total duration: {total_sec:.3f}s")
158
+
159
+ @classmethod
160
+ def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
161
+ if hparams['vocoder'] in VOCODERS:
162
+ wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
163
+ else:
164
+ wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
165
+ res = {
166
+ 'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
167
+ 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
168
+ }
169
+ try:
170
+ if binarization_args['with_f0']:
171
+ cls.get_pitch(wav, mel, res)
172
+ if binarization_args['with_f0cwt']:
173
+ cls.get_f0cwt(res['f0'], res)
174
+ if binarization_args['with_txt']:
175
+ try:
176
+ phone_encoded = res['phone'] = encoder.encode(ph)
177
+ except:
178
+ traceback.print_exc()
179
+ raise BinarizationError(f"Empty phoneme")
180
+ if binarization_args['with_align']:
181
+ cls.get_align(tg_fn, ph, mel, phone_encoded, res)
182
+ except BinarizationError as e:
183
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
184
+ return None
185
+ return res
186
+
187
+ @staticmethod
188
+ def get_align(tg_fn, ph, mel, phone_encoded, res):
189
+ if tg_fn is not None and os.path.exists(tg_fn):
190
+ mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
191
+ else:
192
+ raise BinarizationError(f"Align not found")
193
+ if mel2ph.max() - 1 >= len(phone_encoded):
194
+ raise BinarizationError(
195
+ f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
196
+ res['mel2ph'] = mel2ph
197
+ res['dur'] = dur
198
+
199
+ @staticmethod
200
+ def get_pitch(wav, mel, res):
201
+ f0, pitch_coarse = get_pitch(wav, mel, hparams)
202
+ if sum(f0) == 0:
203
+ raise BinarizationError("Empty f0")
204
+ res['f0'] = f0
205
+ res['pitch'] = pitch_coarse
206
+
207
+ @staticmethod
208
+ def get_f0cwt(f0, res):
209
+ from utils.cwt import get_cont_lf0, get_lf0_cwt
210
+ uv, cont_lf0_lpf = get_cont_lf0(f0)
211
+ logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
212
+ cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
213
+ Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
214
+ if np.any(np.isnan(Wavelet_lf0)):
215
+ raise BinarizationError("NaN CWT")
216
+ res['cwt_spec'] = Wavelet_lf0
217
+ res['cwt_scales'] = scales
218
+ res['f0_mean'] = logf0s_mean_org
219
+ res['f0_std'] = logf0s_std_org
220
+
221
+
222
+ if __name__ == "__main__":
223
+ set_hparams()
224
+ BaseBinarizer().process()
data_gen/tts/bin/binarize.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["OMP_NUM_THREADS"] = "1"
4
+
5
+ import importlib
6
+ from utils.hparams import set_hparams, hparams
7
+
8
+
9
+ def binarize():
10
+ binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
11
+ pkg = ".".join(binarizer_cls.split(".")[:-1])
12
+ cls_name = binarizer_cls.split(".")[-1]
13
+ binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
14
+ print("| Binarizer: ", binarizer_cls)
15
+ binarizer_cls().process()
16
+
17
+
18
+ if __name__ == '__main__':
19
+ set_hparams()
20
+ binarize()
data_gen/tts/binarizer_zh.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["OMP_NUM_THREADS"] = "1"
4
+
5
+ from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
6
+ from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
7
+ from data_gen.tts.data_gen_utils import get_mel2ph
8
+ from utils.hparams import set_hparams, hparams
9
+ import numpy as np
10
+
11
+
12
+ class ZhBinarizer(BaseBinarizer):
13
+ @staticmethod
14
+ def get_align(tg_fn, ph, mel, phone_encoded, res):
15
+ if tg_fn is not None and os.path.exists(tg_fn):
16
+ _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
17
+ else:
18
+ raise BinarizationError(f"Align not found")
19
+ ph_list = ph.split(" ")
20
+ assert len(dur) == len(ph_list)
21
+ mel2ph = []
22
+ # 分隔符的时长分配给韵母
23
+ dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
24
+ for i in range(len(dur)):
25
+ p = ph_list[i]
26
+ if p[0] != '<' and not p[0].isalpha():
27
+ uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
28
+ j = 0
29
+ while j < len(uv_) and not uv_[j]:
30
+ j += 1
31
+ dur[i - 1] += j
32
+ dur[i] -= j
33
+ if dur[i] < 100:
34
+ dur[i - 1] += dur[i]
35
+ dur[i] = 0
36
+ # 声母和韵母等长
37
+ for i in range(len(dur)):
38
+ p = ph_list[i]
39
+ if p in ALL_SHENMU:
40
+ p_next = ph_list[i + 1]
41
+ if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
42
+ print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
43
+ f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
44
+ continue
45
+ total = dur[i + 1] + dur[i]
46
+ dur[i] = total // 2
47
+ dur[i + 1] = total - dur[i]
48
+ for i in range(len(dur)):
49
+ mel2ph += [i + 1] * dur[i]
50
+ mel2ph = np.array(mel2ph)
51
+ if mel2ph.max() - 1 >= len(phone_encoded):
52
+ raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
53
+ res['mel2ph'] = mel2ph
54
+ res['dur'] = dur
55
+
56
+
57
+ if __name__ == "__main__":
58
+ set_hparams()
59
+ ZhBinarizer().process()
data_gen/tts/data_gen_utils.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ warnings.filterwarnings("ignore")
4
+
5
+ import parselmouth
6
+ import os
7
+ import torch
8
+ from skimage.transform import resize
9
+ from utils.text_encoder import TokenTextEncoder
10
+ from utils.pitch_utils import f0_to_coarse
11
+ import struct
12
+ import webrtcvad
13
+ from scipy.ndimage.morphology import binary_dilation
14
+ import librosa
15
+ import numpy as np
16
+ from utils import audio
17
+ import pyloudnorm as pyln
18
+ import re
19
+ import json
20
+ from collections import OrderedDict
21
+
22
+ PUNCS = '!,.?;:'
23
+
24
+ int16_max = (2 ** 15) - 1
25
+
26
+
27
+ def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
28
+ """
29
+ Ensures that segments without voice in the waveform remain no longer than a
30
+ threshold determined by the VAD parameters in params.py.
31
+ :param wav: the raw waveform as a numpy array of floats
32
+ :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
33
+ :return: the same waveform with silences trimmed away (length <= original wav length)
34
+ """
35
+
36
+ ## Voice Activation Detection
37
+ # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
38
+ # This sets the granularity of the VAD. Should not need to be changed.
39
+ sampling_rate = 16000
40
+ wav_raw, sr = librosa.core.load(path, sr=sr)
41
+
42
+ if norm:
43
+ meter = pyln.Meter(sr) # create BS.1770 meter
44
+ loudness = meter.integrated_loudness(wav_raw)
45
+ wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
46
+ if np.abs(wav_raw).max() > 1.0:
47
+ wav_raw = wav_raw / np.abs(wav_raw).max()
48
+
49
+ wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
50
+
51
+ vad_window_length = 30 # In milliseconds
52
+ # Number of frames to average together when performing the moving average smoothing.
53
+ # The larger this value, the larger the VAD variations must be to not get smoothed out.
54
+ vad_moving_average_width = 8
55
+
56
+ # Compute the voice detection window size
57
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
58
+
59
+ # Trim the end of the audio to have a multiple of the window size
60
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
61
+
62
+ # Convert the float waveform to 16-bit mono PCM
63
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
64
+
65
+ # Perform voice activation detection
66
+ voice_flags = []
67
+ vad = webrtcvad.Vad(mode=3)
68
+ for window_start in range(0, len(wav), samples_per_window):
69
+ window_end = window_start + samples_per_window
70
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
71
+ sample_rate=sampling_rate))
72
+ voice_flags = np.array(voice_flags)
73
+
74
+ # Smooth the voice detection with a moving average
75
+ def moving_average(array, width):
76
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
77
+ ret = np.cumsum(array_padded, dtype=float)
78
+ ret[width:] = ret[width:] - ret[:-width]
79
+ return ret[width - 1:] / width
80
+
81
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
82
+ audio_mask = np.round(audio_mask).astype(np.bool)
83
+
84
+ # Dilate the voiced regions
85
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
86
+ audio_mask = np.repeat(audio_mask, samples_per_window)
87
+ audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
88
+ if return_raw_wav:
89
+ return wav_raw, audio_mask, sr
90
+ return wav_raw[audio_mask], audio_mask, sr
91
+
92
+
93
+ def process_utterance(wav_path,
94
+ fft_size=1024,
95
+ hop_size=256,
96
+ win_length=1024,
97
+ window="hann",
98
+ num_mels=80,
99
+ fmin=80,
100
+ fmax=7600,
101
+ eps=1e-6,
102
+ sample_rate=22050,
103
+ loud_norm=False,
104
+ min_level_db=-100,
105
+ return_linear=False,
106
+ trim_long_sil=False, vocoder='pwg'):
107
+ if isinstance(wav_path, str):
108
+ if trim_long_sil:
109
+ wav, _, _ = trim_long_silences(wav_path, sample_rate)
110
+ else:
111
+ wav, _ = librosa.core.load(wav_path, sr=sample_rate)
112
+ else:
113
+ wav = wav_path
114
+
115
+ if loud_norm:
116
+ meter = pyln.Meter(sample_rate) # create BS.1770 meter
117
+ loudness = meter.integrated_loudness(wav)
118
+ wav = pyln.normalize.loudness(wav, loudness, -22.0)
119
+ if np.abs(wav).max() > 1:
120
+ wav = wav / np.abs(wav).max()
121
+
122
+ # get amplitude spectrogram
123
+ x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
124
+ win_length=win_length, window=window, pad_mode="constant")
125
+ spc = np.abs(x_stft) # (n_bins, T)
126
+
127
+ # get mel basis
128
+ fmin = 0 if fmin == -1 else fmin
129
+ fmax = sample_rate / 2 if fmax == -1 else fmax
130
+ mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
131
+ mel = mel_basis @ spc
132
+
133
+ if vocoder == 'pwg':
134
+ mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
135
+ else:
136
+ assert False, f'"{vocoder}" is not in ["pwg"].'
137
+
138
+ l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
139
+ wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
140
+ wav = wav[:mel.shape[1] * hop_size]
141
+
142
+ if not return_linear:
143
+ return wav, mel
144
+ else:
145
+ spc = audio.amp_to_db(spc)
146
+ spc = audio.normalize(spc, {'min_level_db': min_level_db})
147
+ return wav, mel, spc
148
+
149
+
150
+ def get_pitch(wav_data, mel, hparams):
151
+ """
152
+
153
+ :param wav_data: [T]
154
+ :param mel: [T, 80]
155
+ :param hparams:
156
+ :return:
157
+ """
158
+ time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
159
+ f0_min = 80
160
+ f0_max = 750
161
+
162
+ if hparams['hop_size'] == 128:
163
+ pad_size = 4
164
+ elif hparams['hop_size'] == 256:
165
+ pad_size = 2
166
+ else:
167
+ assert False
168
+
169
+ f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
170
+ time_step=time_step / 1000, voicing_threshold=0.6,
171
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
172
+ lpad = pad_size * 2
173
+ rpad = len(mel) - len(f0) - lpad
174
+ f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
175
+ # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
176
+ # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
177
+ # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
178
+ delta_l = len(mel) - len(f0)
179
+ assert np.abs(delta_l) <= 8
180
+ if delta_l > 0:
181
+ f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
182
+ f0 = f0[:len(mel)]
183
+ pitch_coarse = f0_to_coarse(f0)
184
+ return f0, pitch_coarse
185
+
186
+
187
+ def remove_empty_lines(text):
188
+ """remove empty lines"""
189
+ assert (len(text) > 0)
190
+ assert (isinstance(text, list))
191
+ text = [t.strip() for t in text]
192
+ if "" in text:
193
+ text.remove("")
194
+ return text
195
+
196
+
197
+ class TextGrid(object):
198
+ def __init__(self, text):
199
+ text = remove_empty_lines(text)
200
+ self.text = text
201
+ self.line_count = 0
202
+ self._get_type()
203
+ self._get_time_intval()
204
+ self._get_size()
205
+ self.tier_list = []
206
+ self._get_item_list()
207
+
208
+ def _extract_pattern(self, pattern, inc):
209
+ """
210
+ Parameters
211
+ ----------
212
+ pattern : regex to extract pattern
213
+ inc : increment of line count after extraction
214
+ Returns
215
+ -------
216
+ group : extracted info
217
+ """
218
+ try:
219
+ group = re.match(pattern, self.text[self.line_count]).group(1)
220
+ self.line_count += inc
221
+ except AttributeError:
222
+ raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
223
+ return group
224
+
225
+ def _get_type(self):
226
+ self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
227
+
228
+ def _get_time_intval(self):
229
+ self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
230
+ self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
231
+
232
+ def _get_size(self):
233
+ self.size = int(self._extract_pattern(r"size = (.*)", 2))
234
+
235
+ def _get_item_list(self):
236
+ """Only supports IntervalTier currently"""
237
+ for itemIdx in range(1, self.size + 1):
238
+ tier = OrderedDict()
239
+ item_list = []
240
+ tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
241
+ tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
242
+ if tier_class != "IntervalTier":
243
+ raise NotImplementedError("Only IntervalTier class is supported currently")
244
+ tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
245
+ tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
246
+ tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
247
+ tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
248
+ for i in range(int(tier_size)):
249
+ item = OrderedDict()
250
+ item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
251
+ item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
252
+ item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
253
+ item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
254
+ item_list.append(item)
255
+ tier["idx"] = tier_idx
256
+ tier["class"] = tier_class
257
+ tier["name"] = tier_name
258
+ tier["xmin"] = tier_xmin
259
+ tier["xmax"] = tier_xmax
260
+ tier["size"] = tier_size
261
+ tier["items"] = item_list
262
+ self.tier_list.append(tier)
263
+
264
+ def toJson(self):
265
+ _json = OrderedDict()
266
+ _json["file_type"] = self.file_type
267
+ _json["xmin"] = self.xmin
268
+ _json["xmax"] = self.xmax
269
+ _json["size"] = self.size
270
+ _json["tiers"] = self.tier_list
271
+ return json.dumps(_json, ensure_ascii=False, indent=2)
272
+
273
+
274
+ def get_mel2ph(tg_fn, ph, mel, hparams):
275
+ ph_list = ph.split(" ")
276
+ with open(tg_fn, "r") as f:
277
+ tg = f.readlines()
278
+ tg = remove_empty_lines(tg)
279
+ tg = TextGrid(tg)
280
+ tg = json.loads(tg.toJson())
281
+ split = np.ones(len(ph_list) + 1, np.float) * -1
282
+ tg_idx = 0
283
+ ph_idx = 0
284
+ tg_align = [x for x in tg['tiers'][-1]['items']]
285
+ tg_align_ = []
286
+ for x in tg_align:
287
+ x['xmin'] = float(x['xmin'])
288
+ x['xmax'] = float(x['xmax'])
289
+ if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
290
+ x['text'] = ''
291
+ if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
292
+ tg_align_[-1]['xmax'] = x['xmax']
293
+ continue
294
+ tg_align_.append(x)
295
+ tg_align = tg_align_
296
+ tg_len = len([x for x in tg_align if x['text'] != ''])
297
+ ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
298
+ assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
299
+ while tg_idx < len(tg_align) or ph_idx < len(ph_list):
300
+ if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
301
+ split[ph_idx] = 1e8
302
+ ph_idx += 1
303
+ continue
304
+ x = tg_align[tg_idx]
305
+ if x['text'] == '' and ph_idx == len(ph_list):
306
+ tg_idx += 1
307
+ continue
308
+ assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
309
+ ph = ph_list[ph_idx]
310
+ if x['text'] == '' and not is_sil_phoneme(ph):
311
+ assert False, (ph_list, tg_align)
312
+ if x['text'] != '' and is_sil_phoneme(ph):
313
+ ph_idx += 1
314
+ else:
315
+ assert (x['text'] == '' and is_sil_phoneme(ph)) \
316
+ or x['text'].lower() == ph.lower() \
317
+ or x['text'].lower() == 'sil', (x['text'], ph)
318
+ split[ph_idx] = x['xmin']
319
+ if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
320
+ split[ph_idx - 1] = split[ph_idx]
321
+ ph_idx += 1
322
+ tg_idx += 1
323
+ assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
324
+ assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
325
+ mel2ph = np.zeros([mel.shape[0]], np.int)
326
+ split[0] = 0
327
+ split[-1] = 1e8
328
+ for i in range(len(split) - 1):
329
+ assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
330
+ split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
331
+ for ph_idx in range(len(ph_list)):
332
+ mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
333
+ mel2ph_torch = torch.from_numpy(mel2ph)
334
+ T_t = len(ph_list)
335
+ dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
336
+ dur = dur[1:].numpy()
337
+ return mel2ph, dur
338
+
339
+
340
+ def build_phone_encoder(data_dir):
341
+ phone_list_file = os.path.join(data_dir, 'phone_set.json')
342
+ phone_list = json.load(open(phone_list_file))
343
+ return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
344
+
345
+
346
+ def is_sil_phoneme(p):
347
+ return not p[0].isalpha()
data_gen/tts/txt_processors/base_text_processor.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ class BaseTxtProcessor:
2
+ @staticmethod
3
+ def sp_phonemes():
4
+ return ['|']
5
+
6
+ @classmethod
7
+ def process(cls, txt, pre_align_args):
8
+ raise NotImplementedError
data_gen/tts/txt_processors/en.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from data_gen.tts.data_gen_utils import PUNCS
3
+ from g2p_en import G2p
4
+ import unicodedata
5
+ from g2p_en.expand import normalize_numbers
6
+ from nltk import pos_tag
7
+ from nltk.tokenize import TweetTokenizer
8
+
9
+ from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
10
+
11
+
12
+ class EnG2p(G2p):
13
+ word_tokenize = TweetTokenizer().tokenize
14
+
15
+ def __call__(self, text):
16
+ # preprocessing
17
+ words = EnG2p.word_tokenize(text)
18
+ tokens = pos_tag(words) # tuples of (word, tag)
19
+
20
+ # steps
21
+ prons = []
22
+ for word, pos in tokens:
23
+ if re.search("[a-z]", word) is None:
24
+ pron = [word]
25
+
26
+ elif word in self.homograph2features: # Check homograph
27
+ pron1, pron2, pos1 = self.homograph2features[word]
28
+ if pos.startswith(pos1):
29
+ pron = pron1
30
+ else:
31
+ pron = pron2
32
+ elif word in self.cmu: # lookup CMU dict
33
+ pron = self.cmu[word][0]
34
+ else: # predict for oov
35
+ pron = self.predict(word)
36
+
37
+ prons.extend(pron)
38
+ prons.extend([" "])
39
+
40
+ return prons[:-1]
41
+
42
+
43
+ class TxtProcessor(BaseTxtProcessor):
44
+ g2p = EnG2p()
45
+
46
+ @staticmethod
47
+ def preprocess_text(text):
48
+ text = normalize_numbers(text)
49
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
50
+ if unicodedata.category(char) != 'Mn') # Strip accents
51
+ text = text.lower()
52
+ text = re.sub("[\'\"()]+", "", text)
53
+ text = re.sub("[-]+", " ", text)
54
+ text = re.sub(f"[^ a-z{PUNCS}]", "", text)
55
+ text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
56
+ text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
57
+ text = text.replace("i.e.", "that is")
58
+ text = text.replace("i.e.", "that is")
59
+ text = text.replace("etc.", "etc")
60
+ text = re.sub(f"([{PUNCS}])", r" \1 ", text)
61
+ text = re.sub(rf"\s+", r" ", text)
62
+ return text
63
+
64
+ @classmethod
65
+ def process(cls, txt, pre_align_args):
66
+ txt = cls.preprocess_text(txt).strip()
67
+ phs = cls.g2p(txt)
68
+ phs_ = []
69
+ n_word_sep = 0
70
+ for p in phs:
71
+ if p.strip() == '':
72
+ phs_ += ['|']
73
+ n_word_sep += 1
74
+ else:
75
+ phs_ += p.split(" ")
76
+ phs = phs_
77
+ assert n_word_sep + 1 == len(txt.split(" ")), (phs, f"\"{txt}\"")
78
+ return phs, txt
data_gen/tts/txt_processors/zh.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pypinyin import pinyin, Style
3
+ from data_gen.tts.data_gen_utils import PUNCS
4
+ from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
5
+ from utils.text_norm import NSWNormalizer
6
+
7
+
8
+ class TxtProcessor(BaseTxtProcessor):
9
+ table = {ord(f): ord(t) for f, t in zip(
10
+ u':,。!?【】()%#@&1234567890',
11
+ u':,.!?[]()%#@&1234567890')}
12
+
13
+ @staticmethod
14
+ def preprocess_text(text):
15
+ text = text.translate(TxtProcessor.table)
16
+ text = NSWNormalizer(text).normalize(remove_punc=False)
17
+ text = re.sub("[\'\"()]+", "", text)
18
+ text = re.sub("[-]+", " ", text)
19
+ text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
20
+ text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
21
+ text = re.sub(f"([{PUNCS}])", r" \1 ", text)
22
+ text = re.sub(rf"\s+", r"", text)
23
+ return text
24
+
25
+ @classmethod
26
+ def process(cls, txt, pre_align_args):
27
+ txt = cls.preprocess_text(txt)
28
+ shengmu = pinyin(txt, style=Style.INITIALS) # https://blog.csdn.net/zhoulei124/article/details/89055403
29
+ yunmu_finals = pinyin(txt, style=Style.FINALS)
30
+ yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
31
+ yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
32
+ if pre_align_args['use_tone'] else yunmu_finals
33
+
34
+ assert len(shengmu) == len(yunmu)
35
+ phs = ["|"]
36
+ for a, b, c in zip(shengmu, yunmu, yunmu_finals):
37
+ if a[0] == c[0]:
38
+ phs += [a[0], "|"]
39
+ else:
40
+ phs += [a[0], b[0], "|"]
41
+ return phs, txt
data_gen/tts/txt_processors/zh_g2pM.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import jieba
3
+ from pypinyin import pinyin, Style
4
+ from data_gen.tts.data_gen_utils import PUNCS
5
+ from data_gen.tts.txt_processors import zh
6
+ from g2pM import G2pM
7
+
8
+ ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
9
+ 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
10
+ ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
11
+ 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
12
+ 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
13
+
14
+
15
+ class TxtProcessor(zh.TxtProcessor):
16
+ model = G2pM()
17
+
18
+ @staticmethod
19
+ def sp_phonemes():
20
+ return ['|', '#']
21
+
22
+ @classmethod
23
+ def process(cls, txt, pre_align_args):
24
+ txt = cls.preprocess_text(txt)
25
+ ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True)
26
+ seg_list = '#'.join(jieba.cut(txt))
27
+ assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
28
+
29
+ # 加入词边界'#'
30
+ ph_list_ = []
31
+ seg_idx = 0
32
+ for p in ph_list:
33
+ p = p.replace("u:", "v")
34
+ if seg_list[seg_idx] == '#':
35
+ ph_list_.append('#')
36
+ seg_idx += 1
37
+ else:
38
+ ph_list_.append("|")
39
+ seg_idx += 1
40
+ if re.findall('[\u4e00-\u9fff]', p):
41
+ if pre_align_args['use_tone']:
42
+ p = pinyin(p, style=Style.TONE3, strict=True)[0][0]
43
+ if p[-1] not in ['1', '2', '3', '4', '5']:
44
+ p = p + '5'
45
+ else:
46
+ p = pinyin(p, style=Style.NORMAL, strict=True)[0][0]
47
+
48
+ finished = False
49
+ if len([c.isalpha() for c in p]) > 1:
50
+ for shenmu in ALL_SHENMU:
51
+ if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric():
52
+ ph_list_ += [shenmu, p.lstrip(shenmu)]
53
+ finished = True
54
+ break
55
+ if not finished:
56
+ ph_list_.append(p)
57
+
58
+ ph_list = ph_list_
59
+
60
+ # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
61
+ sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
62
+ ph_list_ = []
63
+ for i in range(0, len(ph_list), 1):
64
+ if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
65
+ ph_list_.append(ph_list[i])
66
+ ph_list = ph_list_
67
+ return ph_list, txt
68
+
69
+
70
+ if __name__ == '__main__':
71
+ phs, txt = TxtProcessor.process('他来到了,网易杭研大厦', {'use_tone': True})
72
+ print(phs)
docs/README-SVS-opencpop-cascade.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
2
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
3
+ [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
4
+ [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
5
+
6
+ ## DiffSinger (MIDI SVS | A version)
7
+ ### 0. Data Acquirement
8
+ For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
9
+
10
+ The pipeline below is designed for Opencpop dataset:
11
+
12
+ ### 1. Preparation
13
+
14
+ #### Data Preparation
15
+ a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
16
+
17
+ b) Run the following scripts to pack the dataset for training/inference.
18
+
19
+ ```sh
20
+ export PYTHONPATH=.
21
+ CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
22
+
23
+ # `data/binary/opencpop-midi-dp` will be generated.
24
+ ```
25
+
26
+ #### Vocoder Preparation
27
+ We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
28
+ Please unzip this file into `checkpoints` before training your acoustic model.
29
+
30
+ (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
31
+
32
+ This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
33
+
34
+ #### Exp Name Preparation
35
+ ```bash
36
+ export MY_FS_EXP_NAME=0302_opencpop_fs_midi
37
+ export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
38
+ ```
39
+
40
+ ```
41
+ .
42
+ |--data
43
+ |--raw
44
+ |--opencpop
45
+ |--segments
46
+ |--transcriptions.txt
47
+ |--wavs
48
+ |--checkpoints
49
+ |--MY_FS_EXP_NAME (optional)
50
+ |--MY_DS_EXP_NAME (optional)
51
+ |--0109_hifigan_bigpopcs_hop128
52
+ |--model_ckpt_steps_1512000.ckpt
53
+ |--config.yaml
54
+ ```
55
+
56
+ ### 2. Training Example
57
+ First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
58
+ ```sh
59
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
60
+ ```
61
+
62
+ Then, to train DiffSinger, run:
63
+
64
+ ```sh
65
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
66
+ ```
67
+
68
+ Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
69
+
70
+ ### 3. Inference from packed test set
71
+ ```sh
72
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
73
+ ```
74
+
75
+ We also provide:
76
+ - the pre-trained model of DiffSinger;
77
+ - the pre-trained model of FFT-Singer;
78
+
79
+ They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
80
+
81
+ Remember to put the pre-trained models in `checkpoints` directory.
82
+
83
+ ### 4. Inference from raw inputs
84
+ ```sh
85
+ python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
86
+ ```
87
+ Raw inputs:
88
+ ```
89
+ inp = {
90
+ 'text': '小酒窝长睫毛AP是你最美的记号',
91
+ 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
92
+ 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
93
+ 'input_type': 'word'
94
+ } # user input: Chinese characters
95
+ or,
96
+ inp = {
97
+ 'text': '小酒窝长睫毛AP是你最美的记号',
98
+ 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
99
+ 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
100
+ 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
101
+ 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
102
+ 'input_type': 'phoneme'
103
+ } # input like Opencpop dataset.
104
+ ```
105
+
106
+ ### 5. Some issues.
107
+ a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
108
+
109
+ b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
110
+
111
+ c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
docs/README-SVS-opencpop-e2e.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
2
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
3
+ [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
4
+ [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
5
+ | [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
6
+
7
+ Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
8
+ **By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
9
+
10
+ 简而言之,把F0曲线的动态性交给生成式模型去捕捉,而不再是以前那样用MSE约束对数域F0。
11
+
12
+ ## DiffSinger (MIDI SVS | B version)
13
+ ### 0. Data Acquirement
14
+ For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
15
+
16
+ The pipeline below is designed for Opencpop dataset:
17
+
18
+ ### 1. Preparation
19
+
20
+ #### Data Preparation
21
+ a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
22
+
23
+ b) Run the following scripts to pack the dataset for training/inference.
24
+
25
+ ```sh
26
+ export PYTHONPATH=.
27
+ CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
28
+
29
+ # `data/binary/opencpop-midi-dp` will be generated.
30
+ ```
31
+
32
+ #### Vocoder Preparation
33
+ We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
34
+
35
+ Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
36
+
37
+ (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
38
+
39
+ This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
40
+
41
+ #### Exp Name Preparation
42
+ ```bash
43
+ export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
44
+ ```
45
+
46
+ ```
47
+ .
48
+ |--data
49
+ |--raw
50
+ |--opencpop
51
+ |--segments
52
+ |--transcriptions.txt
53
+ |--wavs
54
+ |--checkpoints
55
+ |--MY_DS_EXP_NAME (optional)
56
+ |--0109_hifigan_bigpopcs_hop128 (vocoder)
57
+ |--model_ckpt_steps_1512000.ckpt
58
+ |--config.yaml
59
+ ```
60
+
61
+ ### 2. Training Example
62
+ ```sh
63
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
64
+ ```
65
+
66
+ ### 3. Inference from packed test set
67
+ ```sh
68
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
69
+ ```
70
+
71
+ We also provide:
72
+ - the pre-trained model of DiffSinger;
73
+
74
+ They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
75
+
76
+ Remember to put the pre-trained models in `checkpoints` directory.
77
+
78
+ ### 4. Inference from raw inputs
79
+ ```sh
80
+ python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
81
+ ```
82
+ Raw inputs:
83
+ ```
84
+ inp = {
85
+ 'text': '小酒窝长睫毛AP是你最美的记号',
86
+ 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
87
+ 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
88
+ 'input_type': 'word'
89
+ } # user input: Chinese characters
90
+ or,
91
+ inp = {
92
+ 'text': '小酒窝长睫毛AP是你最美的记号',
93
+ 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
94
+ 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
95
+ 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
96
+ 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
97
+ 'input_type': 'phoneme'
98
+ } # input like Opencpop dataset.
99
+ ```
100
+
101
+ ### 5. Some issues.
102
+ a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
103
+
104
+ b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
105
+
106
+ c) example [generated audio](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/demos_0221/DS/).
107
+ More generated audio demos can be found in [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
docs/README-SVS-popcs.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## DiffSinger (SVS version)
2
+
3
+ ### 0. Data Acquirement
4
+ - See in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
5
+ - Dataset [preview](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
6
+
7
+ ### 1. Preparation
8
+ #### Data Preparation
9
+ a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
10
+
11
+ b) Run the following scripts to pack the dataset for training/inference.
12
+ ```sh
13
+ export PYTHONPATH=.
14
+ CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
15
+ # `data/binary/popcs-pmf0` will be generated.
16
+ ```
17
+
18
+ #### Vocoder Preparation
19
+ We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
20
+ Please unzip this file into `checkpoints` before training your acoustic model.
21
+
22
+ (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
23
+
24
+ This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
25
+
26
+ ### 2. Training Example
27
+ First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
28
+
29
+ ```sh
30
+ # First, train fft-singer;
31
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
32
+ # Then, infer fft-singer;
33
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
34
+ ```
35
+
36
+ Then, to train DiffSinger, run:
37
+ ```sh
38
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
39
+ ```
40
+
41
+ Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
42
+
43
+ ### 3. Inference Example
44
+ ```sh
45
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
46
+ ```
47
+
48
+ We also provide:
49
+ - the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
50
+ - the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
51
+
52
+ Remember to put the pre-trained models in `checkpoints` directory.
53
+
54
+ *Note that:*
55
+
56
+ - *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
57
+ - *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
58
+
59
+ [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
60
+
61
+ [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
62
+
63
+ [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
docs/README-SVS.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
2
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
3
+ [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
4
+ [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
5
+ | [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
6
+
7
+ ## DiffSinger (SVS)
8
+
9
+ ### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
10
+ In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
11
+
12
+ Thus, the pipeline of this part can be summarized as:
13
+
14
+ ```
15
+ [lyrics] -> [linguistic representation] (Frontend)
16
+ [linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram] (Acoustic model)
17
+ [mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
18
+ ```
19
+
20
+
21
+ [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
22
+
23
+ [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
24
+
25
+ [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
26
+
27
+ Click here for detailed instructions: [link](README-SVS-popcs.md).
28
+
29
+
30
+ ### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
31
+ Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
32
+
33
+ Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
34
+
35
+ #### 2.A
36
+ Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
37
+
38
+ ```
39
+ [lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
40
+ [linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
41
+ [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
42
+ ```
43
+
44
+ Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
45
+
46
+ #### 2.B
47
+ In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
48
+
49
+ Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
50
+ ```
51
+ [lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
52
+ [linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
53
+ [mel-spectrogram] -> [predicted F0] (Pitch extractor)
54
+ [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
55
+ ```
56
+
57
+ Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
58
+
59
+ ### FAQ
60
+ Q1: Why do I need F0 in Vocoders?
61
+
62
+ A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
63
+
64
+ Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
65
+
66
+ A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
67
+
68
+ Q3: Why " 'HifiGAN' object has no attribute 'model' "?
69
+
70
+ A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
71
+
72
+ Q4: How to check whether I use GT information or predicted information during inference from packed test set?
73
+
74
+ A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
75
+
76
+ ...
docs/README-TTS.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
2
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
3
+ [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
4
+ [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
5
+ | [Interactive🤗 TTS](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
6
+
7
+ ## DiffSpeech (TTS)
8
+ ### 1. Preparation
9
+
10
+ #### Data Preparation
11
+ a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
12
+
13
+ b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
14
+
15
+ c) Run the following scripts to pack the dataset for training/inference.
16
+
17
+ ```sh
18
+ export PYTHONPATH=.
19
+ CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
20
+
21
+ # `data/binary/ljspeech` will be generated.
22
+ ```
23
+
24
+ #### Vocoder Preparation
25
+ We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
26
+ Please unzip this file into `checkpoints` before training your acoustic model.
27
+
28
+ ### 2. Training Example
29
+
30
+ First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
31
+ ```sh
32
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
33
+ ```
34
+ Then, to train DiffSpeech, run:
35
+ ```sh
36
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
37
+ ```
38
+
39
+ Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
40
+
41
+ ### 3. Inference Example
42
+
43
+ ```sh
44
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
45
+ ```
46
+
47
+ We also provide:
48
+ - the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
49
+ - the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
50
+
51
+ Remember to put the pre-trained models in `checkpoints` directory.
52
+
53
+ ## Mel Visualization
54
+ Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
55
+
56
+ <table style="width:100%">
57
+ <tr>
58
+ <th>DiffSpeech vs. FastSpeech 2</th>
59
+ </tr>
60
+ <tr>
61
+ <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
62
+ </tr>
63
+ <tr>
64
+ <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
65
+ </tr>
66
+ <tr>
67
+ <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
68
+ </tr>
69
+ </table>
docs/README-zh.md ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
2
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
3
+ [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
4
+ [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
5
+ | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue)](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
6
+ | [English README](../README.md)
7
+
8
+ 本仓库包含了我们的AAAI-2022 [论文](https://arxiv.org/abs/2105.02446)中提出的DiffSpeech (用于语音合成) 与 DiffSinger (用于歌声合成) 的官方Pytorch实现。
9
+
10
+ <table style="width:100%">
11
+ <tr>
12
+ <th>DiffSinger/DiffSpeech训练阶段</th>
13
+ <th>DiffSinger/DiffSpeech推理阶段</th>
14
+ </tr>
15
+ <tr>
16
+ <td><img src="resources/model_a.png" alt="Training" height="300"></td>
17
+ <td><img src="resources/model_b.png" alt="Inference" height="300"></td>
18
+ </tr>
19
+ </table>
20
+
21
+ :tada: :tada: :tada: **一些重要更新**:
22
+ - Mar.2, 2022: [MIDI-新版](README-SVS-opencpop-e2e.md): 重大更新 :sparkles:
23
+ - Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), 为了歌声美化任务的代码,开源了 :sparkles: :sparkles: :sparkles: .
24
+ - Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), 一个升级后的代码框架, 包含了DiffSpeech和我们NeurIPS-2021的工作[PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) 已经开源! :sparkles: :sparkles: :sparkles:.
25
+ - Jan.29, 2022: 支持了[MIDI-旧版](README-SVS-opencpop-cascade.md) 版本的歌声合成系统.
26
+ - Jan.13, 2022: 支持了歌声合成系统, 开源了PopCS数据集.
27
+ - Dec.19, 2021: 支持了语音合成系统. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
28
+
29
+ :rocket: **新闻**:
30
+ - Feb.24, 2022: 我们的新工作`NeuralSVB` 被 ACL-2022 接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2202.13277). [音频演示](https://neuralsvb.github.io).
31
+ - Dec.01, 2021: DiffSinger被AAAI-2022接收.
32
+ - Sep.29, 2021: 我们的新工作`PortaSpeech: Portable and High-Quality Generative Text-to-Speech` 被NeurIPS-2021接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2109.15166) .
33
+ - May.06, 2021: 我们把这篇DiffSinger提交到了公开论文网站: Arxiv [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446).
34
+
35
+ ## 安装依赖
36
+ ```sh
37
+ conda create -n your_env_name python=3.8
38
+ source activate your_env_name
39
+ pip install -r requirements_2080.txt (GPU 2080Ti, CUDA 10.2)
40
+ or pip install -r requirements_3090.txt (GPU 3090, CUDA 11.4)
41
+ ```
42
+
43
+ ## DiffSpeech (语音合成的版本)
44
+ ### 1. 准备工作
45
+
46
+ #### 数据准备
47
+ a) 下载并解压 [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), 创建软链接: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
48
+
49
+ b) 下载并解压 [我们用MFA预处理好的对齐](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
50
+
51
+ c) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
52
+
53
+ ```sh
54
+ export PYTHONPATH=.
55
+ CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
56
+
57
+ # `data/binary/ljspeech` will be generated.
58
+ ```
59
+
60
+ #### 声码器准备
61
+ 我们提供了[HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip)声码器的预训练模型.
62
+ 请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
63
+
64
+ ### 2. 训练样例
65
+
66
+ 首先你需要一个预训练好的FastSpeech2存档点. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), 或者跑下面这个指令从零开始训练FastSpeech2:
67
+ ```sh
68
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
69
+ ```
70
+ 然后为了训练DiffSpeech, 运行:
71
+ ```sh
72
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
73
+ ```
74
+
75
+ 记得针对你的路径修改`usr/configs/lj_ds_beta6.yaml`里"fs2_ckpt"这个参数.
76
+
77
+ ### 3. 推理样例
78
+
79
+ ```sh
80
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
81
+ ```
82
+
83
+ 我们也提供了:
84
+ - [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip)的预训练模型;
85
+ - [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip)的预训练模型, 这是为了DiffSpeech里的浅扩散机制;
86
+
87
+ 记得把预训练模型放在 `checkpoints` 目录.
88
+
89
+ ## DiffSinger (歌声合成的版本)
90
+
91
+ ### 0. 数据获取
92
+ - 见 [申请表](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
93
+ - 数据集 [预览](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
94
+
95
+ ### 1. Preparation
96
+ #### 数据准备
97
+ a) 下载并解压PopCS, 创建软链接: `ln -s /xxx/popcs/ data/processed/popcs`
98
+
99
+ b) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
100
+ ```sh
101
+ export PYTHONPATH=.
102
+ CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
103
+ # `data/binary/popcs-pmf0` 会生成出来.
104
+ ```
105
+
106
+ #### 声码器准备
107
+ 我们提供了[HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip)的预训练模型, 它专门为了歌声合成系统设计, 采用了NSF的技术。
108
+ 请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
109
+
110
+ (更新: 你也可以将我们提供的[训练更多步数的存档点](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt)放到声码器的文件夹里)
111
+
112
+ 这个声码器是在大约70小时的较大数据集上训练的, 可以被认为是一个通用声码器。
113
+
114
+ ### 2. 训练样例
115
+ 首先你需要一个预训练好的FFT-Singer. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), 或者用如下脚本从零训练FFT-Singer:
116
+
117
+ ```sh
118
+ # First, train fft-singer;
119
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
120
+ # Then, infer fft-singer;
121
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
122
+ ```
123
+
124
+ 然后, 为了训练DiffSinger, 运行:
125
+ ```sh
126
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
127
+ ```
128
+
129
+ 记得针对你的路径修改`usr/configs/popcs_ds_beta6_offline.yaml`里"fs2_ckpt"这个参数.
130
+
131
+ ### 3. 推理样例
132
+ ```sh
133
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
134
+ ```
135
+
136
+ 我们也提供了:
137
+ - [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip)的预训练模型;
138
+ - [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip)的预训练模型, 这是为了DiffSinger里的浅扩散机制;
139
+
140
+ 记得把预训练模型放在 `checkpoints` 目录.
141
+
142
+ *请注意:*
143
+
144
+ -*我们原始论文中的PWG版本声码器已投入商业使用,因此我们提供此HifiGAN版本声码器作为替代品。*
145
+
146
+ -*我们这篇论文假设提供真实的F0来进行实验,如[1][2][3]等前作所做的那样,重点在频谱建模上,而非F0曲线的预测。如果你想对MIDI数据进行实验,从MIDI和歌词预测F0曲线(显式或隐式),请查看文档[MIDI-old-version](README-SVS-opencpop-cascade.md) 或 [MIDI-new-version](README-SVS-opencpop-e2e.md)。目前已经支持的MIDI数据集有: Opencpop*
147
+
148
+ [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
149
+
150
+ [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
151
+
152
+ [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
153
+
154
+ ## Tensorboard
155
+ ```sh
156
+ tensorboard --logdir_spec exp_name
157
+ ```
158
+ <table style="width:100%">
159
+ <tr>
160
+ <td><img src="resources/tfb.png" alt="Tensorboard" height="250"></td>
161
+ </tr>
162
+ </table>
163
+
164
+ ## Mel 可视化
165
+ 沿着纵轴, DiffSpeech: [0-80]; FastSpeech2: [80-160].
166
+
167
+ <table style="width:100%">
168
+ <tr>
169
+ <th>DiffSpeech vs. FastSpeech 2</th>
170
+ </tr>
171
+ <tr>
172
+ <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
173
+ </tr>
174
+ <tr>
175
+ <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
176
+ </tr>
177
+ <tr>
178
+ <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
179
+ </tr>
180
+ </table>
181
+
182
+ ## Audio Demos
183
+ 音频样本可以看我们的[样例页](https://diffsinger.github.io/).
184
+
185
+ 我们也放了部分由DiffSpeech+HifiGAN (标记为[P]) 和 GTmel+HifiGAN (标记为[G]) 生成的测试集音频样例在:[resources/demos_1213](../resources/demos_1213).
186
+
187
+ (对应这个预训练参数:[DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip))
188
+
189
+ ---
190
+ :rocket: :rocket: :rocket: **更新:**
191
+
192
+ 新生成的歌声样例在:[resources/demos_0112](../resources/demos_0112).
193
+
194
+ ## Citation
195
+ 如果本仓库对你的研究和工作有用,请引用以下论文:
196
+
197
+ @article{liu2021diffsinger,
198
+ title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
199
+ author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
200
+ journal={arXiv preprint arXiv:2105.02446},
201
+ volume={2},
202
+ year={2021}}
203
+
204
+
205
+ ## 鸣谢
206
+ 我们的代码基于如下仓库:
207
+ * [denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch)
208
+ * [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)
209
+ * [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
210
+ * [HifiGAN](https://github.com/jik876/hifi-gan)
211
+ * [espnet](https://github.com/espnet/espnet)
212
+ * [DiffWave](https://github.com/lmnt-com/diffwave)
inference/svs/base_svs_infer.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import numpy as np
5
+ from modules.hifigan.hifigan import HifiGanGenerator
6
+ from vocoders.hifigan import HifiGAN
7
+ from inference.svs.opencpop.map import cpop_pinyin2ph_func
8
+
9
+ from utils import load_ckpt
10
+ from utils.hparams import set_hparams, hparams
11
+ from utils.text_encoder import TokenTextEncoder
12
+ from pypinyin import pinyin, lazy_pinyin, Style
13
+ import librosa
14
+ import glob
15
+ import re
16
+
17
+
18
+ class BaseSVSInfer:
19
+ def __init__(self, hparams, device=None):
20
+ if device is None:
21
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
22
+ self.hparams = hparams
23
+ self.device = device
24
+
25
+ phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
26
+ "h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
27
+ "ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
28
+ "van", "ve", "vn", "w", "x", "y", "z", "zh"]
29
+ self.ph_encoder = TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
30
+ self.pinyin2phs = cpop_pinyin2ph_func()
31
+ self.spk_map = {'opencpop': 0}
32
+
33
+ self.model = self.build_model()
34
+ self.model.eval()
35
+ self.model.to(self.device)
36
+ self.vocoder = self.build_vocoder()
37
+ self.vocoder.eval()
38
+ self.vocoder.to(self.device)
39
+
40
+ def build_model(self):
41
+ raise NotImplementedError
42
+
43
+ def forward_model(self, inp):
44
+ raise NotImplementedError
45
+
46
+ def build_vocoder(self):
47
+ base_dir = hparams['vocoder_ckpt']
48
+ config_path = f'{base_dir}/config.yaml'
49
+ ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
50
+ lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
51
+ print('| load HifiGAN: ', ckpt)
52
+ ckpt_dict = torch.load(ckpt, map_location="cpu")
53
+ config = set_hparams(config_path, global_hparams=False)
54
+ state = ckpt_dict["state_dict"]["model_gen"]
55
+ vocoder = HifiGanGenerator(config)
56
+ vocoder.load_state_dict(state, strict=True)
57
+ vocoder.remove_weight_norm()
58
+ vocoder = vocoder.eval().to(self.device)
59
+ return vocoder
60
+
61
+ def run_vocoder(self, c, **kwargs):
62
+ c = c.transpose(2, 1) # [B, 80, T]
63
+ f0 = kwargs.get('f0') # [B, T]
64
+ if f0 is not None and hparams.get('use_nsf'):
65
+ # f0 = torch.FloatTensor(f0).to(self.device)
66
+ y = self.vocoder(c, f0).view(-1)
67
+ else:
68
+ y = self.vocoder(c).view(-1)
69
+ # [T]
70
+ return y[None]
71
+
72
+ def preprocess_word_level_input(self, inp):
73
+ # Pypinyin can't solve polyphonic words
74
+ text_raw = inp['text'].replace('最长', '最常').replace('长睫毛', '常睫毛') \
75
+ .replace('那么长', '那么常').replace('多长', '多常') \
76
+ .replace('很长', '很常') # We hope someone could provide a better g2p module for us by opening pull requests.
77
+
78
+ # lyric
79
+ pinyins = lazy_pinyin(text_raw, strict=False)
80
+ ph_per_word_lst = [self.pinyin2phs[pinyin.strip()] for pinyin in pinyins if pinyin.strip() in self.pinyin2phs]
81
+
82
+ # Note
83
+ note_per_word_lst = [x.strip() for x in inp['notes'].split('|') if x.strip() != '']
84
+ mididur_per_word_lst = [x.strip() for x in inp['notes_duration'].split('|') if x.strip() != '']
85
+
86
+ if len(note_per_word_lst) == len(ph_per_word_lst) == len(mididur_per_word_lst):
87
+ print('Pass word-notes check.')
88
+ else:
89
+ print('The number of words does\'t match the number of notes\' windows. ',
90
+ 'You should split the note(s) for each word by | mark.')
91
+ print(ph_per_word_lst, note_per_word_lst, mididur_per_word_lst)
92
+ print(len(ph_per_word_lst), len(note_per_word_lst), len(mididur_per_word_lst))
93
+ return None
94
+
95
+ note_lst = []
96
+ ph_lst = []
97
+ midi_dur_lst = []
98
+ is_slur = []
99
+ for idx, ph_per_word in enumerate(ph_per_word_lst):
100
+ # for phs in one word:
101
+ # single ph like ['ai'] or multiple phs like ['n', 'i']
102
+ ph_in_this_word = ph_per_word.split()
103
+
104
+ # for notes in one word:
105
+ # single note like ['D4'] or multiple notes like ['D4', 'E4'] which means a 'slur' here.
106
+ note_in_this_word = note_per_word_lst[idx].split()
107
+ midi_dur_in_this_word = mididur_per_word_lst[idx].split()
108
+ # process for the model input
109
+ # Step 1.
110
+ # Deal with note of 'not slur' case or the first note of 'slur' case
111
+ # j ie
112
+ # F#4/Gb4 F#4/Gb4
113
+ # 0 0
114
+ for ph in ph_in_this_word:
115
+ ph_lst.append(ph)
116
+ note_lst.append(note_in_this_word[0])
117
+ midi_dur_lst.append(midi_dur_in_this_word[0])
118
+ is_slur.append(0)
119
+ # step 2.
120
+ # Deal with the 2nd, 3rd... notes of 'slur' case
121
+ # j ie ie
122
+ # F#4/Gb4 F#4/Gb4 C#4/Db4
123
+ # 0 0 1
124
+ if len(note_in_this_word) > 1: # is_slur = True, we should repeat the YUNMU to match the 2nd, 3rd... notes.
125
+ for idx in range(1, len(note_in_this_word)):
126
+ ph_lst.append(ph_in_this_word[-1])
127
+ note_lst.append(note_in_this_word[idx])
128
+ midi_dur_lst.append(midi_dur_in_this_word[idx])
129
+ is_slur.append(1)
130
+ ph_seq = ' '.join(ph_lst)
131
+
132
+ if len(ph_lst) == len(note_lst) == len(midi_dur_lst):
133
+ print(len(ph_lst), len(note_lst), len(midi_dur_lst))
134
+ print('Pass word-notes check.')
135
+ else:
136
+ print('The number of words does\'t match the number of notes\' windows. ',
137
+ 'You should split the note(s) for each word by | mark.')
138
+ return None
139
+ return ph_seq, note_lst, midi_dur_lst, is_slur
140
+
141
+ def preprocess_phoneme_level_input(self, inp):
142
+ ph_seq = inp['ph_seq']
143
+ note_lst = inp['note_seq'].split()
144
+ midi_dur_lst = inp['note_dur_seq'].split()
145
+ is_slur = [float(x) for x in inp['is_slur_seq'].split()]
146
+ print(len(note_lst), len(ph_seq.split()), len(midi_dur_lst))
147
+ if len(note_lst) == len(ph_seq.split()) == len(midi_dur_lst):
148
+ print('Pass word-notes check.')
149
+ else:
150
+ print('The number of words does\'t match the number of notes\' windows. ',
151
+ 'You should split the note(s) for each word by | mark.')
152
+ return None
153
+ return ph_seq, note_lst, midi_dur_lst, is_slur
154
+
155
+ def preprocess_input(self, inp, input_type='word'):
156
+ """
157
+
158
+ :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
159
+ :return:
160
+ """
161
+
162
+ item_name = inp.get('item_name', '<ITEM_NAME>')
163
+ spk_name = inp.get('spk_name', 'opencpop')
164
+
165
+ # single spk
166
+ spk_id = self.spk_map[spk_name]
167
+
168
+ # get ph seq, note lst, midi dur lst, is slur lst.
169
+ if input_type == 'word':
170
+ ret = self.preprocess_word_level_input(inp)
171
+ elif input_type == 'phoneme': # like transcriptions.txt in Opencpop dataset.
172
+ ret = self.preprocess_phoneme_level_input(inp)
173
+ else:
174
+ print('Invalid input type.')
175
+ return None
176
+
177
+ if ret:
178
+ ph_seq, note_lst, midi_dur_lst, is_slur = ret
179
+ else:
180
+ print('==========> Preprocess_word_level or phone_level input wrong.')
181
+ return None
182
+
183
+ # convert note lst to midi id; convert note dur lst to midi duration
184
+ try:
185
+ midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
186
+ for x in note_lst]
187
+ midi_dur_lst = [float(x) for x in midi_dur_lst]
188
+ except Exception as e:
189
+ print(e)
190
+ print('Invalid Input Type.')
191
+ return None
192
+
193
+ ph_token = self.ph_encoder.encode(ph_seq)
194
+ item = {'item_name': item_name, 'text': inp['text'], 'ph': ph_seq, 'spk_id': spk_id,
195
+ 'ph_token': ph_token, 'pitch_midi': np.asarray(midis), 'midi_dur': np.asarray(midi_dur_lst),
196
+ 'is_slur': np.asarray(is_slur), }
197
+ item['ph_len'] = len(item['ph_token'])
198
+ return item
199
+
200
+ def input_to_batch(self, item):
201
+ item_names = [item['item_name']]
202
+ text = [item['text']]
203
+ ph = [item['ph']]
204
+ txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
205
+ txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
206
+ spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
207
+
208
+ pitch_midi = torch.LongTensor(item['pitch_midi'])[None, :hparams['max_frames']].to(self.device)
209
+ midi_dur = torch.FloatTensor(item['midi_dur'])[None, :hparams['max_frames']].to(self.device)
210
+ is_slur = torch.LongTensor(item['is_slur'])[None, :hparams['max_frames']].to(self.device)
211
+
212
+ batch = {
213
+ 'item_name': item_names,
214
+ 'text': text,
215
+ 'ph': ph,
216
+ 'txt_tokens': txt_tokens,
217
+ 'txt_lengths': txt_lengths,
218
+ 'spk_ids': spk_ids,
219
+ 'pitch_midi': pitch_midi,
220
+ 'midi_dur': midi_dur,
221
+ 'is_slur': is_slur
222
+ }
223
+ return batch
224
+
225
+ def postprocess_output(self, output):
226
+ return output
227
+
228
+ def infer_once(self, inp):
229
+ inp = self.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
230
+ output = self.forward_model(inp)
231
+ output = self.postprocess_output(output)
232
+ return output
233
+
234
+ @classmethod
235
+ def example_run(cls, inp):
236
+ from utils.audio import save_wav
237
+ set_hparams(print_hparams=False)
238
+ infer_ins = cls(hparams)
239
+ out = infer_ins.infer_once(inp)
240
+ os.makedirs('infer_out', exist_ok=True)
241
+ save_wav(out, f'infer_out/example_out.wav', hparams['audio_sample_rate'])
242
+
243
+
244
+ # if __name__ == '__main__':
245
+ # debug
246
+ # a = BaseSVSInfer(hparams)
247
+ # a.preprocess_input({'text': '你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP',
248
+ # 'notes': 'D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest',
249
+ # 'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
250
+ # })
251
+
252
+ # b = {
253
+ # 'text': '小酒窝长睫毛AP是你最美的记号',
254
+ # 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
255
+ # 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340'
256
+ # }
257
+ # c = {
258
+ # 'text': '小酒窝长睫毛AP是你最美的记号',
259
+ # 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
260
+ # 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
261
+ # 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
262
+ # 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
263
+ # } # input like Opencpop dataset.
264
+ # a.preprocess_input(b)
265
+ # a.preprocess_input(c, input_type='phoneme')
inference/svs/ds_cascade.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ # from inference.tts.fs import FastSpeechInfer
3
+ # from modules.tts.fs2_orig import FastSpeech2Orig
4
+ from inference.svs.base_svs_infer import BaseSVSInfer
5
+ from utils import load_ckpt
6
+ from utils.hparams import hparams
7
+ from usr.diff.shallow_diffusion_tts import GaussianDiffusion
8
+ from usr.diffsinger_task import DIFF_DECODERS
9
+
10
+ class DiffSingerCascadeInfer(BaseSVSInfer):
11
+ def build_model(self):
12
+ model = GaussianDiffusion(
13
+ phone_encoder=self.ph_encoder,
14
+ out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
15
+ timesteps=hparams['timesteps'],
16
+ K_step=hparams['K_step'],
17
+ loss_type=hparams['diff_loss_type'],
18
+ spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
19
+ )
20
+ model.eval()
21
+ load_ckpt(model, hparams['work_dir'], 'model')
22
+ return model
23
+
24
+ def forward_model(self, inp):
25
+ sample = self.input_to_batch(inp)
26
+ txt_tokens = sample['txt_tokens'] # [B, T_t]
27
+ spk_id = sample.get('spk_ids')
28
+ with torch.no_grad():
29
+ output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
30
+ pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
31
+ is_slur=sample['is_slur'])
32
+ mel_out = output['mel_out'] # [B, T,80]
33
+ f0_pred = output['f0_denorm']
34
+ wav_out = self.run_vocoder(mel_out, f0=f0_pred)
35
+ wav_out = wav_out.cpu().numpy()
36
+ return wav_out[0]
37
+
38
+
39
+ if __name__ == '__main__':
40
+ inp = {
41
+ 'text': '小酒窝长睫毛AP是你最美的记号',
42
+ 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
43
+ 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
44
+ 'input_type': 'word'
45
+ } # user input: Chinese characters
46
+ c = {
47
+ 'text': '小酒窝长睫毛AP是你最美的记号',
48
+ 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
49
+ 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
50
+ 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
51
+ 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
52
+ 'input_type': 'phoneme'
53
+ } # input like Opencpop dataset.
54
+ DiffSingerCascadeInfer.example_run(inp)
55
+
56
+ # # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
inference/svs/ds_e2e.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ # from inference.tts.fs import FastSpeechInfer
3
+ # from modules.tts.fs2_orig import FastSpeech2Orig
4
+ from inference.svs.base_svs_infer import BaseSVSInfer
5
+ from utils import load_ckpt
6
+ from utils.hparams import hparams
7
+ from usr.diff.shallow_diffusion_tts import GaussianDiffusion
8
+ from usr.diffsinger_task import DIFF_DECODERS
9
+ from modules.fastspeech.pe import PitchExtractor
10
+ import utils
11
+
12
+
13
+ class DiffSingerE2EInfer(BaseSVSInfer):
14
+ def build_model(self):
15
+ model = GaussianDiffusion(
16
+ phone_encoder=self.ph_encoder,
17
+ out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
18
+ timesteps=hparams['timesteps'],
19
+ K_step=hparams['K_step'],
20
+ loss_type=hparams['diff_loss_type'],
21
+ spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
22
+ )
23
+ model.eval()
24
+ load_ckpt(model, hparams['work_dir'], 'model')
25
+
26
+ if hparams.get('pe_enable') is not None and hparams['pe_enable']:
27
+ self.pe = PitchExtractor().to(self.device)
28
+ utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
29
+ self.pe.eval()
30
+ return model
31
+
32
+ def forward_model(self, inp):
33
+ sample = self.input_to_batch(inp)
34
+ txt_tokens = sample['txt_tokens'] # [B, T_t]
35
+ spk_id = sample.get('spk_ids')
36
+ with torch.no_grad():
37
+ output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
38
+ pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
39
+ is_slur=sample['is_slur'])
40
+ mel_out = output['mel_out'] # [B, T,80]
41
+ if hparams.get('pe_enable') is not None and hparams['pe_enable']:
42
+ f0_pred = self.pe(mel_out)['f0_denorm_pred'] # pe predict from Pred mel
43
+ else:
44
+ f0_pred = output['f0_denorm']
45
+ wav_out = self.run_vocoder(mel_out, f0=f0_pred)
46
+ wav_out = wav_out.cpu().numpy()
47
+ return wav_out[0]
48
+
49
+ if __name__ == '__main__':
50
+ inp = {
51
+ 'text': '小酒窝长睫毛AP是你最美的记号',
52
+ 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
53
+ 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
54
+ 'input_type': 'word'
55
+ } # user input: Chinese characters
56
+ inp = {
57
+ 'text': '小酒窝长睫毛AP是你最美的记号',
58
+ 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
59
+ 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
60
+ 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
61
+ 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
62
+ 'input_type': 'phoneme'
63
+ } # input like Opencpop dataset.
64
+ DiffSingerE2EInfer.example_run(inp)
65
+
66
+
67
+ # CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel