Spaces:
Runtime error
Runtime error
Commit
·
b2c3cd9
0
Parent(s):
Duplicate from Silentlin/DiffSinger
Browse filesCo-authored-by: Jinglin Liu <[email protected]>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +33 -0
- LICENSE +21 -0
- README.md +10 -0
- checkpoints/.gitattributes +1 -0
- checkpoints/.gitkeep +0 -0
- checkpoints/0102_xiaoma_pe/config.yaml +172 -0
- checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +3 -0
- checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml +241 -0
- checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt +3 -0
- checkpoints/0228_opencpop_ds100_rel/config.yaml +342 -0
- checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt +3 -0
- checkpoints/0831_opencpop_ds1000/config.yaml +346 -0
- checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt +3 -0
- checkpoints/clean.py +12 -0
- checkpoints/cleaner.py +8 -0
- configs/config_base.yaml +42 -0
- configs/singing/base.yaml +42 -0
- configs/singing/fs2.yaml +3 -0
- configs/tts/base.yaml +95 -0
- configs/tts/base_zh.yaml +3 -0
- configs/tts/fs2.yaml +80 -0
- configs/tts/hifigan.yaml +21 -0
- configs/tts/lj/base_mel2wav.yaml +3 -0
- configs/tts/lj/base_text2mel.yaml +13 -0
- configs/tts/lj/fs2.yaml +3 -0
- configs/tts/lj/hifigan.yaml +3 -0
- configs/tts/lj/pwg.yaml +3 -0
- configs/tts/pwg.yaml +110 -0
- data/processed/ljspeech/dict.txt +77 -0
- data/processed/ljspeech/metadata_phone.csv +0 -0
- data/processed/ljspeech/mfa_dict.txt +0 -0
- data/processed/ljspeech/phone_set.json +1 -0
- data_gen/singing/binarize.py +398 -0
- data_gen/tts/base_binarizer.py +224 -0
- data_gen/tts/bin/binarize.py +20 -0
- data_gen/tts/binarizer_zh.py +59 -0
- data_gen/tts/data_gen_utils.py +347 -0
- data_gen/tts/txt_processors/base_text_processor.py +8 -0
- data_gen/tts/txt_processors/en.py +78 -0
- data_gen/tts/txt_processors/zh.py +41 -0
- data_gen/tts/txt_processors/zh_g2pM.py +72 -0
- docs/README-SVS-opencpop-cascade.md +111 -0
- docs/README-SVS-opencpop-e2e.md +107 -0
- docs/README-SVS-popcs.md +63 -0
- docs/README-SVS.md +76 -0
- docs/README-TTS.md +69 -0
- docs/README-zh.md +212 -0
- inference/svs/base_svs_infer.py +265 -0
- inference/svs/ds_cascade.py +56 -0
- inference/svs/ds_e2e.py +67 -0
.gitattributes
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
23 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
32 |
+
model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
|
33 |
+
checkpoints/0831_opencpop_ds1000 filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2021 Jinglin Liu
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
|
3 |
+
emoji: 🎶
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
app_file: inference/svs/gradio/infer.py
|
8 |
+
pinned: false
|
9 |
+
duplicated_from: Silentlin/DiffSinger
|
10 |
+
---
|
checkpoints/.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
|
checkpoints/.gitkeep
ADDED
File without changes
|
checkpoints/0102_xiaoma_pe/config.yaml
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
audio_num_mel_bins: 80
|
3 |
+
audio_sample_rate: 24000
|
4 |
+
base_config:
|
5 |
+
- configs/tts/lj/fs2.yaml
|
6 |
+
binarization_args:
|
7 |
+
shuffle: false
|
8 |
+
with_align: true
|
9 |
+
with_f0: true
|
10 |
+
with_f0cwt: true
|
11 |
+
with_spk_embed: true
|
12 |
+
with_txt: true
|
13 |
+
with_wav: false
|
14 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
15 |
+
binary_data_dir: data/binary/xiaoma1022_24k_128hop
|
16 |
+
check_val_every_n_epoch: 10
|
17 |
+
clip_grad_norm: 1
|
18 |
+
cwt_add_f0_loss: false
|
19 |
+
cwt_hidden_size: 128
|
20 |
+
cwt_layers: 2
|
21 |
+
cwt_loss: l1
|
22 |
+
cwt_std_scale: 0.8
|
23 |
+
debug: false
|
24 |
+
dec_ffn_kernel_size: 9
|
25 |
+
dec_layers: 4
|
26 |
+
decoder_type: fft
|
27 |
+
dict_dir: ''
|
28 |
+
dropout: 0.1
|
29 |
+
ds_workers: 4
|
30 |
+
dur_enc_hidden_stride_kernel:
|
31 |
+
- 0,2,3
|
32 |
+
- 0,2,3
|
33 |
+
- 0,1,3
|
34 |
+
dur_loss: mse
|
35 |
+
dur_predictor_kernel: 3
|
36 |
+
dur_predictor_layers: 2
|
37 |
+
enc_ffn_kernel_size: 9
|
38 |
+
enc_layers: 4
|
39 |
+
encoder_K: 8
|
40 |
+
encoder_type: fft
|
41 |
+
endless_ds: true
|
42 |
+
ffn_act: gelu
|
43 |
+
ffn_padding: SAME
|
44 |
+
fft_size: 512
|
45 |
+
fmax: 12000
|
46 |
+
fmin: 30
|
47 |
+
gen_dir_name: ''
|
48 |
+
hidden_size: 256
|
49 |
+
hop_size: 128
|
50 |
+
infer: false
|
51 |
+
lambda_commit: 0.25
|
52 |
+
lambda_energy: 0.1
|
53 |
+
lambda_f0: 1.0
|
54 |
+
lambda_ph_dur: 1.0
|
55 |
+
lambda_sent_dur: 1.0
|
56 |
+
lambda_uv: 1.0
|
57 |
+
lambda_word_dur: 1.0
|
58 |
+
load_ckpt: ''
|
59 |
+
log_interval: 100
|
60 |
+
loud_norm: false
|
61 |
+
lr: 2.0
|
62 |
+
max_epochs: 1000
|
63 |
+
max_eval_sentences: 1
|
64 |
+
max_eval_tokens: 60000
|
65 |
+
max_frames: 5000
|
66 |
+
max_input_tokens: 1550
|
67 |
+
max_sentences: 100000
|
68 |
+
max_tokens: 20000
|
69 |
+
max_updates: 60000
|
70 |
+
mel_loss: l1
|
71 |
+
mel_vmax: 1.5
|
72 |
+
mel_vmin: -6
|
73 |
+
min_level_db: -120
|
74 |
+
norm_type: gn
|
75 |
+
num_ckpt_keep: 3
|
76 |
+
num_heads: 2
|
77 |
+
num_sanity_val_steps: 5
|
78 |
+
num_spk: 1
|
79 |
+
num_test_samples: 20
|
80 |
+
num_valid_plots: 10
|
81 |
+
optimizer_adam_beta1: 0.9
|
82 |
+
optimizer_adam_beta2: 0.98
|
83 |
+
out_wav_norm: false
|
84 |
+
pitch_ar: false
|
85 |
+
pitch_enc_hidden_stride_kernel:
|
86 |
+
- 0,2,5
|
87 |
+
- 0,2,5
|
88 |
+
- 0,2,5
|
89 |
+
pitch_extractor_conv_layers: 2
|
90 |
+
pitch_loss: l1
|
91 |
+
pitch_norm: log
|
92 |
+
pitch_type: frame
|
93 |
+
pre_align_args:
|
94 |
+
allow_no_txt: false
|
95 |
+
denoise: false
|
96 |
+
forced_align: mfa
|
97 |
+
txt_processor: en
|
98 |
+
use_sox: false
|
99 |
+
use_tone: true
|
100 |
+
pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
|
101 |
+
predictor_dropout: 0.5
|
102 |
+
predictor_grad: 0.1
|
103 |
+
predictor_hidden: -1
|
104 |
+
predictor_kernel: 5
|
105 |
+
predictor_layers: 2
|
106 |
+
prenet_dropout: 0.5
|
107 |
+
prenet_hidden_size: 256
|
108 |
+
pretrain_fs_ckpt: ''
|
109 |
+
processed_data_dir: data/processed/ljspeech
|
110 |
+
profile_infer: false
|
111 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
112 |
+
ref_norm_layer: bn
|
113 |
+
reset_phone_dict: true
|
114 |
+
save_best: false
|
115 |
+
save_ckpt: true
|
116 |
+
save_codes:
|
117 |
+
- configs
|
118 |
+
- modules
|
119 |
+
- tasks
|
120 |
+
- utils
|
121 |
+
- usr
|
122 |
+
save_f0: false
|
123 |
+
save_gt: false
|
124 |
+
seed: 1234
|
125 |
+
sort_by_len: true
|
126 |
+
stop_token_weight: 5.0
|
127 |
+
task_cls: tasks.tts.pe.PitchExtractionTask
|
128 |
+
test_ids:
|
129 |
+
- 68
|
130 |
+
- 70
|
131 |
+
- 74
|
132 |
+
- 87
|
133 |
+
- 110
|
134 |
+
- 172
|
135 |
+
- 190
|
136 |
+
- 215
|
137 |
+
- 231
|
138 |
+
- 294
|
139 |
+
- 316
|
140 |
+
- 324
|
141 |
+
- 402
|
142 |
+
- 422
|
143 |
+
- 485
|
144 |
+
- 500
|
145 |
+
- 505
|
146 |
+
- 508
|
147 |
+
- 509
|
148 |
+
- 519
|
149 |
+
test_input_dir: ''
|
150 |
+
test_num: 523
|
151 |
+
test_set_name: test
|
152 |
+
train_set_name: train
|
153 |
+
use_denoise: false
|
154 |
+
use_energy_embed: false
|
155 |
+
use_gt_dur: false
|
156 |
+
use_gt_f0: false
|
157 |
+
use_pitch_embed: true
|
158 |
+
use_pos_embed: true
|
159 |
+
use_spk_embed: false
|
160 |
+
use_spk_id: false
|
161 |
+
use_split_spk_id: false
|
162 |
+
use_uv: true
|
163 |
+
use_var_enc: false
|
164 |
+
val_check_interval: 2000
|
165 |
+
valid_num: 348
|
166 |
+
valid_set_name: valid
|
167 |
+
vocoder: pwg
|
168 |
+
vocoder_ckpt: ''
|
169 |
+
warmup_updates: 2000
|
170 |
+
weight_decay: 0
|
171 |
+
win_size: 512
|
172 |
+
work_dir: checkpoints/0102_xiaoma_pe
|
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb
|
3 |
+
size 13047222
|
checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
adam_b1: 0.8
|
3 |
+
adam_b2: 0.99
|
4 |
+
amp: false
|
5 |
+
audio_num_mel_bins: 80
|
6 |
+
audio_sample_rate: 24000
|
7 |
+
aux_context_window: 0
|
8 |
+
#base_config:
|
9 |
+
#- egs/egs_bases/singing/pwg.yaml
|
10 |
+
#- egs/egs_bases/tts/vocoder/hifigan.yaml
|
11 |
+
binarization_args:
|
12 |
+
reset_phone_dict: true
|
13 |
+
reset_word_dict: true
|
14 |
+
shuffle: false
|
15 |
+
trim_eos_bos: false
|
16 |
+
trim_sil: false
|
17 |
+
with_align: false
|
18 |
+
with_f0: true
|
19 |
+
with_f0cwt: false
|
20 |
+
with_linear: false
|
21 |
+
with_spk_embed: false
|
22 |
+
with_spk_id: true
|
23 |
+
with_txt: false
|
24 |
+
with_wav: true
|
25 |
+
with_word: false
|
26 |
+
binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer
|
27 |
+
binary_data_dir: data/binary/big_popcs_24k_hop128
|
28 |
+
check_val_every_n_epoch: 10
|
29 |
+
clip_grad_norm: 1
|
30 |
+
clip_grad_value: 0
|
31 |
+
datasets: []
|
32 |
+
debug: false
|
33 |
+
dec_ffn_kernel_size: 9
|
34 |
+
dec_layers: 4
|
35 |
+
dict_dir: ''
|
36 |
+
disc_start_steps: 40000
|
37 |
+
discriminator_grad_norm: 1
|
38 |
+
discriminator_optimizer_params:
|
39 |
+
eps: 1.0e-06
|
40 |
+
lr: 0.0002
|
41 |
+
weight_decay: 0.0
|
42 |
+
discriminator_params:
|
43 |
+
bias: true
|
44 |
+
conv_channels: 64
|
45 |
+
in_channels: 1
|
46 |
+
kernel_size: 3
|
47 |
+
layers: 10
|
48 |
+
nonlinear_activation: LeakyReLU
|
49 |
+
nonlinear_activation_params:
|
50 |
+
negative_slope: 0.2
|
51 |
+
out_channels: 1
|
52 |
+
use_weight_norm: true
|
53 |
+
discriminator_scheduler_params:
|
54 |
+
gamma: 0.999
|
55 |
+
step_size: 600
|
56 |
+
dropout: 0.1
|
57 |
+
ds_workers: 1
|
58 |
+
enc_ffn_kernel_size: 9
|
59 |
+
enc_layers: 4
|
60 |
+
endless_ds: true
|
61 |
+
ffn_act: gelu
|
62 |
+
ffn_padding: SAME
|
63 |
+
fft_size: 512
|
64 |
+
fmax: 12000
|
65 |
+
fmin: 30
|
66 |
+
frames_multiple: 1
|
67 |
+
gen_dir_name: ''
|
68 |
+
generator_grad_norm: 10
|
69 |
+
generator_optimizer_params:
|
70 |
+
eps: 1.0e-06
|
71 |
+
lr: 0.0002
|
72 |
+
weight_decay: 0.0
|
73 |
+
generator_params:
|
74 |
+
aux_channels: 80
|
75 |
+
dropout: 0.0
|
76 |
+
gate_channels: 128
|
77 |
+
in_channels: 1
|
78 |
+
kernel_size: 3
|
79 |
+
layers: 30
|
80 |
+
out_channels: 1
|
81 |
+
residual_channels: 64
|
82 |
+
skip_channels: 64
|
83 |
+
stacks: 3
|
84 |
+
upsample_net: ConvInUpsampleNetwork
|
85 |
+
upsample_params:
|
86 |
+
upsample_scales:
|
87 |
+
- 2
|
88 |
+
- 4
|
89 |
+
- 4
|
90 |
+
- 4
|
91 |
+
use_nsf: false
|
92 |
+
use_pitch_embed: true
|
93 |
+
use_weight_norm: true
|
94 |
+
generator_scheduler_params:
|
95 |
+
gamma: 0.999
|
96 |
+
step_size: 600
|
97 |
+
griffin_lim_iters: 60
|
98 |
+
hidden_size: 256
|
99 |
+
hop_size: 128
|
100 |
+
infer: false
|
101 |
+
lambda_adv: 1.0
|
102 |
+
lambda_cdisc: 4.0
|
103 |
+
lambda_energy: 0.0
|
104 |
+
lambda_f0: 0.0
|
105 |
+
lambda_mel: 5.0
|
106 |
+
lambda_mel_adv: 1.0
|
107 |
+
lambda_ph_dur: 0.0
|
108 |
+
lambda_sent_dur: 0.0
|
109 |
+
lambda_uv: 0.0
|
110 |
+
lambda_word_dur: 0.0
|
111 |
+
load_ckpt: ''
|
112 |
+
loud_norm: false
|
113 |
+
lr: 2.0
|
114 |
+
max_epochs: 1000
|
115 |
+
max_frames: 2400
|
116 |
+
max_input_tokens: 1550
|
117 |
+
max_samples: 8192
|
118 |
+
max_sentences: 20
|
119 |
+
max_tokens: 24000
|
120 |
+
max_updates: 3000000
|
121 |
+
max_valid_sentences: 1
|
122 |
+
max_valid_tokens: 60000
|
123 |
+
mel_loss: ssim:0.5|l1:0.5
|
124 |
+
mel_vmax: 1.5
|
125 |
+
mel_vmin: -6
|
126 |
+
min_frames: 0
|
127 |
+
min_level_db: -120
|
128 |
+
num_ckpt_keep: 3
|
129 |
+
num_heads: 2
|
130 |
+
num_mels: 80
|
131 |
+
num_sanity_val_steps: 5
|
132 |
+
num_spk: 100
|
133 |
+
num_test_samples: 0
|
134 |
+
num_valid_plots: 10
|
135 |
+
optimizer_adam_beta1: 0.9
|
136 |
+
optimizer_adam_beta2: 0.98
|
137 |
+
out_wav_norm: false
|
138 |
+
pitch_extractor: parselmouth
|
139 |
+
pitch_type: frame
|
140 |
+
pre_align_args:
|
141 |
+
allow_no_txt: false
|
142 |
+
denoise: false
|
143 |
+
sox_resample: true
|
144 |
+
sox_to_wav: false
|
145 |
+
trim_sil: false
|
146 |
+
txt_processor: zh
|
147 |
+
use_tone: false
|
148 |
+
pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign
|
149 |
+
predictor_grad: 0.0
|
150 |
+
print_nan_grads: false
|
151 |
+
processed_data_dir: ''
|
152 |
+
profile_infer: false
|
153 |
+
raw_data_dir: ''
|
154 |
+
ref_level_db: 20
|
155 |
+
rename_tmux: true
|
156 |
+
rerun_gen: true
|
157 |
+
resblock: '1'
|
158 |
+
resblock_dilation_sizes:
|
159 |
+
- - 1
|
160 |
+
- 3
|
161 |
+
- 5
|
162 |
+
- - 1
|
163 |
+
- 3
|
164 |
+
- 5
|
165 |
+
- - 1
|
166 |
+
- 3
|
167 |
+
- 5
|
168 |
+
resblock_kernel_sizes:
|
169 |
+
- 3
|
170 |
+
- 7
|
171 |
+
- 11
|
172 |
+
resume_from_checkpoint: 0
|
173 |
+
save_best: true
|
174 |
+
save_codes: []
|
175 |
+
save_f0: true
|
176 |
+
save_gt: true
|
177 |
+
scheduler: rsqrt
|
178 |
+
seed: 1234
|
179 |
+
sort_by_len: true
|
180 |
+
stft_loss_params:
|
181 |
+
fft_sizes:
|
182 |
+
- 1024
|
183 |
+
- 2048
|
184 |
+
- 512
|
185 |
+
hop_sizes:
|
186 |
+
- 120
|
187 |
+
- 240
|
188 |
+
- 50
|
189 |
+
win_lengths:
|
190 |
+
- 600
|
191 |
+
- 1200
|
192 |
+
- 240
|
193 |
+
window: hann_window
|
194 |
+
task_cls: tasks.vocoder.hifigan.HifiGanTask
|
195 |
+
tb_log_interval: 100
|
196 |
+
test_ids: []
|
197 |
+
test_input_dir: ''
|
198 |
+
test_num: 50
|
199 |
+
test_prefixes: []
|
200 |
+
test_set_name: test
|
201 |
+
train_set_name: train
|
202 |
+
train_sets: ''
|
203 |
+
upsample_initial_channel: 512
|
204 |
+
upsample_kernel_sizes:
|
205 |
+
- 16
|
206 |
+
- 16
|
207 |
+
- 4
|
208 |
+
- 4
|
209 |
+
upsample_rates:
|
210 |
+
- 8
|
211 |
+
- 4
|
212 |
+
- 2
|
213 |
+
- 2
|
214 |
+
use_cdisc: false
|
215 |
+
use_cond_disc: false
|
216 |
+
use_fm_loss: false
|
217 |
+
use_gt_dur: true
|
218 |
+
use_gt_f0: true
|
219 |
+
use_mel_loss: true
|
220 |
+
use_ms_stft: false
|
221 |
+
use_pitch_embed: true
|
222 |
+
use_ref_enc: true
|
223 |
+
use_spec_disc: false
|
224 |
+
use_spk_embed: false
|
225 |
+
use_spk_id: false
|
226 |
+
use_split_spk_id: false
|
227 |
+
val_check_interval: 2000
|
228 |
+
valid_infer_interval: 10000
|
229 |
+
valid_monitor_key: val_loss
|
230 |
+
valid_monitor_mode: min
|
231 |
+
valid_set_name: valid
|
232 |
+
vocoder: pwg
|
233 |
+
vocoder_ckpt: ''
|
234 |
+
vocoder_denoise_c: 0.0
|
235 |
+
warmup_updates: 8000
|
236 |
+
weight_decay: 0
|
237 |
+
win_length: null
|
238 |
+
win_size: 512
|
239 |
+
window: hann
|
240 |
+
word_size: 3000
|
241 |
+
work_dir: checkpoints/0109_hifigan_bigpopcs_hop128
|
checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1
|
3 |
+
size 55827436
|
checkpoints/0228_opencpop_ds100_rel/config.yaml
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
K_step: 100
|
2 |
+
accumulate_grad_batches: 1
|
3 |
+
audio_num_mel_bins: 80
|
4 |
+
audio_sample_rate: 24000
|
5 |
+
base_config:
|
6 |
+
- usr/configs/popcs_ds_beta6.yaml
|
7 |
+
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
8 |
+
binarization_args:
|
9 |
+
shuffle: false
|
10 |
+
with_align: true
|
11 |
+
with_f0: true
|
12 |
+
with_f0cwt: true
|
13 |
+
with_spk_embed: false
|
14 |
+
with_txt: true
|
15 |
+
with_wav: true
|
16 |
+
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
|
17 |
+
binary_data_dir: data/binary/opencpop-midi-dp
|
18 |
+
check_val_every_n_epoch: 10
|
19 |
+
clip_grad_norm: 1
|
20 |
+
content_cond_steps: []
|
21 |
+
cwt_add_f0_loss: false
|
22 |
+
cwt_hidden_size: 128
|
23 |
+
cwt_layers: 2
|
24 |
+
cwt_loss: l1
|
25 |
+
cwt_std_scale: 0.8
|
26 |
+
datasets:
|
27 |
+
- popcs
|
28 |
+
debug: false
|
29 |
+
dec_ffn_kernel_size: 9
|
30 |
+
dec_layers: 4
|
31 |
+
decay_steps: 50000
|
32 |
+
decoder_type: fft
|
33 |
+
dict_dir: ''
|
34 |
+
diff_decoder_type: wavenet
|
35 |
+
diff_loss_type: l1
|
36 |
+
dilation_cycle_length: 4
|
37 |
+
dropout: 0.1
|
38 |
+
ds_workers: 4
|
39 |
+
dur_enc_hidden_stride_kernel:
|
40 |
+
- 0,2,3
|
41 |
+
- 0,2,3
|
42 |
+
- 0,1,3
|
43 |
+
dur_loss: mse
|
44 |
+
dur_predictor_kernel: 3
|
45 |
+
dur_predictor_layers: 5
|
46 |
+
enc_ffn_kernel_size: 9
|
47 |
+
enc_layers: 4
|
48 |
+
encoder_K: 8
|
49 |
+
encoder_type: fft
|
50 |
+
endless_ds: true
|
51 |
+
ffn_act: gelu
|
52 |
+
ffn_padding: SAME
|
53 |
+
fft_size: 512
|
54 |
+
fmax: 12000
|
55 |
+
fmin: 30
|
56 |
+
fs2_ckpt: ''
|
57 |
+
gaussian_start: true
|
58 |
+
gen_dir_name: ''
|
59 |
+
gen_tgt_spk_id: -1
|
60 |
+
hidden_size: 256
|
61 |
+
hop_size: 128
|
62 |
+
infer: false
|
63 |
+
keep_bins: 80
|
64 |
+
lambda_commit: 0.25
|
65 |
+
lambda_energy: 0.0
|
66 |
+
lambda_f0: 0.0
|
67 |
+
lambda_ph_dur: 1.0
|
68 |
+
lambda_sent_dur: 1.0
|
69 |
+
lambda_uv: 0.0
|
70 |
+
lambda_word_dur: 1.0
|
71 |
+
load_ckpt: ''
|
72 |
+
log_interval: 100
|
73 |
+
loud_norm: false
|
74 |
+
lr: 0.001
|
75 |
+
max_beta: 0.06
|
76 |
+
max_epochs: 1000
|
77 |
+
max_eval_sentences: 1
|
78 |
+
max_eval_tokens: 60000
|
79 |
+
max_frames: 8000
|
80 |
+
max_input_tokens: 1550
|
81 |
+
max_sentences: 48
|
82 |
+
max_tokens: 40000
|
83 |
+
max_updates: 160000
|
84 |
+
mel_loss: ssim:0.5|l1:0.5
|
85 |
+
mel_vmax: 1.5
|
86 |
+
mel_vmin: -6.0
|
87 |
+
min_level_db: -120
|
88 |
+
norm_type: gn
|
89 |
+
num_ckpt_keep: 3
|
90 |
+
num_heads: 2
|
91 |
+
num_sanity_val_steps: 1
|
92 |
+
num_spk: 1
|
93 |
+
num_test_samples: 0
|
94 |
+
num_valid_plots: 10
|
95 |
+
optimizer_adam_beta1: 0.9
|
96 |
+
optimizer_adam_beta2: 0.98
|
97 |
+
out_wav_norm: false
|
98 |
+
pe_ckpt: checkpoints/0102_xiaoma_pe
|
99 |
+
pe_enable: true
|
100 |
+
pitch_ar: false
|
101 |
+
pitch_enc_hidden_stride_kernel:
|
102 |
+
- 0,2,5
|
103 |
+
- 0,2,5
|
104 |
+
- 0,2,5
|
105 |
+
pitch_extractor: parselmouth
|
106 |
+
pitch_loss: l1
|
107 |
+
pitch_norm: log
|
108 |
+
pitch_type: frame
|
109 |
+
pre_align_args:
|
110 |
+
allow_no_txt: false
|
111 |
+
denoise: false
|
112 |
+
forced_align: mfa
|
113 |
+
txt_processor: zh_g2pM
|
114 |
+
use_sox: true
|
115 |
+
use_tone: false
|
116 |
+
pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
|
117 |
+
predictor_dropout: 0.5
|
118 |
+
predictor_grad: 0.1
|
119 |
+
predictor_hidden: -1
|
120 |
+
predictor_kernel: 5
|
121 |
+
predictor_layers: 5
|
122 |
+
prenet_dropout: 0.5
|
123 |
+
prenet_hidden_size: 256
|
124 |
+
pretrain_fs_ckpt: ''
|
125 |
+
processed_data_dir: data/processed/popcs
|
126 |
+
profile_infer: false
|
127 |
+
raw_data_dir: data/raw/popcs
|
128 |
+
ref_norm_layer: bn
|
129 |
+
rel_pos: true
|
130 |
+
reset_phone_dict: true
|
131 |
+
residual_channels: 256
|
132 |
+
residual_layers: 20
|
133 |
+
save_best: false
|
134 |
+
save_ckpt: true
|
135 |
+
save_codes:
|
136 |
+
- configs
|
137 |
+
- modules
|
138 |
+
- tasks
|
139 |
+
- utils
|
140 |
+
- usr
|
141 |
+
save_f0: true
|
142 |
+
save_gt: false
|
143 |
+
schedule_type: linear
|
144 |
+
seed: 1234
|
145 |
+
sort_by_len: true
|
146 |
+
spec_max:
|
147 |
+
- -0.79453
|
148 |
+
- -0.81116
|
149 |
+
- -0.61631
|
150 |
+
- -0.30679
|
151 |
+
- -0.13863
|
152 |
+
- -0.050652
|
153 |
+
- -0.11563
|
154 |
+
- -0.10679
|
155 |
+
- -0.091068
|
156 |
+
- -0.062174
|
157 |
+
- -0.075302
|
158 |
+
- -0.072217
|
159 |
+
- -0.063815
|
160 |
+
- -0.073299
|
161 |
+
- 0.007361
|
162 |
+
- -0.072508
|
163 |
+
- -0.050234
|
164 |
+
- -0.16534
|
165 |
+
- -0.26928
|
166 |
+
- -0.20782
|
167 |
+
- -0.20823
|
168 |
+
- -0.11702
|
169 |
+
- -0.070128
|
170 |
+
- -0.065868
|
171 |
+
- -0.012675
|
172 |
+
- 0.0015121
|
173 |
+
- -0.089902
|
174 |
+
- -0.21392
|
175 |
+
- -0.23789
|
176 |
+
- -0.28922
|
177 |
+
- -0.30405
|
178 |
+
- -0.23029
|
179 |
+
- -0.22088
|
180 |
+
- -0.21542
|
181 |
+
- -0.29367
|
182 |
+
- -0.30137
|
183 |
+
- -0.38281
|
184 |
+
- -0.4359
|
185 |
+
- -0.28681
|
186 |
+
- -0.46855
|
187 |
+
- -0.57485
|
188 |
+
- -0.47022
|
189 |
+
- -0.54266
|
190 |
+
- -0.44848
|
191 |
+
- -0.6412
|
192 |
+
- -0.687
|
193 |
+
- -0.6486
|
194 |
+
- -0.76436
|
195 |
+
- -0.49971
|
196 |
+
- -0.71068
|
197 |
+
- -0.69724
|
198 |
+
- -0.61487
|
199 |
+
- -0.55843
|
200 |
+
- -0.69773
|
201 |
+
- -0.57502
|
202 |
+
- -0.70919
|
203 |
+
- -0.82431
|
204 |
+
- -0.84213
|
205 |
+
- -0.90431
|
206 |
+
- -0.8284
|
207 |
+
- -0.77945
|
208 |
+
- -0.82758
|
209 |
+
- -0.87699
|
210 |
+
- -1.0532
|
211 |
+
- -1.0766
|
212 |
+
- -1.1198
|
213 |
+
- -1.0185
|
214 |
+
- -0.98983
|
215 |
+
- -1.0001
|
216 |
+
- -1.0756
|
217 |
+
- -1.0024
|
218 |
+
- -1.0304
|
219 |
+
- -1.0579
|
220 |
+
- -1.0188
|
221 |
+
- -1.05
|
222 |
+
- -1.0842
|
223 |
+
- -1.0923
|
224 |
+
- -1.1223
|
225 |
+
- -1.2381
|
226 |
+
- -1.6467
|
227 |
+
spec_min:
|
228 |
+
- -6.0
|
229 |
+
- -6.0
|
230 |
+
- -6.0
|
231 |
+
- -6.0
|
232 |
+
- -6.0
|
233 |
+
- -6.0
|
234 |
+
- -6.0
|
235 |
+
- -6.0
|
236 |
+
- -6.0
|
237 |
+
- -6.0
|
238 |
+
- -6.0
|
239 |
+
- -6.0
|
240 |
+
- -6.0
|
241 |
+
- -6.0
|
242 |
+
- -6.0
|
243 |
+
- -6.0
|
244 |
+
- -6.0
|
245 |
+
- -6.0
|
246 |
+
- -6.0
|
247 |
+
- -6.0
|
248 |
+
- -6.0
|
249 |
+
- -6.0
|
250 |
+
- -6.0
|
251 |
+
- -6.0
|
252 |
+
- -6.0
|
253 |
+
- -6.0
|
254 |
+
- -6.0
|
255 |
+
- -6.0
|
256 |
+
- -6.0
|
257 |
+
- -6.0
|
258 |
+
- -6.0
|
259 |
+
- -6.0
|
260 |
+
- -6.0
|
261 |
+
- -6.0
|
262 |
+
- -6.0
|
263 |
+
- -6.0
|
264 |
+
- -6.0
|
265 |
+
- -6.0
|
266 |
+
- -6.0
|
267 |
+
- -6.0
|
268 |
+
- -6.0
|
269 |
+
- -6.0
|
270 |
+
- -6.0
|
271 |
+
- -6.0
|
272 |
+
- -6.0
|
273 |
+
- -6.0
|
274 |
+
- -6.0
|
275 |
+
- -6.0
|
276 |
+
- -6.0
|
277 |
+
- -6.0
|
278 |
+
- -6.0
|
279 |
+
- -6.0
|
280 |
+
- -6.0
|
281 |
+
- -6.0
|
282 |
+
- -6.0
|
283 |
+
- -6.0
|
284 |
+
- -6.0
|
285 |
+
- -6.0
|
286 |
+
- -6.0
|
287 |
+
- -6.0
|
288 |
+
- -6.0
|
289 |
+
- -6.0
|
290 |
+
- -6.0
|
291 |
+
- -6.0
|
292 |
+
- -6.0
|
293 |
+
- -6.0
|
294 |
+
- -6.0
|
295 |
+
- -6.0
|
296 |
+
- -6.0
|
297 |
+
- -6.0
|
298 |
+
- -6.0
|
299 |
+
- -6.0
|
300 |
+
- -6.0
|
301 |
+
- -6.0
|
302 |
+
- -6.0
|
303 |
+
- -6.0
|
304 |
+
- -6.0
|
305 |
+
- -6.0
|
306 |
+
- -6.0
|
307 |
+
- -6.0
|
308 |
+
spk_cond_steps: []
|
309 |
+
stop_token_weight: 5.0
|
310 |
+
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
311 |
+
test_ids: []
|
312 |
+
test_input_dir: ''
|
313 |
+
test_num: 0
|
314 |
+
test_prefixes:
|
315 |
+
- "popcs-\u8BF4\u6563\u5C31\u6563"
|
316 |
+
- "popcs-\u9690\u5F62\u7684\u7FC5\u8180"
|
317 |
+
test_set_name: test
|
318 |
+
timesteps: 100
|
319 |
+
train_set_name: train
|
320 |
+
use_denoise: false
|
321 |
+
use_energy_embed: false
|
322 |
+
use_gt_dur: false
|
323 |
+
use_gt_f0: false
|
324 |
+
use_midi: true
|
325 |
+
use_nsf: true
|
326 |
+
use_pitch_embed: false
|
327 |
+
use_pos_embed: true
|
328 |
+
use_spk_embed: false
|
329 |
+
use_spk_id: false
|
330 |
+
use_split_spk_id: false
|
331 |
+
use_uv: true
|
332 |
+
use_var_enc: false
|
333 |
+
val_check_interval: 2000
|
334 |
+
valid_num: 0
|
335 |
+
valid_set_name: valid
|
336 |
+
vocoder: vocoders.hifigan.HifiGAN
|
337 |
+
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
|
338 |
+
warmup_updates: 2000
|
339 |
+
wav2spec_eps: 1e-6
|
340 |
+
weight_decay: 0
|
341 |
+
win_size: 512
|
342 |
+
work_dir: checkpoints/0228_opencpop_ds100_rel
|
checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c
|
3 |
+
size 170226367
|
checkpoints/0831_opencpop_ds1000/config.yaml
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
K_step: 1000
|
2 |
+
accumulate_grad_batches: 1
|
3 |
+
audio_num_mel_bins: 80
|
4 |
+
audio_sample_rate: 24000
|
5 |
+
base_config:
|
6 |
+
- usr/configs/popcs_ds_beta6.yaml
|
7 |
+
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
8 |
+
binarization_args:
|
9 |
+
shuffle: false
|
10 |
+
with_align: true
|
11 |
+
with_f0: true
|
12 |
+
with_f0cwt: true
|
13 |
+
with_spk_embed: false
|
14 |
+
with_txt: true
|
15 |
+
with_wav: true
|
16 |
+
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
|
17 |
+
binary_data_dir: data/binary/opencpop-midi-dp
|
18 |
+
check_val_every_n_epoch: 10
|
19 |
+
clip_grad_norm: 1
|
20 |
+
content_cond_steps: []
|
21 |
+
cwt_add_f0_loss: false
|
22 |
+
cwt_hidden_size: 128
|
23 |
+
cwt_layers: 2
|
24 |
+
cwt_loss: l1
|
25 |
+
cwt_std_scale: 0.8
|
26 |
+
datasets:
|
27 |
+
- opencpop
|
28 |
+
debug: false
|
29 |
+
dec_ffn_kernel_size: 9
|
30 |
+
dec_layers: 4
|
31 |
+
decay_steps: 50000
|
32 |
+
decoder_type: fft
|
33 |
+
dict_dir: ''
|
34 |
+
diff_decoder_type: wavenet
|
35 |
+
diff_loss_type: l1
|
36 |
+
dilation_cycle_length: 4
|
37 |
+
dropout: 0.1
|
38 |
+
ds_workers: 4
|
39 |
+
dur_enc_hidden_stride_kernel:
|
40 |
+
- 0,2,3
|
41 |
+
- 0,2,3
|
42 |
+
- 0,1,3
|
43 |
+
dur_loss: mse
|
44 |
+
dur_predictor_kernel: 3
|
45 |
+
dur_predictor_layers: 5
|
46 |
+
enc_ffn_kernel_size: 9
|
47 |
+
enc_layers: 4
|
48 |
+
encoder_K: 8
|
49 |
+
encoder_type: fft
|
50 |
+
endless_ds: true
|
51 |
+
ffn_act: gelu
|
52 |
+
ffn_padding: SAME
|
53 |
+
fft_size: 512
|
54 |
+
fmax: 12000
|
55 |
+
fmin: 30
|
56 |
+
fs2_ckpt: ''
|
57 |
+
gaussian_start: true
|
58 |
+
gen_dir_name: ''
|
59 |
+
gen_tgt_spk_id: -1
|
60 |
+
hidden_size: 256
|
61 |
+
hop_size: 128
|
62 |
+
infer: false
|
63 |
+
keep_bins: 80
|
64 |
+
lambda_commit: 0.25
|
65 |
+
lambda_energy: 0.0
|
66 |
+
lambda_f0: 0.0
|
67 |
+
lambda_ph_dur: 1.0
|
68 |
+
lambda_sent_dur: 1.0
|
69 |
+
lambda_uv: 0.0
|
70 |
+
lambda_word_dur: 1.0
|
71 |
+
load_ckpt: ''
|
72 |
+
log_interval: 100
|
73 |
+
loud_norm: false
|
74 |
+
lr: 0.001
|
75 |
+
max_beta: 0.02
|
76 |
+
max_epochs: 1000
|
77 |
+
max_eval_sentences: 1
|
78 |
+
max_eval_tokens: 60000
|
79 |
+
max_frames: 8000
|
80 |
+
max_input_tokens: 1550
|
81 |
+
max_sentences: 48
|
82 |
+
max_tokens: 36000
|
83 |
+
max_updates: 320000
|
84 |
+
mel_loss: ssim:0.5|l1:0.5
|
85 |
+
mel_vmax: 1.5
|
86 |
+
mel_vmin: -6.0
|
87 |
+
min_level_db: -120
|
88 |
+
norm_type: gn
|
89 |
+
num_ckpt_keep: 3
|
90 |
+
num_heads: 2
|
91 |
+
num_sanity_val_steps: 1
|
92 |
+
num_spk: 1
|
93 |
+
num_test_samples: 0
|
94 |
+
num_valid_plots: 10
|
95 |
+
optimizer_adam_beta1: 0.9
|
96 |
+
optimizer_adam_beta2: 0.98
|
97 |
+
out_wav_norm: false
|
98 |
+
pe_ckpt: checkpoints/0102_xiaoma_pe
|
99 |
+
pe_enable: true
|
100 |
+
pitch_ar: false
|
101 |
+
pitch_enc_hidden_stride_kernel:
|
102 |
+
- 0,2,5
|
103 |
+
- 0,2,5
|
104 |
+
- 0,2,5
|
105 |
+
pitch_extractor: parselmouth
|
106 |
+
pitch_loss: l1
|
107 |
+
pitch_norm: log
|
108 |
+
pitch_type: frame
|
109 |
+
pre_align_args:
|
110 |
+
allow_no_txt: false
|
111 |
+
denoise: false
|
112 |
+
forced_align: mfa
|
113 |
+
txt_processor: zh_g2pM
|
114 |
+
use_sox: true
|
115 |
+
use_tone: false
|
116 |
+
pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
|
117 |
+
predictor_dropout: 0.5
|
118 |
+
predictor_grad: 0.1
|
119 |
+
predictor_hidden: -1
|
120 |
+
predictor_kernel: 5
|
121 |
+
predictor_layers: 5
|
122 |
+
prenet_dropout: 0.5
|
123 |
+
prenet_hidden_size: 256
|
124 |
+
pretrain_fs_ckpt: ''
|
125 |
+
processed_data_dir: xxx
|
126 |
+
profile_infer: false
|
127 |
+
raw_data_dir: data/raw/opencpop/segments
|
128 |
+
ref_norm_layer: bn
|
129 |
+
rel_pos: true
|
130 |
+
reset_phone_dict: true
|
131 |
+
residual_channels: 256
|
132 |
+
residual_layers: 20
|
133 |
+
save_best: false
|
134 |
+
save_ckpt: true
|
135 |
+
save_codes:
|
136 |
+
- configs
|
137 |
+
- modules
|
138 |
+
- tasks
|
139 |
+
- utils
|
140 |
+
- usr
|
141 |
+
save_f0: true
|
142 |
+
save_gt: false
|
143 |
+
schedule_type: linear
|
144 |
+
seed: 1234
|
145 |
+
sort_by_len: true
|
146 |
+
spec_max:
|
147 |
+
- -0.79453
|
148 |
+
- -0.81116
|
149 |
+
- -0.61631
|
150 |
+
- -0.30679
|
151 |
+
- -0.13863
|
152 |
+
- -0.050652
|
153 |
+
- -0.11563
|
154 |
+
- -0.10679
|
155 |
+
- -0.091068
|
156 |
+
- -0.062174
|
157 |
+
- -0.075302
|
158 |
+
- -0.072217
|
159 |
+
- -0.063815
|
160 |
+
- -0.073299
|
161 |
+
- 0.007361
|
162 |
+
- -0.072508
|
163 |
+
- -0.050234
|
164 |
+
- -0.16534
|
165 |
+
- -0.26928
|
166 |
+
- -0.20782
|
167 |
+
- -0.20823
|
168 |
+
- -0.11702
|
169 |
+
- -0.070128
|
170 |
+
- -0.065868
|
171 |
+
- -0.012675
|
172 |
+
- 0.0015121
|
173 |
+
- -0.089902
|
174 |
+
- -0.21392
|
175 |
+
- -0.23789
|
176 |
+
- -0.28922
|
177 |
+
- -0.30405
|
178 |
+
- -0.23029
|
179 |
+
- -0.22088
|
180 |
+
- -0.21542
|
181 |
+
- -0.29367
|
182 |
+
- -0.30137
|
183 |
+
- -0.38281
|
184 |
+
- -0.4359
|
185 |
+
- -0.28681
|
186 |
+
- -0.46855
|
187 |
+
- -0.57485
|
188 |
+
- -0.47022
|
189 |
+
- -0.54266
|
190 |
+
- -0.44848
|
191 |
+
- -0.6412
|
192 |
+
- -0.687
|
193 |
+
- -0.6486
|
194 |
+
- -0.76436
|
195 |
+
- -0.49971
|
196 |
+
- -0.71068
|
197 |
+
- -0.69724
|
198 |
+
- -0.61487
|
199 |
+
- -0.55843
|
200 |
+
- -0.69773
|
201 |
+
- -0.57502
|
202 |
+
- -0.70919
|
203 |
+
- -0.82431
|
204 |
+
- -0.84213
|
205 |
+
- -0.90431
|
206 |
+
- -0.8284
|
207 |
+
- -0.77945
|
208 |
+
- -0.82758
|
209 |
+
- -0.87699
|
210 |
+
- -1.0532
|
211 |
+
- -1.0766
|
212 |
+
- -1.1198
|
213 |
+
- -1.0185
|
214 |
+
- -0.98983
|
215 |
+
- -1.0001
|
216 |
+
- -1.0756
|
217 |
+
- -1.0024
|
218 |
+
- -1.0304
|
219 |
+
- -1.0579
|
220 |
+
- -1.0188
|
221 |
+
- -1.05
|
222 |
+
- -1.0842
|
223 |
+
- -1.0923
|
224 |
+
- -1.1223
|
225 |
+
- -1.2381
|
226 |
+
- -1.6467
|
227 |
+
spec_min:
|
228 |
+
- -6.0
|
229 |
+
- -6.0
|
230 |
+
- -6.0
|
231 |
+
- -6.0
|
232 |
+
- -6.0
|
233 |
+
- -6.0
|
234 |
+
- -6.0
|
235 |
+
- -6.0
|
236 |
+
- -6.0
|
237 |
+
- -6.0
|
238 |
+
- -6.0
|
239 |
+
- -6.0
|
240 |
+
- -6.0
|
241 |
+
- -6.0
|
242 |
+
- -6.0
|
243 |
+
- -6.0
|
244 |
+
- -6.0
|
245 |
+
- -6.0
|
246 |
+
- -6.0
|
247 |
+
- -6.0
|
248 |
+
- -6.0
|
249 |
+
- -6.0
|
250 |
+
- -6.0
|
251 |
+
- -6.0
|
252 |
+
- -6.0
|
253 |
+
- -6.0
|
254 |
+
- -6.0
|
255 |
+
- -6.0
|
256 |
+
- -6.0
|
257 |
+
- -6.0
|
258 |
+
- -6.0
|
259 |
+
- -6.0
|
260 |
+
- -6.0
|
261 |
+
- -6.0
|
262 |
+
- -6.0
|
263 |
+
- -6.0
|
264 |
+
- -6.0
|
265 |
+
- -6.0
|
266 |
+
- -6.0
|
267 |
+
- -6.0
|
268 |
+
- -6.0
|
269 |
+
- -6.0
|
270 |
+
- -6.0
|
271 |
+
- -6.0
|
272 |
+
- -6.0
|
273 |
+
- -6.0
|
274 |
+
- -6.0
|
275 |
+
- -6.0
|
276 |
+
- -6.0
|
277 |
+
- -6.0
|
278 |
+
- -6.0
|
279 |
+
- -6.0
|
280 |
+
- -6.0
|
281 |
+
- -6.0
|
282 |
+
- -6.0
|
283 |
+
- -6.0
|
284 |
+
- -6.0
|
285 |
+
- -6.0
|
286 |
+
- -6.0
|
287 |
+
- -6.0
|
288 |
+
- -6.0
|
289 |
+
- -6.0
|
290 |
+
- -6.0
|
291 |
+
- -6.0
|
292 |
+
- -6.0
|
293 |
+
- -6.0
|
294 |
+
- -6.0
|
295 |
+
- -6.0
|
296 |
+
- -6.0
|
297 |
+
- -6.0
|
298 |
+
- -6.0
|
299 |
+
- -6.0
|
300 |
+
- -6.0
|
301 |
+
- -6.0
|
302 |
+
- -6.0
|
303 |
+
- -6.0
|
304 |
+
- -6.0
|
305 |
+
- -6.0
|
306 |
+
- -6.0
|
307 |
+
- -6.0
|
308 |
+
spk_cond_steps: []
|
309 |
+
stop_token_weight: 5.0
|
310 |
+
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
311 |
+
test_ids: []
|
312 |
+
test_input_dir: ''
|
313 |
+
test_num: 0
|
314 |
+
test_prefixes:
|
315 |
+
- '2044'
|
316 |
+
- '2086'
|
317 |
+
- '2092'
|
318 |
+
- '2093'
|
319 |
+
- '2100'
|
320 |
+
test_set_name: test
|
321 |
+
timesteps: 1000
|
322 |
+
train_set_name: train
|
323 |
+
use_denoise: false
|
324 |
+
use_energy_embed: false
|
325 |
+
use_gt_dur: false
|
326 |
+
use_gt_f0: false
|
327 |
+
use_midi: true
|
328 |
+
use_nsf: true
|
329 |
+
use_pitch_embed: false
|
330 |
+
use_pos_embed: true
|
331 |
+
use_spk_embed: false
|
332 |
+
use_spk_id: false
|
333 |
+
use_split_spk_id: false
|
334 |
+
use_uv: true
|
335 |
+
use_var_enc: false
|
336 |
+
val_check_interval: 2000
|
337 |
+
valid_num: 0
|
338 |
+
valid_set_name: valid
|
339 |
+
vocoder: vocoders.hifigan.HifiGAN
|
340 |
+
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
|
341 |
+
warmup_updates: 2000
|
342 |
+
wav2spec_eps: 1e-6
|
343 |
+
weight_decay: 0
|
344 |
+
win_size: 512
|
345 |
+
work_dir: checkpoints/0831_opencpop_ds1000
|
346 |
+
pndm_speedup: 10
|
checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e
|
3 |
+
size 170269591
|
checkpoints/clean.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import torch
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
ckpt_path = sys.argv[1]
|
6 |
+
checkpoint = torch.load(ckpt_path, map_location='cpu')
|
7 |
+
print(checkpoint['state_dict'].keys())
|
8 |
+
if 'model' in checkpoint['state_dict']:
|
9 |
+
checkpoint = {'state_dict': {'model': checkpoint['state_dict']['model']}}
|
10 |
+
else:
|
11 |
+
checkpoint = {'state_dict': {'model_gen': checkpoint['state_dict']['model_gen']}}
|
12 |
+
torch.save(checkpoint, ckpt_path, _use_new_zipfile_serialization=False)
|
checkpoints/cleaner.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import torch
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
ckpt_path = sys.argv[1]
|
6 |
+
checkpoint = torch.load(ckpt_path, map_location='cpu')
|
7 |
+
checkpoint = {'state_dict': checkpoint['state_dict']}
|
8 |
+
torch.save(checkpoint, ckpt_path, _use_new_zipfile_serialization=False)
|
configs/config_base.yaml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
binary_data_dir: ''
|
3 |
+
work_dir: '' # experiment directory.
|
4 |
+
infer: false # infer
|
5 |
+
seed: 1234
|
6 |
+
debug: false
|
7 |
+
save_codes:
|
8 |
+
- configs
|
9 |
+
- modules
|
10 |
+
- tasks
|
11 |
+
- utils
|
12 |
+
- usr
|
13 |
+
|
14 |
+
#############
|
15 |
+
# dataset
|
16 |
+
#############
|
17 |
+
ds_workers: 1
|
18 |
+
test_num: 100
|
19 |
+
valid_num: 100
|
20 |
+
endless_ds: false
|
21 |
+
sort_by_len: true
|
22 |
+
|
23 |
+
#########
|
24 |
+
# train and eval
|
25 |
+
#########
|
26 |
+
load_ckpt: ''
|
27 |
+
save_ckpt: true
|
28 |
+
save_best: false
|
29 |
+
num_ckpt_keep: 3
|
30 |
+
clip_grad_norm: 0
|
31 |
+
accumulate_grad_batches: 1
|
32 |
+
log_interval: 100
|
33 |
+
num_sanity_val_steps: 5 # steps of validation at the beginning
|
34 |
+
check_val_every_n_epoch: 10
|
35 |
+
val_check_interval: 2000
|
36 |
+
max_epochs: 1000
|
37 |
+
max_updates: 160000
|
38 |
+
max_tokens: 31250
|
39 |
+
max_sentences: 100000
|
40 |
+
max_eval_tokens: -1
|
41 |
+
max_eval_sentences: -1
|
42 |
+
test_input_dir: ''
|
configs/singing/base.yaml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- configs/tts/base.yaml
|
3 |
+
- configs/tts/base_zh.yaml
|
4 |
+
|
5 |
+
|
6 |
+
datasets: []
|
7 |
+
test_prefixes: []
|
8 |
+
test_num: 0
|
9 |
+
valid_num: 0
|
10 |
+
|
11 |
+
pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
|
12 |
+
binarizer_cls: data_gen.singing.binarize.SingingBinarizer
|
13 |
+
pre_align_args:
|
14 |
+
use_tone: false # for ZH
|
15 |
+
forced_align: mfa
|
16 |
+
use_sox: true
|
17 |
+
hop_size: 128 # Hop size.
|
18 |
+
fft_size: 512 # FFT size.
|
19 |
+
win_size: 512 # FFT size.
|
20 |
+
max_frames: 8000
|
21 |
+
fmin: 50 # Minimum freq in mel basis calculation.
|
22 |
+
fmax: 11025 # Maximum frequency in mel basis calculation.
|
23 |
+
pitch_type: frame
|
24 |
+
|
25 |
+
hidden_size: 256
|
26 |
+
mel_loss: "ssim:0.5|l1:0.5"
|
27 |
+
lambda_f0: 0.0
|
28 |
+
lambda_uv: 0.0
|
29 |
+
lambda_energy: 0.0
|
30 |
+
lambda_ph_dur: 0.0
|
31 |
+
lambda_sent_dur: 0.0
|
32 |
+
lambda_word_dur: 0.0
|
33 |
+
predictor_grad: 0.0
|
34 |
+
use_spk_embed: true
|
35 |
+
use_spk_id: false
|
36 |
+
|
37 |
+
max_tokens: 20000
|
38 |
+
max_updates: 400000
|
39 |
+
num_spk: 100
|
40 |
+
save_f0: true
|
41 |
+
use_gt_dur: true
|
42 |
+
use_gt_f0: true
|
configs/singing/fs2.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- configs/tts/fs2.yaml
|
3 |
+
- configs/singing/base.yaml
|
configs/tts/base.yaml
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
base_config: configs/config_base.yaml
|
3 |
+
task_cls: ''
|
4 |
+
#############
|
5 |
+
# dataset
|
6 |
+
#############
|
7 |
+
raw_data_dir: ''
|
8 |
+
processed_data_dir: ''
|
9 |
+
binary_data_dir: ''
|
10 |
+
dict_dir: ''
|
11 |
+
pre_align_cls: ''
|
12 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
13 |
+
pre_align_args:
|
14 |
+
use_tone: true # for ZH
|
15 |
+
forced_align: mfa
|
16 |
+
use_sox: false
|
17 |
+
txt_processor: en
|
18 |
+
allow_no_txt: false
|
19 |
+
denoise: false
|
20 |
+
binarization_args:
|
21 |
+
shuffle: false
|
22 |
+
with_txt: true
|
23 |
+
with_wav: false
|
24 |
+
with_align: true
|
25 |
+
with_spk_embed: true
|
26 |
+
with_f0: true
|
27 |
+
with_f0cwt: true
|
28 |
+
|
29 |
+
loud_norm: false
|
30 |
+
endless_ds: true
|
31 |
+
reset_phone_dict: true
|
32 |
+
|
33 |
+
test_num: 100
|
34 |
+
valid_num: 100
|
35 |
+
max_frames: 1550
|
36 |
+
max_input_tokens: 1550
|
37 |
+
audio_num_mel_bins: 80
|
38 |
+
audio_sample_rate: 22050
|
39 |
+
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
|
40 |
+
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
|
41 |
+
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
|
42 |
+
fmax: 7600 # To be increased/reduced depending on data.
|
43 |
+
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
|
44 |
+
min_level_db: -100
|
45 |
+
num_spk: 1
|
46 |
+
mel_vmin: -6
|
47 |
+
mel_vmax: 1.5
|
48 |
+
ds_workers: 4
|
49 |
+
|
50 |
+
#########
|
51 |
+
# model
|
52 |
+
#########
|
53 |
+
dropout: 0.1
|
54 |
+
enc_layers: 4
|
55 |
+
dec_layers: 4
|
56 |
+
hidden_size: 384
|
57 |
+
num_heads: 2
|
58 |
+
prenet_dropout: 0.5
|
59 |
+
prenet_hidden_size: 256
|
60 |
+
stop_token_weight: 5.0
|
61 |
+
enc_ffn_kernel_size: 9
|
62 |
+
dec_ffn_kernel_size: 9
|
63 |
+
ffn_act: gelu
|
64 |
+
ffn_padding: 'SAME'
|
65 |
+
|
66 |
+
|
67 |
+
###########
|
68 |
+
# optimization
|
69 |
+
###########
|
70 |
+
lr: 2.0
|
71 |
+
warmup_updates: 8000
|
72 |
+
optimizer_adam_beta1: 0.9
|
73 |
+
optimizer_adam_beta2: 0.98
|
74 |
+
weight_decay: 0
|
75 |
+
clip_grad_norm: 1
|
76 |
+
|
77 |
+
|
78 |
+
###########
|
79 |
+
# train and eval
|
80 |
+
###########
|
81 |
+
max_tokens: 30000
|
82 |
+
max_sentences: 100000
|
83 |
+
max_eval_sentences: 1
|
84 |
+
max_eval_tokens: 60000
|
85 |
+
train_set_name: 'train'
|
86 |
+
valid_set_name: 'valid'
|
87 |
+
test_set_name: 'test'
|
88 |
+
vocoder: pwg
|
89 |
+
vocoder_ckpt: ''
|
90 |
+
profile_infer: false
|
91 |
+
out_wav_norm: false
|
92 |
+
save_gt: false
|
93 |
+
save_f0: false
|
94 |
+
gen_dir_name: ''
|
95 |
+
use_denoise: false
|
configs/tts/base_zh.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pre_align_args:
|
2 |
+
txt_processor: zh_g2pM
|
3 |
+
binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer
|
configs/tts/fs2.yaml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: configs/tts/base.yaml
|
2 |
+
task_cls: tasks.tts.fs2.FastSpeech2Task
|
3 |
+
|
4 |
+
# model
|
5 |
+
hidden_size: 256
|
6 |
+
dropout: 0.1
|
7 |
+
encoder_type: fft # fft|tacotron|tacotron2|conformer
|
8 |
+
encoder_K: 8 # for tacotron encoder
|
9 |
+
decoder_type: fft # fft|rnn|conv|conformer
|
10 |
+
use_pos_embed: true
|
11 |
+
|
12 |
+
# duration
|
13 |
+
predictor_hidden: -1
|
14 |
+
predictor_kernel: 5
|
15 |
+
predictor_layers: 2
|
16 |
+
dur_predictor_kernel: 3
|
17 |
+
dur_predictor_layers: 2
|
18 |
+
predictor_dropout: 0.5
|
19 |
+
|
20 |
+
# pitch and energy
|
21 |
+
use_pitch_embed: true
|
22 |
+
pitch_type: ph # frame|ph|cwt
|
23 |
+
use_uv: true
|
24 |
+
cwt_hidden_size: 128
|
25 |
+
cwt_layers: 2
|
26 |
+
cwt_loss: l1
|
27 |
+
cwt_add_f0_loss: false
|
28 |
+
cwt_std_scale: 0.8
|
29 |
+
|
30 |
+
pitch_ar: false
|
31 |
+
#pitch_embed_type: 0q
|
32 |
+
pitch_loss: 'l1' # l1|l2|ssim
|
33 |
+
pitch_norm: log
|
34 |
+
use_energy_embed: false
|
35 |
+
|
36 |
+
# reference encoder and speaker embedding
|
37 |
+
use_spk_id: false
|
38 |
+
use_split_spk_id: false
|
39 |
+
use_spk_embed: false
|
40 |
+
use_var_enc: false
|
41 |
+
lambda_commit: 0.25
|
42 |
+
ref_norm_layer: bn
|
43 |
+
pitch_enc_hidden_stride_kernel:
|
44 |
+
- 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
|
45 |
+
- 0,2,5
|
46 |
+
- 0,2,5
|
47 |
+
dur_enc_hidden_stride_kernel:
|
48 |
+
- 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
|
49 |
+
- 0,2,3
|
50 |
+
- 0,1,3
|
51 |
+
|
52 |
+
|
53 |
+
# mel
|
54 |
+
mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
|
55 |
+
|
56 |
+
# loss lambda
|
57 |
+
lambda_f0: 1.0
|
58 |
+
lambda_uv: 1.0
|
59 |
+
lambda_energy: 0.1
|
60 |
+
lambda_ph_dur: 1.0
|
61 |
+
lambda_sent_dur: 1.0
|
62 |
+
lambda_word_dur: 1.0
|
63 |
+
predictor_grad: 0.1
|
64 |
+
|
65 |
+
# train and eval
|
66 |
+
pretrain_fs_ckpt: ''
|
67 |
+
warmup_updates: 2000
|
68 |
+
max_tokens: 32000
|
69 |
+
max_sentences: 100000
|
70 |
+
max_eval_sentences: 1
|
71 |
+
max_updates: 120000
|
72 |
+
num_valid_plots: 5
|
73 |
+
num_test_samples: 0
|
74 |
+
test_ids: []
|
75 |
+
use_gt_dur: false
|
76 |
+
use_gt_f0: false
|
77 |
+
|
78 |
+
# exp
|
79 |
+
dur_loss: mse # huber|mol
|
80 |
+
norm_type: gn
|
configs/tts/hifigan.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: configs/tts/pwg.yaml
|
2 |
+
task_cls: tasks.vocoder.hifigan.HifiGanTask
|
3 |
+
resblock: "1"
|
4 |
+
adam_b1: 0.8
|
5 |
+
adam_b2: 0.99
|
6 |
+
upsample_rates: [ 8,8,2,2 ]
|
7 |
+
upsample_kernel_sizes: [ 16,16,4,4 ]
|
8 |
+
upsample_initial_channel: 128
|
9 |
+
resblock_kernel_sizes: [ 3,7,11 ]
|
10 |
+
resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
|
11 |
+
|
12 |
+
lambda_mel: 45.0
|
13 |
+
|
14 |
+
max_samples: 8192
|
15 |
+
max_sentences: 16
|
16 |
+
|
17 |
+
generator_params:
|
18 |
+
lr: 0.0002 # Generator's learning rate.
|
19 |
+
aux_context_window: 0 # Context window size for auxiliary feature.
|
20 |
+
discriminator_optimizer_params:
|
21 |
+
lr: 0.0002 # Discriminator's learning rate.
|
configs/tts/lj/base_mel2wav.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
2 |
+
processed_data_dir: 'data/processed/ljspeech'
|
3 |
+
binary_data_dir: 'data/binary/ljspeech_wav'
|
configs/tts/lj/base_text2mel.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
2 |
+
processed_data_dir: 'data/processed/ljspeech'
|
3 |
+
binary_data_dir: 'data/binary/ljspeech'
|
4 |
+
pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
|
5 |
+
|
6 |
+
pitch_type: cwt
|
7 |
+
mel_loss: l1
|
8 |
+
num_test_samples: 20
|
9 |
+
test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
|
10 |
+
316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
|
11 |
+
use_energy_embed: false
|
12 |
+
test_num: 523
|
13 |
+
valid_num: 348
|
configs/tts/lj/fs2.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- configs/tts/fs2.yaml
|
3 |
+
- configs/tts/lj/base_text2mel.yaml
|
configs/tts/lj/hifigan.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- configs/tts/hifigan.yaml
|
3 |
+
- configs/tts/lj/base_mel2wav.yaml
|
configs/tts/lj/pwg.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- configs/tts/pwg.yaml
|
3 |
+
- configs/tts/lj/base_mel2wav.yaml
|
configs/tts/pwg.yaml
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: configs/tts/base.yaml
|
2 |
+
task_cls: tasks.vocoder.pwg.PwgTask
|
3 |
+
|
4 |
+
binarization_args:
|
5 |
+
with_wav: true
|
6 |
+
with_spk_embed: false
|
7 |
+
with_align: false
|
8 |
+
test_input_dir: ''
|
9 |
+
|
10 |
+
###########
|
11 |
+
# train and eval
|
12 |
+
###########
|
13 |
+
max_samples: 25600
|
14 |
+
max_sentences: 5
|
15 |
+
max_eval_sentences: 1
|
16 |
+
max_updates: 1000000
|
17 |
+
val_check_interval: 2000
|
18 |
+
|
19 |
+
|
20 |
+
###########################################################
|
21 |
+
# FEATURE EXTRACTION SETTING #
|
22 |
+
###########################################################
|
23 |
+
sampling_rate: 22050 # Sampling rate.
|
24 |
+
fft_size: 1024 # FFT size.
|
25 |
+
hop_size: 256 # Hop size.
|
26 |
+
win_length: null # Window length.
|
27 |
+
# If set to null, it will be the same as fft_size.
|
28 |
+
window: "hann" # Window function.
|
29 |
+
num_mels: 80 # Number of mel basis.
|
30 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
31 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
32 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
33 |
+
|
34 |
+
###########################################################
|
35 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
36 |
+
###########################################################
|
37 |
+
generator_params:
|
38 |
+
in_channels: 1 # Number of input channels.
|
39 |
+
out_channels: 1 # Number of output channels.
|
40 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
41 |
+
layers: 30 # Number of residual block layers.
|
42 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
43 |
+
residual_channels: 64 # Number of channels in residual conv.
|
44 |
+
gate_channels: 128 # Number of channels in gated conv.
|
45 |
+
skip_channels: 64 # Number of channels in skip conv.
|
46 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
47 |
+
# Must be the same as num_mels.
|
48 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
49 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
50 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
51 |
+
use_weight_norm: true # Whether to use weight norm.
|
52 |
+
# If set to true, it will be applied to all of the conv layers.
|
53 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
54 |
+
upsample_params: # Upsampling network parameters.
|
55 |
+
upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
|
56 |
+
use_pitch_embed: false
|
57 |
+
|
58 |
+
###########################################################
|
59 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
60 |
+
###########################################################
|
61 |
+
discriminator_params:
|
62 |
+
in_channels: 1 # Number of input channels.
|
63 |
+
out_channels: 1 # Number of output channels.
|
64 |
+
kernel_size: 3 # Number of output channels.
|
65 |
+
layers: 10 # Number of conv layers.
|
66 |
+
conv_channels: 64 # Number of chnn layers.
|
67 |
+
bias: true # Whether to use bias parameter in conv.
|
68 |
+
use_weight_norm: true # Whether to use weight norm.
|
69 |
+
# If set to true, it will be applied to all of the conv layers.
|
70 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
71 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
72 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
73 |
+
|
74 |
+
###########################################################
|
75 |
+
# STFT LOSS SETTING #
|
76 |
+
###########################################################
|
77 |
+
stft_loss_params:
|
78 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
79 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
80 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
81 |
+
window: "hann_window" # Window function for STFT-based loss
|
82 |
+
use_mel_loss: false
|
83 |
+
|
84 |
+
###########################################################
|
85 |
+
# ADVERSARIAL LOSS SETTING #
|
86 |
+
###########################################################
|
87 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
88 |
+
|
89 |
+
###########################################################
|
90 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
91 |
+
###########################################################
|
92 |
+
generator_optimizer_params:
|
93 |
+
lr: 0.0001 # Generator's learning rate.
|
94 |
+
eps: 1.0e-6 # Generator's epsilon.
|
95 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
96 |
+
generator_scheduler_params:
|
97 |
+
step_size: 200000 # Generator's scheduler step size.
|
98 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
99 |
+
# At each step size, lr will be multiplied by this parameter.
|
100 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
101 |
+
discriminator_optimizer_params:
|
102 |
+
lr: 0.00005 # Discriminator's learning rate.
|
103 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
104 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
105 |
+
discriminator_scheduler_params:
|
106 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
107 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
108 |
+
# At each step size, lr will be multiplied by this parameter.
|
109 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
110 |
+
disc_start_steps: 40000 # Number of steps to start to train discriminator.
|
data/processed/ljspeech/dict.txt
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
! !
|
2 |
+
, ,
|
3 |
+
. .
|
4 |
+
; ;
|
5 |
+
<BOS> <BOS>
|
6 |
+
<EOS> <EOS>
|
7 |
+
? ?
|
8 |
+
AA0 AA0
|
9 |
+
AA1 AA1
|
10 |
+
AA2 AA2
|
11 |
+
AE0 AE0
|
12 |
+
AE1 AE1
|
13 |
+
AE2 AE2
|
14 |
+
AH0 AH0
|
15 |
+
AH1 AH1
|
16 |
+
AH2 AH2
|
17 |
+
AO0 AO0
|
18 |
+
AO1 AO1
|
19 |
+
AO2 AO2
|
20 |
+
AW0 AW0
|
21 |
+
AW1 AW1
|
22 |
+
AW2 AW2
|
23 |
+
AY0 AY0
|
24 |
+
AY1 AY1
|
25 |
+
AY2 AY2
|
26 |
+
B B
|
27 |
+
CH CH
|
28 |
+
D D
|
29 |
+
DH DH
|
30 |
+
EH0 EH0
|
31 |
+
EH1 EH1
|
32 |
+
EH2 EH2
|
33 |
+
ER0 ER0
|
34 |
+
ER1 ER1
|
35 |
+
ER2 ER2
|
36 |
+
EY0 EY0
|
37 |
+
EY1 EY1
|
38 |
+
EY2 EY2
|
39 |
+
F F
|
40 |
+
G G
|
41 |
+
HH HH
|
42 |
+
IH0 IH0
|
43 |
+
IH1 IH1
|
44 |
+
IH2 IH2
|
45 |
+
IY0 IY0
|
46 |
+
IY1 IY1
|
47 |
+
IY2 IY2
|
48 |
+
JH JH
|
49 |
+
K K
|
50 |
+
L L
|
51 |
+
M M
|
52 |
+
N N
|
53 |
+
NG NG
|
54 |
+
OW0 OW0
|
55 |
+
OW1 OW1
|
56 |
+
OW2 OW2
|
57 |
+
OY0 OY0
|
58 |
+
OY1 OY1
|
59 |
+
OY2 OY2
|
60 |
+
P P
|
61 |
+
R R
|
62 |
+
S S
|
63 |
+
SH SH
|
64 |
+
T T
|
65 |
+
TH TH
|
66 |
+
UH0 UH0
|
67 |
+
UH1 UH1
|
68 |
+
UH2 UH2
|
69 |
+
UW0 UW0
|
70 |
+
UW1 UW1
|
71 |
+
UW2 UW2
|
72 |
+
V V
|
73 |
+
W W
|
74 |
+
Y Y
|
75 |
+
Z Z
|
76 |
+
ZH ZH
|
77 |
+
| |
|
data/processed/ljspeech/metadata_phone.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/processed/ljspeech/mfa_dict.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/processed/ljspeech/phone_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
data_gen/singing/binarize.py
ADDED
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
from copy import deepcopy
|
4 |
+
import pandas as pd
|
5 |
+
import logging
|
6 |
+
from tqdm import tqdm
|
7 |
+
import json
|
8 |
+
import glob
|
9 |
+
import re
|
10 |
+
from resemblyzer import VoiceEncoder
|
11 |
+
import traceback
|
12 |
+
import numpy as np
|
13 |
+
import pretty_midi
|
14 |
+
import librosa
|
15 |
+
from scipy.interpolate import interp1d
|
16 |
+
import torch
|
17 |
+
from textgrid import TextGrid
|
18 |
+
|
19 |
+
from utils.hparams import hparams
|
20 |
+
from data_gen.tts.data_gen_utils import build_phone_encoder, get_pitch
|
21 |
+
from utils.pitch_utils import f0_to_coarse
|
22 |
+
from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
|
23 |
+
from data_gen.tts.binarizer_zh import ZhBinarizer
|
24 |
+
from data_gen.tts.txt_processors.zh_g2pM import ALL_YUNMU
|
25 |
+
from vocoders.base_vocoder import VOCODERS
|
26 |
+
|
27 |
+
|
28 |
+
class SingingBinarizer(BaseBinarizer):
|
29 |
+
def __init__(self, processed_data_dir=None):
|
30 |
+
if processed_data_dir is None:
|
31 |
+
processed_data_dir = hparams['processed_data_dir']
|
32 |
+
self.processed_data_dirs = processed_data_dir.split(",")
|
33 |
+
self.binarization_args = hparams['binarization_args']
|
34 |
+
self.pre_align_args = hparams['pre_align_args']
|
35 |
+
self.item2txt = {}
|
36 |
+
self.item2ph = {}
|
37 |
+
self.item2wavfn = {}
|
38 |
+
self.item2f0fn = {}
|
39 |
+
self.item2tgfn = {}
|
40 |
+
self.item2spk = {}
|
41 |
+
|
42 |
+
def split_train_test_set(self, item_names):
|
43 |
+
item_names = deepcopy(item_names)
|
44 |
+
test_item_names = [x for x in item_names if any([ts in x for ts in hparams['test_prefixes']])]
|
45 |
+
train_item_names = [x for x in item_names if x not in set(test_item_names)]
|
46 |
+
logging.info("train {}".format(len(train_item_names)))
|
47 |
+
logging.info("test {}".format(len(test_item_names)))
|
48 |
+
return train_item_names, test_item_names
|
49 |
+
|
50 |
+
def load_meta_data(self):
|
51 |
+
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
52 |
+
wav_suffix = '_wf0.wav'
|
53 |
+
txt_suffix = '.txt'
|
54 |
+
ph_suffix = '_ph.txt'
|
55 |
+
tg_suffix = '.TextGrid'
|
56 |
+
all_wav_pieces = glob.glob(f'{processed_data_dir}/*/*{wav_suffix}')
|
57 |
+
|
58 |
+
for piece_path in all_wav_pieces:
|
59 |
+
item_name = raw_item_name = piece_path[len(processed_data_dir)+1:].replace('/', '-')[:-len(wav_suffix)]
|
60 |
+
if len(self.processed_data_dirs) > 1:
|
61 |
+
item_name = f'ds{ds_id}_{item_name}'
|
62 |
+
self.item2txt[item_name] = open(f'{piece_path.replace(wav_suffix, txt_suffix)}').readline()
|
63 |
+
self.item2ph[item_name] = open(f'{piece_path.replace(wav_suffix, ph_suffix)}').readline()
|
64 |
+
self.item2wavfn[item_name] = piece_path
|
65 |
+
|
66 |
+
self.item2spk[item_name] = re.split('-|#', piece_path.split('/')[-2])[0]
|
67 |
+
if len(self.processed_data_dirs) > 1:
|
68 |
+
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
69 |
+
self.item2tgfn[item_name] = piece_path.replace(wav_suffix, tg_suffix)
|
70 |
+
print('spkers: ', set(self.item2spk.values()))
|
71 |
+
self.item_names = sorted(list(self.item2txt.keys()))
|
72 |
+
if self.binarization_args['shuffle']:
|
73 |
+
random.seed(1234)
|
74 |
+
random.shuffle(self.item_names)
|
75 |
+
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
|
76 |
+
|
77 |
+
@property
|
78 |
+
def train_item_names(self):
|
79 |
+
return self._train_item_names
|
80 |
+
|
81 |
+
@property
|
82 |
+
def valid_item_names(self):
|
83 |
+
return self._test_item_names
|
84 |
+
|
85 |
+
@property
|
86 |
+
def test_item_names(self):
|
87 |
+
return self._test_item_names
|
88 |
+
|
89 |
+
def process(self):
|
90 |
+
self.load_meta_data()
|
91 |
+
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
92 |
+
self.spk_map = self.build_spk_map()
|
93 |
+
print("| spk_map: ", self.spk_map)
|
94 |
+
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
|
95 |
+
json.dump(self.spk_map, open(spk_map_fn, 'w'))
|
96 |
+
|
97 |
+
self.phone_encoder = self._phone_encoder()
|
98 |
+
self.process_data('valid')
|
99 |
+
self.process_data('test')
|
100 |
+
self.process_data('train')
|
101 |
+
|
102 |
+
def _phone_encoder(self):
|
103 |
+
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
|
104 |
+
ph_set = []
|
105 |
+
if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
106 |
+
for ph_sent in self.item2ph.values():
|
107 |
+
ph_set += ph_sent.split(' ')
|
108 |
+
ph_set = sorted(set(ph_set))
|
109 |
+
json.dump(ph_set, open(ph_set_fn, 'w'))
|
110 |
+
print("| Build phone set: ", ph_set)
|
111 |
+
else:
|
112 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
113 |
+
print("| Load phone set: ", ph_set)
|
114 |
+
return build_phone_encoder(hparams['binary_data_dir'])
|
115 |
+
|
116 |
+
# @staticmethod
|
117 |
+
# def get_pitch(wav_fn, spec, res):
|
118 |
+
# wav_suffix = '_wf0.wav'
|
119 |
+
# f0_suffix = '_f0.npy'
|
120 |
+
# f0fn = wav_fn.replace(wav_suffix, f0_suffix)
|
121 |
+
# pitch_info = np.load(f0fn)
|
122 |
+
# f0 = [x[1] for x in pitch_info]
|
123 |
+
# spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
|
124 |
+
# f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
|
125 |
+
# f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
|
126 |
+
# # f0_x_coor = np.arange(0, 1, 1 / len(f0))
|
127 |
+
# # f0_x_coor[-1] = 1
|
128 |
+
# # f0 = interp1d(f0_x_coor, f0, 'nearest')(spec_x_coor)[:len(spec)]
|
129 |
+
# if sum(f0) == 0:
|
130 |
+
# raise BinarizationError("Empty f0")
|
131 |
+
# assert len(f0) == len(spec), (len(f0), len(spec))
|
132 |
+
# pitch_coarse = f0_to_coarse(f0)
|
133 |
+
#
|
134 |
+
# # vis f0
|
135 |
+
# # import matplotlib.pyplot as plt
|
136 |
+
# # from textgrid import TextGrid
|
137 |
+
# # tg_fn = wav_fn.replace(wav_suffix, '.TextGrid')
|
138 |
+
# # fig = plt.figure(figsize=(12, 6))
|
139 |
+
# # plt.pcolor(spec.T, vmin=-5, vmax=0)
|
140 |
+
# # ax = plt.gca()
|
141 |
+
# # ax2 = ax.twinx()
|
142 |
+
# # ax2.plot(f0, color='red')
|
143 |
+
# # ax2.set_ylim(0, 800)
|
144 |
+
# # itvs = TextGrid.fromFile(tg_fn)[0]
|
145 |
+
# # for itv in itvs:
|
146 |
+
# # x = itv.maxTime * hparams['audio_sample_rate'] / hparams['hop_size']
|
147 |
+
# # plt.vlines(x=x, ymin=0, ymax=80, color='black')
|
148 |
+
# # plt.text(x=x, y=20, s=itv.mark, color='black')
|
149 |
+
# # plt.savefig('tmp/20211229_singing_plots_test.png')
|
150 |
+
#
|
151 |
+
# res['f0'] = f0
|
152 |
+
# res['pitch'] = pitch_coarse
|
153 |
+
|
154 |
+
@classmethod
|
155 |
+
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
156 |
+
if hparams['vocoder'] in VOCODERS:
|
157 |
+
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
158 |
+
else:
|
159 |
+
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
160 |
+
res = {
|
161 |
+
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
162 |
+
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
163 |
+
}
|
164 |
+
try:
|
165 |
+
if binarization_args['with_f0']:
|
166 |
+
# cls.get_pitch(wav_fn, mel, res)
|
167 |
+
cls.get_pitch(wav, mel, res)
|
168 |
+
if binarization_args['with_txt']:
|
169 |
+
try:
|
170 |
+
# print(ph)
|
171 |
+
phone_encoded = res['phone'] = encoder.encode(ph)
|
172 |
+
except:
|
173 |
+
traceback.print_exc()
|
174 |
+
raise BinarizationError(f"Empty phoneme")
|
175 |
+
if binarization_args['with_align']:
|
176 |
+
cls.get_align(tg_fn, ph, mel, phone_encoded, res)
|
177 |
+
except BinarizationError as e:
|
178 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
179 |
+
return None
|
180 |
+
return res
|
181 |
+
|
182 |
+
|
183 |
+
class MidiSingingBinarizer(SingingBinarizer):
|
184 |
+
item2midi = {}
|
185 |
+
item2midi_dur = {}
|
186 |
+
item2is_slur = {}
|
187 |
+
item2ph_durs = {}
|
188 |
+
item2wdb = {}
|
189 |
+
|
190 |
+
def load_meta_data(self):
|
191 |
+
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
192 |
+
meta_midi = json.load(open(os.path.join(processed_data_dir, 'meta.json'))) # [list of dict]
|
193 |
+
|
194 |
+
for song_item in meta_midi:
|
195 |
+
item_name = raw_item_name = song_item['item_name']
|
196 |
+
if len(self.processed_data_dirs) > 1:
|
197 |
+
item_name = f'ds{ds_id}_{item_name}'
|
198 |
+
self.item2wavfn[item_name] = song_item['wav_fn']
|
199 |
+
self.item2txt[item_name] = song_item['txt']
|
200 |
+
|
201 |
+
self.item2ph[item_name] = ' '.join(song_item['phs'])
|
202 |
+
self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP', '<SIL>'] else 0 for x in song_item['phs']]
|
203 |
+
self.item2ph_durs[item_name] = song_item['ph_dur']
|
204 |
+
|
205 |
+
self.item2midi[item_name] = song_item['notes']
|
206 |
+
self.item2midi_dur[item_name] = song_item['notes_dur']
|
207 |
+
self.item2is_slur[item_name] = song_item['is_slur']
|
208 |
+
self.item2spk[item_name] = 'pop-cs'
|
209 |
+
if len(self.processed_data_dirs) > 1:
|
210 |
+
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
211 |
+
|
212 |
+
print('spkers: ', set(self.item2spk.values()))
|
213 |
+
self.item_names = sorted(list(self.item2txt.keys()))
|
214 |
+
if self.binarization_args['shuffle']:
|
215 |
+
random.seed(1234)
|
216 |
+
random.shuffle(self.item_names)
|
217 |
+
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
|
218 |
+
|
219 |
+
@staticmethod
|
220 |
+
def get_pitch(wav_fn, wav, spec, ph, res):
|
221 |
+
wav_suffix = '.wav'
|
222 |
+
# midi_suffix = '.mid'
|
223 |
+
wav_dir = 'wavs'
|
224 |
+
f0_dir = 'f0'
|
225 |
+
|
226 |
+
item_name = '/'.join(os.path.splitext(wav_fn)[0].split('/')[-2:]).replace('_wf0', '')
|
227 |
+
res['pitch_midi'] = np.asarray(MidiSingingBinarizer.item2midi[item_name])
|
228 |
+
res['midi_dur'] = np.asarray(MidiSingingBinarizer.item2midi_dur[item_name])
|
229 |
+
res['is_slur'] = np.asarray(MidiSingingBinarizer.item2is_slur[item_name])
|
230 |
+
res['word_boundary'] = np.asarray(MidiSingingBinarizer.item2wdb[item_name])
|
231 |
+
assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (
|
232 |
+
res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
|
233 |
+
|
234 |
+
# gt f0.
|
235 |
+
gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
|
236 |
+
if sum(gt_f0) == 0:
|
237 |
+
raise BinarizationError("Empty **gt** f0")
|
238 |
+
res['f0'] = gt_f0
|
239 |
+
res['pitch'] = gt_pitch_coarse
|
240 |
+
|
241 |
+
@staticmethod
|
242 |
+
def get_align(ph_durs, mel, phone_encoded, res, hop_size=hparams['hop_size'], audio_sample_rate=hparams['audio_sample_rate']):
|
243 |
+
mel2ph = np.zeros([mel.shape[0]], int)
|
244 |
+
startTime = 0
|
245 |
+
|
246 |
+
for i_ph in range(len(ph_durs)):
|
247 |
+
start_frame = int(startTime * audio_sample_rate / hop_size + 0.5)
|
248 |
+
end_frame = int((startTime + ph_durs[i_ph]) * audio_sample_rate / hop_size + 0.5)
|
249 |
+
mel2ph[start_frame:end_frame] = i_ph + 1
|
250 |
+
startTime = startTime + ph_durs[i_ph]
|
251 |
+
|
252 |
+
# print('ph durs: ', ph_durs)
|
253 |
+
# print('mel2ph: ', mel2ph, len(mel2ph))
|
254 |
+
res['mel2ph'] = mel2ph
|
255 |
+
# res['dur'] = None
|
256 |
+
|
257 |
+
@classmethod
|
258 |
+
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
259 |
+
if hparams['vocoder'] in VOCODERS:
|
260 |
+
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
261 |
+
else:
|
262 |
+
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
263 |
+
res = {
|
264 |
+
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
265 |
+
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
266 |
+
}
|
267 |
+
try:
|
268 |
+
if binarization_args['with_f0']:
|
269 |
+
cls.get_pitch(wav_fn, wav, mel, ph, res)
|
270 |
+
if binarization_args['with_txt']:
|
271 |
+
try:
|
272 |
+
phone_encoded = res['phone'] = encoder.encode(ph)
|
273 |
+
except:
|
274 |
+
traceback.print_exc()
|
275 |
+
raise BinarizationError(f"Empty phoneme")
|
276 |
+
if binarization_args['with_align']:
|
277 |
+
cls.get_align(MidiSingingBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
|
278 |
+
except BinarizationError as e:
|
279 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
280 |
+
return None
|
281 |
+
return res
|
282 |
+
|
283 |
+
|
284 |
+
class ZhSingingBinarizer(ZhBinarizer, SingingBinarizer):
|
285 |
+
pass
|
286 |
+
|
287 |
+
|
288 |
+
class OpencpopBinarizer(MidiSingingBinarizer):
|
289 |
+
item2midi = {}
|
290 |
+
item2midi_dur = {}
|
291 |
+
item2is_slur = {}
|
292 |
+
item2ph_durs = {}
|
293 |
+
item2wdb = {}
|
294 |
+
|
295 |
+
def split_train_test_set(self, item_names):
|
296 |
+
item_names = deepcopy(item_names)
|
297 |
+
test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
|
298 |
+
train_item_names = [x for x in item_names if x not in set(test_item_names)]
|
299 |
+
logging.info("train {}".format(len(train_item_names)))
|
300 |
+
logging.info("test {}".format(len(test_item_names)))
|
301 |
+
return train_item_names, test_item_names
|
302 |
+
|
303 |
+
def load_meta_data(self):
|
304 |
+
raw_data_dir = hparams['raw_data_dir']
|
305 |
+
# meta_midi = json.load(open(os.path.join(raw_data_dir, 'meta.json'))) # [list of dict]
|
306 |
+
utterance_labels = open(os.path.join(raw_data_dir, 'transcriptions.txt')).readlines()
|
307 |
+
|
308 |
+
for utterance_label in utterance_labels:
|
309 |
+
song_info = utterance_label.split('|')
|
310 |
+
item_name = raw_item_name = song_info[0]
|
311 |
+
self.item2wavfn[item_name] = f'{raw_data_dir}/wavs/{item_name}.wav'
|
312 |
+
self.item2txt[item_name] = song_info[1]
|
313 |
+
|
314 |
+
self.item2ph[item_name] = song_info[2]
|
315 |
+
# self.item2wdb[item_name] = list(np.nonzero([1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()])[0])
|
316 |
+
self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()]
|
317 |
+
self.item2ph_durs[item_name] = [float(x) for x in song_info[5].split(" ")]
|
318 |
+
|
319 |
+
self.item2midi[item_name] = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
|
320 |
+
for x in song_info[3].split(" ")]
|
321 |
+
self.item2midi_dur[item_name] = [float(x) for x in song_info[4].split(" ")]
|
322 |
+
self.item2is_slur[item_name] = [int(x) for x in song_info[6].split(" ")]
|
323 |
+
self.item2spk[item_name] = 'opencpop'
|
324 |
+
|
325 |
+
print('spkers: ', set(self.item2spk.values()))
|
326 |
+
self.item_names = sorted(list(self.item2txt.keys()))
|
327 |
+
if self.binarization_args['shuffle']:
|
328 |
+
random.seed(1234)
|
329 |
+
random.shuffle(self.item_names)
|
330 |
+
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
|
331 |
+
|
332 |
+
@staticmethod
|
333 |
+
def get_pitch(wav_fn, wav, spec, ph, res):
|
334 |
+
wav_suffix = '.wav'
|
335 |
+
# midi_suffix = '.mid'
|
336 |
+
wav_dir = 'wavs'
|
337 |
+
f0_dir = 'text_f0_align'
|
338 |
+
|
339 |
+
item_name = os.path.splitext(os.path.basename(wav_fn))[0]
|
340 |
+
res['pitch_midi'] = np.asarray(OpencpopBinarizer.item2midi[item_name])
|
341 |
+
res['midi_dur'] = np.asarray(OpencpopBinarizer.item2midi_dur[item_name])
|
342 |
+
res['is_slur'] = np.asarray(OpencpopBinarizer.item2is_slur[item_name])
|
343 |
+
res['word_boundary'] = np.asarray(OpencpopBinarizer.item2wdb[item_name])
|
344 |
+
assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
|
345 |
+
|
346 |
+
# gt f0.
|
347 |
+
# f0 = None
|
348 |
+
# f0_suffix = '_f0.npy'
|
349 |
+
# f0fn = wav_fn.replace(wav_suffix, f0_suffix).replace(wav_dir, f0_dir)
|
350 |
+
# pitch_info = np.load(f0fn)
|
351 |
+
# f0 = [x[1] for x in pitch_info]
|
352 |
+
# spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
|
353 |
+
#
|
354 |
+
# f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
|
355 |
+
# f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
|
356 |
+
# if sum(f0) == 0:
|
357 |
+
# raise BinarizationError("Empty **gt** f0")
|
358 |
+
#
|
359 |
+
# pitch_coarse = f0_to_coarse(f0)
|
360 |
+
# res['f0'] = f0
|
361 |
+
# res['pitch'] = pitch_coarse
|
362 |
+
|
363 |
+
# gt f0.
|
364 |
+
gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
|
365 |
+
if sum(gt_f0) == 0:
|
366 |
+
raise BinarizationError("Empty **gt** f0")
|
367 |
+
res['f0'] = gt_f0
|
368 |
+
res['pitch'] = gt_pitch_coarse
|
369 |
+
|
370 |
+
@classmethod
|
371 |
+
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
372 |
+
if hparams['vocoder'] in VOCODERS:
|
373 |
+
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
374 |
+
else:
|
375 |
+
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
376 |
+
res = {
|
377 |
+
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
378 |
+
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
379 |
+
}
|
380 |
+
try:
|
381 |
+
if binarization_args['with_f0']:
|
382 |
+
cls.get_pitch(wav_fn, wav, mel, ph, res)
|
383 |
+
if binarization_args['with_txt']:
|
384 |
+
try:
|
385 |
+
phone_encoded = res['phone'] = encoder.encode(ph)
|
386 |
+
except:
|
387 |
+
traceback.print_exc()
|
388 |
+
raise BinarizationError(f"Empty phoneme")
|
389 |
+
if binarization_args['with_align']:
|
390 |
+
cls.get_align(OpencpopBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
|
391 |
+
except BinarizationError as e:
|
392 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
393 |
+
return None
|
394 |
+
return res
|
395 |
+
|
396 |
+
|
397 |
+
if __name__ == "__main__":
|
398 |
+
SingingBinarizer().process()
|
data_gen/tts/base_binarizer.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
3 |
+
|
4 |
+
from utils.multiprocess_utils import chunked_multiprocess_run
|
5 |
+
import random
|
6 |
+
import traceback
|
7 |
+
import json
|
8 |
+
from resemblyzer import VoiceEncoder
|
9 |
+
from tqdm import tqdm
|
10 |
+
from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
|
11 |
+
from utils.hparams import set_hparams, hparams
|
12 |
+
import numpy as np
|
13 |
+
from utils.indexed_datasets import IndexedDatasetBuilder
|
14 |
+
from vocoders.base_vocoder import VOCODERS
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
|
18 |
+
class BinarizationError(Exception):
|
19 |
+
pass
|
20 |
+
|
21 |
+
|
22 |
+
class BaseBinarizer:
|
23 |
+
def __init__(self, processed_data_dir=None):
|
24 |
+
if processed_data_dir is None:
|
25 |
+
processed_data_dir = hparams['processed_data_dir']
|
26 |
+
self.processed_data_dirs = processed_data_dir.split(",")
|
27 |
+
self.binarization_args = hparams['binarization_args']
|
28 |
+
self.pre_align_args = hparams['pre_align_args']
|
29 |
+
self.forced_align = self.pre_align_args['forced_align']
|
30 |
+
tg_dir = None
|
31 |
+
if self.forced_align == 'mfa':
|
32 |
+
tg_dir = 'mfa_outputs'
|
33 |
+
if self.forced_align == 'kaldi':
|
34 |
+
tg_dir = 'kaldi_outputs'
|
35 |
+
self.item2txt = {}
|
36 |
+
self.item2ph = {}
|
37 |
+
self.item2wavfn = {}
|
38 |
+
self.item2tgfn = {}
|
39 |
+
self.item2spk = {}
|
40 |
+
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
41 |
+
self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
|
42 |
+
for r_idx, r in self.meta_df.iterrows():
|
43 |
+
item_name = raw_item_name = r['item_name']
|
44 |
+
if len(self.processed_data_dirs) > 1:
|
45 |
+
item_name = f'ds{ds_id}_{item_name}'
|
46 |
+
self.item2txt[item_name] = r['txt']
|
47 |
+
self.item2ph[item_name] = r['ph']
|
48 |
+
self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
|
49 |
+
self.item2spk[item_name] = r.get('spk', 'SPK1')
|
50 |
+
if len(self.processed_data_dirs) > 1:
|
51 |
+
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
52 |
+
if tg_dir is not None:
|
53 |
+
self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
|
54 |
+
self.item_names = sorted(list(self.item2txt.keys()))
|
55 |
+
if self.binarization_args['shuffle']:
|
56 |
+
random.seed(1234)
|
57 |
+
random.shuffle(self.item_names)
|
58 |
+
|
59 |
+
@property
|
60 |
+
def train_item_names(self):
|
61 |
+
return self.item_names[hparams['test_num']+hparams['valid_num']:]
|
62 |
+
|
63 |
+
@property
|
64 |
+
def valid_item_names(self):
|
65 |
+
return self.item_names[0: hparams['test_num']+hparams['valid_num']] #
|
66 |
+
|
67 |
+
@property
|
68 |
+
def test_item_names(self):
|
69 |
+
return self.item_names[0: hparams['test_num']] # Audios for MOS testing are in 'test_ids'
|
70 |
+
|
71 |
+
def build_spk_map(self):
|
72 |
+
spk_map = set()
|
73 |
+
for item_name in self.item_names:
|
74 |
+
spk_name = self.item2spk[item_name]
|
75 |
+
spk_map.add(spk_name)
|
76 |
+
spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
|
77 |
+
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
|
78 |
+
return spk_map
|
79 |
+
|
80 |
+
def item_name2spk_id(self, item_name):
|
81 |
+
return self.spk_map[self.item2spk[item_name]]
|
82 |
+
|
83 |
+
def _phone_encoder(self):
|
84 |
+
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
|
85 |
+
ph_set = []
|
86 |
+
if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
87 |
+
for processed_data_dir in self.processed_data_dirs:
|
88 |
+
ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
|
89 |
+
ph_set = sorted(set(ph_set))
|
90 |
+
json.dump(ph_set, open(ph_set_fn, 'w'))
|
91 |
+
else:
|
92 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
93 |
+
print("| phone set: ", ph_set)
|
94 |
+
return build_phone_encoder(hparams['binary_data_dir'])
|
95 |
+
|
96 |
+
def meta_data(self, prefix):
|
97 |
+
if prefix == 'valid':
|
98 |
+
item_names = self.valid_item_names
|
99 |
+
elif prefix == 'test':
|
100 |
+
item_names = self.test_item_names
|
101 |
+
else:
|
102 |
+
item_names = self.train_item_names
|
103 |
+
for item_name in item_names:
|
104 |
+
ph = self.item2ph[item_name]
|
105 |
+
txt = self.item2txt[item_name]
|
106 |
+
tg_fn = self.item2tgfn.get(item_name)
|
107 |
+
wav_fn = self.item2wavfn[item_name]
|
108 |
+
spk_id = self.item_name2spk_id(item_name)
|
109 |
+
yield item_name, ph, txt, tg_fn, wav_fn, spk_id
|
110 |
+
|
111 |
+
def process(self):
|
112 |
+
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
113 |
+
self.spk_map = self.build_spk_map()
|
114 |
+
print("| spk_map: ", self.spk_map)
|
115 |
+
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
|
116 |
+
json.dump(self.spk_map, open(spk_map_fn, 'w'))
|
117 |
+
|
118 |
+
self.phone_encoder = self._phone_encoder()
|
119 |
+
self.process_data('valid')
|
120 |
+
self.process_data('test')
|
121 |
+
self.process_data('train')
|
122 |
+
|
123 |
+
def process_data(self, prefix):
|
124 |
+
data_dir = hparams['binary_data_dir']
|
125 |
+
args = []
|
126 |
+
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
|
127 |
+
lengths = []
|
128 |
+
f0s = []
|
129 |
+
total_sec = 0
|
130 |
+
if self.binarization_args['with_spk_embed']:
|
131 |
+
voice_encoder = VoiceEncoder().cuda()
|
132 |
+
|
133 |
+
meta_data = list(self.meta_data(prefix))
|
134 |
+
for m in meta_data:
|
135 |
+
args.append(list(m) + [self.phone_encoder, self.binarization_args])
|
136 |
+
num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
|
137 |
+
for f_id, (_, item) in enumerate(
|
138 |
+
zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
|
139 |
+
if item is None:
|
140 |
+
continue
|
141 |
+
item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
|
142 |
+
if self.binarization_args['with_spk_embed'] else None
|
143 |
+
if not self.binarization_args['with_wav'] and 'wav' in item:
|
144 |
+
print("del wav")
|
145 |
+
del item['wav']
|
146 |
+
builder.add_item(item)
|
147 |
+
lengths.append(item['len'])
|
148 |
+
total_sec += item['sec']
|
149 |
+
if item.get('f0') is not None:
|
150 |
+
f0s.append(item['f0'])
|
151 |
+
builder.finalize()
|
152 |
+
np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
|
153 |
+
if len(f0s) > 0:
|
154 |
+
f0s = np.concatenate(f0s, 0)
|
155 |
+
f0s = f0s[f0s != 0]
|
156 |
+
np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
|
157 |
+
print(f"| {prefix} total duration: {total_sec:.3f}s")
|
158 |
+
|
159 |
+
@classmethod
|
160 |
+
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
161 |
+
if hparams['vocoder'] in VOCODERS:
|
162 |
+
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
163 |
+
else:
|
164 |
+
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
165 |
+
res = {
|
166 |
+
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
167 |
+
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
168 |
+
}
|
169 |
+
try:
|
170 |
+
if binarization_args['with_f0']:
|
171 |
+
cls.get_pitch(wav, mel, res)
|
172 |
+
if binarization_args['with_f0cwt']:
|
173 |
+
cls.get_f0cwt(res['f0'], res)
|
174 |
+
if binarization_args['with_txt']:
|
175 |
+
try:
|
176 |
+
phone_encoded = res['phone'] = encoder.encode(ph)
|
177 |
+
except:
|
178 |
+
traceback.print_exc()
|
179 |
+
raise BinarizationError(f"Empty phoneme")
|
180 |
+
if binarization_args['with_align']:
|
181 |
+
cls.get_align(tg_fn, ph, mel, phone_encoded, res)
|
182 |
+
except BinarizationError as e:
|
183 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
184 |
+
return None
|
185 |
+
return res
|
186 |
+
|
187 |
+
@staticmethod
|
188 |
+
def get_align(tg_fn, ph, mel, phone_encoded, res):
|
189 |
+
if tg_fn is not None and os.path.exists(tg_fn):
|
190 |
+
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
|
191 |
+
else:
|
192 |
+
raise BinarizationError(f"Align not found")
|
193 |
+
if mel2ph.max() - 1 >= len(phone_encoded):
|
194 |
+
raise BinarizationError(
|
195 |
+
f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
|
196 |
+
res['mel2ph'] = mel2ph
|
197 |
+
res['dur'] = dur
|
198 |
+
|
199 |
+
@staticmethod
|
200 |
+
def get_pitch(wav, mel, res):
|
201 |
+
f0, pitch_coarse = get_pitch(wav, mel, hparams)
|
202 |
+
if sum(f0) == 0:
|
203 |
+
raise BinarizationError("Empty f0")
|
204 |
+
res['f0'] = f0
|
205 |
+
res['pitch'] = pitch_coarse
|
206 |
+
|
207 |
+
@staticmethod
|
208 |
+
def get_f0cwt(f0, res):
|
209 |
+
from utils.cwt import get_cont_lf0, get_lf0_cwt
|
210 |
+
uv, cont_lf0_lpf = get_cont_lf0(f0)
|
211 |
+
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
|
212 |
+
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
|
213 |
+
Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
|
214 |
+
if np.any(np.isnan(Wavelet_lf0)):
|
215 |
+
raise BinarizationError("NaN CWT")
|
216 |
+
res['cwt_spec'] = Wavelet_lf0
|
217 |
+
res['cwt_scales'] = scales
|
218 |
+
res['f0_mean'] = logf0s_mean_org
|
219 |
+
res['f0_std'] = logf0s_std_org
|
220 |
+
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
set_hparams()
|
224 |
+
BaseBinarizer().process()
|
data_gen/tts/bin/binarize.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
4 |
+
|
5 |
+
import importlib
|
6 |
+
from utils.hparams import set_hparams, hparams
|
7 |
+
|
8 |
+
|
9 |
+
def binarize():
|
10 |
+
binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
|
11 |
+
pkg = ".".join(binarizer_cls.split(".")[:-1])
|
12 |
+
cls_name = binarizer_cls.split(".")[-1]
|
13 |
+
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
|
14 |
+
print("| Binarizer: ", binarizer_cls)
|
15 |
+
binarizer_cls().process()
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == '__main__':
|
19 |
+
set_hparams()
|
20 |
+
binarize()
|
data_gen/tts/binarizer_zh.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
4 |
+
|
5 |
+
from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
|
6 |
+
from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
|
7 |
+
from data_gen.tts.data_gen_utils import get_mel2ph
|
8 |
+
from utils.hparams import set_hparams, hparams
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
|
12 |
+
class ZhBinarizer(BaseBinarizer):
|
13 |
+
@staticmethod
|
14 |
+
def get_align(tg_fn, ph, mel, phone_encoded, res):
|
15 |
+
if tg_fn is not None and os.path.exists(tg_fn):
|
16 |
+
_, dur = get_mel2ph(tg_fn, ph, mel, hparams)
|
17 |
+
else:
|
18 |
+
raise BinarizationError(f"Align not found")
|
19 |
+
ph_list = ph.split(" ")
|
20 |
+
assert len(dur) == len(ph_list)
|
21 |
+
mel2ph = []
|
22 |
+
# 分隔符的时长分配给韵母
|
23 |
+
dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
|
24 |
+
for i in range(len(dur)):
|
25 |
+
p = ph_list[i]
|
26 |
+
if p[0] != '<' and not p[0].isalpha():
|
27 |
+
uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
|
28 |
+
j = 0
|
29 |
+
while j < len(uv_) and not uv_[j]:
|
30 |
+
j += 1
|
31 |
+
dur[i - 1] += j
|
32 |
+
dur[i] -= j
|
33 |
+
if dur[i] < 100:
|
34 |
+
dur[i - 1] += dur[i]
|
35 |
+
dur[i] = 0
|
36 |
+
# 声母和韵母等长
|
37 |
+
for i in range(len(dur)):
|
38 |
+
p = ph_list[i]
|
39 |
+
if p in ALL_SHENMU:
|
40 |
+
p_next = ph_list[i + 1]
|
41 |
+
if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
|
42 |
+
print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
|
43 |
+
f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
|
44 |
+
continue
|
45 |
+
total = dur[i + 1] + dur[i]
|
46 |
+
dur[i] = total // 2
|
47 |
+
dur[i + 1] = total - dur[i]
|
48 |
+
for i in range(len(dur)):
|
49 |
+
mel2ph += [i + 1] * dur[i]
|
50 |
+
mel2ph = np.array(mel2ph)
|
51 |
+
if mel2ph.max() - 1 >= len(phone_encoded):
|
52 |
+
raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
|
53 |
+
res['mel2ph'] = mel2ph
|
54 |
+
res['dur'] = dur
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
set_hparams()
|
59 |
+
ZhBinarizer().process()
|
data_gen/tts/data_gen_utils.py
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
+
warnings.filterwarnings("ignore")
|
4 |
+
|
5 |
+
import parselmouth
|
6 |
+
import os
|
7 |
+
import torch
|
8 |
+
from skimage.transform import resize
|
9 |
+
from utils.text_encoder import TokenTextEncoder
|
10 |
+
from utils.pitch_utils import f0_to_coarse
|
11 |
+
import struct
|
12 |
+
import webrtcvad
|
13 |
+
from scipy.ndimage.morphology import binary_dilation
|
14 |
+
import librosa
|
15 |
+
import numpy as np
|
16 |
+
from utils import audio
|
17 |
+
import pyloudnorm as pyln
|
18 |
+
import re
|
19 |
+
import json
|
20 |
+
from collections import OrderedDict
|
21 |
+
|
22 |
+
PUNCS = '!,.?;:'
|
23 |
+
|
24 |
+
int16_max = (2 ** 15) - 1
|
25 |
+
|
26 |
+
|
27 |
+
def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
|
28 |
+
"""
|
29 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
30 |
+
threshold determined by the VAD parameters in params.py.
|
31 |
+
:param wav: the raw waveform as a numpy array of floats
|
32 |
+
:param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
|
33 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
34 |
+
"""
|
35 |
+
|
36 |
+
## Voice Activation Detection
|
37 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
38 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
39 |
+
sampling_rate = 16000
|
40 |
+
wav_raw, sr = librosa.core.load(path, sr=sr)
|
41 |
+
|
42 |
+
if norm:
|
43 |
+
meter = pyln.Meter(sr) # create BS.1770 meter
|
44 |
+
loudness = meter.integrated_loudness(wav_raw)
|
45 |
+
wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
|
46 |
+
if np.abs(wav_raw).max() > 1.0:
|
47 |
+
wav_raw = wav_raw / np.abs(wav_raw).max()
|
48 |
+
|
49 |
+
wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
|
50 |
+
|
51 |
+
vad_window_length = 30 # In milliseconds
|
52 |
+
# Number of frames to average together when performing the moving average smoothing.
|
53 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
54 |
+
vad_moving_average_width = 8
|
55 |
+
|
56 |
+
# Compute the voice detection window size
|
57 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
58 |
+
|
59 |
+
# Trim the end of the audio to have a multiple of the window size
|
60 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
61 |
+
|
62 |
+
# Convert the float waveform to 16-bit mono PCM
|
63 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
64 |
+
|
65 |
+
# Perform voice activation detection
|
66 |
+
voice_flags = []
|
67 |
+
vad = webrtcvad.Vad(mode=3)
|
68 |
+
for window_start in range(0, len(wav), samples_per_window):
|
69 |
+
window_end = window_start + samples_per_window
|
70 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
71 |
+
sample_rate=sampling_rate))
|
72 |
+
voice_flags = np.array(voice_flags)
|
73 |
+
|
74 |
+
# Smooth the voice detection with a moving average
|
75 |
+
def moving_average(array, width):
|
76 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
77 |
+
ret = np.cumsum(array_padded, dtype=float)
|
78 |
+
ret[width:] = ret[width:] - ret[:-width]
|
79 |
+
return ret[width - 1:] / width
|
80 |
+
|
81 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
82 |
+
audio_mask = np.round(audio_mask).astype(np.bool)
|
83 |
+
|
84 |
+
# Dilate the voiced regions
|
85 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
86 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
87 |
+
audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
|
88 |
+
if return_raw_wav:
|
89 |
+
return wav_raw, audio_mask, sr
|
90 |
+
return wav_raw[audio_mask], audio_mask, sr
|
91 |
+
|
92 |
+
|
93 |
+
def process_utterance(wav_path,
|
94 |
+
fft_size=1024,
|
95 |
+
hop_size=256,
|
96 |
+
win_length=1024,
|
97 |
+
window="hann",
|
98 |
+
num_mels=80,
|
99 |
+
fmin=80,
|
100 |
+
fmax=7600,
|
101 |
+
eps=1e-6,
|
102 |
+
sample_rate=22050,
|
103 |
+
loud_norm=False,
|
104 |
+
min_level_db=-100,
|
105 |
+
return_linear=False,
|
106 |
+
trim_long_sil=False, vocoder='pwg'):
|
107 |
+
if isinstance(wav_path, str):
|
108 |
+
if trim_long_sil:
|
109 |
+
wav, _, _ = trim_long_silences(wav_path, sample_rate)
|
110 |
+
else:
|
111 |
+
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
|
112 |
+
else:
|
113 |
+
wav = wav_path
|
114 |
+
|
115 |
+
if loud_norm:
|
116 |
+
meter = pyln.Meter(sample_rate) # create BS.1770 meter
|
117 |
+
loudness = meter.integrated_loudness(wav)
|
118 |
+
wav = pyln.normalize.loudness(wav, loudness, -22.0)
|
119 |
+
if np.abs(wav).max() > 1:
|
120 |
+
wav = wav / np.abs(wav).max()
|
121 |
+
|
122 |
+
# get amplitude spectrogram
|
123 |
+
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
|
124 |
+
win_length=win_length, window=window, pad_mode="constant")
|
125 |
+
spc = np.abs(x_stft) # (n_bins, T)
|
126 |
+
|
127 |
+
# get mel basis
|
128 |
+
fmin = 0 if fmin == -1 else fmin
|
129 |
+
fmax = sample_rate / 2 if fmax == -1 else fmax
|
130 |
+
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
|
131 |
+
mel = mel_basis @ spc
|
132 |
+
|
133 |
+
if vocoder == 'pwg':
|
134 |
+
mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
|
135 |
+
else:
|
136 |
+
assert False, f'"{vocoder}" is not in ["pwg"].'
|
137 |
+
|
138 |
+
l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
|
139 |
+
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
|
140 |
+
wav = wav[:mel.shape[1] * hop_size]
|
141 |
+
|
142 |
+
if not return_linear:
|
143 |
+
return wav, mel
|
144 |
+
else:
|
145 |
+
spc = audio.amp_to_db(spc)
|
146 |
+
spc = audio.normalize(spc, {'min_level_db': min_level_db})
|
147 |
+
return wav, mel, spc
|
148 |
+
|
149 |
+
|
150 |
+
def get_pitch(wav_data, mel, hparams):
|
151 |
+
"""
|
152 |
+
|
153 |
+
:param wav_data: [T]
|
154 |
+
:param mel: [T, 80]
|
155 |
+
:param hparams:
|
156 |
+
:return:
|
157 |
+
"""
|
158 |
+
time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
|
159 |
+
f0_min = 80
|
160 |
+
f0_max = 750
|
161 |
+
|
162 |
+
if hparams['hop_size'] == 128:
|
163 |
+
pad_size = 4
|
164 |
+
elif hparams['hop_size'] == 256:
|
165 |
+
pad_size = 2
|
166 |
+
else:
|
167 |
+
assert False
|
168 |
+
|
169 |
+
f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
|
170 |
+
time_step=time_step / 1000, voicing_threshold=0.6,
|
171 |
+
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
|
172 |
+
lpad = pad_size * 2
|
173 |
+
rpad = len(mel) - len(f0) - lpad
|
174 |
+
f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
|
175 |
+
# mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
|
176 |
+
# Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
|
177 |
+
# Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
|
178 |
+
delta_l = len(mel) - len(f0)
|
179 |
+
assert np.abs(delta_l) <= 8
|
180 |
+
if delta_l > 0:
|
181 |
+
f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
|
182 |
+
f0 = f0[:len(mel)]
|
183 |
+
pitch_coarse = f0_to_coarse(f0)
|
184 |
+
return f0, pitch_coarse
|
185 |
+
|
186 |
+
|
187 |
+
def remove_empty_lines(text):
|
188 |
+
"""remove empty lines"""
|
189 |
+
assert (len(text) > 0)
|
190 |
+
assert (isinstance(text, list))
|
191 |
+
text = [t.strip() for t in text]
|
192 |
+
if "" in text:
|
193 |
+
text.remove("")
|
194 |
+
return text
|
195 |
+
|
196 |
+
|
197 |
+
class TextGrid(object):
|
198 |
+
def __init__(self, text):
|
199 |
+
text = remove_empty_lines(text)
|
200 |
+
self.text = text
|
201 |
+
self.line_count = 0
|
202 |
+
self._get_type()
|
203 |
+
self._get_time_intval()
|
204 |
+
self._get_size()
|
205 |
+
self.tier_list = []
|
206 |
+
self._get_item_list()
|
207 |
+
|
208 |
+
def _extract_pattern(self, pattern, inc):
|
209 |
+
"""
|
210 |
+
Parameters
|
211 |
+
----------
|
212 |
+
pattern : regex to extract pattern
|
213 |
+
inc : increment of line count after extraction
|
214 |
+
Returns
|
215 |
+
-------
|
216 |
+
group : extracted info
|
217 |
+
"""
|
218 |
+
try:
|
219 |
+
group = re.match(pattern, self.text[self.line_count]).group(1)
|
220 |
+
self.line_count += inc
|
221 |
+
except AttributeError:
|
222 |
+
raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
|
223 |
+
return group
|
224 |
+
|
225 |
+
def _get_type(self):
|
226 |
+
self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
|
227 |
+
|
228 |
+
def _get_time_intval(self):
|
229 |
+
self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
|
230 |
+
self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
|
231 |
+
|
232 |
+
def _get_size(self):
|
233 |
+
self.size = int(self._extract_pattern(r"size = (.*)", 2))
|
234 |
+
|
235 |
+
def _get_item_list(self):
|
236 |
+
"""Only supports IntervalTier currently"""
|
237 |
+
for itemIdx in range(1, self.size + 1):
|
238 |
+
tier = OrderedDict()
|
239 |
+
item_list = []
|
240 |
+
tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
|
241 |
+
tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
|
242 |
+
if tier_class != "IntervalTier":
|
243 |
+
raise NotImplementedError("Only IntervalTier class is supported currently")
|
244 |
+
tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
|
245 |
+
tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
|
246 |
+
tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
|
247 |
+
tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
|
248 |
+
for i in range(int(tier_size)):
|
249 |
+
item = OrderedDict()
|
250 |
+
item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
|
251 |
+
item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
|
252 |
+
item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
|
253 |
+
item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
|
254 |
+
item_list.append(item)
|
255 |
+
tier["idx"] = tier_idx
|
256 |
+
tier["class"] = tier_class
|
257 |
+
tier["name"] = tier_name
|
258 |
+
tier["xmin"] = tier_xmin
|
259 |
+
tier["xmax"] = tier_xmax
|
260 |
+
tier["size"] = tier_size
|
261 |
+
tier["items"] = item_list
|
262 |
+
self.tier_list.append(tier)
|
263 |
+
|
264 |
+
def toJson(self):
|
265 |
+
_json = OrderedDict()
|
266 |
+
_json["file_type"] = self.file_type
|
267 |
+
_json["xmin"] = self.xmin
|
268 |
+
_json["xmax"] = self.xmax
|
269 |
+
_json["size"] = self.size
|
270 |
+
_json["tiers"] = self.tier_list
|
271 |
+
return json.dumps(_json, ensure_ascii=False, indent=2)
|
272 |
+
|
273 |
+
|
274 |
+
def get_mel2ph(tg_fn, ph, mel, hparams):
|
275 |
+
ph_list = ph.split(" ")
|
276 |
+
with open(tg_fn, "r") as f:
|
277 |
+
tg = f.readlines()
|
278 |
+
tg = remove_empty_lines(tg)
|
279 |
+
tg = TextGrid(tg)
|
280 |
+
tg = json.loads(tg.toJson())
|
281 |
+
split = np.ones(len(ph_list) + 1, np.float) * -1
|
282 |
+
tg_idx = 0
|
283 |
+
ph_idx = 0
|
284 |
+
tg_align = [x for x in tg['tiers'][-1]['items']]
|
285 |
+
tg_align_ = []
|
286 |
+
for x in tg_align:
|
287 |
+
x['xmin'] = float(x['xmin'])
|
288 |
+
x['xmax'] = float(x['xmax'])
|
289 |
+
if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
|
290 |
+
x['text'] = ''
|
291 |
+
if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
|
292 |
+
tg_align_[-1]['xmax'] = x['xmax']
|
293 |
+
continue
|
294 |
+
tg_align_.append(x)
|
295 |
+
tg_align = tg_align_
|
296 |
+
tg_len = len([x for x in tg_align if x['text'] != ''])
|
297 |
+
ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
|
298 |
+
assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
|
299 |
+
while tg_idx < len(tg_align) or ph_idx < len(ph_list):
|
300 |
+
if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
|
301 |
+
split[ph_idx] = 1e8
|
302 |
+
ph_idx += 1
|
303 |
+
continue
|
304 |
+
x = tg_align[tg_idx]
|
305 |
+
if x['text'] == '' and ph_idx == len(ph_list):
|
306 |
+
tg_idx += 1
|
307 |
+
continue
|
308 |
+
assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
|
309 |
+
ph = ph_list[ph_idx]
|
310 |
+
if x['text'] == '' and not is_sil_phoneme(ph):
|
311 |
+
assert False, (ph_list, tg_align)
|
312 |
+
if x['text'] != '' and is_sil_phoneme(ph):
|
313 |
+
ph_idx += 1
|
314 |
+
else:
|
315 |
+
assert (x['text'] == '' and is_sil_phoneme(ph)) \
|
316 |
+
or x['text'].lower() == ph.lower() \
|
317 |
+
or x['text'].lower() == 'sil', (x['text'], ph)
|
318 |
+
split[ph_idx] = x['xmin']
|
319 |
+
if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
|
320 |
+
split[ph_idx - 1] = split[ph_idx]
|
321 |
+
ph_idx += 1
|
322 |
+
tg_idx += 1
|
323 |
+
assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
|
324 |
+
assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
|
325 |
+
mel2ph = np.zeros([mel.shape[0]], np.int)
|
326 |
+
split[0] = 0
|
327 |
+
split[-1] = 1e8
|
328 |
+
for i in range(len(split) - 1):
|
329 |
+
assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
|
330 |
+
split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
|
331 |
+
for ph_idx in range(len(ph_list)):
|
332 |
+
mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
|
333 |
+
mel2ph_torch = torch.from_numpy(mel2ph)
|
334 |
+
T_t = len(ph_list)
|
335 |
+
dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
|
336 |
+
dur = dur[1:].numpy()
|
337 |
+
return mel2ph, dur
|
338 |
+
|
339 |
+
|
340 |
+
def build_phone_encoder(data_dir):
|
341 |
+
phone_list_file = os.path.join(data_dir, 'phone_set.json')
|
342 |
+
phone_list = json.load(open(phone_list_file))
|
343 |
+
return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
|
344 |
+
|
345 |
+
|
346 |
+
def is_sil_phoneme(p):
|
347 |
+
return not p[0].isalpha()
|
data_gen/tts/txt_processors/base_text_processor.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class BaseTxtProcessor:
|
2 |
+
@staticmethod
|
3 |
+
def sp_phonemes():
|
4 |
+
return ['|']
|
5 |
+
|
6 |
+
@classmethod
|
7 |
+
def process(cls, txt, pre_align_args):
|
8 |
+
raise NotImplementedError
|
data_gen/tts/txt_processors/en.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from data_gen.tts.data_gen_utils import PUNCS
|
3 |
+
from g2p_en import G2p
|
4 |
+
import unicodedata
|
5 |
+
from g2p_en.expand import normalize_numbers
|
6 |
+
from nltk import pos_tag
|
7 |
+
from nltk.tokenize import TweetTokenizer
|
8 |
+
|
9 |
+
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
|
10 |
+
|
11 |
+
|
12 |
+
class EnG2p(G2p):
|
13 |
+
word_tokenize = TweetTokenizer().tokenize
|
14 |
+
|
15 |
+
def __call__(self, text):
|
16 |
+
# preprocessing
|
17 |
+
words = EnG2p.word_tokenize(text)
|
18 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
19 |
+
|
20 |
+
# steps
|
21 |
+
prons = []
|
22 |
+
for word, pos in tokens:
|
23 |
+
if re.search("[a-z]", word) is None:
|
24 |
+
pron = [word]
|
25 |
+
|
26 |
+
elif word in self.homograph2features: # Check homograph
|
27 |
+
pron1, pron2, pos1 = self.homograph2features[word]
|
28 |
+
if pos.startswith(pos1):
|
29 |
+
pron = pron1
|
30 |
+
else:
|
31 |
+
pron = pron2
|
32 |
+
elif word in self.cmu: # lookup CMU dict
|
33 |
+
pron = self.cmu[word][0]
|
34 |
+
else: # predict for oov
|
35 |
+
pron = self.predict(word)
|
36 |
+
|
37 |
+
prons.extend(pron)
|
38 |
+
prons.extend([" "])
|
39 |
+
|
40 |
+
return prons[:-1]
|
41 |
+
|
42 |
+
|
43 |
+
class TxtProcessor(BaseTxtProcessor):
|
44 |
+
g2p = EnG2p()
|
45 |
+
|
46 |
+
@staticmethod
|
47 |
+
def preprocess_text(text):
|
48 |
+
text = normalize_numbers(text)
|
49 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
50 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
51 |
+
text = text.lower()
|
52 |
+
text = re.sub("[\'\"()]+", "", text)
|
53 |
+
text = re.sub("[-]+", " ", text)
|
54 |
+
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
|
55 |
+
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
|
56 |
+
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
|
57 |
+
text = text.replace("i.e.", "that is")
|
58 |
+
text = text.replace("i.e.", "that is")
|
59 |
+
text = text.replace("etc.", "etc")
|
60 |
+
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
|
61 |
+
text = re.sub(rf"\s+", r" ", text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
@classmethod
|
65 |
+
def process(cls, txt, pre_align_args):
|
66 |
+
txt = cls.preprocess_text(txt).strip()
|
67 |
+
phs = cls.g2p(txt)
|
68 |
+
phs_ = []
|
69 |
+
n_word_sep = 0
|
70 |
+
for p in phs:
|
71 |
+
if p.strip() == '':
|
72 |
+
phs_ += ['|']
|
73 |
+
n_word_sep += 1
|
74 |
+
else:
|
75 |
+
phs_ += p.split(" ")
|
76 |
+
phs = phs_
|
77 |
+
assert n_word_sep + 1 == len(txt.split(" ")), (phs, f"\"{txt}\"")
|
78 |
+
return phs, txt
|
data_gen/tts/txt_processors/zh.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from pypinyin import pinyin, Style
|
3 |
+
from data_gen.tts.data_gen_utils import PUNCS
|
4 |
+
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
|
5 |
+
from utils.text_norm import NSWNormalizer
|
6 |
+
|
7 |
+
|
8 |
+
class TxtProcessor(BaseTxtProcessor):
|
9 |
+
table = {ord(f): ord(t) for f, t in zip(
|
10 |
+
u':,。!?【】()%#@&1234567890',
|
11 |
+
u':,.!?[]()%#@&1234567890')}
|
12 |
+
|
13 |
+
@staticmethod
|
14 |
+
def preprocess_text(text):
|
15 |
+
text = text.translate(TxtProcessor.table)
|
16 |
+
text = NSWNormalizer(text).normalize(remove_punc=False)
|
17 |
+
text = re.sub("[\'\"()]+", "", text)
|
18 |
+
text = re.sub("[-]+", " ", text)
|
19 |
+
text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
|
20 |
+
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
|
21 |
+
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
|
22 |
+
text = re.sub(rf"\s+", r"", text)
|
23 |
+
return text
|
24 |
+
|
25 |
+
@classmethod
|
26 |
+
def process(cls, txt, pre_align_args):
|
27 |
+
txt = cls.preprocess_text(txt)
|
28 |
+
shengmu = pinyin(txt, style=Style.INITIALS) # https://blog.csdn.net/zhoulei124/article/details/89055403
|
29 |
+
yunmu_finals = pinyin(txt, style=Style.FINALS)
|
30 |
+
yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
|
31 |
+
yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
|
32 |
+
if pre_align_args['use_tone'] else yunmu_finals
|
33 |
+
|
34 |
+
assert len(shengmu) == len(yunmu)
|
35 |
+
phs = ["|"]
|
36 |
+
for a, b, c in zip(shengmu, yunmu, yunmu_finals):
|
37 |
+
if a[0] == c[0]:
|
38 |
+
phs += [a[0], "|"]
|
39 |
+
else:
|
40 |
+
phs += [a[0], b[0], "|"]
|
41 |
+
return phs, txt
|
data_gen/tts/txt_processors/zh_g2pM.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import jieba
|
3 |
+
from pypinyin import pinyin, Style
|
4 |
+
from data_gen.tts.data_gen_utils import PUNCS
|
5 |
+
from data_gen.tts.txt_processors import zh
|
6 |
+
from g2pM import G2pM
|
7 |
+
|
8 |
+
ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
|
9 |
+
'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
|
10 |
+
ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
|
11 |
+
'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
|
12 |
+
'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
|
13 |
+
|
14 |
+
|
15 |
+
class TxtProcessor(zh.TxtProcessor):
|
16 |
+
model = G2pM()
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def sp_phonemes():
|
20 |
+
return ['|', '#']
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def process(cls, txt, pre_align_args):
|
24 |
+
txt = cls.preprocess_text(txt)
|
25 |
+
ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True)
|
26 |
+
seg_list = '#'.join(jieba.cut(txt))
|
27 |
+
assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
|
28 |
+
|
29 |
+
# 加入词边界'#'
|
30 |
+
ph_list_ = []
|
31 |
+
seg_idx = 0
|
32 |
+
for p in ph_list:
|
33 |
+
p = p.replace("u:", "v")
|
34 |
+
if seg_list[seg_idx] == '#':
|
35 |
+
ph_list_.append('#')
|
36 |
+
seg_idx += 1
|
37 |
+
else:
|
38 |
+
ph_list_.append("|")
|
39 |
+
seg_idx += 1
|
40 |
+
if re.findall('[\u4e00-\u9fff]', p):
|
41 |
+
if pre_align_args['use_tone']:
|
42 |
+
p = pinyin(p, style=Style.TONE3, strict=True)[0][0]
|
43 |
+
if p[-1] not in ['1', '2', '3', '4', '5']:
|
44 |
+
p = p + '5'
|
45 |
+
else:
|
46 |
+
p = pinyin(p, style=Style.NORMAL, strict=True)[0][0]
|
47 |
+
|
48 |
+
finished = False
|
49 |
+
if len([c.isalpha() for c in p]) > 1:
|
50 |
+
for shenmu in ALL_SHENMU:
|
51 |
+
if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric():
|
52 |
+
ph_list_ += [shenmu, p.lstrip(shenmu)]
|
53 |
+
finished = True
|
54 |
+
break
|
55 |
+
if not finished:
|
56 |
+
ph_list_.append(p)
|
57 |
+
|
58 |
+
ph_list = ph_list_
|
59 |
+
|
60 |
+
# 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
|
61 |
+
sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
|
62 |
+
ph_list_ = []
|
63 |
+
for i in range(0, len(ph_list), 1):
|
64 |
+
if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
|
65 |
+
ph_list_.append(ph_list[i])
|
66 |
+
ph_list = ph_list_
|
67 |
+
return ph_list, txt
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == '__main__':
|
71 |
+
phs, txt = TxtProcessor.process('他来到了,网易杭研大厦', {'use_tone': True})
|
72 |
+
print(phs)
|
docs/README-SVS-opencpop-cascade.md
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
2 |
+
[](https://arxiv.org/abs/2105.02446)
|
3 |
+
[](https://github.com/MoonInTheRiver/DiffSinger)
|
4 |
+
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
5 |
+
|
6 |
+
## DiffSinger (MIDI SVS | A version)
|
7 |
+
### 0. Data Acquirement
|
8 |
+
For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
|
9 |
+
|
10 |
+
The pipeline below is designed for Opencpop dataset:
|
11 |
+
|
12 |
+
### 1. Preparation
|
13 |
+
|
14 |
+
#### Data Preparation
|
15 |
+
a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
|
16 |
+
|
17 |
+
b) Run the following scripts to pack the dataset for training/inference.
|
18 |
+
|
19 |
+
```sh
|
20 |
+
export PYTHONPATH=.
|
21 |
+
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
|
22 |
+
|
23 |
+
# `data/binary/opencpop-midi-dp` will be generated.
|
24 |
+
```
|
25 |
+
|
26 |
+
#### Vocoder Preparation
|
27 |
+
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
|
28 |
+
Please unzip this file into `checkpoints` before training your acoustic model.
|
29 |
+
|
30 |
+
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
|
31 |
+
|
32 |
+
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
|
33 |
+
|
34 |
+
#### Exp Name Preparation
|
35 |
+
```bash
|
36 |
+
export MY_FS_EXP_NAME=0302_opencpop_fs_midi
|
37 |
+
export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
|
38 |
+
```
|
39 |
+
|
40 |
+
```
|
41 |
+
.
|
42 |
+
|--data
|
43 |
+
|--raw
|
44 |
+
|--opencpop
|
45 |
+
|--segments
|
46 |
+
|--transcriptions.txt
|
47 |
+
|--wavs
|
48 |
+
|--checkpoints
|
49 |
+
|--MY_FS_EXP_NAME (optional)
|
50 |
+
|--MY_DS_EXP_NAME (optional)
|
51 |
+
|--0109_hifigan_bigpopcs_hop128
|
52 |
+
|--model_ckpt_steps_1512000.ckpt
|
53 |
+
|--config.yaml
|
54 |
+
```
|
55 |
+
|
56 |
+
### 2. Training Example
|
57 |
+
First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
|
58 |
+
```sh
|
59 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
|
60 |
+
```
|
61 |
+
|
62 |
+
Then, to train DiffSinger, run:
|
63 |
+
|
64 |
+
```sh
|
65 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
|
66 |
+
```
|
67 |
+
|
68 |
+
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
|
69 |
+
|
70 |
+
### 3. Inference from packed test set
|
71 |
+
```sh
|
72 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
|
73 |
+
```
|
74 |
+
|
75 |
+
We also provide:
|
76 |
+
- the pre-trained model of DiffSinger;
|
77 |
+
- the pre-trained model of FFT-Singer;
|
78 |
+
|
79 |
+
They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
|
80 |
+
|
81 |
+
Remember to put the pre-trained models in `checkpoints` directory.
|
82 |
+
|
83 |
+
### 4. Inference from raw inputs
|
84 |
+
```sh
|
85 |
+
python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
|
86 |
+
```
|
87 |
+
Raw inputs:
|
88 |
+
```
|
89 |
+
inp = {
|
90 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
91 |
+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
92 |
+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
|
93 |
+
'input_type': 'word'
|
94 |
+
} # user input: Chinese characters
|
95 |
+
or,
|
96 |
+
inp = {
|
97 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
98 |
+
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
99 |
+
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
100 |
+
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
101 |
+
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
|
102 |
+
'input_type': 'phoneme'
|
103 |
+
} # input like Opencpop dataset.
|
104 |
+
```
|
105 |
+
|
106 |
+
### 5. Some issues.
|
107 |
+
a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
|
108 |
+
|
109 |
+
b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
|
110 |
+
|
111 |
+
c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
|
docs/README-SVS-opencpop-e2e.md
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
2 |
+
[](https://arxiv.org/abs/2105.02446)
|
3 |
+
[](https://github.com/MoonInTheRiver/DiffSinger)
|
4 |
+
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
5 |
+
| [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
|
6 |
+
|
7 |
+
Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
|
8 |
+
**By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
|
9 |
+
|
10 |
+
简而言之,把F0曲线的动态性交给生成式模型去捕捉,而不再是以前那样用MSE约束对数域F0。
|
11 |
+
|
12 |
+
## DiffSinger (MIDI SVS | B version)
|
13 |
+
### 0. Data Acquirement
|
14 |
+
For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
|
15 |
+
|
16 |
+
The pipeline below is designed for Opencpop dataset:
|
17 |
+
|
18 |
+
### 1. Preparation
|
19 |
+
|
20 |
+
#### Data Preparation
|
21 |
+
a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
|
22 |
+
|
23 |
+
b) Run the following scripts to pack the dataset for training/inference.
|
24 |
+
|
25 |
+
```sh
|
26 |
+
export PYTHONPATH=.
|
27 |
+
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
|
28 |
+
|
29 |
+
# `data/binary/opencpop-midi-dp` will be generated.
|
30 |
+
```
|
31 |
+
|
32 |
+
#### Vocoder Preparation
|
33 |
+
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
|
34 |
+
|
35 |
+
Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
|
36 |
+
|
37 |
+
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
|
38 |
+
|
39 |
+
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
|
40 |
+
|
41 |
+
#### Exp Name Preparation
|
42 |
+
```bash
|
43 |
+
export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
|
44 |
+
```
|
45 |
+
|
46 |
+
```
|
47 |
+
.
|
48 |
+
|--data
|
49 |
+
|--raw
|
50 |
+
|--opencpop
|
51 |
+
|--segments
|
52 |
+
|--transcriptions.txt
|
53 |
+
|--wavs
|
54 |
+
|--checkpoints
|
55 |
+
|--MY_DS_EXP_NAME (optional)
|
56 |
+
|--0109_hifigan_bigpopcs_hop128 (vocoder)
|
57 |
+
|--model_ckpt_steps_1512000.ckpt
|
58 |
+
|--config.yaml
|
59 |
+
```
|
60 |
+
|
61 |
+
### 2. Training Example
|
62 |
+
```sh
|
63 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
|
64 |
+
```
|
65 |
+
|
66 |
+
### 3. Inference from packed test set
|
67 |
+
```sh
|
68 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
|
69 |
+
```
|
70 |
+
|
71 |
+
We also provide:
|
72 |
+
- the pre-trained model of DiffSinger;
|
73 |
+
|
74 |
+
They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
|
75 |
+
|
76 |
+
Remember to put the pre-trained models in `checkpoints` directory.
|
77 |
+
|
78 |
+
### 4. Inference from raw inputs
|
79 |
+
```sh
|
80 |
+
python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
|
81 |
+
```
|
82 |
+
Raw inputs:
|
83 |
+
```
|
84 |
+
inp = {
|
85 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
86 |
+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
87 |
+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
|
88 |
+
'input_type': 'word'
|
89 |
+
} # user input: Chinese characters
|
90 |
+
or,
|
91 |
+
inp = {
|
92 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
93 |
+
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
94 |
+
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
95 |
+
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
96 |
+
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
|
97 |
+
'input_type': 'phoneme'
|
98 |
+
} # input like Opencpop dataset.
|
99 |
+
```
|
100 |
+
|
101 |
+
### 5. Some issues.
|
102 |
+
a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
|
103 |
+
|
104 |
+
b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
|
105 |
+
|
106 |
+
c) example [generated audio](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/demos_0221/DS/).
|
107 |
+
More generated audio demos can be found in [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
|
docs/README-SVS-popcs.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## DiffSinger (SVS version)
|
2 |
+
|
3 |
+
### 0. Data Acquirement
|
4 |
+
- See in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
|
5 |
+
- Dataset [preview](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
|
6 |
+
|
7 |
+
### 1. Preparation
|
8 |
+
#### Data Preparation
|
9 |
+
a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
|
10 |
+
|
11 |
+
b) Run the following scripts to pack the dataset for training/inference.
|
12 |
+
```sh
|
13 |
+
export PYTHONPATH=.
|
14 |
+
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
|
15 |
+
# `data/binary/popcs-pmf0` will be generated.
|
16 |
+
```
|
17 |
+
|
18 |
+
#### Vocoder Preparation
|
19 |
+
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
|
20 |
+
Please unzip this file into `checkpoints` before training your acoustic model.
|
21 |
+
|
22 |
+
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
|
23 |
+
|
24 |
+
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
|
25 |
+
|
26 |
+
### 2. Training Example
|
27 |
+
First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
|
28 |
+
|
29 |
+
```sh
|
30 |
+
# First, train fft-singer;
|
31 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
|
32 |
+
# Then, infer fft-singer;
|
33 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
|
34 |
+
```
|
35 |
+
|
36 |
+
Then, to train DiffSinger, run:
|
37 |
+
```sh
|
38 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
|
39 |
+
```
|
40 |
+
|
41 |
+
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
|
42 |
+
|
43 |
+
### 3. Inference Example
|
44 |
+
```sh
|
45 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
|
46 |
+
```
|
47 |
+
|
48 |
+
We also provide:
|
49 |
+
- the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
|
50 |
+
- the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
|
51 |
+
|
52 |
+
Remember to put the pre-trained models in `checkpoints` directory.
|
53 |
+
|
54 |
+
*Note that:*
|
55 |
+
|
56 |
+
- *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
|
57 |
+
- *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
|
58 |
+
|
59 |
+
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
|
60 |
+
|
61 |
+
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
|
62 |
+
|
63 |
+
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
|
docs/README-SVS.md
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
2 |
+
[](https://arxiv.org/abs/2105.02446)
|
3 |
+
[](https://github.com/MoonInTheRiver/DiffSinger)
|
4 |
+
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
5 |
+
| [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
|
6 |
+
|
7 |
+
## DiffSinger (SVS)
|
8 |
+
|
9 |
+
### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
|
10 |
+
In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
|
11 |
+
|
12 |
+
Thus, the pipeline of this part can be summarized as:
|
13 |
+
|
14 |
+
```
|
15 |
+
[lyrics] -> [linguistic representation] (Frontend)
|
16 |
+
[linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram] (Acoustic model)
|
17 |
+
[mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
|
18 |
+
```
|
19 |
+
|
20 |
+
|
21 |
+
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
|
22 |
+
|
23 |
+
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
|
24 |
+
|
25 |
+
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
|
26 |
+
|
27 |
+
Click here for detailed instructions: [link](README-SVS-popcs.md).
|
28 |
+
|
29 |
+
|
30 |
+
### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
|
31 |
+
Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
|
32 |
+
|
33 |
+
Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
|
34 |
+
|
35 |
+
#### 2.A
|
36 |
+
Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
|
37 |
+
|
38 |
+
```
|
39 |
+
[lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
|
40 |
+
[linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
|
41 |
+
[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
|
42 |
+
```
|
43 |
+
|
44 |
+
Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
|
45 |
+
|
46 |
+
#### 2.B
|
47 |
+
In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
|
48 |
+
|
49 |
+
Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
|
50 |
+
```
|
51 |
+
[lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
|
52 |
+
[linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
|
53 |
+
[mel-spectrogram] -> [predicted F0] (Pitch extractor)
|
54 |
+
[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
|
55 |
+
```
|
56 |
+
|
57 |
+
Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
|
58 |
+
|
59 |
+
### FAQ
|
60 |
+
Q1: Why do I need F0 in Vocoders?
|
61 |
+
|
62 |
+
A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
|
63 |
+
|
64 |
+
Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
|
65 |
+
|
66 |
+
A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
|
67 |
+
|
68 |
+
Q3: Why " 'HifiGAN' object has no attribute 'model' "?
|
69 |
+
|
70 |
+
A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
|
71 |
+
|
72 |
+
Q4: How to check whether I use GT information or predicted information during inference from packed test set?
|
73 |
+
|
74 |
+
A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
|
75 |
+
|
76 |
+
...
|
docs/README-TTS.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
2 |
+
[](https://arxiv.org/abs/2105.02446)
|
3 |
+
[](https://github.com/MoonInTheRiver/DiffSinger)
|
4 |
+
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
5 |
+
| [Interactive🤗 TTS](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
|
6 |
+
|
7 |
+
## DiffSpeech (TTS)
|
8 |
+
### 1. Preparation
|
9 |
+
|
10 |
+
#### Data Preparation
|
11 |
+
a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
|
12 |
+
|
13 |
+
b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
|
14 |
+
|
15 |
+
c) Run the following scripts to pack the dataset for training/inference.
|
16 |
+
|
17 |
+
```sh
|
18 |
+
export PYTHONPATH=.
|
19 |
+
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
|
20 |
+
|
21 |
+
# `data/binary/ljspeech` will be generated.
|
22 |
+
```
|
23 |
+
|
24 |
+
#### Vocoder Preparation
|
25 |
+
We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
|
26 |
+
Please unzip this file into `checkpoints` before training your acoustic model.
|
27 |
+
|
28 |
+
### 2. Training Example
|
29 |
+
|
30 |
+
First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
|
31 |
+
```sh
|
32 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
|
33 |
+
```
|
34 |
+
Then, to train DiffSpeech, run:
|
35 |
+
```sh
|
36 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
|
37 |
+
```
|
38 |
+
|
39 |
+
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
|
40 |
+
|
41 |
+
### 3. Inference Example
|
42 |
+
|
43 |
+
```sh
|
44 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
|
45 |
+
```
|
46 |
+
|
47 |
+
We also provide:
|
48 |
+
- the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
|
49 |
+
- the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
|
50 |
+
|
51 |
+
Remember to put the pre-trained models in `checkpoints` directory.
|
52 |
+
|
53 |
+
## Mel Visualization
|
54 |
+
Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
|
55 |
+
|
56 |
+
<table style="width:100%">
|
57 |
+
<tr>
|
58 |
+
<th>DiffSpeech vs. FastSpeech 2</th>
|
59 |
+
</tr>
|
60 |
+
<tr>
|
61 |
+
<td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
62 |
+
</tr>
|
63 |
+
<tr>
|
64 |
+
<td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
65 |
+
</tr>
|
66 |
+
<tr>
|
67 |
+
<td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
68 |
+
</tr>
|
69 |
+
</table>
|
docs/README-zh.md
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
2 |
+
[](https://arxiv.org/abs/2105.02446)
|
3 |
+
[](https://github.com/MoonInTheRiver/DiffSinger)
|
4 |
+
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
5 |
+
| [](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
|
6 |
+
| [English README](../README.md)
|
7 |
+
|
8 |
+
本仓库包含了我们的AAAI-2022 [论文](https://arxiv.org/abs/2105.02446)中提出的DiffSpeech (用于语音合成) 与 DiffSinger (用于歌声合成) 的官方Pytorch实现。
|
9 |
+
|
10 |
+
<table style="width:100%">
|
11 |
+
<tr>
|
12 |
+
<th>DiffSinger/DiffSpeech训练阶段</th>
|
13 |
+
<th>DiffSinger/DiffSpeech推理阶段</th>
|
14 |
+
</tr>
|
15 |
+
<tr>
|
16 |
+
<td><img src="resources/model_a.png" alt="Training" height="300"></td>
|
17 |
+
<td><img src="resources/model_b.png" alt="Inference" height="300"></td>
|
18 |
+
</tr>
|
19 |
+
</table>
|
20 |
+
|
21 |
+
:tada: :tada: :tada: **一些重要更新**:
|
22 |
+
- Mar.2, 2022: [MIDI-新版](README-SVS-opencpop-e2e.md): 重大更新 :sparkles:
|
23 |
+
- Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), 为了歌声美化任务的代码,开源了 :sparkles: :sparkles: :sparkles: .
|
24 |
+
- Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), 一个升级后的代码框架, 包含了DiffSpeech和我们NeurIPS-2021的工作[PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) 已经开源! :sparkles: :sparkles: :sparkles:.
|
25 |
+
- Jan.29, 2022: 支持了[MIDI-旧版](README-SVS-opencpop-cascade.md) 版本的歌声合成系统.
|
26 |
+
- Jan.13, 2022: 支持了歌声合成系统, 开源了PopCS数据集.
|
27 |
+
- Dec.19, 2021: 支持了语音合成系统. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
|
28 |
+
|
29 |
+
:rocket: **新闻**:
|
30 |
+
- Feb.24, 2022: 我们的新工作`NeuralSVB` 被 ACL-2022 接收 [](https://arxiv.org/abs/2202.13277). [音频演示](https://neuralsvb.github.io).
|
31 |
+
- Dec.01, 2021: DiffSinger被AAAI-2022接收.
|
32 |
+
- Sep.29, 2021: 我们的新工作`PortaSpeech: Portable and High-Quality Generative Text-to-Speech` 被NeurIPS-2021接收 [](https://arxiv.org/abs/2109.15166) .
|
33 |
+
- May.06, 2021: 我们把这篇DiffSinger提交到了公开论文网站: Arxiv [](https://arxiv.org/abs/2105.02446).
|
34 |
+
|
35 |
+
## 安装依赖
|
36 |
+
```sh
|
37 |
+
conda create -n your_env_name python=3.8
|
38 |
+
source activate your_env_name
|
39 |
+
pip install -r requirements_2080.txt (GPU 2080Ti, CUDA 10.2)
|
40 |
+
or pip install -r requirements_3090.txt (GPU 3090, CUDA 11.4)
|
41 |
+
```
|
42 |
+
|
43 |
+
## DiffSpeech (语音合成的版本)
|
44 |
+
### 1. 准备工作
|
45 |
+
|
46 |
+
#### 数据准备
|
47 |
+
a) 下载并解压 [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), 创建软链接: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
|
48 |
+
|
49 |
+
b) 下载并解压 [我们用MFA预处理好的对齐](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
|
50 |
+
|
51 |
+
c) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
|
52 |
+
|
53 |
+
```sh
|
54 |
+
export PYTHONPATH=.
|
55 |
+
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
|
56 |
+
|
57 |
+
# `data/binary/ljspeech` will be generated.
|
58 |
+
```
|
59 |
+
|
60 |
+
#### 声码器准备
|
61 |
+
我们提供了[HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip)声码器的预训练模型.
|
62 |
+
请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
|
63 |
+
|
64 |
+
### 2. 训练样例
|
65 |
+
|
66 |
+
首先你需要一个预训练好的FastSpeech2存档点. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), 或者跑下面这个指令从零开始训练FastSpeech2:
|
67 |
+
```sh
|
68 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
|
69 |
+
```
|
70 |
+
然后为了训练DiffSpeech, 运行:
|
71 |
+
```sh
|
72 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
|
73 |
+
```
|
74 |
+
|
75 |
+
记得针对你的路径修改`usr/configs/lj_ds_beta6.yaml`里"fs2_ckpt"这个参数.
|
76 |
+
|
77 |
+
### 3. 推理样例
|
78 |
+
|
79 |
+
```sh
|
80 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
|
81 |
+
```
|
82 |
+
|
83 |
+
我们也提供了:
|
84 |
+
- [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip)的预训练模型;
|
85 |
+
- [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip)的预训练模型, 这是为了DiffSpeech里的浅扩散机制;
|
86 |
+
|
87 |
+
记得把预训练模型放在 `checkpoints` 目录.
|
88 |
+
|
89 |
+
## DiffSinger (歌声合成的版本)
|
90 |
+
|
91 |
+
### 0. 数据获取
|
92 |
+
- 见 [申请表](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
|
93 |
+
- 数据集 [预览](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
|
94 |
+
|
95 |
+
### 1. Preparation
|
96 |
+
#### 数据准备
|
97 |
+
a) 下载并解压PopCS, 创建软链接: `ln -s /xxx/popcs/ data/processed/popcs`
|
98 |
+
|
99 |
+
b) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
|
100 |
+
```sh
|
101 |
+
export PYTHONPATH=.
|
102 |
+
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
|
103 |
+
# `data/binary/popcs-pmf0` 会生成出来.
|
104 |
+
```
|
105 |
+
|
106 |
+
#### 声码器准备
|
107 |
+
我们提供了[HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip)的预训练模型, 它专门为了歌声合成系统设计, 采用了NSF的技术。
|
108 |
+
请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
|
109 |
+
|
110 |
+
(更新: 你也可以将我们提供的[训练更多步数的存档点](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt)放到声码器的文件夹里)
|
111 |
+
|
112 |
+
这个声码器是在大约70小时的较大数据集上训练的, 可以被认为是一个通用声码器。
|
113 |
+
|
114 |
+
### 2. 训练样例
|
115 |
+
首先你需要一个预训练好的FFT-Singer. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), 或者用如下脚本从零训练FFT-Singer:
|
116 |
+
|
117 |
+
```sh
|
118 |
+
# First, train fft-singer;
|
119 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
|
120 |
+
# Then, infer fft-singer;
|
121 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
|
122 |
+
```
|
123 |
+
|
124 |
+
然后, 为了训练DiffSinger, 运行:
|
125 |
+
```sh
|
126 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
|
127 |
+
```
|
128 |
+
|
129 |
+
记得针对你的路径修改`usr/configs/popcs_ds_beta6_offline.yaml`里"fs2_ckpt"这个参数.
|
130 |
+
|
131 |
+
### 3. 推理样例
|
132 |
+
```sh
|
133 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
|
134 |
+
```
|
135 |
+
|
136 |
+
我们也提供了:
|
137 |
+
- [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip)的预训练模型;
|
138 |
+
- [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip)的预训练模型, 这是为了DiffSinger里的浅扩散机制;
|
139 |
+
|
140 |
+
记得把预训练模型放在 `checkpoints` 目录.
|
141 |
+
|
142 |
+
*请注意:*
|
143 |
+
|
144 |
+
-*我们原始论文中的PWG版本声码器已投入商业使用,因此我们提供此HifiGAN版本声码器作为替代品。*
|
145 |
+
|
146 |
+
-*我们这篇论文假设提供真实的F0来进行实验,如[1][2][3]等前作所做的那样,重点在频谱建模上,而非F0曲线的预测。如果你想对MIDI数据进行实验,从MIDI和歌词预测F0曲线(显式或隐式),请查看文档[MIDI-old-version](README-SVS-opencpop-cascade.md) 或 [MIDI-new-version](README-SVS-opencpop-e2e.md)。目前已经支持的MIDI数据集有: Opencpop*
|
147 |
+
|
148 |
+
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
|
149 |
+
|
150 |
+
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
|
151 |
+
|
152 |
+
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
|
153 |
+
|
154 |
+
## Tensorboard
|
155 |
+
```sh
|
156 |
+
tensorboard --logdir_spec exp_name
|
157 |
+
```
|
158 |
+
<table style="width:100%">
|
159 |
+
<tr>
|
160 |
+
<td><img src="resources/tfb.png" alt="Tensorboard" height="250"></td>
|
161 |
+
</tr>
|
162 |
+
</table>
|
163 |
+
|
164 |
+
## Mel 可视化
|
165 |
+
沿着纵轴, DiffSpeech: [0-80]; FastSpeech2: [80-160].
|
166 |
+
|
167 |
+
<table style="width:100%">
|
168 |
+
<tr>
|
169 |
+
<th>DiffSpeech vs. FastSpeech 2</th>
|
170 |
+
</tr>
|
171 |
+
<tr>
|
172 |
+
<td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
173 |
+
</tr>
|
174 |
+
<tr>
|
175 |
+
<td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
176 |
+
</tr>
|
177 |
+
<tr>
|
178 |
+
<td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
179 |
+
</tr>
|
180 |
+
</table>
|
181 |
+
|
182 |
+
## Audio Demos
|
183 |
+
音频样本可以看我们的[样例页](https://diffsinger.github.io/).
|
184 |
+
|
185 |
+
我们也放了部分由DiffSpeech+HifiGAN (标记为[P]) 和 GTmel+HifiGAN (标记为[G]) 生成的测试集音频样例在:[resources/demos_1213](../resources/demos_1213).
|
186 |
+
|
187 |
+
(对应这个预训练参数:[DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip))
|
188 |
+
|
189 |
+
---
|
190 |
+
:rocket: :rocket: :rocket: **更新:**
|
191 |
+
|
192 |
+
新生成的歌声样例在:[resources/demos_0112](../resources/demos_0112).
|
193 |
+
|
194 |
+
## Citation
|
195 |
+
如果本仓库对你的研究和工作有用,请引用以下论文:
|
196 |
+
|
197 |
+
@article{liu2021diffsinger,
|
198 |
+
title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
|
199 |
+
author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
|
200 |
+
journal={arXiv preprint arXiv:2105.02446},
|
201 |
+
volume={2},
|
202 |
+
year={2021}}
|
203 |
+
|
204 |
+
|
205 |
+
## 鸣谢
|
206 |
+
我们的代码基于如下仓库:
|
207 |
+
* [denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch)
|
208 |
+
* [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)
|
209 |
+
* [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
|
210 |
+
* [HifiGAN](https://github.com/jik876/hifi-gan)
|
211 |
+
* [espnet](https://github.com/espnet/espnet)
|
212 |
+
* [DiffWave](https://github.com/lmnt-com/diffwave)
|
inference/svs/base_svs_infer.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
from modules.hifigan.hifigan import HifiGanGenerator
|
6 |
+
from vocoders.hifigan import HifiGAN
|
7 |
+
from inference.svs.opencpop.map import cpop_pinyin2ph_func
|
8 |
+
|
9 |
+
from utils import load_ckpt
|
10 |
+
from utils.hparams import set_hparams, hparams
|
11 |
+
from utils.text_encoder import TokenTextEncoder
|
12 |
+
from pypinyin import pinyin, lazy_pinyin, Style
|
13 |
+
import librosa
|
14 |
+
import glob
|
15 |
+
import re
|
16 |
+
|
17 |
+
|
18 |
+
class BaseSVSInfer:
|
19 |
+
def __init__(self, hparams, device=None):
|
20 |
+
if device is None:
|
21 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
22 |
+
self.hparams = hparams
|
23 |
+
self.device = device
|
24 |
+
|
25 |
+
phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
|
26 |
+
"h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
|
27 |
+
"ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
|
28 |
+
"van", "ve", "vn", "w", "x", "y", "z", "zh"]
|
29 |
+
self.ph_encoder = TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
|
30 |
+
self.pinyin2phs = cpop_pinyin2ph_func()
|
31 |
+
self.spk_map = {'opencpop': 0}
|
32 |
+
|
33 |
+
self.model = self.build_model()
|
34 |
+
self.model.eval()
|
35 |
+
self.model.to(self.device)
|
36 |
+
self.vocoder = self.build_vocoder()
|
37 |
+
self.vocoder.eval()
|
38 |
+
self.vocoder.to(self.device)
|
39 |
+
|
40 |
+
def build_model(self):
|
41 |
+
raise NotImplementedError
|
42 |
+
|
43 |
+
def forward_model(self, inp):
|
44 |
+
raise NotImplementedError
|
45 |
+
|
46 |
+
def build_vocoder(self):
|
47 |
+
base_dir = hparams['vocoder_ckpt']
|
48 |
+
config_path = f'{base_dir}/config.yaml'
|
49 |
+
ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
|
50 |
+
lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
|
51 |
+
print('| load HifiGAN: ', ckpt)
|
52 |
+
ckpt_dict = torch.load(ckpt, map_location="cpu")
|
53 |
+
config = set_hparams(config_path, global_hparams=False)
|
54 |
+
state = ckpt_dict["state_dict"]["model_gen"]
|
55 |
+
vocoder = HifiGanGenerator(config)
|
56 |
+
vocoder.load_state_dict(state, strict=True)
|
57 |
+
vocoder.remove_weight_norm()
|
58 |
+
vocoder = vocoder.eval().to(self.device)
|
59 |
+
return vocoder
|
60 |
+
|
61 |
+
def run_vocoder(self, c, **kwargs):
|
62 |
+
c = c.transpose(2, 1) # [B, 80, T]
|
63 |
+
f0 = kwargs.get('f0') # [B, T]
|
64 |
+
if f0 is not None and hparams.get('use_nsf'):
|
65 |
+
# f0 = torch.FloatTensor(f0).to(self.device)
|
66 |
+
y = self.vocoder(c, f0).view(-1)
|
67 |
+
else:
|
68 |
+
y = self.vocoder(c).view(-1)
|
69 |
+
# [T]
|
70 |
+
return y[None]
|
71 |
+
|
72 |
+
def preprocess_word_level_input(self, inp):
|
73 |
+
# Pypinyin can't solve polyphonic words
|
74 |
+
text_raw = inp['text'].replace('最长', '最常').replace('长睫毛', '常睫毛') \
|
75 |
+
.replace('那么长', '那么常').replace('多长', '多常') \
|
76 |
+
.replace('很长', '很常') # We hope someone could provide a better g2p module for us by opening pull requests.
|
77 |
+
|
78 |
+
# lyric
|
79 |
+
pinyins = lazy_pinyin(text_raw, strict=False)
|
80 |
+
ph_per_word_lst = [self.pinyin2phs[pinyin.strip()] for pinyin in pinyins if pinyin.strip() in self.pinyin2phs]
|
81 |
+
|
82 |
+
# Note
|
83 |
+
note_per_word_lst = [x.strip() for x in inp['notes'].split('|') if x.strip() != '']
|
84 |
+
mididur_per_word_lst = [x.strip() for x in inp['notes_duration'].split('|') if x.strip() != '']
|
85 |
+
|
86 |
+
if len(note_per_word_lst) == len(ph_per_word_lst) == len(mididur_per_word_lst):
|
87 |
+
print('Pass word-notes check.')
|
88 |
+
else:
|
89 |
+
print('The number of words does\'t match the number of notes\' windows. ',
|
90 |
+
'You should split the note(s) for each word by | mark.')
|
91 |
+
print(ph_per_word_lst, note_per_word_lst, mididur_per_word_lst)
|
92 |
+
print(len(ph_per_word_lst), len(note_per_word_lst), len(mididur_per_word_lst))
|
93 |
+
return None
|
94 |
+
|
95 |
+
note_lst = []
|
96 |
+
ph_lst = []
|
97 |
+
midi_dur_lst = []
|
98 |
+
is_slur = []
|
99 |
+
for idx, ph_per_word in enumerate(ph_per_word_lst):
|
100 |
+
# for phs in one word:
|
101 |
+
# single ph like ['ai'] or multiple phs like ['n', 'i']
|
102 |
+
ph_in_this_word = ph_per_word.split()
|
103 |
+
|
104 |
+
# for notes in one word:
|
105 |
+
# single note like ['D4'] or multiple notes like ['D4', 'E4'] which means a 'slur' here.
|
106 |
+
note_in_this_word = note_per_word_lst[idx].split()
|
107 |
+
midi_dur_in_this_word = mididur_per_word_lst[idx].split()
|
108 |
+
# process for the model input
|
109 |
+
# Step 1.
|
110 |
+
# Deal with note of 'not slur' case or the first note of 'slur' case
|
111 |
+
# j ie
|
112 |
+
# F#4/Gb4 F#4/Gb4
|
113 |
+
# 0 0
|
114 |
+
for ph in ph_in_this_word:
|
115 |
+
ph_lst.append(ph)
|
116 |
+
note_lst.append(note_in_this_word[0])
|
117 |
+
midi_dur_lst.append(midi_dur_in_this_word[0])
|
118 |
+
is_slur.append(0)
|
119 |
+
# step 2.
|
120 |
+
# Deal with the 2nd, 3rd... notes of 'slur' case
|
121 |
+
# j ie ie
|
122 |
+
# F#4/Gb4 F#4/Gb4 C#4/Db4
|
123 |
+
# 0 0 1
|
124 |
+
if len(note_in_this_word) > 1: # is_slur = True, we should repeat the YUNMU to match the 2nd, 3rd... notes.
|
125 |
+
for idx in range(1, len(note_in_this_word)):
|
126 |
+
ph_lst.append(ph_in_this_word[-1])
|
127 |
+
note_lst.append(note_in_this_word[idx])
|
128 |
+
midi_dur_lst.append(midi_dur_in_this_word[idx])
|
129 |
+
is_slur.append(1)
|
130 |
+
ph_seq = ' '.join(ph_lst)
|
131 |
+
|
132 |
+
if len(ph_lst) == len(note_lst) == len(midi_dur_lst):
|
133 |
+
print(len(ph_lst), len(note_lst), len(midi_dur_lst))
|
134 |
+
print('Pass word-notes check.')
|
135 |
+
else:
|
136 |
+
print('The number of words does\'t match the number of notes\' windows. ',
|
137 |
+
'You should split the note(s) for each word by | mark.')
|
138 |
+
return None
|
139 |
+
return ph_seq, note_lst, midi_dur_lst, is_slur
|
140 |
+
|
141 |
+
def preprocess_phoneme_level_input(self, inp):
|
142 |
+
ph_seq = inp['ph_seq']
|
143 |
+
note_lst = inp['note_seq'].split()
|
144 |
+
midi_dur_lst = inp['note_dur_seq'].split()
|
145 |
+
is_slur = [float(x) for x in inp['is_slur_seq'].split()]
|
146 |
+
print(len(note_lst), len(ph_seq.split()), len(midi_dur_lst))
|
147 |
+
if len(note_lst) == len(ph_seq.split()) == len(midi_dur_lst):
|
148 |
+
print('Pass word-notes check.')
|
149 |
+
else:
|
150 |
+
print('The number of words does\'t match the number of notes\' windows. ',
|
151 |
+
'You should split the note(s) for each word by | mark.')
|
152 |
+
return None
|
153 |
+
return ph_seq, note_lst, midi_dur_lst, is_slur
|
154 |
+
|
155 |
+
def preprocess_input(self, inp, input_type='word'):
|
156 |
+
"""
|
157 |
+
|
158 |
+
:param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
|
159 |
+
:return:
|
160 |
+
"""
|
161 |
+
|
162 |
+
item_name = inp.get('item_name', '<ITEM_NAME>')
|
163 |
+
spk_name = inp.get('spk_name', 'opencpop')
|
164 |
+
|
165 |
+
# single spk
|
166 |
+
spk_id = self.spk_map[spk_name]
|
167 |
+
|
168 |
+
# get ph seq, note lst, midi dur lst, is slur lst.
|
169 |
+
if input_type == 'word':
|
170 |
+
ret = self.preprocess_word_level_input(inp)
|
171 |
+
elif input_type == 'phoneme': # like transcriptions.txt in Opencpop dataset.
|
172 |
+
ret = self.preprocess_phoneme_level_input(inp)
|
173 |
+
else:
|
174 |
+
print('Invalid input type.')
|
175 |
+
return None
|
176 |
+
|
177 |
+
if ret:
|
178 |
+
ph_seq, note_lst, midi_dur_lst, is_slur = ret
|
179 |
+
else:
|
180 |
+
print('==========> Preprocess_word_level or phone_level input wrong.')
|
181 |
+
return None
|
182 |
+
|
183 |
+
# convert note lst to midi id; convert note dur lst to midi duration
|
184 |
+
try:
|
185 |
+
midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
|
186 |
+
for x in note_lst]
|
187 |
+
midi_dur_lst = [float(x) for x in midi_dur_lst]
|
188 |
+
except Exception as e:
|
189 |
+
print(e)
|
190 |
+
print('Invalid Input Type.')
|
191 |
+
return None
|
192 |
+
|
193 |
+
ph_token = self.ph_encoder.encode(ph_seq)
|
194 |
+
item = {'item_name': item_name, 'text': inp['text'], 'ph': ph_seq, 'spk_id': spk_id,
|
195 |
+
'ph_token': ph_token, 'pitch_midi': np.asarray(midis), 'midi_dur': np.asarray(midi_dur_lst),
|
196 |
+
'is_slur': np.asarray(is_slur), }
|
197 |
+
item['ph_len'] = len(item['ph_token'])
|
198 |
+
return item
|
199 |
+
|
200 |
+
def input_to_batch(self, item):
|
201 |
+
item_names = [item['item_name']]
|
202 |
+
text = [item['text']]
|
203 |
+
ph = [item['ph']]
|
204 |
+
txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
|
205 |
+
txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
|
206 |
+
spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
|
207 |
+
|
208 |
+
pitch_midi = torch.LongTensor(item['pitch_midi'])[None, :hparams['max_frames']].to(self.device)
|
209 |
+
midi_dur = torch.FloatTensor(item['midi_dur'])[None, :hparams['max_frames']].to(self.device)
|
210 |
+
is_slur = torch.LongTensor(item['is_slur'])[None, :hparams['max_frames']].to(self.device)
|
211 |
+
|
212 |
+
batch = {
|
213 |
+
'item_name': item_names,
|
214 |
+
'text': text,
|
215 |
+
'ph': ph,
|
216 |
+
'txt_tokens': txt_tokens,
|
217 |
+
'txt_lengths': txt_lengths,
|
218 |
+
'spk_ids': spk_ids,
|
219 |
+
'pitch_midi': pitch_midi,
|
220 |
+
'midi_dur': midi_dur,
|
221 |
+
'is_slur': is_slur
|
222 |
+
}
|
223 |
+
return batch
|
224 |
+
|
225 |
+
def postprocess_output(self, output):
|
226 |
+
return output
|
227 |
+
|
228 |
+
def infer_once(self, inp):
|
229 |
+
inp = self.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
|
230 |
+
output = self.forward_model(inp)
|
231 |
+
output = self.postprocess_output(output)
|
232 |
+
return output
|
233 |
+
|
234 |
+
@classmethod
|
235 |
+
def example_run(cls, inp):
|
236 |
+
from utils.audio import save_wav
|
237 |
+
set_hparams(print_hparams=False)
|
238 |
+
infer_ins = cls(hparams)
|
239 |
+
out = infer_ins.infer_once(inp)
|
240 |
+
os.makedirs('infer_out', exist_ok=True)
|
241 |
+
save_wav(out, f'infer_out/example_out.wav', hparams['audio_sample_rate'])
|
242 |
+
|
243 |
+
|
244 |
+
# if __name__ == '__main__':
|
245 |
+
# debug
|
246 |
+
# a = BaseSVSInfer(hparams)
|
247 |
+
# a.preprocess_input({'text': '你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP',
|
248 |
+
# 'notes': 'D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest',
|
249 |
+
# 'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
250 |
+
# })
|
251 |
+
|
252 |
+
# b = {
|
253 |
+
# 'text': '小酒窝长睫毛AP是你最美的记号',
|
254 |
+
# 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
255 |
+
# 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340'
|
256 |
+
# }
|
257 |
+
# c = {
|
258 |
+
# 'text': '小酒窝长睫毛AP是你最美的记号',
|
259 |
+
# 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
260 |
+
# 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
261 |
+
# 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
262 |
+
# 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
263 |
+
# } # input like Opencpop dataset.
|
264 |
+
# a.preprocess_input(b)
|
265 |
+
# a.preprocess_input(c, input_type='phoneme')
|
inference/svs/ds_cascade.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
# from inference.tts.fs import FastSpeechInfer
|
3 |
+
# from modules.tts.fs2_orig import FastSpeech2Orig
|
4 |
+
from inference.svs.base_svs_infer import BaseSVSInfer
|
5 |
+
from utils import load_ckpt
|
6 |
+
from utils.hparams import hparams
|
7 |
+
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
|
8 |
+
from usr.diffsinger_task import DIFF_DECODERS
|
9 |
+
|
10 |
+
class DiffSingerCascadeInfer(BaseSVSInfer):
|
11 |
+
def build_model(self):
|
12 |
+
model = GaussianDiffusion(
|
13 |
+
phone_encoder=self.ph_encoder,
|
14 |
+
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
|
15 |
+
timesteps=hparams['timesteps'],
|
16 |
+
K_step=hparams['K_step'],
|
17 |
+
loss_type=hparams['diff_loss_type'],
|
18 |
+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
|
19 |
+
)
|
20 |
+
model.eval()
|
21 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
22 |
+
return model
|
23 |
+
|
24 |
+
def forward_model(self, inp):
|
25 |
+
sample = self.input_to_batch(inp)
|
26 |
+
txt_tokens = sample['txt_tokens'] # [B, T_t]
|
27 |
+
spk_id = sample.get('spk_ids')
|
28 |
+
with torch.no_grad():
|
29 |
+
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
|
30 |
+
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
|
31 |
+
is_slur=sample['is_slur'])
|
32 |
+
mel_out = output['mel_out'] # [B, T,80]
|
33 |
+
f0_pred = output['f0_denorm']
|
34 |
+
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
|
35 |
+
wav_out = wav_out.cpu().numpy()
|
36 |
+
return wav_out[0]
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == '__main__':
|
40 |
+
inp = {
|
41 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
42 |
+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
43 |
+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
|
44 |
+
'input_type': 'word'
|
45 |
+
} # user input: Chinese characters
|
46 |
+
c = {
|
47 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
48 |
+
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
49 |
+
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
50 |
+
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
51 |
+
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
|
52 |
+
'input_type': 'phoneme'
|
53 |
+
} # input like Opencpop dataset.
|
54 |
+
DiffSingerCascadeInfer.example_run(inp)
|
55 |
+
|
56 |
+
# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
|
inference/svs/ds_e2e.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
# from inference.tts.fs import FastSpeechInfer
|
3 |
+
# from modules.tts.fs2_orig import FastSpeech2Orig
|
4 |
+
from inference.svs.base_svs_infer import BaseSVSInfer
|
5 |
+
from utils import load_ckpt
|
6 |
+
from utils.hparams import hparams
|
7 |
+
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
|
8 |
+
from usr.diffsinger_task import DIFF_DECODERS
|
9 |
+
from modules.fastspeech.pe import PitchExtractor
|
10 |
+
import utils
|
11 |
+
|
12 |
+
|
13 |
+
class DiffSingerE2EInfer(BaseSVSInfer):
|
14 |
+
def build_model(self):
|
15 |
+
model = GaussianDiffusion(
|
16 |
+
phone_encoder=self.ph_encoder,
|
17 |
+
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
|
18 |
+
timesteps=hparams['timesteps'],
|
19 |
+
K_step=hparams['K_step'],
|
20 |
+
loss_type=hparams['diff_loss_type'],
|
21 |
+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
|
22 |
+
)
|
23 |
+
model.eval()
|
24 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
25 |
+
|
26 |
+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
|
27 |
+
self.pe = PitchExtractor().to(self.device)
|
28 |
+
utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
|
29 |
+
self.pe.eval()
|
30 |
+
return model
|
31 |
+
|
32 |
+
def forward_model(self, inp):
|
33 |
+
sample = self.input_to_batch(inp)
|
34 |
+
txt_tokens = sample['txt_tokens'] # [B, T_t]
|
35 |
+
spk_id = sample.get('spk_ids')
|
36 |
+
with torch.no_grad():
|
37 |
+
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
|
38 |
+
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
|
39 |
+
is_slur=sample['is_slur'])
|
40 |
+
mel_out = output['mel_out'] # [B, T,80]
|
41 |
+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
|
42 |
+
f0_pred = self.pe(mel_out)['f0_denorm_pred'] # pe predict from Pred mel
|
43 |
+
else:
|
44 |
+
f0_pred = output['f0_denorm']
|
45 |
+
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
|
46 |
+
wav_out = wav_out.cpu().numpy()
|
47 |
+
return wav_out[0]
|
48 |
+
|
49 |
+
if __name__ == '__main__':
|
50 |
+
inp = {
|
51 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
52 |
+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
53 |
+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
|
54 |
+
'input_type': 'word'
|
55 |
+
} # user input: Chinese characters
|
56 |
+
inp = {
|
57 |
+
'text': '小酒窝长睫毛AP是你最美的记号',
|
58 |
+
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
59 |
+
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
60 |
+
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
61 |
+
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
|
62 |
+
'input_type': 'phoneme'
|
63 |
+
} # input like Opencpop dataset.
|
64 |
+
DiffSingerE2EInfer.example_run(inp)
|
65 |
+
|
66 |
+
|
67 |
+
# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
|