DiffSinger

Runtime error

App Files Files Community

kevinwang676

Silentlin commited on May 17, 2023

Commit

b2c3cd9

0 Parent(s):

Duplicate from Silentlin/DiffSinger

Browse files

Co-authored-by: Jinglin Liu <[email protected]>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +33 -0
LICENSE +21 -0
README.md +10 -0
checkpoints/.gitattributes +1 -0
checkpoints/.gitkeep +0 -0
checkpoints/0102_xiaoma_pe/config.yaml +172 -0
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +3 -0
checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml +241 -0
checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt +3 -0
checkpoints/0228_opencpop_ds100_rel/config.yaml +342 -0
checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt +3 -0
checkpoints/0831_opencpop_ds1000/config.yaml +346 -0
checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt +3 -0
checkpoints/clean.py +12 -0
checkpoints/cleaner.py +8 -0
configs/config_base.yaml +42 -0
configs/singing/base.yaml +42 -0
configs/singing/fs2.yaml +3 -0
configs/tts/base.yaml +95 -0
configs/tts/base_zh.yaml +3 -0
configs/tts/fs2.yaml +80 -0
configs/tts/hifigan.yaml +21 -0
configs/tts/lj/base_mel2wav.yaml +3 -0
configs/tts/lj/base_text2mel.yaml +13 -0
configs/tts/lj/fs2.yaml +3 -0
configs/tts/lj/hifigan.yaml +3 -0
configs/tts/lj/pwg.yaml +3 -0
configs/tts/pwg.yaml +110 -0
data/processed/ljspeech/dict.txt +77 -0
data/processed/ljspeech/metadata_phone.csv +0 -0
data/processed/ljspeech/mfa_dict.txt +0 -0
data/processed/ljspeech/phone_set.json +1 -0
data_gen/singing/binarize.py +398 -0
data_gen/tts/base_binarizer.py +224 -0
data_gen/tts/bin/binarize.py +20 -0
data_gen/tts/binarizer_zh.py +59 -0
data_gen/tts/data_gen_utils.py +347 -0
data_gen/tts/txt_processors/base_text_processor.py +8 -0
data_gen/tts/txt_processors/en.py +78 -0
data_gen/tts/txt_processors/zh.py +41 -0
data_gen/tts/txt_processors/zh_g2pM.py +72 -0
docs/README-SVS-opencpop-cascade.md +111 -0
docs/README-SVS-opencpop-e2e.md +107 -0
docs/README-SVS-popcs.md +63 -0
docs/README-SVS.md +76 -0
docs/README-TTS.md +69 -0
docs/README-zh.md +212 -0
inference/svs/base_svs_infer.py +265 -0
inference/svs/ds_cascade.py +56 -0
inference/svs/ds_e2e.py +67 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
+checkpoints/0831_opencpop_ds1000 filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Jinglin Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
+emoji: 🎶
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+app_file: inference/svs/gradio/infer.py
+pinned: false
+duplicated_from: Silentlin/DiffSinger
+---

checkpoints/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text

checkpoints/.gitkeep ADDED Viewed

File without changes

checkpoints/0102_xiaoma_pe/config.yaml ADDED Viewed

	@@ -0,0 +1,172 @@

+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- configs/tts/lj/fs2.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: true
+  with_txt: true
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/xiaoma1022_24k_128hop
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decoder_type: fft
+dict_dir: ''
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+gen_dir_name: ''
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 5000
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 20000
+max_updates: 60000
+mel_loss: l1
+mel_vmax: 1.5
+mel_vmin: -6
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_test_samples: 20
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor_conv_layers: 2
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: en
+  use_sox: false
+  use_tone: true
+pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+reset_phone_dict: true
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: false
+save_gt: false
+seed: 1234
+sort_by_len: true
+stop_token_weight: 5.0
+task_cls: tasks.tts.pe.PitchExtractionTask
+test_ids:
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_dir: ''
+test_num: 523
+test_set_name: test
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 348
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+warmup_updates: 2000
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0102_xiaoma_pe

checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb
+size 13047222

checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml ADDED Viewed

	@@ -0,0 +1,241 @@

+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+aux_context_window: 0
+#base_config:
+#- egs/egs_bases/singing/pwg.yaml
+#- egs/egs_bases/tts/vocoder/hifigan.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: false
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: false
+  with_wav: true
+  with_word: false
+binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer
+binary_data_dir: data/binary/big_popcs_24k_hop128
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+datasets: []
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  lr: 0.0002
+  weight_decay: 0.0
+discriminator_params:
+  bias: true
+  conv_channels: 64
+  in_channels: 1
+  kernel_size: 3
+  layers: 10
+  nonlinear_activation: LeakyReLU
+  nonlinear_activation_params:
+    negative_slope: 0.2
+  out_channels: 1
+  use_weight_norm: true
+discriminator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+  eps: 1.0e-06
+  lr: 0.0002
+  weight_decay: 0.0
+generator_params:
+  aux_channels: 80
+  dropout: 0.0
+  gate_channels: 128
+  in_channels: 1
+  kernel_size: 3
+  layers: 30
+  out_channels: 1
+  residual_channels: 64
+  skip_channels: 64
+  stacks: 3
+  upsample_net: ConvInUpsampleNetwork
+  upsample_params:
+    upsample_scales:
+    - 2
+    - 4
+    - 4
+    - 4
+  use_nsf: false
+  use_pitch_embed: true
+  use_weight_norm: true
+generator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_adv: 1.0
+lambda_cdisc: 4.0
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+lambda_ph_dur: 0.0
+lambda_sent_dur: 0.0
+lambda_uv: 0.0
+lambda_word_dur: 0.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_frames: 2400
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 20
+max_tokens: 24000
+max_updates: 3000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+min_level_db: -120
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: 5
+num_spk: 100
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: true
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: zh
+  use_tone: false
+pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign
+predictor_grad: 0.0
+print_nan_grads: false
+processed_data_dir: ''
+profile_infer: false
+raw_data_dir: ''
+ref_level_db: 20
+rename_tmux: true
+rerun_gen: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: true
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+stft_loss_params:
+  fft_sizes:
+  - 1024
+  - 2048
+  - 512
+  hop_sizes:
+  - 120
+  - 240
+  - 50
+  win_lengths:
+  - 600
+  - 1200
+  - 240
+  window: hann_window
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 50
+test_prefixes: []
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 4
+- 2
+- 2
+use_cdisc: false
+use_cond_disc: false
+use_fm_loss: false
+use_gt_dur: true
+use_gt_f0: true
+use_mel_loss: true
+use_ms_stft: false
+use_pitch_embed: true
+use_ref_enc: true
+use_spec_disc: false
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 512
+window: hann
+word_size: 3000
+work_dir: checkpoints/0109_hifigan_bigpopcs_hop128

checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1
+size 55827436

checkpoints/0228_opencpop_ds100_rel/config.yaml ADDED Viewed

	@@ -0,0 +1,342 @@

+K_step: 100
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- usr/configs/popcs_ds_beta6.yaml
+- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: false
+  with_txt: true
+  with_wav: true
+binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
+binary_data_dir: data/binary/opencpop-midi-dp
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- popcs
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 50000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 128
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 0.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.001
+max_beta: 0.06
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 8000
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 40000
+max_updates: 160000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe
+pe_enable: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: zh_g2pM
+  use_sox: true
+  use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/popcs
+profile_infer: false
+raw_data_dir: data/raw/popcs
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 256
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.79453
+- -0.81116
+- -0.61631
+- -0.30679
+- -0.13863
+- -0.050652
+- -0.11563
+- -0.10679
+- -0.091068
+- -0.062174
+- -0.075302
+- -0.072217
+- -0.063815
+- -0.073299
+- 0.007361
+- -0.072508
+- -0.050234
+- -0.16534
+- -0.26928
+- -0.20782
+- -0.20823
+- -0.11702
+- -0.070128
+- -0.065868
+- -0.012675
+- 0.0015121
+- -0.089902
+- -0.21392
+- -0.23789
+- -0.28922
+- -0.30405
+- -0.23029
+- -0.22088
+- -0.21542
+- -0.29367
+- -0.30137
+- -0.38281
+- -0.4359
+- -0.28681
+- -0.46855
+- -0.57485
+- -0.47022
+- -0.54266
+- -0.44848
+- -0.6412
+- -0.687
+- -0.6486
+- -0.76436
+- -0.49971
+- -0.71068
+- -0.69724
+- -0.61487
+- -0.55843
+- -0.69773
+- -0.57502
+- -0.70919
+- -0.82431
+- -0.84213
+- -0.90431
+- -0.8284
+- -0.77945
+- -0.82758
+- -0.87699
+- -1.0532
+- -1.0766
+- -1.1198
+- -1.0185
+- -0.98983
+- -1.0001
+- -1.0756
+- -1.0024
+- -1.0304
+- -1.0579
+- -1.0188
+- -1.05
+- -1.0842
+- -1.0923
+- -1.1223
+- -1.2381
+- -1.6467
+spec_min:
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: usr.diffsinger_task.DiffSingerMIDITask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- "popcs-\u8BF4\u6563\u5C31\u6563"
+- "popcs-\u9690\u5F62\u7684\u7FC5\u8180"
+test_set_name: test
+timesteps: 100
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: true
+use_nsf: true
+use_pitch_embed: false
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: vocoders.hifigan.HifiGAN
+vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0228_opencpop_ds100_rel

checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c
+size 170226367

checkpoints/0831_opencpop_ds1000/config.yaml ADDED Viewed

	@@ -0,0 +1,346 @@

+K_step: 1000
+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- usr/configs/popcs_ds_beta6.yaml
+- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: false
+  with_txt: true
+  with_wav: true
+binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
+binary_data_dir: data/binary/opencpop-midi-dp
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- opencpop
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 50000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 128
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 0.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 0.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.001
+max_beta: 0.02
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 8000
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 36000
+max_updates: 320000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe
+pe_enable: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: zh_g2pM
+  use_sox: true
+  use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: xxx
+profile_infer: false
+raw_data_dir: data/raw/opencpop/segments
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 256
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.79453
+- -0.81116
+- -0.61631
+- -0.30679
+- -0.13863
+- -0.050652
+- -0.11563
+- -0.10679
+- -0.091068
+- -0.062174
+- -0.075302
+- -0.072217
+- -0.063815
+- -0.073299
+- 0.007361
+- -0.072508
+- -0.050234
+- -0.16534
+- -0.26928
+- -0.20782
+- -0.20823
+- -0.11702
+- -0.070128
+- -0.065868
+- -0.012675
+- 0.0015121
+- -0.089902
+- -0.21392
+- -0.23789
+- -0.28922
+- -0.30405
+- -0.23029
+- -0.22088
+- -0.21542
+- -0.29367
+- -0.30137
+- -0.38281
+- -0.4359
+- -0.28681
+- -0.46855
+- -0.57485
+- -0.47022
+- -0.54266
+- -0.44848
+- -0.6412
+- -0.687
+- -0.6486
+- -0.76436
+- -0.49971
+- -0.71068
+- -0.69724
+- -0.61487
+- -0.55843
+- -0.69773
+- -0.57502
+- -0.70919
+- -0.82431
+- -0.84213
+- -0.90431
+- -0.8284
+- -0.77945
+- -0.82758
+- -0.87699
+- -1.0532
+- -1.0766
+- -1.1198
+- -1.0185
+- -0.98983
+- -1.0001
+- -1.0756
+- -1.0024
+- -1.0304
+- -1.0579
+- -1.0188
+- -1.05
+- -1.0842
+- -1.0923
+- -1.1223
+- -1.2381
+- -1.6467
+spec_min:
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+- -6.0
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: usr.diffsinger_task.DiffSingerMIDITask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- '2044'
+- '2086'
+- '2092'
+- '2093'
+- '2100'
+test_set_name: test
+timesteps: 1000
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: true
+use_nsf: true
+use_pitch_embed: false
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: vocoders.hifigan.HifiGAN
+vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0831_opencpop_ds1000
+pndm_speedup: 10

checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e
+size 170269591

checkpoints/clean.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import sys
+import torch
+if __name__ == '__main__':
+    ckpt_path = sys.argv[1]
+    checkpoint = torch.load(ckpt_path, map_location='cpu')
+    print(checkpoint['state_dict'].keys())
+    if 'model' in checkpoint['state_dict']:
+        checkpoint = {'state_dict': {'model': checkpoint['state_dict']['model']}}
+    else:
+        checkpoint = {'state_dict': {'model_gen': checkpoint['state_dict']['model_gen']}}
+    torch.save(checkpoint, ckpt_path, _use_new_zipfile_serialization=False)

checkpoints/cleaner.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import sys
+import torch
+if __name__ == '__main__':
+    ckpt_path = sys.argv[1]
+    checkpoint = torch.load(ckpt_path, map_location='cpu')
+    checkpoint = {'state_dict': checkpoint['state_dict']}
+    torch.save(checkpoint, ckpt_path, _use_new_zipfile_serialization=False)

configs/config_base.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# task
+binary_data_dir: ''
+work_dir: '' # experiment directory.
+infer: false # infer
+seed: 1234
+debug: false
+save_codes:
+  - configs
+  - modules
+  - tasks
+  - utils
+  - usr
+#############
+# dataset
+#############
+ds_workers: 1
+test_num: 100
+valid_num: 100
+endless_ds: false
+sort_by_len: true
+#########
+# train and eval
+#########
+load_ckpt: ''
+save_ckpt: true
+save_best: false
+num_ckpt_keep: 3
+clip_grad_norm: 0
+accumulate_grad_batches: 1
+log_interval: 100
+num_sanity_val_steps: 5  # steps of validation at the beginning
+check_val_every_n_epoch: 10
+val_check_interval: 2000
+max_epochs: 1000
+max_updates: 160000
+max_tokens: 31250
+max_sentences: 100000
+max_eval_tokens: -1
+max_eval_sentences: -1
+test_input_dir: ''

configs/singing/base.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+base_config:
+  - configs/tts/base.yaml
+  - configs/tts/base_zh.yaml
+datasets: []
+test_prefixes: []
+test_num: 0
+valid_num: 0
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+binarizer_cls: data_gen.singing.binarize.SingingBinarizer
+pre_align_args:
+  use_tone: false # for ZH
+  forced_align: mfa
+  use_sox: true
+hop_size: 128            # Hop size.
+fft_size: 512           # FFT size.
+win_size: 512           # FFT size.
+max_frames: 8000
+fmin: 50                 # Minimum freq in mel basis calculation.
+fmax: 11025               # Maximum frequency in mel basis calculation.
+pitch_type: frame
+hidden_size: 256
+mel_loss: "ssim:0.5|l1:0.5"
+lambda_f0: 0.0
+lambda_uv: 0.0
+lambda_energy: 0.0
+lambda_ph_dur: 0.0
+lambda_sent_dur: 0.0
+lambda_word_dur: 0.0
+predictor_grad: 0.0
+use_spk_embed: true
+use_spk_id: false
+max_tokens: 20000
+max_updates: 400000
+num_spk: 100
+save_f0: true
+use_gt_dur: true
+use_gt_f0: true

configs/singing/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/fs2.yaml
+  - configs/singing/base.yaml

configs/tts/base.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+# task
+base_config: configs/config_base.yaml
+task_cls: ''
+#############
+# dataset
+#############
+raw_data_dir: ''
+processed_data_dir: ''
+binary_data_dir: ''
+dict_dir: ''
+pre_align_cls: ''
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+pre_align_args:
+  use_tone: true # for ZH
+  forced_align: mfa
+  use_sox: false
+  txt_processor: en
+  allow_no_txt: false
+  denoise: false
+binarization_args:
+  shuffle: false
+  with_txt: true
+  with_wav: false
+  with_align: true
+  with_spk_embed: true
+  with_f0: true
+  with_f0cwt: true
+loud_norm: false
+endless_ds: true
+reset_phone_dict: true
+test_num: 100
+valid_num: 100
+max_frames: 1550
+max_input_tokens: 1550
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+fmax: 7600  # To be increased/reduced depending on data.
+fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
+min_level_db: -100
+num_spk: 1
+mel_vmin: -6
+mel_vmax: 1.5
+ds_workers: 4
+#########
+# model
+#########
+dropout: 0.1
+enc_layers: 4
+dec_layers: 4
+hidden_size: 384
+num_heads: 2
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+stop_token_weight: 5.0
+enc_ffn_kernel_size: 9
+dec_ffn_kernel_size: 9
+ffn_act: gelu
+ffn_padding: 'SAME'
+###########
+# optimization
+###########
+lr: 2.0
+warmup_updates: 8000
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+weight_decay: 0
+clip_grad_norm: 1
+###########
+# train and eval
+###########
+max_tokens: 30000
+max_sentences: 100000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+train_set_name: 'train'
+valid_set_name: 'valid'
+test_set_name: 'test'
+vocoder: pwg
+vocoder_ckpt: ''
+profile_infer: false
+out_wav_norm: false
+save_gt: false
+save_f0: false
+gen_dir_name: ''
+use_denoise: false

configs/tts/base_zh.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+pre_align_args:
+  txt_processor: zh_g2pM
+binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer

configs/tts/fs2.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+base_config: configs/tts/base.yaml
+task_cls: tasks.tts.fs2.FastSpeech2Task
+# model
+hidden_size: 256
+dropout: 0.1
+encoder_type: fft # fft|tacotron|tacotron2|conformer
+encoder_K: 8 # for tacotron encoder
+decoder_type: fft # fft|rnn|conv|conformer
+use_pos_embed: true
+# duration
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+predictor_dropout: 0.5
+# pitch and energy
+use_pitch_embed: true
+pitch_type: ph # frame|ph|cwt
+use_uv: true
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_add_f0_loss: false
+cwt_std_scale: 0.8
+pitch_ar: false
+#pitch_embed_type: 0q
+pitch_loss: 'l1' # l1|l2|ssim
+pitch_norm: log
+use_energy_embed: false
+# reference encoder and speaker embedding
+use_spk_id: false
+use_split_spk_id: false
+use_spk_embed: false
+use_var_enc: false
+lambda_commit: 0.25
+ref_norm_layer: bn
+pitch_enc_hidden_stride_kernel:
+  - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,2,5
+  - 0,2,5
+dur_enc_hidden_stride_kernel:
+  - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,2,3
+  - 0,1,3
+# mel
+mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
+# loss lambda
+lambda_f0: 1.0
+lambda_uv: 1.0
+lambda_energy: 0.1
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_word_dur: 1.0
+predictor_grad: 0.1
+# train and eval
+pretrain_fs_ckpt: ''
+warmup_updates: 2000
+max_tokens: 32000
+max_sentences: 100000
+max_eval_sentences: 1
+max_updates: 120000
+num_valid_plots: 5
+num_test_samples: 0
+test_ids: []
+use_gt_dur: false
+use_gt_f0: false
+# exp
+dur_loss: mse # huber|mol
+norm_type: gn

configs/tts/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+base_config: configs/tts/pwg.yaml
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+resblock: "1"
+adam_b1: 0.8
+adam_b2: 0.99
+upsample_rates: [ 8,8,2,2 ]
+upsample_kernel_sizes: [ 16,16,4,4 ]
+upsample_initial_channel: 128
+resblock_kernel_sizes: [ 3,7,11 ]
+resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
+lambda_mel: 45.0
+max_samples: 8192
+max_sentences: 16
+generator_params:
+  lr: 0.0002            # Generator's learning rate.
+  aux_context_window: 0 # Context window size for auxiliary feature.
+discriminator_optimizer_params:
+  lr: 0.0002            # Discriminator's learning rate.

configs/tts/lj/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech_wav'

configs/tts/lj/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech'
+pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
+pitch_type: cwt
+mel_loss: l1
+num_test_samples: 20
+test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
+            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
+use_energy_embed: false
+test_num: 523
+valid_num: 348

configs/tts/lj/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/fs2.yaml
+  - configs/tts/lj/base_text2mel.yaml

configs/tts/lj/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/hifigan.yaml
+  - configs/tts/lj/base_mel2wav.yaml

configs/tts/lj/pwg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/pwg.yaml
+  - configs/tts/lj/base_mel2wav.yaml

configs/tts/pwg.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+base_config: configs/tts/base.yaml
+task_cls: tasks.vocoder.pwg.PwgTask
+binarization_args:
+  with_wav: true
+  with_spk_embed: false
+  with_align: false
+test_input_dir: ''
+###########
+# train and eval
+###########
+max_samples: 25600
+max_sentences: 5
+max_eval_sentences: 1
+max_updates: 1000000
+val_check_interval: 2000
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+# If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+  in_channels: 1        # Number of input channels.
+  out_channels: 1       # Number of output channels.
+  kernel_size: 3        # Kernel size of dilated convolution.
+  layers: 30            # Number of residual block layers.
+  stacks: 3             # Number of stacks i.e., dilation cycles.
+  residual_channels: 64 # Number of channels in residual conv.
+  gate_channels: 128    # Number of channels in gated conv.
+  skip_channels: 64     # Number of channels in skip conv.
+  aux_channels: 80      # Number of channels for auxiliary feature conv.
+  # Must be the same as num_mels.
+  aux_context_window: 2 # Context window size for auxiliary feature.
+  # If set to 2, previous 2 and future 2 frames will be considered.
+  dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+  use_weight_norm: true # Whether to use weight norm.
+  # If set to true, it will be applied to all of the conv layers.
+  upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+  upsample_params:                      # Upsampling network parameters.
+    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+  use_pitch_embed: false
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+  in_channels: 1        # Number of input channels.
+  out_channels: 1       # Number of output channels.
+  kernel_size: 3        # Number of output channels.
+  layers: 10            # Number of conv layers.
+  conv_channels: 64     # Number of chnn layers.
+  bias: true            # Whether to use bias parameter in conv.
+  use_weight_norm: true # Whether to use weight norm.
+  # If set to true, it will be applied to all of the conv layers.
+  nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+  nonlinear_activation_params:      # Nonlinear function parameters
+    negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+  fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+  hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+  win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+  window: "hann_window"         # Window function for STFT-based loss
+use_mel_loss: false
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+  lr: 0.0001             # Generator's learning rate.
+  eps: 1.0e-6            # Generator's epsilon.
+  weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+  step_size: 200000      # Generator's scheduler step size.
+  gamma: 0.5             # Generator's scheduler gamma.
+  # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+  lr: 0.00005            # Discriminator's learning rate.
+  eps: 1.0e-6            # Discriminator's epsilon.
+  weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+  step_size: 200000      # Discriminator's scheduler step size.
+  gamma: 0.5             # Discriminator's scheduler gamma.
+  # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+disc_start_steps: 40000 # Number of steps to start to train discriminator.

data/processed/ljspeech/dict.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+! !
+, ,
+. .
+; ;
+<BOS> <BOS>
+<EOS> <EOS>
+? ?
+AA0 AA0
+AA1 AA1
+AA2 AA2
+AE0 AE0
+AE1 AE1
+AE2 AE2
+AH0 AH0
+AH1 AH1
+AH2 AH2
+AO0 AO0
+AO1 AO1
+AO2 AO2
+AW0 AW0
+AW1 AW1
+AW2 AW2
+AY0 AY0
+AY1 AY1
+AY2 AY2
+B B
+CH CH
+D D
+DH DH
+EH0 EH0
+EH1 EH1
+EH2 EH2
+ER0 ER0
+ER1 ER1
+ER2 ER2
+EY0 EY0
+EY1 EY1
+EY2 EY2
+F F
+G G
+HH HH
+IH0 IH0
+IH1 IH1
+IH2 IH2
+IY0 IY0
+IY1 IY1
+IY2 IY2
+JH JH
+K K
+L L
+M M
+N N
+NG NG
+OW0 OW0
+OW1 OW1
+OW2 OW2
+OY0 OY0
+OY1 OY1
+OY2 OY2
+P P
+R R
+S S
+SH SH
+T T
+TH TH
+UH0 UH0
+UH1 UH1
+UH2 UH2
+UW0 UW0
+UW1 UW1
+UW2 UW2
+V V
+W W
+Y Y
+Z Z
+ZH ZH
+| |

data/processed/ljspeech/metadata_phone.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/processed/ljspeech/mfa_dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/processed/ljspeech/phone_set.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]

data_gen/singing/binarize.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import os
+import random
+from copy import deepcopy
+import pandas as pd
+import logging
+from tqdm import tqdm
+import json
+import glob
+import re
+from resemblyzer import VoiceEncoder
+import traceback
+import numpy as np
+import pretty_midi
+import librosa
+from scipy.interpolate import interp1d
+import torch
+from textgrid import TextGrid
+from utils.hparams import hparams
+from data_gen.tts.data_gen_utils import build_phone_encoder, get_pitch
+from utils.pitch_utils import f0_to_coarse
+from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
+from data_gen.tts.binarizer_zh import ZhBinarizer
+from data_gen.tts.txt_processors.zh_g2pM import ALL_YUNMU
+from vocoders.base_vocoder import VOCODERS
+class SingingBinarizer(BaseBinarizer):
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2f0fn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+    def split_train_test_set(self, item_names):
+        item_names = deepcopy(item_names)
+        test_item_names = [x for x in item_names if any([ts in x for ts in hparams['test_prefixes']])]
+        train_item_names = [x for x in item_names if x not in set(test_item_names)]
+        logging.info("train {}".format(len(train_item_names)))
+        logging.info("test {}".format(len(test_item_names)))
+        return train_item_names, test_item_names
+    def load_meta_data(self):
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            wav_suffix = '_wf0.wav'
+            txt_suffix = '.txt'
+            ph_suffix = '_ph.txt'
+            tg_suffix = '.TextGrid'
+            all_wav_pieces = glob.glob(f'{processed_data_dir}/*/*{wav_suffix}')
+            for piece_path in all_wav_pieces:
+                item_name = raw_item_name = piece_path[len(processed_data_dir)+1:].replace('/', '-')[:-len(wav_suffix)]
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = open(f'{piece_path.replace(wav_suffix, txt_suffix)}').readline()
+                self.item2ph[item_name] = open(f'{piece_path.replace(wav_suffix, ph_suffix)}').readline()
+                self.item2wavfn[item_name] = piece_path
+                self.item2spk[item_name] = re.split('-|#', piece_path.split('/')[-2])[0]
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                self.item2tgfn[item_name] = piece_path.replace(wav_suffix, tg_suffix)
+        print('spkers: ', set(self.item2spk.values()))
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
+    @property
+    def train_item_names(self):
+        return self._train_item_names
+    @property
+    def valid_item_names(self):
+        return self._test_item_names
+    @property
+    def test_item_names(self):
+        return self._test_item_names
+    def process(self):
+        self.load_meta_data()
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for ph_sent in self.item2ph.values():
+                ph_set += ph_sent.split(' ')
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    # @staticmethod
+    # def get_pitch(wav_fn, spec, res):
+    #     wav_suffix = '_wf0.wav'
+    #     f0_suffix = '_f0.npy'
+    #     f0fn = wav_fn.replace(wav_suffix, f0_suffix)
+    #     pitch_info = np.load(f0fn)
+    #     f0 = [x[1] for x in pitch_info]
+    #     spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
+    #     f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
+    #     f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
+    #     # f0_x_coor = np.arange(0, 1, 1 / len(f0))
+    #     # f0_x_coor[-1] = 1
+    #     # f0 = interp1d(f0_x_coor, f0, 'nearest')(spec_x_coor)[:len(spec)]
+    #     if sum(f0) == 0:
+    #         raise BinarizationError("Empty f0")
+    #     assert len(f0) == len(spec), (len(f0), len(spec))
+    #     pitch_coarse = f0_to_coarse(f0)
+    #
+    #     # vis f0
+    #     # import matplotlib.pyplot as plt
+    #     # from textgrid import TextGrid
+    #     # tg_fn = wav_fn.replace(wav_suffix, '.TextGrid')
+    #     # fig = plt.figure(figsize=(12, 6))
+    #     # plt.pcolor(spec.T, vmin=-5, vmax=0)
+    #     # ax = plt.gca()
+    #     # ax2 = ax.twinx()
+    #     # ax2.plot(f0, color='red')
+    #     # ax2.set_ylim(0, 800)
+    #     # itvs = TextGrid.fromFile(tg_fn)[0]
+    #     # for itv in itvs:
+    #     #     x = itv.maxTime * hparams['audio_sample_rate'] / hparams['hop_size']
+    #     #     plt.vlines(x=x, ymin=0, ymax=80, color='black')
+    #     #     plt.text(x=x, y=20, s=itv.mark, color='black')
+    #     # plt.savefig('tmp/20211229_singing_plots_test.png')
+    #
+    #     res['f0'] = f0
+    #     res['pitch'] = pitch_coarse
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                # cls.get_pitch(wav_fn, mel, res)
+                cls.get_pitch(wav, mel, res)
+            if binarization_args['with_txt']:
+                try:
+                    # print(ph)
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+class MidiSingingBinarizer(SingingBinarizer):
+    item2midi = {}
+    item2midi_dur = {}
+    item2is_slur = {}
+    item2ph_durs = {}
+    item2wdb = {}
+    def load_meta_data(self):
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            meta_midi = json.load(open(os.path.join(processed_data_dir, 'meta.json')))   # [list of dict]
+            for song_item in meta_midi:
+                item_name = raw_item_name = song_item['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2wavfn[item_name] = song_item['wav_fn']
+                self.item2txt[item_name] = song_item['txt']
+                self.item2ph[item_name] = ' '.join(song_item['phs'])
+                self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP', '<SIL>'] else 0 for x in song_item['phs']]
+                self.item2ph_durs[item_name] = song_item['ph_dur']
+                self.item2midi[item_name] = song_item['notes']
+                self.item2midi_dur[item_name] = song_item['notes_dur']
+                self.item2is_slur[item_name] = song_item['is_slur']
+                self.item2spk[item_name] = 'pop-cs'
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+        print('spkers: ', set(self.item2spk.values()))
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
+    @staticmethod
+    def get_pitch(wav_fn, wav, spec, ph, res):
+        wav_suffix = '.wav'
+        # midi_suffix = '.mid'
+        wav_dir = 'wavs'
+        f0_dir = 'f0'
+        item_name = '/'.join(os.path.splitext(wav_fn)[0].split('/')[-2:]).replace('_wf0', '')
+        res['pitch_midi'] = np.asarray(MidiSingingBinarizer.item2midi[item_name])
+        res['midi_dur'] = np.asarray(MidiSingingBinarizer.item2midi_dur[item_name])
+        res['is_slur'] = np.asarray(MidiSingingBinarizer.item2is_slur[item_name])
+        res['word_boundary'] = np.asarray(MidiSingingBinarizer.item2wdb[item_name])
+        assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (
+        res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
+        # gt f0.
+        gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
+        if sum(gt_f0) == 0:
+            raise BinarizationError("Empty **gt** f0")
+        res['f0'] = gt_f0
+        res['pitch'] = gt_pitch_coarse
+    @staticmethod
+    def get_align(ph_durs, mel, phone_encoded, res, hop_size=hparams['hop_size'], audio_sample_rate=hparams['audio_sample_rate']):
+        mel2ph = np.zeros([mel.shape[0]], int)
+        startTime = 0
+        for i_ph in range(len(ph_durs)):
+            start_frame = int(startTime * audio_sample_rate / hop_size + 0.5)
+            end_frame = int((startTime + ph_durs[i_ph]) * audio_sample_rate / hop_size + 0.5)
+            mel2ph[start_frame:end_frame] = i_ph + 1
+            startTime = startTime + ph_durs[i_ph]
+        # print('ph durs: ', ph_durs)
+        # print('mel2ph: ', mel2ph, len(mel2ph))
+        res['mel2ph'] = mel2ph
+        # res['dur'] = None
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(wav_fn, wav, mel, ph, res)
+            if binarization_args['with_txt']:
+                try:
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(MidiSingingBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+class ZhSingingBinarizer(ZhBinarizer, SingingBinarizer):
+    pass
+class OpencpopBinarizer(MidiSingingBinarizer):
+    item2midi = {}
+    item2midi_dur = {}
+    item2is_slur = {}
+    item2ph_durs = {}
+    item2wdb = {}
+    def split_train_test_set(self, item_names):
+        item_names = deepcopy(item_names)
+        test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
+        train_item_names = [x for x in item_names if x not in set(test_item_names)]
+        logging.info("train {}".format(len(train_item_names)))
+        logging.info("test {}".format(len(test_item_names)))
+        return train_item_names, test_item_names
+    def load_meta_data(self):
+        raw_data_dir = hparams['raw_data_dir']
+        # meta_midi = json.load(open(os.path.join(raw_data_dir, 'meta.json')))   # [list of dict]
+        utterance_labels = open(os.path.join(raw_data_dir, 'transcriptions.txt')).readlines()
+        for utterance_label in utterance_labels:
+            song_info = utterance_label.split('|')
+            item_name = raw_item_name = song_info[0]
+            self.item2wavfn[item_name] = f'{raw_data_dir}/wavs/{item_name}.wav'
+            self.item2txt[item_name] = song_info[1]
+            self.item2ph[item_name] = song_info[2]
+            # self.item2wdb[item_name] = list(np.nonzero([1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()])[0])
+            self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()]
+            self.item2ph_durs[item_name] = [float(x) for x in song_info[5].split(" ")]
+            self.item2midi[item_name] = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
+                                   for x in song_info[3].split(" ")]
+            self.item2midi_dur[item_name] = [float(x) for x in song_info[4].split(" ")]
+            self.item2is_slur[item_name] = [int(x) for x in song_info[6].split(" ")]
+            self.item2spk[item_name] = 'opencpop'
+        print('spkers: ', set(self.item2spk.values()))
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
+    @staticmethod
+    def get_pitch(wav_fn, wav, spec, ph, res):
+        wav_suffix = '.wav'
+        # midi_suffix = '.mid'
+        wav_dir = 'wavs'
+        f0_dir = 'text_f0_align'
+        item_name = os.path.splitext(os.path.basename(wav_fn))[0]
+        res['pitch_midi'] = np.asarray(OpencpopBinarizer.item2midi[item_name])
+        res['midi_dur'] = np.asarray(OpencpopBinarizer.item2midi_dur[item_name])
+        res['is_slur'] = np.asarray(OpencpopBinarizer.item2is_slur[item_name])
+        res['word_boundary'] = np.asarray(OpencpopBinarizer.item2wdb[item_name])
+        assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
+        # gt f0.
+        # f0 = None
+        # f0_suffix = '_f0.npy'
+        # f0fn = wav_fn.replace(wav_suffix, f0_suffix).replace(wav_dir, f0_dir)
+        # pitch_info = np.load(f0fn)
+        # f0 = [x[1] for x in pitch_info]
+        # spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
+        #
+        # f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
+        # f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
+        # if sum(f0) == 0:
+        #     raise BinarizationError("Empty **gt** f0")
+        #
+        # pitch_coarse = f0_to_coarse(f0)
+        # res['f0'] = f0
+        # res['pitch'] = pitch_coarse
+        # gt f0.
+        gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
+        if sum(gt_f0) == 0:
+            raise BinarizationError("Empty **gt** f0")
+        res['f0'] = gt_f0
+        res['pitch'] = gt_pitch_coarse
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(wav_fn, wav, mel, ph, res)
+            if binarization_args['with_txt']:
+                try:
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(OpencpopBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+if __name__ == "__main__":
+    SingingBinarizer().process()

data_gen/tts/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import traceback
+import json
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
+from utils.hparams import set_hparams, hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+from vocoders.base_vocoder import VOCODERS
+import pandas as pd
+class BinarizationError(Exception):
+    pass
+class BaseBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.forced_align = self.pre_align_args['forced_align']
+        tg_dir = None
+        if self.forced_align == 'mfa':
+            tg_dir = 'mfa_outputs'
+        if self.forced_align == 'kaldi':
+            tg_dir = 'kaldi_outputs'
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
+            for r_idx, r in self.meta_df.iterrows():
+                item_name = raw_item_name = r['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = r['txt']
+                self.item2ph[item_name] = r['ph']
+                self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
+                self.item2spk[item_name] = r.get('spk', 'SPK1')
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                if tg_dir is not None:
+                    self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        return self.item_names[hparams['test_num']+hparams['valid_num']:]
+    @property
+    def valid_item_names(self):
+        return self.item_names[0: hparams['test_num']+hparams['valid_num']]  #
+    @property
+    def test_item_names(self):
+        return self.item_names[0: hparams['test_num']]  # Audios for MOS testing are in 'test_ids'
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.item2spk[item_name]
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.item2spk[item_name]]
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for processed_data_dir in self.processed_data_dirs:
+                ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+        print("| phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            ph = self.item2ph[item_name]
+            txt = self.item2txt[item_name]
+            tg_fn = self.item2tgfn.get(item_name)
+            wav_fn = self.item2wavfn[item_name]
+            spk_id = self.item_name2spk_id(item_name)
+            yield item_name, ph, txt, tg_fn, wav_fn, spk_id
+    def process(self):
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        lengths = []
+        f0s = []
+        total_sec = 0
+        if self.binarization_args['with_spk_embed']:
+            voice_encoder = VoiceEncoder().cuda()
+        meta_data = list(self.meta_data(prefix))
+        for m in meta_data:
+            args.append(list(m) + [self.phone_encoder, self.binarization_args])
+        num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
+        for f_id, (_, item) in enumerate(
+                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
+            if item is None:
+                continue
+            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
+                if self.binarization_args['with_spk_embed'] else None
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                print("del wav")
+                del item['wav']
+            builder.add_item(item)
+            lengths.append(item['len'])
+            total_sec += item['sec']
+            if item.get('f0') is not None:
+                f0s.append(item['f0'])
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(wav, mel, res)
+                if binarization_args['with_f0cwt']:
+                    cls.get_f0cwt(res['f0'], res)
+            if binarization_args['with_txt']:
+                try:
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+    @staticmethod
+    def get_align(tg_fn, ph, mel, phone_encoded, res):
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    @staticmethod
+    def get_pitch(wav, mel, res):
+        f0, pitch_coarse = get_pitch(wav, mel, hparams)
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        res['f0'] = f0
+        res['pitch'] = pitch_coarse
+    @staticmethod
+    def get_f0cwt(f0, res):
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+if __name__ == "__main__":
+    set_hparams()
+    BaseBinarizer().process()

data_gen/tts/bin/binarize.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import importlib
+from utils.hparams import set_hparams, hparams
+def binarize():
+    binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
+    pkg = ".".join(binarizer_cls.split(".")[:-1])
+    cls_name = binarizer_cls.split(".")[-1]
+    binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
+    print("| Binarizer: ", binarizer_cls)
+    binarizer_cls().process()
+if __name__ == '__main__':
+    set_hparams()
+    binarize()

data_gen/tts/binarizer_zh.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
+from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
+from data_gen.tts.data_gen_utils import get_mel2ph
+from utils.hparams import set_hparams, hparams
+import numpy as np
+class ZhBinarizer(BaseBinarizer):
+    @staticmethod
+    def get_align(tg_fn, ph, mel, phone_encoded, res):
+        if tg_fn is not None and os.path.exists(tg_fn):
+            _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        ph_list = ph.split(" ")
+        assert len(dur) == len(ph_list)
+        mel2ph = []
+        # 分隔符的时长分配给韵母
+        dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
+        for i in range(len(dur)):
+            p = ph_list[i]
+            if p[0] != '<' and not p[0].isalpha():
+                uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
+                j = 0
+                while j < len(uv_) and not uv_[j]:
+                    j += 1
+                dur[i - 1] += j
+                dur[i] -= j
+                if dur[i] < 100:
+                    dur[i - 1] += dur[i]
+                    dur[i] = 0
+        # 声母和韵母等长
+        for i in range(len(dur)):
+            p = ph_list[i]
+            if p in ALL_SHENMU:
+                p_next = ph_list[i + 1]
+                if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
+                    print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
+                          f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
+                    continue
+                total = dur[i + 1] + dur[i]
+                dur[i] = total // 2
+                dur[i + 1] = total - dur[i]
+        for i in range(len(dur)):
+            mel2ph += [i + 1] * dur[i]
+        mel2ph = np.array(mel2ph)
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+if __name__ == "__main__":
+    set_hparams()
+    ZhBinarizer().process()

data_gen/tts/data_gen_utils.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import warnings
+warnings.filterwarnings("ignore")
+import parselmouth
+import os
+import torch
+from skimage.transform import resize
+from utils.text_encoder import TokenTextEncoder
+from utils.pitch_utils import f0_to_coarse
+import struct
+import webrtcvad
+from scipy.ndimage.morphology import binary_dilation
+import librosa
+import numpy as np
+from utils import audio
+import pyloudnorm as pyln
+import re
+import json
+from collections import OrderedDict
+PUNCS = '!,.?;:'
+int16_max = (2 ** 15) - 1
+def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    ## Voice Activation Detection
+    # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+    # This sets the granularity of the VAD. Should not need to be changed.
+    sampling_rate = 16000
+    wav_raw, sr = librosa.core.load(path, sr=sr)
+    if norm:
+        meter = pyln.Meter(sr)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav_raw)
+        wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
+        if np.abs(wav_raw).max() > 1.0:
+            wav_raw = wav_raw / np.abs(wav_raw).max()
+    wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
+    vad_window_length = 30  # In milliseconds
+    # Number of frames to average together when performing the moving average smoothing.
+    # The larger this value, the larger the VAD variations must be to not get smoothed out.
+    vad_moving_average_width = 8
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
+    if return_raw_wav:
+        return wav_raw, audio_mask, sr
+    return wav_raw[audio_mask], audio_mask, sr
+def process_utterance(wav_path,
+                      fft_size=1024,
+                      hop_size=256,
+                      win_length=1024,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=22050,
+                      loud_norm=False,
+                      min_level_db=-100,
+                      return_linear=False,
+                      trim_long_sil=False, vocoder='pwg'):
+    if isinstance(wav_path, str):
+        if trim_long_sil:
+            wav, _, _ = trim_long_silences(wav_path, sample_rate)
+        else:
+            wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    if loud_norm:
+        meter = pyln.Meter(sample_rate)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav)
+        wav = pyln.normalize.loudness(wav, loudness, -22.0)
+        if np.abs(wav).max() > 1:
+            wav = wav / np.abs(wav).max()
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, pad_mode="constant")
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
+    mel = mel_basis @ spc
+    if vocoder == 'pwg':
+        mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    else:
+        assert False, f'"{vocoder}" is not in ["pwg"].'
+    l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    wav = wav[:mel.shape[1] * hop_size]
+    if not return_linear:
+        return wav, mel
+    else:
+        spc = audio.amp_to_db(spc)
+        spc = audio.normalize(spc, {'min_level_db': min_level_db})
+        return wav, mel, spc
+def get_pitch(wav_data, mel, hparams):
+    """
+    :param wav_data: [T]
+    :param mel: [T, 80]
+    :param hparams:
+    :return:
+    """
+    time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
+    f0_min = 80
+    f0_max = 750
+    if hparams['hop_size'] == 128:
+        pad_size = 4
+    elif hparams['hop_size'] == 256:
+        pad_size = 2
+    else:
+        assert False
+    f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    lpad = pad_size * 2
+    rpad = len(mel) - len(f0) - lpad
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
+    # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
+    # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def remove_empty_lines(text):
+    """remove empty lines"""
+    assert (len(text) > 0)
+    assert (isinstance(text, list))
+    text = [t.strip() for t in text]
+    if "" in text:
+        text.remove("")
+    return text
+class TextGrid(object):
+    def __init__(self, text):
+        text = remove_empty_lines(text)
+        self.text = text
+        self.line_count = 0
+        self._get_type()
+        self._get_time_intval()
+        self._get_size()
+        self.tier_list = []
+        self._get_item_list()
+    def _extract_pattern(self, pattern, inc):
+        """
+        Parameters
+        ----------
+        pattern : regex to extract pattern
+        inc : increment of line count after extraction
+        Returns
+        -------
+        group : extracted info
+        """
+        try:
+            group = re.match(pattern, self.text[self.line_count]).group(1)
+            self.line_count += inc
+        except AttributeError:
+            raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
+        return group
+    def _get_type(self):
+        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
+    def _get_time_intval(self):
+        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
+        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
+    def _get_size(self):
+        self.size = int(self._extract_pattern(r"size = (.*)", 2))
+    def _get_item_list(self):
+        """Only supports IntervalTier currently"""
+        for itemIdx in range(1, self.size + 1):
+            tier = OrderedDict()
+            item_list = []
+            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
+            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
+            if tier_class != "IntervalTier":
+                raise NotImplementedError("Only IntervalTier class is supported currently")
+            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
+            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
+            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
+            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
+            for i in range(int(tier_size)):
+                item = OrderedDict()
+                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
+                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
+                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
+                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
+                item_list.append(item)
+            tier["idx"] = tier_idx
+            tier["class"] = tier_class
+            tier["name"] = tier_name
+            tier["xmin"] = tier_xmin
+            tier["xmax"] = tier_xmax
+            tier["size"] = tier_size
+            tier["items"] = item_list
+            self.tier_list.append(tier)
+    def toJson(self):
+        _json = OrderedDict()
+        _json["file_type"] = self.file_type
+        _json["xmin"] = self.xmin
+        _json["xmax"] = self.xmax
+        _json["size"] = self.size
+        _json["tiers"] = self.tier_list
+        return json.dumps(_json, ensure_ascii=False, indent=2)
+def get_mel2ph(tg_fn, ph, mel, hparams):
+    ph_list = ph.split(" ")
+    with open(tg_fn, "r") as f:
+        tg = f.readlines()
+    tg = remove_empty_lines(tg)
+    tg = TextGrid(tg)
+    tg = json.loads(tg.toJson())
+    split = np.ones(len(ph_list) + 1, np.float) * -1
+    tg_idx = 0
+    ph_idx = 0
+    tg_align = [x for x in tg['tiers'][-1]['items']]
+    tg_align_ = []
+    for x in tg_align:
+        x['xmin'] = float(x['xmin'])
+        x['xmax'] = float(x['xmax'])
+        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
+            x['text'] = ''
+            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
+                tg_align_[-1]['xmax'] = x['xmax']
+                continue
+        tg_align_.append(x)
+    tg_align = tg_align_
+    tg_len = len([x for x in tg_align if x['text'] != ''])
+    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
+    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
+    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
+        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
+            split[ph_idx] = 1e8
+            ph_idx += 1
+            continue
+        x = tg_align[tg_idx]
+        if x['text'] == '' and ph_idx == len(ph_list):
+            tg_idx += 1
+            continue
+        assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
+        ph = ph_list[ph_idx]
+        if x['text'] == '' and not is_sil_phoneme(ph):
+            assert False, (ph_list, tg_align)
+        if x['text'] != '' and is_sil_phoneme(ph):
+            ph_idx += 1
+        else:
+            assert (x['text'] == '' and is_sil_phoneme(ph)) \
+                   or x['text'].lower() == ph.lower() \
+                   or x['text'].lower() == 'sil', (x['text'], ph)
+            split[ph_idx] = x['xmin']
+            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
+                split[ph_idx - 1] = split[ph_idx]
+            ph_idx += 1
+            tg_idx += 1
+    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
+    assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
+    mel2ph = np.zeros([mel.shape[0]], np.int)
+    split[0] = 0
+    split[-1] = 1e8
+    for i in range(len(split) - 1):
+        assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
+    split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
+    for ph_idx in range(len(ph_list)):
+        mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
+    mel2ph_torch = torch.from_numpy(mel2ph)
+    T_t = len(ph_list)
+    dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
+    dur = dur[1:].numpy()
+    return mel2ph, dur
+def build_phone_encoder(data_dir):
+    phone_list_file = os.path.join(data_dir, 'phone_set.json')
+    phone_list = json.load(open(phone_list_file))
+    return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
+def is_sil_phoneme(p):
+    return not p[0].isalpha()

data_gen/tts/txt_processors/base_text_processor.py ADDED Viewed

	@@ -0,0 +1,8 @@

+class BaseTxtProcessor:
+    @staticmethod
+    def sp_phonemes():
+        return ['|']
+    @classmethod
+    def process(cls, txt, pre_align_args):
+        raise NotImplementedError

data_gen/tts/txt_processors/en.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import re
+from data_gen.tts.data_gen_utils import PUNCS
+from g2p_en import G2p
+import unicodedata
+from g2p_en.expand import normalize_numbers
+from nltk import pos_tag
+from nltk.tokenize import TweetTokenizer
+from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
+class EnG2p(G2p):
+    word_tokenize = TweetTokenizer().tokenize
+    def __call__(self, text):
+        # preprocessing
+        words = EnG2p.word_tokenize(text)
+        tokens = pos_tag(words)  # tuples of (word, tag)
+        # steps
+        prons = []
+        for word, pos in tokens:
+            if re.search("[a-z]", word) is None:
+                pron = [word]
+            elif word in self.homograph2features:  # Check homograph
+                pron1, pron2, pos1 = self.homograph2features[word]
+                if pos.startswith(pos1):
+                    pron = pron1
+                else:
+                    pron = pron2
+            elif word in self.cmu:  # lookup CMU dict
+                pron = self.cmu[word][0]
+            else:  # predict for oov
+                pron = self.predict(word)
+            prons.extend(pron)
+            prons.extend([" "])
+        return prons[:-1]
+class TxtProcessor(BaseTxtProcessor):
+    g2p = EnG2p()
+    @staticmethod
+    def preprocess_text(text):
+        text = normalize_numbers(text)
+        text = ''.join(char for char in unicodedata.normalize('NFD', text)
+                       if unicodedata.category(char) != 'Mn')  # Strip accents
+        text = text.lower()
+        text = re.sub("[\'\"()]+", "", text)
+        text = re.sub("[-]+", " ", text)
+        text = re.sub(f"[^ a-z{PUNCS}]", "", text)
+        text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
+        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
+        text = text.replace("i.e.", "that is")
+        text = text.replace("i.e.", "that is")
+        text = text.replace("etc.", "etc")
+        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
+        text = re.sub(rf"\s+", r" ", text)
+        return text
+    @classmethod
+    def process(cls, txt, pre_align_args):
+        txt = cls.preprocess_text(txt).strip()
+        phs = cls.g2p(txt)
+        phs_ = []
+        n_word_sep = 0
+        for p in phs:
+            if p.strip() == '':
+                phs_ += ['|']
+                n_word_sep += 1
+            else:
+                phs_ += p.split(" ")
+        phs = phs_
+        assert n_word_sep + 1 == len(txt.split(" ")), (phs, f"\"{txt}\"")
+        return phs, txt

data_gen/tts/txt_processors/zh.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import re
+from pypinyin import pinyin, Style
+from data_gen.tts.data_gen_utils import PUNCS
+from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
+from utils.text_norm import NSWNormalizer
+class TxtProcessor(BaseTxtProcessor):
+    table = {ord(f): ord(t) for f, t in zip(
+        u'：，。！？【】（）％＃＠＆１２３４５６７８９０',
+        u':,.!?[]()%#@&1234567890')}
+    @staticmethod
+    def preprocess_text(text):
+        text = text.translate(TxtProcessor.table)
+        text = NSWNormalizer(text).normalize(remove_punc=False)
+        text = re.sub("[\'\"()]+", "", text)
+        text = re.sub("[-]+", " ", text)
+        text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
+        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
+        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
+        text = re.sub(rf"\s+", r"", text)
+        return text
+    @classmethod
+    def process(cls, txt, pre_align_args):
+        txt = cls.preprocess_text(txt)
+        shengmu = pinyin(txt, style=Style.INITIALS)  # https://blog.csdn.net/zhoulei124/article/details/89055403
+        yunmu_finals = pinyin(txt, style=Style.FINALS)
+        yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
+        yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
+            if pre_align_args['use_tone'] else yunmu_finals
+        assert len(shengmu) == len(yunmu)
+        phs = ["|"]
+        for a, b, c in zip(shengmu, yunmu, yunmu_finals):
+            if a[0] == c[0]:
+                phs += [a[0], "|"]
+            else:
+                phs += [a[0], b[0], "|"]
+        return phs, txt

data_gen/tts/txt_processors/zh_g2pM.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import re
+import jieba
+from pypinyin import pinyin, Style
+from data_gen.tts.data_gen_utils import PUNCS
+from data_gen.tts.txt_processors import zh
+from g2pM import G2pM
+ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
+              'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
+ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
+             'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
+             'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
+class TxtProcessor(zh.TxtProcessor):
+    model = G2pM()
+    @staticmethod
+    def sp_phonemes():
+        return ['|', '#']
+    @classmethod
+    def process(cls, txt, pre_align_args):
+        txt = cls.preprocess_text(txt)
+        ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True)
+        seg_list = '#'.join(jieba.cut(txt))
+        assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
+        # 加入词边界'#'
+        ph_list_ = []
+        seg_idx = 0
+        for p in ph_list:
+            p = p.replace("u:", "v")
+            if seg_list[seg_idx] == '#':
+                ph_list_.append('#')
+                seg_idx += 1
+            else:
+                ph_list_.append("|")
+            seg_idx += 1
+            if re.findall('[\u4e00-\u9fff]', p):
+                if pre_align_args['use_tone']:
+                    p = pinyin(p, style=Style.TONE3, strict=True)[0][0]
+                    if p[-1] not in ['1', '2', '3', '4', '5']:
+                        p = p + '5'
+                else:
+                    p = pinyin(p, style=Style.NORMAL, strict=True)[0][0]
+            finished = False
+            if len([c.isalpha() for c in p]) > 1:
+                for shenmu in ALL_SHENMU:
+                    if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric():
+                        ph_list_ += [shenmu, p.lstrip(shenmu)]
+                        finished = True
+                        break
+            if not finished:
+                ph_list_.append(p)
+        ph_list = ph_list_
+        # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
+        sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
+        ph_list_ = []
+        for i in range(0, len(ph_list), 1):
+            if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
+                ph_list_.append(ph_list[i])
+        ph_list = ph_list_
+        return ph_list, txt
+if __name__ == '__main__':
+    phs, txt = TxtProcessor.process('他来到了，网易杭研大厦', {'use_tone': True})
+    print(phs)

docs/README-SVS-opencpop-cascade.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
+[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
+[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
+## DiffSinger (MIDI SVS | A version)
+### 0. Data Acquirement
+For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
+The pipeline below is designed for Opencpop dataset:
+### 1. Preparation
+#### Data Preparation
+a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
+b) Run the following scripts to pack the dataset for training/inference.
+```sh
+export PYTHONPATH=.
+CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
+# `data/binary/opencpop-midi-dp` will be generated.
+```
+#### Vocoder Preparation
+We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
+Please unzip this file into `checkpoints` before training your acoustic model.
+(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
+This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
+#### Exp Name Preparation
+```bash
+export MY_FS_EXP_NAME=0302_opencpop_fs_midi
+export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
+```
+```
+.
+|--data
+    |--raw
+        |--opencpop
+            |--segments
+                |--transcriptions.txt
+                |--wavs
+|--checkpoints
+    |--MY_FS_EXP_NAME (optional)
+    |--MY_DS_EXP_NAME (optional)
+    |--0109_hifigan_bigpopcs_hop128
+        |--model_ckpt_steps_1512000.ckpt
+        |--config.yaml
+```
+### 2. Training Example
+First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
+```
+Then, to train DiffSinger, run:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
+```
+Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
+### 3. Inference from packed test set
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
+```
+We also provide:
+ - the pre-trained model of DiffSinger;
+ - the pre-trained model of FFT-Singer;
+They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
+Remember to put the pre-trained models in `checkpoints` directory.
+### 4. Inference from raw inputs
+```sh
+python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
+```
+Raw inputs:
+```
+inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
+        'input_type': 'word'
+    }  # user input: Chinese characters
+or,
+inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
+        'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
+        'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
+        'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
+        'input_type': 'phoneme'
+    }  # input like Opencpop dataset.
+```
+### 5. Some issues.
+a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
+b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
+c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).

docs/README-SVS-opencpop-e2e.md ADDED Viewed

	@@ -0,0 +1,107 @@

+# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
+[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
+[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
+ | [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
+Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
+**By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
+简而言之，把F0曲线的动态性交给生成式模型去捕捉，而不再是以前那样用MSE约束对数域F0。
+## DiffSinger (MIDI SVS | B version)
+### 0. Data Acquirement
+For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
+The pipeline below is designed for Opencpop dataset:
+### 1. Preparation
+#### Data Preparation
+a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
+b) Run the following scripts to pack the dataset for training/inference.
+```sh
+export PYTHONPATH=.
+CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
+# `data/binary/opencpop-midi-dp` will be generated.
+```
+#### Vocoder Preparation
+We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
+Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
+(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
+This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
+#### Exp Name Preparation
+```bash
+export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
+```
+```
+.
+|--data
+    |--raw
+        |--opencpop
+            |--segments
+                |--transcriptions.txt
+                |--wavs
+|--checkpoints
+    |--MY_DS_EXP_NAME (optional)
+    |--0109_hifigan_bigpopcs_hop128 (vocoder)
+        |--model_ckpt_steps_1512000.ckpt
+        |--config.yaml
+```
+### 2. Training Example
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
+```
+### 3. Inference from packed test set
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
+```
+We also provide:
+ - the pre-trained model of DiffSinger;
+They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
+Remember to put the pre-trained models in `checkpoints` directory.
+### 4. Inference from raw inputs
+```sh
+python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
+```
+Raw inputs:
+```
+inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
+        'input_type': 'word'
+    }  # user input: Chinese characters
+or,
+inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
+        'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
+        'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
+        'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
+        'input_type': 'phoneme'
+    }  # input like Opencpop dataset.
+```
+### 5. Some issues.
+a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
+b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
+c) example [generated audio](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/demos_0221/DS/).
+More generated audio demos can be found in [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).

docs/README-SVS-popcs.md ADDED Viewed

	@@ -0,0 +1,63 @@

+## DiffSinger (SVS version)
+### 0. Data Acquirement
+- See in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
+- Dataset [preview](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
+### 1. Preparation
+#### Data Preparation
+a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
+b) Run the following scripts to pack the dataset for training/inference.
+```sh
+export PYTHONPATH=.
+CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
+# `data/binary/popcs-pmf0` will be generated.
+```
+#### Vocoder Preparation
+We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
+Please unzip this file into `checkpoints` before training your acoustic model.
+(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
+This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
+### 2. Training Example
+First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
+```sh
+# First, train fft-singer;
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
+# Then, infer fft-singer;
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
+```
+Then, to train DiffSinger, run:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
+```
+Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
+### 3. Inference Example
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
+```
+We also provide:
+ - the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
+ - the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
+Remember to put the pre-trained models in `checkpoints` directory.
+*Note that:*
+- *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
+- *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
+[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
+[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
+[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.

docs/README-SVS.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
+[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
+[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
+ | [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
+## DiffSinger (SVS)
+### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
+In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
+Thus, the pipeline of this part can be summarized as:
+```
+[lyrics] -> [linguistic representation] (Frontend)
+[linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram]  (Acoustic model)
+[mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
+```
+[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
+[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
+[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
+Click here for detailed instructions: [link](README-SVS-popcs.md).
+### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
+Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
+Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
+#### 2.A
+Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
+```
+[lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
+[linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram]  (Acoustic model)
+[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
+```
+Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
+#### 2.B
+In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
+Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
+```
+[lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
+[linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram]  (Acoustic model)
+[mel-spectrogram] -> [predicted F0]  (Pitch extractor)
+[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
+```
+Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
+### FAQ
+Q1: Why do I need F0 in Vocoders?
+A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
+Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
+A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
+Q3: Why " 'HifiGAN' object has no attribute 'model' "?
+A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
+Q4: How to check whether I use GT information or predicted information during inference from packed test set?
+A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
+...

docs/README-TTS.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
+[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
+[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
+ | [Interactive🤗 TTS](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
+## DiffSpeech (TTS)
+### 1. Preparation
+#### Data Preparation
+a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
+b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz):  `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
+c) Run the following scripts to pack the dataset for training/inference.
+```sh
+export PYTHONPATH=.
+CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
+# `data/binary/ljspeech` will be generated.
+```
+#### Vocoder Preparation
+We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
+Please unzip this file into `checkpoints` before training your acoustic model.
+### 2. Training Example
+First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
+```
+Then, to train DiffSpeech, run:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
+```
+Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
+### 3. Inference Example
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
+```
+We also provide:
+ - the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
+ - the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
+Remember to put the pre-trained models in `checkpoints` directory.
+## Mel Visualization
+Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
+<table style="width:100%">
+  <tr>
+    <th>DiffSpeech vs. FastSpeech 2</th>
+  </tr>
+  <tr>
+    <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
+  </tr>
+  <tr>
+    <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
+  </tr>
+  <tr>
+    <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
+  </tr>
+</table>

docs/README-zh.md ADDED Viewed

	@@ -0,0 +1,212 @@

+# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
+[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
+[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
+ | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue)](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
+ | [English README](../README.md)
+本仓库包含了我们的AAAI-2022 [论文](https://arxiv.org/abs/2105.02446)中提出的DiffSpeech (用于语音合成) 与 DiffSinger (用于歌声合成) 的官方Pytorch实现。
+<table style="width:100%">
+  <tr>
+    <th>DiffSinger/DiffSpeech训练阶段</th>
+    <th>DiffSinger/DiffSpeech推理阶段</th>
+  </tr>
+  <tr>
+    <td><img src="resources/model_a.png" alt="Training" height="300"></td>
+    <td><img src="resources/model_b.png" alt="Inference" height="300"></td>
+  </tr>
+</table>
+:tada: :tada: :tada: **一些重要更新**:
+- Mar.2, 2022: [MIDI-新版](README-SVS-opencpop-e2e.md): 重大更新 :sparkles:
+ - Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), 为了歌声美化任务的代码，开源了 :sparkles:  :sparkles:  :sparkles: .
+ - Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), 一个升级后的代码框架, 包含了DiffSpeech和我们NeurIPS-2021的工作[PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) 已经开源! :sparkles: :sparkles: :sparkles:.
+ - Jan.29, 2022: 支持了[MIDI-旧版](README-SVS-opencpop-cascade.md) 版本的歌声合成系统.
+ - Jan.13, 2022: 支持了歌声合成系统, 开源了PopCS数据集.
+ - Dec.19, 2021: 支持了语音合成系统. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
+:rocket: **新闻**:
+ - Feb.24, 2022: 我们的新工作`NeuralSVB` 被 ACL-2022 接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2202.13277). [音频演示](https://neuralsvb.github.io).
+ - Dec.01, 2021: DiffSinger被AAAI-2022接收.
+ - Sep.29, 2021: 我们的新工作`PortaSpeech: Portable and High-Quality Generative Text-to-Speech` 被NeurIPS-2021接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2109.15166) .
+ - May.06, 2021: 我们把这篇DiffSinger提交到了公开论文网站: Arxiv [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446).
+## 安装依赖
+```sh
+conda create -n your_env_name python=3.8
+source activate your_env_name
+pip install -r requirements_2080.txt   (GPU 2080Ti, CUDA 10.2)
+or pip install -r requirements_3090.txt   (GPU 3090, CUDA 11.4)
+```
+## DiffSpeech (语音合成的版本)
+### 1. 准备工作
+#### 数据准备
+a) 下载并解压 [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), 创建软链接: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
+b) 下载并解压 [我们用MFA预处理好的对齐](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar):  `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
+c) 按照如下脚本给数据集打包，打包后的二进制文件用于后续的训练和推理.
+```sh
+export PYTHONPATH=.
+CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
+# `data/binary/ljspeech` will be generated.
+```
+#### 声码器准备
+我们提供了[HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip)声码器的预训练模型.
+请在训练声学模型前，先把声码器文件解压到`checkpoints`里。
+### 2. 训练样例
+首先你需要一个预训练好的FastSpeech2存档点. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), 或者跑下面这个指令从零开始训练FastSpeech2:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
+```
+然后为了训练DiffSpeech, 运行:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
+```
+记得针对你的路径修改`usr/configs/lj_ds_beta6.yaml`里"fs2_ckpt"这个参数.
+### 3. 推理样例
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
+```
+我们也提供了:
+ - [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip)的预训练模型;
+ - [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip)的预训练模型, 这是为了DiffSpeech里的浅扩散机制;
+记得把预训练模型放在 `checkpoints` 目录.
+## DiffSinger (歌声合成的版本)
+### 0. 数据获取
+- 见 [申请表](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
+- 数据集 [预览](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
+### 1. Preparation
+#### 数据准备
+a) 下载并解压PopCS, 创建软链接: `ln -s /xxx/popcs/ data/processed/popcs`
+b) 按照如下脚本给数据集打包，打包后的二进制文件用于后续的训练和推理.
+```sh
+export PYTHONPATH=.
+CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
+# `data/binary/popcs-pmf0` 会生成出来.
+```
+#### 声码器准备
+我们提供了[HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip)的预训练模型, 它专门为了歌声合成系统设计, 采用了NSF的技术。
+请在训练声学模型前，先把声码器文件解压到`checkpoints`里。
+(更新: 你也可以将我们提供的[训练更多步数的存档点](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt)放到声码器的文件夹里)
+这个声码器是在大约70小时的较大数据集上训练的, 可以被认为是一个通用声码器。
+### 2. 训练样例
+首先你需要一个预训练好的FFT-Singer. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), 或者用如下脚本从零训练FFT-Singer:
+```sh
+# First, train fft-singer;
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
+# Then, infer fft-singer;
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
+```
+然后, 为了训练DiffSinger, 运行:
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
+```
+记得针对你的路径修改`usr/configs/popcs_ds_beta6_offline.yaml`里"fs2_ckpt"这个参数.
+### 3. 推理样例
+```sh
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
+```
+我们也提供了:
+ - [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip)的预训练模型;
+ - [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip)的预训练模型, 这是为了DiffSinger里的浅扩散机制;
+记得把预训练模型放在 `checkpoints` 目录.
+*请注意：*
+-*我们原始论文中的PWG版本声码器已投入商业使用，因此我们提供此HifiGAN版本声码器作为替代品。*
+-*我们这篇论文假设提供真实的F0来进行实验，如[1][2][3]等前作所做的那样，重点在频谱建模上，而非F0曲线的预测。如果你想对MIDI数据进行实验，从MIDI和歌词预测F0曲线（显式或隐式），请查看文档[MIDI-old-version](README-SVS-opencpop-cascade.md) 或 [MIDI-new-version](README-SVS-opencpop-e2e.md)。目前已经支持的MIDI数据集有: Opencpop*
+[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
+[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
+[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
+## Tensorboard
+```sh
+tensorboard --logdir_spec exp_name
+```
+<table style="width:100%">
+  <tr>
+    <td><img src="resources/tfb.png" alt="Tensorboard" height="250"></td>
+  </tr>
+</table>
+## Mel 可视化
+沿着纵轴, DiffSpeech: [0-80]; FastSpeech2: [80-160].
+<table style="width:100%">
+  <tr>
+    <th>DiffSpeech vs. FastSpeech 2</th>
+  </tr>
+  <tr>
+    <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
+  </tr>
+  <tr>
+    <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
+  </tr>
+  <tr>
+    <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
+  </tr>
+</table>
+## Audio Demos
+音频样本可以看我们的[样例页](https://diffsinger.github.io/).
+我们也放了部分由DiffSpeech+HifiGAN (标记为[P]) 和 GTmel+HifiGAN (标记为[G]) 生成的测试集音频样例在：[resources/demos_1213](../resources/demos_1213).
+(对应这个预训练参数：[DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip))
+---
+:rocket: :rocket: :rocket: **更新:**
+新生成的歌声样例在：[resources/demos_0112](../resources/demos_0112).
+## Citation
+如果本仓库对你的研究和工作有用，请引用以下论文：
+    @article{liu2021diffsinger,
+      title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
+      author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
+      journal={arXiv preprint arXiv:2105.02446},
+      volume={2},
+      year={2021}}
+## 鸣谢
+我们的代码基于如下仓库:
+* [denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch)
+* [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)
+* [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
+* [HifiGAN](https://github.com/jik876/hifi-gan)
+* [espnet](https://github.com/espnet/espnet)
+* [DiffWave](https://github.com/lmnt-com/diffwave)

inference/svs/base_svs_infer.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import os
+import torch
+import numpy as np
+from modules.hifigan.hifigan import HifiGanGenerator
+from vocoders.hifigan import HifiGAN
+from inference.svs.opencpop.map import cpop_pinyin2ph_func
+from utils import load_ckpt
+from utils.hparams import set_hparams, hparams
+from utils.text_encoder import TokenTextEncoder
+from pypinyin import pinyin, lazy_pinyin, Style
+import librosa
+import glob
+import re
+class BaseSVSInfer:
+    def __init__(self, hparams, device=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.hparams = hparams
+        self.device = device
+        phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
+                  "h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
+                  "ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
+                  "van", "ve", "vn", "w", "x", "y", "z", "zh"]
+        self.ph_encoder = TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
+        self.pinyin2phs = cpop_pinyin2ph_func()
+        self.spk_map = {'opencpop': 0}
+        self.model = self.build_model()
+        self.model.eval()
+        self.model.to(self.device)
+        self.vocoder = self.build_vocoder()
+        self.vocoder.eval()
+        self.vocoder.to(self.device)
+    def build_model(self):
+        raise NotImplementedError
+    def forward_model(self, inp):
+        raise NotImplementedError
+    def build_vocoder(self):
+        base_dir = hparams['vocoder_ckpt']
+        config_path = f'{base_dir}/config.yaml'
+        ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
+        lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
+        print('| load HifiGAN: ', ckpt)
+        ckpt_dict = torch.load(ckpt, map_location="cpu")
+        config = set_hparams(config_path, global_hparams=False)
+        state = ckpt_dict["state_dict"]["model_gen"]
+        vocoder = HifiGanGenerator(config)
+        vocoder.load_state_dict(state, strict=True)
+        vocoder.remove_weight_norm()
+        vocoder = vocoder.eval().to(self.device)
+        return vocoder
+    def run_vocoder(self, c, **kwargs):
+        c = c.transpose(2, 1)  # [B, 80, T]
+        f0 = kwargs.get('f0')  # [B, T]
+        if f0 is not None and hparams.get('use_nsf'):
+            # f0 = torch.FloatTensor(f0).to(self.device)
+            y = self.vocoder(c, f0).view(-1)
+        else:
+            y = self.vocoder(c).view(-1)
+            # [T]
+        return y[None]
+    def preprocess_word_level_input(self, inp):
+        # Pypinyin can't solve polyphonic words
+        text_raw = inp['text'].replace('最长', '最常').replace('长睫毛', '常睫毛') \
+            .replace('那么长', '那么常').replace('多长', '多常') \
+            .replace('很长', '很常')  # We hope someone could provide a better g2p module for us by opening pull requests.
+        # lyric
+        pinyins = lazy_pinyin(text_raw, strict=False)
+        ph_per_word_lst = [self.pinyin2phs[pinyin.strip()] for pinyin in pinyins if pinyin.strip() in self.pinyin2phs]
+        # Note
+        note_per_word_lst = [x.strip() for x in inp['notes'].split('|') if x.strip() != '']
+        mididur_per_word_lst = [x.strip() for x in inp['notes_duration'].split('|') if x.strip() != '']
+        if len(note_per_word_lst) == len(ph_per_word_lst) == len(mididur_per_word_lst):
+            print('Pass word-notes check.')
+        else:
+            print('The number of words does\'t match the number of notes\' windows. ',
+                  'You should split the note(s) for each word by | mark.')
+            print(ph_per_word_lst, note_per_word_lst, mididur_per_word_lst)
+            print(len(ph_per_word_lst), len(note_per_word_lst), len(mididur_per_word_lst))
+            return None
+        note_lst = []
+        ph_lst = []
+        midi_dur_lst = []
+        is_slur = []
+        for idx, ph_per_word in enumerate(ph_per_word_lst):
+            # for phs in one word:
+            # single ph like ['ai']  or multiple phs like ['n', 'i']
+            ph_in_this_word = ph_per_word.split()
+            # for notes in one word:
+            # single note like ['D4'] or multiple notes like ['D4', 'E4'] which means a 'slur' here.
+            note_in_this_word = note_per_word_lst[idx].split()
+            midi_dur_in_this_word = mididur_per_word_lst[idx].split()
+            # process for the model input
+            # Step 1.
+            #  Deal with note of 'not slur' case or the first note of 'slur' case
+            #  j        ie
+            #  F#4/Gb4  F#4/Gb4
+            #  0        0
+            for ph in ph_in_this_word:
+                ph_lst.append(ph)
+                note_lst.append(note_in_this_word[0])
+                midi_dur_lst.append(midi_dur_in_this_word[0])
+                is_slur.append(0)
+            # step 2.
+            #  Deal with the 2nd, 3rd... notes of 'slur' case
+            #  j        ie         ie
+            #  F#4/Gb4  F#4/Gb4    C#4/Db4
+            #  0        0          1
+            if len(note_in_this_word) > 1:  # is_slur = True, we should repeat the YUNMU to match the 2nd, 3rd... notes.
+                for idx in range(1, len(note_in_this_word)):
+                    ph_lst.append(ph_in_this_word[-1])
+                    note_lst.append(note_in_this_word[idx])
+                    midi_dur_lst.append(midi_dur_in_this_word[idx])
+                    is_slur.append(1)
+        ph_seq = ' '.join(ph_lst)
+        if len(ph_lst) == len(note_lst) == len(midi_dur_lst):
+            print(len(ph_lst), len(note_lst), len(midi_dur_lst))
+            print('Pass word-notes check.')
+        else:
+            print('The number of words does\'t match the number of notes\' windows. ',
+                  'You should split the note(s) for each word by | mark.')
+            return None
+        return ph_seq, note_lst, midi_dur_lst, is_slur
+    def preprocess_phoneme_level_input(self, inp):
+        ph_seq = inp['ph_seq']
+        note_lst = inp['note_seq'].split()
+        midi_dur_lst = inp['note_dur_seq'].split()
+        is_slur = [float(x) for x in inp['is_slur_seq'].split()]
+        print(len(note_lst), len(ph_seq.split()), len(midi_dur_lst))
+        if len(note_lst) == len(ph_seq.split()) == len(midi_dur_lst):
+            print('Pass word-notes check.')
+        else:
+            print('The number of words does\'t match the number of notes\' windows. ',
+                  'You should split the note(s) for each word by | mark.')
+            return None
+        return ph_seq, note_lst, midi_dur_lst, is_slur
+    def preprocess_input(self, inp, input_type='word'):
+        """
+        :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
+        :return:
+        """
+        item_name = inp.get('item_name', '<ITEM_NAME>')
+        spk_name = inp.get('spk_name', 'opencpop')
+        # single spk
+        spk_id = self.spk_map[spk_name]
+        # get ph seq, note lst, midi dur lst, is slur lst.
+        if input_type == 'word':
+            ret = self.preprocess_word_level_input(inp)
+        elif input_type == 'phoneme':  # like transcriptions.txt in Opencpop dataset.
+            ret = self.preprocess_phoneme_level_input(inp)
+        else:
+            print('Invalid input type.')
+            return None
+        if ret:
+            ph_seq, note_lst, midi_dur_lst, is_slur = ret
+        else:
+            print('==========> Preprocess_word_level or phone_level input wrong.')
+            return None
+        # convert note lst to midi id; convert note dur lst to midi duration
+        try:
+            midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
+                     for x in note_lst]
+            midi_dur_lst = [float(x) for x in midi_dur_lst]
+        except Exception as e:
+            print(e)
+            print('Invalid Input Type.')
+            return None
+        ph_token = self.ph_encoder.encode(ph_seq)
+        item = {'item_name': item_name, 'text': inp['text'], 'ph': ph_seq, 'spk_id': spk_id,
+                'ph_token': ph_token, 'pitch_midi': np.asarray(midis), 'midi_dur': np.asarray(midi_dur_lst),
+                'is_slur': np.asarray(is_slur), }
+        item['ph_len'] = len(item['ph_token'])
+        return item
+    def input_to_batch(self, item):
+        item_names = [item['item_name']]
+        text = [item['text']]
+        ph = [item['ph']]
+        txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
+        txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
+        spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
+        pitch_midi = torch.LongTensor(item['pitch_midi'])[None, :hparams['max_frames']].to(self.device)
+        midi_dur = torch.FloatTensor(item['midi_dur'])[None, :hparams['max_frames']].to(self.device)
+        is_slur = torch.LongTensor(item['is_slur'])[None, :hparams['max_frames']].to(self.device)
+        batch = {
+            'item_name': item_names,
+            'text': text,
+            'ph': ph,
+            'txt_tokens': txt_tokens,
+            'txt_lengths': txt_lengths,
+            'spk_ids': spk_ids,
+            'pitch_midi': pitch_midi,
+            'midi_dur': midi_dur,
+            'is_slur': is_slur
+        }
+        return batch
+    def postprocess_output(self, output):
+        return output
+    def infer_once(self, inp):
+        inp = self.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
+        output = self.forward_model(inp)
+        output = self.postprocess_output(output)
+        return output
+    @classmethod
+    def example_run(cls, inp):
+        from utils.audio import save_wav
+        set_hparams(print_hparams=False)
+        infer_ins = cls(hparams)
+        out = infer_ins.infer_once(inp)
+        os.makedirs('infer_out', exist_ok=True)
+        save_wav(out, f'infer_out/example_out.wav', hparams['audio_sample_rate'])
+# if __name__ == '__main__':
+    # debug
+    # a = BaseSVSInfer(hparams)
+    # a.preprocess_input({'text': '你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP',
+    #                     'notes': 'D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest',
+    #                     'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
+    #                     })
+    # b = {
+    #     'text': '小酒窝长睫毛AP是你最美的记号',
+    #     'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+    #     'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340'
+    # }
+    # c = {
+    #     'text': '小酒窝长睫毛AP是你最美的记号',
+    #     'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
+    #     'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
+    #     'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
+    #     'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
+    # }  # input like Opencpop dataset.
+    # a.preprocess_input(b)
+    # a.preprocess_input(c, input_type='phoneme')

inference/svs/ds_cascade.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+# from inference.tts.fs import FastSpeechInfer
+# from modules.tts.fs2_orig import FastSpeech2Orig
+from inference.svs.base_svs_infer import BaseSVSInfer
+from utils import load_ckpt
+from utils.hparams import hparams
+from usr.diff.shallow_diffusion_tts import GaussianDiffusion
+from usr.diffsinger_task import DIFF_DECODERS
+class DiffSingerCascadeInfer(BaseSVSInfer):
+    def build_model(self):
+        model = GaussianDiffusion(
+            phone_encoder=self.ph_encoder,
+            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
+            timesteps=hparams['timesteps'],
+            K_step=hparams['K_step'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+        with torch.no_grad():
+            output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
+                                pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
+                                is_slur=sample['is_slur'])
+            mel_out = output['mel_out']  # [B, T,80]
+            f0_pred = output['f0_denorm']
+            wav_out = self.run_vocoder(mel_out, f0=f0_pred)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
+        'input_type': 'word'
+    }  # user input: Chinese characters
+    c = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
+        'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
+        'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
+        'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
+        'input_type': 'phoneme'
+    }  # input like Opencpop dataset.
+    DiffSingerCascadeInfer.example_run(inp)
+# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi

inference/svs/ds_e2e.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+# from inference.tts.fs import FastSpeechInfer
+# from modules.tts.fs2_orig import FastSpeech2Orig
+from inference.svs.base_svs_infer import BaseSVSInfer
+from utils import load_ckpt
+from utils.hparams import hparams
+from usr.diff.shallow_diffusion_tts import GaussianDiffusion
+from usr.diffsinger_task import DIFF_DECODERS
+from modules.fastspeech.pe import PitchExtractor
+import utils
+class DiffSingerE2EInfer(BaseSVSInfer):
+    def build_model(self):
+        model = GaussianDiffusion(
+            phone_encoder=self.ph_encoder,
+            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
+            timesteps=hparams['timesteps'],
+            K_step=hparams['K_step'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        if hparams.get('pe_enable') is not None and hparams['pe_enable']:
+            self.pe = PitchExtractor().to(self.device)
+            utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
+            self.pe.eval()
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+        with torch.no_grad():
+            output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
+                                pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
+                                is_slur=sample['is_slur'])
+            mel_out = output['mel_out']  # [B, T,80]
+            if hparams.get('pe_enable') is not None and hparams['pe_enable']:
+                f0_pred = self.pe(mel_out)['f0_denorm_pred']  # pe predict from Pred mel
+            else:
+                f0_pred = output['f0_denorm']
+            wav_out = self.run_vocoder(mel_out, f0=f0_pred)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
+        'input_type': 'word'
+    }  # user input: Chinese characters
+    inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
+        'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
+        'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
+        'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
+        'input_type': 'phoneme'
+    }  # input like Opencpop dataset.
+    DiffSingerE2EInfer.example_run(inp)
+# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel