Spaces:

amphion
/

singing_voice_conversion

Runtime error

App Files Files Community

RMSnow commited on Dec 6, 2023

Commit

0883aa1

1 Parent(s): df2accb

add backend inference and inferface output

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -0
app.py +52 -13
ckpts/svc/vocalist_l1_contentvec+whisper/args.json +2 -1
config/audioldm.json +92 -0
config/autoencoderkl.json +69 -0
config/base.json +220 -0
config/comosvc.json +216 -0
config/diffusion.json +227 -0
config/fs2.json +117 -0
config/transformer.json +180 -0
config/tts.json +23 -0
config/valle.json +52 -0
config/vits.json +101 -0
config/vocoder.json +84 -0
egs/vocoder/README.md +23 -0
egs/vocoder/diffusion/README.md +0 -0
egs/vocoder/diffusion/exp_config_base.json +0 -0
egs/vocoder/gan/README.md +224 -0
egs/vocoder/gan/_template/run.sh +143 -0
egs/vocoder/gan/apnet/exp_config.json +45 -0
egs/vocoder/gan/apnet/run.sh +143 -0
egs/vocoder/gan/bigvgan/exp_config.json +66 -0
egs/vocoder/gan/bigvgan/run.sh +143 -0
egs/vocoder/gan/bigvgan_large/exp_config.json +70 -0
egs/vocoder/gan/bigvgan_large/run.sh +143 -0
egs/vocoder/gan/exp_config_base.json +111 -0
egs/vocoder/gan/hifigan/exp_config.json +59 -0
egs/vocoder/gan/hifigan/run.sh +143 -0
egs/vocoder/gan/melgan/exp_config.json +34 -0
egs/vocoder/gan/melgan/run.sh +143 -0
egs/vocoder/gan/nsfhifigan/exp_config.json +83 -0
egs/vocoder/gan/nsfhifigan/run.sh +143 -0
egs/vocoder/gan/tfr_enhanced_hifigan/README.md +185 -0
egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json +118 -0
egs/vocoder/gan/tfr_enhanced_hifigan/run.sh +145 -0
inference.py +6 -2
modules/__init__.py +0 -0
modules/activation_functions/__init__.py +7 -0
modules/activation_functions/gated_activation_unit.py +61 -0
modules/activation_functions/snake.py +122 -0
modules/anti_aliasing/__init__.py +8 -0
modules/anti_aliasing/act.py +35 -0
modules/anti_aliasing/filter.py +99 -0
modules/anti_aliasing/resample.py +64 -0
modules/base/base_module.py +75 -0
modules/diffusion/__init__.py +7 -0
modules/diffusion/bidilconv/bidilated_conv.py +102 -0
modules/diffusion/bidilconv/residual_block.py +73 -0
modules/diffusion/karras/karras_diffusion.py +979 -0
modules/diffusion/karras/random_utils.py +177 -0

.gitignore CHANGED Viewed

@@ -1,6 +1,9 @@
 __pycache__
 flagged
 result
 # Developing mode
 _*.sh

 __pycache__
 flagged
 result
+source_audios
+ckpts/svc/vocalist_l1_contentvec+whisper/data
+!ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1
 # Developing mode
 _*.sh

app.py CHANGED Viewed

@@ -1,5 +1,11 @@
-import gradio as gr
 SUPPORTED_TARGET_SINGERS = {
     "Adele": "vocalist_l1_Adele",
@@ -21,33 +27,58 @@ SUPPORTED_TARGET_SINGERS = {
 def svc_inference(
-    source_audio,
     target_singer,
-    diffusion_steps=1000,
-    key_shift_mode="auto",
     key_shift_num=0,
 ):
-    pass
 demo_inputs = [
     gr.Audio(
         sources=["upload", "microphone"],
         label="Upload (or record) a song you want to listen",
     ),
     gr.Radio(
         choices=list(SUPPORTED_TARGET_SINGERS.keys()),
         label="Target Singer",
         value="Jian Li 李健",
     ),
-    gr.Slider(
-        1,
-        1000,
-        value=1000,
-        step=1,
-        label="Diffusion Inference Steps",
-        info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
-    ),
     gr.Radio(
         choices=["Auto Shift", "Key Shift"],
         value="Auto Shift",
@@ -62,6 +93,14 @@ demo_inputs = [
         label="Key Shift Values",
         info='How many semitones you want to transpose.	This parameter will work only if you choose "Key Shift"',
     ),
 ]
 demo_outputs = gr.Audio(label="")

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import gradio as gr
+import os
+import inference
 SUPPORTED_TARGET_SINGERS = {
     "Adele": "vocalist_l1_Adele",
 def svc_inference(
+    source_audio_path,
     target_singer,
+    key_shift_mode="Auto Shift",
     key_shift_num=0,
+    diffusion_steps=1000,
 ):
+    #### Prepare source audio file ####
+    print("source_audio_path: {}".format(source_audio_path))
+    audio_file = source_audio_path.split("/")[-1]
+    audio_name = audio_file.split(".")[0]
+    source_audio_dir = source_audio_path.replace(audio_file, "")
+    ### Target Singer ###
+    target_singer = SUPPORTED_TARGET_SINGERS[target_singer]
+    ### Inference ###
+    if key_shift_mode == "Auto Shift":
+        key_shift = "autoshift"
+    else:
+        key_shift = key_shift_num
+    args_list = ["--config", "ckpts/svc/vocalist_l1_contentvec+whisper/args.json"]
+    args_list += ["--acoustics_dir", "ckpts/svc/vocalist_l1_contentvec+whisper"]
+    args_list += ["--vocoder_dir", "pretrained/bigvgan"]
+    args_list += ["--target_singer", target_singer]
+    args_list += ["--trans_key", str(key_shift)]
+    args_list += ["--diffusion_inference_steps", str(diffusion_steps)]
+    args_list += ["--source", source_audio_dir]
+    args_list += ["--output_dir", "result"]
+    args_list += ["--log_level", "debug"]
+    os.environ["WORK_DIR"] = "./"
+    inference.main(args_list)
+    ### Display ###
+    result_file = os.path.join(
+        "result/{}/{}_{}.wav".format(audio_name, audio_name, target_singer)
+    )
+    return result_file
 demo_inputs = [
     gr.Audio(
         sources=["upload", "microphone"],
         label="Upload (or record) a song you want to listen",
+        type="filepath",
     ),
     gr.Radio(
         choices=list(SUPPORTED_TARGET_SINGERS.keys()),
         label="Target Singer",
         value="Jian Li 李健",
     ),
     gr.Radio(
         choices=["Auto Shift", "Key Shift"],
         value="Auto Shift",
         label="Key Shift Values",
         info='How many semitones you want to transpose.	This parameter will work only if you choose "Key Shift"',
     ),
+    gr.Slider(
+        1,
+        1000,
+        value=1000,
+        step=1,
+        label="Diffusion Inference Steps",
+        info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
+    ),
 ]
 demo_outputs = gr.Audio(label="")

ckpts/svc/vocalist_l1_contentvec+whisper/args.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "base_config": "config/diffusion.json",
     "dataset": [
         "vocalist_l1",
     ],
@@ -195,6 +195,7 @@
         "whisper_frameshift": 0.01,
         "whisper_model": "medium",
         "whisper_model_path": "pretrained/whisper/medium.pt",
         "win_size": 1024,
     },
     "supported_model_type": [

 {
+    "task_type": "svc",
     "dataset": [
         "vocalist_l1",
     ],
         "whisper_frameshift": 0.01,
         "whisper_model": "medium",
         "whisper_model_path": "pretrained/whisper/medium.pt",
+        "whisper_sample_rate": 16000,
         "win_size": 1024,
     },
     "supported_model_type": [

config/audioldm.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AudioLDM",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "cond_mask_prob": 0.1
+  },
+  // model
+  "model": {
+    "audioldm": {
+      "image_size": 32,
+      "in_channels": 4,
+      "out_channels": 4,
+      "model_channels": 256,
+      "attention_resolutions": [
+        4,
+        2,
+        1
+      ],
+      "num_res_blocks": 2,
+      "channel_mult": [
+        1,
+        2,
+        4
+      ],
+      "num_heads": 8,
+      "use_spatial_transformer": true,
+      "transformer_depth": 1,
+      "context_dim": 768,
+      "use_checkpoint": true,
+      "legacy": false
+    },
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "noise_scheduler": {
+      "num_train_timesteps": 1000,
+      "beta_start": 0.00085,
+      "beta_end": 0.012,
+      "beta_schedule": "scaled_linear",
+      "clip_sample": false,
+      "steps_offset": 1,
+      "set_alpha_to_one": false,
+      "skip_prk_steps": true,
+      "prediction_type": "epsilon"
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 5.0e-5,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

config/autoencoderkl.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AutoencoderKL",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false
+  },
+  // model
+  "model": {
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "loss": {
+      "kl_weight": 1e-8,
+      "disc_weight": 0.5,
+      "disc_factor": 1.0,
+      "logvar_init": 0.0,
+      "min_adapt_d_weight": 0.0,
+      "max_adapt_d_weight": 10.0,
+      "disc_start": 50001,
+      "disc_in_channels": 1,
+      "disc_num_layers": 3,
+      "use_actnorm": false
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 4.0e-4,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

config/base.json ADDED Viewed

	@@ -0,0 +1,220 @@

+{
+  "supported_model_type": [
+    "GANVocoder",
+    "Fastspeech2",
+    "DiffSVC",
+    "Transformer",
+    "EDM",
+    "CD"
+  ],
+  "task_type": "",
+  "dataset": [],
+  "use_custom_dataset": false,
+  "preprocess": {
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
+    // trim audio silence
+    "data_augment": false,
+    "trim_silence": false,
+    "num_silent_frames": 8,
+    "trim_fft_size": 512, // fft size used in trimming
+    "trim_hop_size": 128, // hop size used in trimming
+    "trim_top_db": 30, // top db used in trimming sensitive to each dataset
+    // acoustic features
+    "extract_mel": false,
+    "mel_extract_mode": "",
+    "extract_linear_spec": false,
+    "extract_mcep": false,
+    "extract_pitch": false,
+    "extract_acoustic_token": false,
+    "pitch_remove_outlier": false,
+    "extract_uv": false,
+    "pitch_norm": false,
+    "extract_audio": false,
+    "extract_label": false,
+    "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
+    "extract_energy": false,
+    "energy_remove_outlier": false,
+    "energy_norm": false,
+    "energy_extract_mode": "from_mel",
+    "extract_duration": false,
+    "extract_amplitude_phase": false,
+    "mel_min_max_norm": false,
+    // lingusitic features
+    "extract_phone": false,
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // content features
+    "extract_whisper_feature": false,
+    "extract_contentvec_feature": false,
+    "extract_mert_feature": false,
+    "extract_wenet_feature": false,
+    // Settings for data preprocessing
+    "n_mel": 80,
+    "win_size": 480,
+    "hop_size": 120,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "min_level_db": -115,
+    "ref_level_db": 20,
+    "bits": 8,
+    // Directory names of processed data or extracted features
+    "processed_dir": "processed_data",
+    "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
+    "raw_data": "raw_data",
+    "phone_dir": "phones",
+    "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
+    "audio_dir": "audios",
+    "log_amplitude_dir": "log_amplitudes",
+    "phase_dir": "phases",
+    "real_dir": "reals",
+    "imaginary_dir": "imaginarys",
+    "label_dir": "labels",
+    "linear_dir": "linears",
+    "mel_dir": "mels", // directory name of extraced mel features
+    "mcep_dir": "mcep", // directory name of extraced mcep features
+    "dur_dir": "durs",
+    "symbols_dict": "symbols.dict",
+    "lab_dir": "labs", // directory name of extraced label features
+    "wenet_dir": "wenet", // directory name of extraced wenet features
+    "contentvec_dir": "contentvec", // directory name of extraced wenet features
+    "pitch_dir": "pitches", // directory name of extraced pitch features
+    "energy_dir": "energys", // directory name of extracted energy features
+    "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
+    "phone_energy_dir": "phone_energys", // directory name of extracted energy features
+    "uv_dir": "uvs", // directory name of extracted unvoiced features
+    "duration_dir": "duration", // ground-truth duration file
+    "phone_seq_file": "phone_seq_file", // phoneme sequence file
+    "file_lst": "file.lst",
+    "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
+    "valid_file": "valid.json", // validattion set
+    "spk2id": "spk2id.json", // used for multi-speaker dataset
+    "utt2spk": "utt2spk", // used for multi-speaker dataset
+    "emo2id": "emo2id.json", // used for multi-emotion dataset
+    "utt2emo": "utt2emo", // used for multi-emotion dataset
+    // Features used for model training
+    "use_text": false,
+    "use_phone": false,
+    "use_phn_seq": false,
+    "use_lab": false,
+    "use_linear": false,
+    "use_mel": false,
+    "use_min_max_norm_mel": false,
+    "use_wav": false,
+    "use_phone_pitch": false,
+    "use_log_scale_pitch": false,
+    "use_phone_energy": false,
+    "use_phone_duration": false,
+    "use_log_scale_energy": false,
+    "use_wenet": false,
+    "use_dur": false,
+    "use_spkid": false, // True: use speaker id for multi-speaker dataset
+    "use_emoid": false, // True: use emotion id for multi-emotion dataset
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_frame_energy": false,
+    "use_frame_duration": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "use_amplitude_phase": false,
+    "data_augment": false,
+    "align_mel_duration": false
+  },
+  "train": {
+    "ddp": true,
+    "random_seed": 970227,
+    "batch_size": 16,
+    "max_steps": 1000000,
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // if one number will broadcast
+    // Fix the random seed
+    "random_seed": 10086,
+    // Optimizer
+    "optimizer": "AdamW",
+    "adamw": {
+      "lr": 4.0e-4
+      // nn model lr
+    },
+    // LR Scheduler
+    "scheduler": "ReduceLROnPlateau",
+    "reducelronplateau": {
+      "factor": 0.8,
+      "patience": 10,
+      // unit is epoch
+      "min_lr": 1.0e-4
+    },
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    "gradient_accumulation_step": 1,
+    "total_training_steps": 50000,
+    "save_summary_steps": 500,
+    "save_checkpoints_steps": 10000,
+    "valid_interval": 10000,
+    "keep_checkpoint_max": 5,
+    "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+  },
+}

config/comosvc.json ADDED Viewed

	@@ -0,0 +1,216 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "DiffComoSVC",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "teacher_model_path": "[Your Teacher Model Path].bin",
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        "comosvc": {
+            "distill": false,
+            // conformer encoder
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+            // karras diffusion
+            "P_mean": -1.2,
+            "P_std": 1.2,
+            "sigma_data": 0.5,
+            "sigma_min": 0.002,
+            "sigma_max": 80,
+            "rho": 7,
+            "n_timesteps": 40,
+        },
+        "diffusion": {
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 100
+            }
+        },
+    },
+    "train": {
+        // Basic settings
+        "fast_steps": 0,
+        "batch_size": 32,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            100
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    },
+    "inference": {
+        "comosvc": {
+            "inference_steps": 40
+        }
+    }
+}

config/diffusion.json ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+    // FIXME: THESE ARE LEGACY
+    "base_config": "config/base.json",
+    "model_type": "diffusion",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        // FIXME: FOLLOWING ARE NEW!!
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            },
+            "unet2d": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "down_block_types": [
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "DownBlock2D"
+                ],
+                "mid_block_type": "UNetMidBlock2DCrossAttn",
+                "up_block_types": [
+                    "UpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D"
+                ],
+                "only_cross_attention": false
+            }
+        }
+    },
+    // FIXME: FOLLOWING ARE NEW!!
+    "train": {
+        // Basic settings
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            5,
+            20
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    },
+    "inference": {
+        "diffusion": {
+            "scheduler": "pndm",
+            "scheduler_settings": {
+                "num_inference_timesteps": 1000
+            }
+        }
+    }
+}

config/fs2.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "FastSpeech2",
+    "task_type": "tts",
+    "dataset": ["LJSpeech"],
+    "preprocess": {
+      // acoustic features
+      "extract_audio": true,
+      "extract_mel": true,
+      "mel_extract_mode": "taco",
+      "mel_min_max_norm": false,
+      "extract_pitch": true,
+      "extract_uv": false,
+      "pitch_extractor": "dio",
+      "extract_energy": true,
+      "energy_extract_mode": "from_tacotron_stft",
+      "extract_duration": true,
+      "use_phone": true,
+      "pitch_norm": true,
+      "energy_norm": true,
+      "pitch_remove_outlier": true,
+      "energy_remove_outlier": true,
+      // Default config
+      "n_mel": 80,
+      "win_size": 1024,  // todo
+      "hop_size": 256,
+      "sample_rate": 22050,
+      "n_fft": 1024, // todo
+      "fmin": 0,
+      "fmax": 8000, // todo
+      "raw_data": "raw_data",
+      "text_cleaners": ["english_cleaners"],
+      "f0_min": 71,    // ~C2
+      "f0_max": 800, //1100,    // ~C6(1100), ~G5(800)
+      "pitch_bin": 256,
+      "pitch_max": 1100.0,
+      "pitch_min": 50.0,
+      "is_label": true,
+      "is_mu_law": true,
+      "bits": 8,
+      "mel_min_max_stats_dir": "mel_min_max_stats",
+      "whisper_dir": "whisper",
+      "content_vector_dir": "content_vector",
+      "wenet_dir": "wenet",
+      "mert_dir": "mert",
+      "spk2id":"spk2id.json",
+      "utt2spk":"utt2spk",
+      // Features used for model training
+      "use_mel": true,
+      "use_min_max_norm_mel": false,
+      "use_frame_pitch": false,
+      "use_frame_energy": false,
+      "use_phone_pitch": true,
+      "use_phone_energy": true,
+      "use_log_scale_pitch": false,
+      "use_log_scale_energy": false,
+      "use_spkid": false,
+      "align_mel_duration": true,
+      "text_cleaners": ["english_cleaners"]
+      },
+    "model": {
+      // Settings for transformer
+      "transformer": {
+        "encoder_layer": 4,
+        "encoder_head": 2,
+        "encoder_hidden": 256,
+        "decoder_layer": 6,
+        "decoder_head": 2,
+        "decoder_hidden": 256,
+        "conv_filter_size": 1024,
+        "conv_kernel_size": [9, 1],
+        "encoder_dropout": 0.2,
+        "decoder_dropout": 0.2
+      },
+      // Settings for variance_predictor
+      "variance_predictor":{
+        "filter_size": 256,
+        "kernel_size": 3,
+        "dropout": 0.5
+      },
+    "variance_embedding":{
+        "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
+        "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
+        "n_bins": 256
+      },
+    "max_seq_len": 1000
+    },
+    "train":{
+      "batch_size": 16,
+      "sort_sample": true,
+      "drop_last": true,
+      "group_size": 4,
+      "grad_clip_thresh": 1.0,
+      "dataloader": {
+        "num_worker": 8,
+        "pin_memory": true
+      },
+      "lr_scheduler":{
+        "num_warmup": 4000
+      },
+      // LR Scheduler
+      "scheduler": "NoamLR",
+      // Optimizer
+      "optimizer": "Adam",
+      "adam": {
+        "lr": 0.0625,
+        "betas": [0.9, 0.98],
+        "eps": 0.000000001,
+        "weight_decay": 0.0
+      },
+    }
+}

config/transformer.json ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "Transformer",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        "transformer": {
+            "type": "conformer",
+            // 'conformer' or 'transformer'
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+        }
+    },
+    "train": {
+        // Basic settings
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            100
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    }
+}

config/tts.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "base_config": "config/base.json",
+  "supported_model_type": [
+    "Fastspeech2",
+    "VITS",
+    "VALLE",
+  ],
+  "task_type": "tts",
+  "preprocess": {
+    "language": "en-us",
+    // linguistic features
+    "extract_phone": true,
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // Directory names of processed data or extracted features
+    "phone_dir": "phones",
+    "use_phone": true,
+  },
+  "model": {
+      "text_token_num": 512,
+  }
+}

config/valle.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "VALLE",
+    "task_type": "tts",
+    "dataset": [
+        "libritts"
+    ],
+    "preprocess": {
+        "extract_phone": true,
+        "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
+        "extract_acoustic_token": true,
+        "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
+        "acoustic_token_dir": "acoutic_tokens",
+        "use_text": false,
+        "use_phone": true,
+        "use_acoustic_token": true,
+        "symbols_dict": "symbols.dict",
+        "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
+        "max_duration": 14, //  the duration uperbound to filter the audio with duration > max_duration.
+        "sampling_rate": 24000,
+    },
+    "model": {
+        "text_token_num": 512,
+        "audio_token_num": 1024,
+        "decoder_dim": 1024, // embedding dimension of the decoder model
+        "nhead": 16, // number of attention heads in the decoder layers
+        "num_decoder_layers": 12, // number of decoder layers
+        "norm_first": true, // pre or post Normalization.
+        "add_prenet": false, // whether add PreNet after Inputs
+        "prefix_mode": 0, //  mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
+        "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
+        "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
+        "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
+        "num_quantizers": 8, // numbert of the audio quantization layers
+        // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
+    },
+    "train": {
+        "ddp": false,
+        "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
+        "max_epoch": 20,
+        "optimizer": "ScaledAdam",
+        "scheduler": "Eden",
+        "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
+        "base_lr": 0.05, // base learning rate."
+        "valid_interval": 1000,
+        "log_epoch_step": 1000,
+        "save_checkpoint_stride": [
+            1,
+            1
+        ]
+    }
+}

config/vits.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "VITS",
+    "task_type": "tts",
+    "preprocess": {
+        "extract_phone": true,
+        "extract_mel": true,
+        "n_mel": 80,
+        "fmin": 0,
+        "fmax": null,
+        "extract_linear_spec": true,
+        "extract_audio": true,
+        "use_linear": true,
+        "use_mel": true,
+        "use_audio": true,
+        "use_text": false,
+        "use_phone": true,
+        "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+        "n_fft": 1024,
+        "win_size": 1024,
+        "hop_size": 256,
+        "segment_size": 8192,
+        "text_cleaners": [
+            "english_cleaners"
+        ]
+    },
+    "model": {
+        "text_token_num": 512,
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "n_layers_q": 3,
+        "use_spectral_norm": false,
+        "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
+        "gin_channels": 256,
+        "use_sdp": true
+    },
+    "train": {
+        "fp16_run": true,
+        "learning_rate": 2e-4,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-9,
+        "batch_size": 16,
+        "lr_decay": 0.999875,
+        // "segment_size": 8192,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "AdamW": {
+            "betas": [
+                0.8,
+                0.99
+            ],
+            "eps": 1e-9,
+        }
+    }
+}

config/vocoder.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "base_config": "config/base.json",
+  "dataset": [
+    "LJSpeech",
+    "LibriTTS",
+    "opencpop",
+    "m4singer",
+    "svcc",
+    "svcceval",
+    "pjs",
+    "opensinger",
+    "popbutfy",
+    "nus48e",
+    "popcs",
+    "kising",
+    "csd",
+    "opera",
+    "vctk",
+    "lijian",
+    "cdmusiceval"
+  ],
+  "task_type": "vocoder",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_pitch": false,
+    "extract_uv": false,
+    "extract_audio": true,
+    "extract_label": false,
+    "extract_one_hot": false,
+    "extract_amplitude_phase": false,
+    "pitch_extractor": "parselmouth",
+    // Settings for data preprocessing
+    "n_mel": 100,
+    "win_size": 1024,
+    "hop_size": 256,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "f0_min": 50,
+    "f0_max": 1100,
+    "pitch_bin": 256,
+    "pitch_max": 1100.0,
+    "pitch_min": 50.0,
+    "is_mu_law": false,
+    "bits": 8,
+    "cut_mel_frame": 32,
+    // Directory names of processed data or extracted features
+    "spk2id": "singers.json",
+    // Features used for model training
+    "use_mel": true,
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_audio": true,
+    "use_label": false,
+    "use_one_hot": false,
+    "train_file": "train.json",
+    "valid_file": "test.json"
+  },
+  "train": {
+    "random_seed": 114514,
+    "batch_size": 64,
+    "gradient_accumulation_step": 1,
+    "max_epoch": 1000000,
+    "save_checkpoint_stride": [
+      20
+    ],
+    "run_eval": [
+      true
+    ],
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    "dataloader": {
+      "num_worker": 4,
+      "pin_memory": true
+    },
+    "tracker": [
+      "tensorboard"
+    ],
+  }
+}

egs/vocoder/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Amphion Vocoder Recipe
+## Quick Start
+We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/).
+## Supported Models
+Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including:
+- **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) :
+  - [MelGAN](https://arxiv.org/abs/1910.06711)
+  - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
+  - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
+  - [BigVGAN](https://arxiv.org/abs/2206.04658)
+  - [APNet](https://arxiv.org/abs/2305.07952)
+- **Flow-based vocoders** (👨‍💻 developing):
+  - [WaveGlow](https://arxiv.org/abs/1811.00002)
+- **Diffusion-based vocoders** (👨‍💻 developing):
+  - [Diffwave](https://arxiv.org/abs/2009.09761)
+- **Auto-regressive based vocoders** (👨‍💻 developing):
+  - [WaveNet](https://arxiv.org/abs/1609.03499)
+  - [WaveRNN](https://arxiv.org/abs/1802.08435v1)

egs/vocoder/diffusion/README.md ADDED Viewed

File without changes

egs/vocoder/diffusion/exp_config_base.json ADDED Viewed

File without changes

egs/vocoder/gan/README.md ADDED Viewed

	@@ -0,0 +1,224 @@

+# Amphion GAN-based Vocoder Recipe
+## Supported Model Architectures
+GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below:
+<br>
+<div align="center">
+  <img src="../../../imgs/vocoder/gan/pipeline.png" width="40%">
+</div>
+<br>
+Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators.
+- **Generators**
+    - [MelGAN](https://arxiv.org/abs/1910.06711)
+    - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
+    - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
+    - [BigVGAN](https://arxiv.org/abs/2206.04658)
+    - [APNet](https://arxiv.org/abs/2305.07952)
+- **Discriminators**
+    - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646)
+    - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646)
+    - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631)
+    - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438)
+    - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957)
+You can use any vocoder architecture with any dataset you want. There are four steps in total:
+1. Data preparation
+2. Feature extraction
+3. Training
+4. Inference
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## 1. Data Preparation
+You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md).
+### Configuration
+Specify the dataset path in  `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+"dataset": [
+    "csd",
+    "kising",
+    "m4singer",
+    "nus48e",
+    "opencpop",
+    "opensinger",
+    "opera",
+    "pjs",
+    "popbutfy",
+    "popcs",
+    "ljspeech",
+    "vctk",
+    "libritts",
+],
+"dataset_path": {
+    // TODO: Fill in your dataset path
+    "csd": "[dataset path]",
+    "kising": "[dataset path]",
+    "m4singer": "[dataset path]",
+    "nus48e": "[dataset path]",
+    "opencpop": "[dataset path]",
+    "opensinger": "[dataset path]",
+    "opera": "[dataset path]",
+    "pjs": "[dataset path]",
+    "popbutfy": "[dataset path]",
+    "popcs": "[dataset path]",
+    "ljspeech": "[dataset path]",
+    "vctk": "[dataset path]",
+    "libritts": "[dataset path]",
+},
+```
+### 2. Feature Extraction
+The needed features are speficied in the individual vocoder direction so it doesn't require any modification.
+### Configuration
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`:
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
+    "log_dir": "ckpts/vocoder",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+### Run
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+```bash
+sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Training
+### Configuration
+We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+    "batch_size": 16,
+    "max_epoch": 1000000,
+    "save_checkpoint_stride": [20],
+    "adamw": {
+        "lr": 2.0e-4,
+        "adam_b1": 0.8,
+        "adam_b2": 0.99
+    },
+    "exponential_lr": {
+        "lr_decay": 0.999
+    },
+}
+```
+You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`.
+```json
+"discriminators": [
+    "msd",
+    "mpd",
+    "msstftd",
+    "mssbcqtd",
+],
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
+```bash
+sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName]
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+## 4. Inference
+### Run
+Run the `run.sh` as the training stage (set  `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`.
+```bash
+sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
+	--infer_mode [Your chosen inference mode] \
+	--infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
+	--infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
+	--infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+#### a. Inference from Dataset
+Run the `run.sh` with specified datasets, here is an example.
+```bash
+sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
+	--infer_mode infer_from_dataset \
+	--infer_datasets "libritts vctk ljspeech" \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+#### b. Inference from Features
+If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
+```plaintext
+ ┣ {infer_feature_dir}
+ ┃ ┣ mels
+ ┃ ┃ ┣ sample1.npy
+ ┃ ┃ ┣ sample2.npy
+ ┃ ┣ f0s (required if you use NSF-HiFiGAN)
+ ┃ ┃ ┣ sample1.npy
+ ┃ ┃ ┣ sample2.npy
+```
+Then run the `run.sh` with specificed folder direction, here is an example.
+```bash
+sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
+	--infer_mode infer_from_feature \
+	--infer_feature_dir [Your path to your predicted acoustic features] \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+#### c. Inference from Audios
+If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
+```plaintext
+ ┣ audios
+ ┃ ┣ sample1.wav
+ ┃ ┣ sample2.wav
+```
+Then run the `run.sh` with specificed folder direction, here is an example.
+```bash
+sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
+	--infer_mode infer_from_audio \
+	--infer_audio_dir [Your path to your audio files] \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```

egs/vocoder/gan/_template/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/apnet/exp_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    "extract_amplitude_phase": true,
+    // Features used for model training
+    "use_mel": true,
+    "use_audio": true,
+    "use_amplitude_phase": true
+  },
+  "model": {
+    "generator": "apnet",
+    "apnet": {
+      "ASP_channel": 512,
+      "ASP_resblock_kernel_sizes": [3,7,11],
+      "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+      "ASP_input_conv_kernel_size": 7,
+      "ASP_output_conv_kernel_size": 7,
+      "PSP_channel": 512,
+      "PSP_resblock_kernel_sizes": [3,7,11],
+      "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+      "PSP_input_conv_kernel_size": 7,
+      "PSP_output_R_conv_kernel_size": 7,
+      "PSP_output_I_conv_kernel_size": 7,
+    }
+  },
+  "train": {
+    "criterions": [
+        "feature",
+        "discriminator",
+        "generator",
+        "mel",
+        "phase",
+        "amplitude",
+        "consistency"
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/apnet/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/bigvgan/exp_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    // Features used for model training
+    "use_mel": true,
+    "use_audio": true
+  },
+  "model": {
+    "generator": "bigvgan",
+    "bigvgan": {
+      "resblock": "1",
+      "activation": "snakebeta",
+      "snake_logscale": true,
+      "upsample_rates": [
+        8,
+        8,
+        2,
+        2,
+      ],
+      "upsample_kernel_sizes": [
+        16,
+        16,
+        4,
+        4
+      ],
+      "upsample_initial_channel": 512,
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ]
+    }
+  },
+  "train": {
+    "criterions": [
+        "feature",
+        "discriminator",
+        "generator",
+        "mel",
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/bigvgan/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/bigvgan_large/exp_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    // Features used for model training
+    "use_mel": true,
+    "use_audio": true
+  },
+  "model": {
+    "generator": "bigvgan",
+    "bigvgan": {
+      "resblock": "1",
+      "activation": "snakebeta",
+      "snake_logscale": true,
+      "upsample_rates": [
+        4,
+        4,
+        2,
+        2,
+        2,
+        2
+      ],
+      "upsample_kernel_sizes": [
+        8,
+        8,
+        4,
+        4,
+        4,
+        4
+      ],
+      "upsample_initial_channel": 1536,
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ]
+    },
+  },
+  "train": {
+    "criterions": [
+        "feature",
+        "discriminator",
+        "generator",
+        "mel",
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/bigvgan_large/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/exp_config_base.json ADDED Viewed

	@@ -0,0 +1,111 @@

+{
+  "base_config": "config/vocoder.json",
+  "model_type": "GANVocoder",
+  // TODO: Choose your needed datasets
+  "dataset": [
+    "csd",
+    "kising",
+    "m4singer",
+    "nus48e",
+    "opencpop",
+    "opensinger",
+    "opera",
+    "pjs",
+    "popbutfy",
+    "popcs",
+    "ljspeech",
+    "vctk",
+    "libritts",
+  ],
+  "dataset_path": {
+    // TODO: Fill in your dataset path
+    "csd": "[dataset path]",
+    "kising": "[dataset path]",
+    "m4singer": "[dataset path]",
+    "nus48e": "[dataset path]",
+    "opencpop": "[dataset path]",
+    "opensinger": "[dataset path]",
+    "opera": "[dataset path]",
+    "pjs": "[dataset path]",
+    "popbutfy": "[dataset path]",
+    "popcs": "[dataset path]",
+    "ljspeech": "[dataset path]",
+    "vctk": "[dataset path]",
+    "libritts": "[dataset path]",
+  },
+  // TODO: Fill in the output log path
+  "log_dir": "ckpts/vocoder",
+  "preprocess": {
+    // Acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    "extract_pitch": false,
+    "extract_uv": false,
+    "pitch_extractor": "parselmouth",
+    // Features used for model training
+    "use_mel": true,
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_audio": true,
+    // TODO: Fill in the output data path
+    "processed_dir": "data/",
+    "n_mel": 100,
+    "sample_rate": 24000
+  },
+  "model": {
+    // TODO: Choose your needed discriminators
+    "discriminators": [
+      "msd",
+      "mpd",
+      "msstftd",
+      "mssbcqtd",
+    ],
+    "mpd": {
+      "mpd_reshapes": [
+        2,
+        3,
+        5,
+        7,
+        11
+      ],
+      "use_spectral_norm": false,
+      "discriminator_channel_mult_factor": 1
+    },
+    "mrd": {
+      "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
+      "use_spectral_norm": false,
+      "discriminator_channel_mult_factor": 1,
+      "mrd_override": false
+    },
+    "msstftd": {
+        "filters": 32
+    },
+    "mssbcqtd": {
+      hop_lengths: [512, 256, 256],
+      filters: 32,
+      max_filters: 1024,
+      filters_scale: 1,
+      dilations: [1, 2, 4],
+      in_channels: 1,
+      out_channels: 1,
+      n_octaves: [9, 9, 9],
+      bins_per_octaves: [24, 36, 48]
+    },
+  },
+  "train": {
+    // TODO: Choose a suitable batch size, training epoch, and save stride
+    "batch_size": 32,
+    "max_epoch": 1000000,
+    "save_checkpoint_stride": [20],
+    "adamw": {
+        "lr": 2.0e-4,
+        "adam_b1": 0.8,
+        "adam_b2": 0.99
+    },
+    "exponential_lr": {
+        "lr_decay": 0.999
+    },
+  }
+}

egs/vocoder/gan/hifigan/exp_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    // Features used for model training
+    "use_mel": true,
+    "use_audio": true
+  },
+  "model": {
+    "generator": "hifigan",
+    "hifigan": {
+      "resblock": "2",
+      "upsample_rates": [
+        8,
+        8,
+        4
+      ],
+      "upsample_kernel_sizes": [
+        16,
+        16,
+        8
+      ],
+      "upsample_initial_channel": 256,
+      "resblock_kernel_sizes": [
+        3,
+        5,
+        7
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          2
+        ],
+        [
+          2,
+          6
+        ],
+        [
+          3,
+          12
+        ]
+      ]
+    }
+  },
+  "train": {
+    "criterions": [
+        "feature",
+        "discriminator",
+        "generator",
+        "mel",
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/hifigan/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/melgan/exp_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    // Features used for model training
+    "use_mel": true,
+    "use_audio": true
+  },
+  "model": {
+    "generator": "melgan",
+    "melgan": {
+      "ratios": [8, 8, 2, 2],
+      "ngf": 32,
+      "n_residual_layers": 3,
+      "num_D": 3,
+      "ndf": 16,
+      "n_layers": 4,
+      "downsampling_factor": 4
+    },
+  },
+  "train": {
+    "criterions": [
+        "feature",
+        "discriminator",
+        "generator",
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/melgan/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/nsfhifigan/exp_config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    "extract_pitch": true,
+    // Features used for model training
+    "use_mel": true,
+    "use_audio": true,
+    "use_frame_pitch": true
+  },
+  "model": {
+    "generator": "nsfhifigan",
+    "nsfhifigan": {
+      "resblock": "1",
+      "harmonic_num": 8,
+      "upsample_rates": [
+        8,
+        4,
+        2,
+        2,
+        2
+      ],
+      "upsample_kernel_sizes": [
+        16,
+        8,
+        4,
+        4,
+        4
+      ],
+      "upsample_initial_channel": 768,
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ]
+    },
+    "mpd": {
+      "mpd_reshapes": [
+        2,
+        3,
+        5,
+        7,
+        11,
+        17,
+        23,
+        37
+      ],
+      "use_spectral_norm": false,
+      "discriminator_channel_multi": 1
+    }
+  },
+  "train": {
+    "criterions": [
+        "feature",
+        "discriminator",
+        "generator",
+        "mel",
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/nsfhifigan/run.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

egs/vocoder/gan/tfr_enhanced_hifigan/README.md ADDED Viewed

	@@ -0,0 +1,185 @@

+# Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957)
+[![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/)
+<br>
+<div align="center">
+<img src="../../../../imgs/vocoder/gan/MSSBCQTD.png" width="80%">
+</div>
+<br>
+This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators.
+There are four stages in total:
+1. Data preparation
+2. Feature extraction
+3. Training
+4. Inference
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## 1. Data Preparation
+### Dataset Download
+By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md).
+### Configuration
+Specify the dataset path in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+"dataset": [
+    "ljspeech",
+    "vctk",
+    "libritts",
+],
+"dataset_path": {
+    // TODO: Fill in your dataset path
+    "ljspeech": "[LJSpeech dataset path]",
+    "vctk": "[VCTK dataset path]",
+    "libritts": "[LibriTTS dataset path]",
+},
+```
+## 2. Features Extraction
+For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training.
+### Configuration
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
+    "log_dir": "ckpts/vocoder",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+### Run
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+```bash
+sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Training
+### Configuration
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+    "batch_size": 32,
+    ...
+}
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
+```bash
+sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName]
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+## 4. Inference
+### Pretrained Vocoder Download
+We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md).
+### Run
+Run the `run.sh` as the training stage (set  `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`.
+```bash
+sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
+	--infer_mode [Your chosen inference mode] \
+	--infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
+	--infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
+	--infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+#### a. Inference from Dataset
+Run the `run.sh` with specified datasets, here is an example.
+```bash
+sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
+	--infer_mode infer_from_dataset \
+	--infer_datasets "libritts vctk ljspeech" \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+#### b. Inference from Features
+If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
+```plaintext
+ ┣ {infer_feature_dir}
+ ┃ ┣ mels
+ ┃ ┃ ┣ sample1.npy
+ ┃ ┃ ┣ sample2.npy
+```
+Then run the `run.sh` with specificed folder direction, here is an example.
+```bash
+sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
+	--infer_mode infer_from_feature \
+	--infer_feature_dir [Your path to your predicted acoustic features] \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+#### c. Inference from Audios
+If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
+```plaintext
+ ┣ audios
+ ┃ ┣ sample1.wav
+ ┃ ┣ sample2.wav
+```
+Then run the `run.sh` with specificed folder direction, here is an example.
+```bash
+sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
+	--infer_mode infer_from_audio \
+	--infer_audio_dir [Your path to your audio files] \
+	--infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
+```
+## Citations
+```bibtex
+@misc{gu2023cqt,
+      title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder},
+      author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu},
+      year={2023},
+      eprint={2311.14957},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD}
+}
+```

egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "base_config": "egs/vocoder/gan/exp_config_base.json",
+  "model_type": "GANVocoder",
+  "dataset": [
+    "ljspeech",
+    "vctk",
+    "libritts",
+  ],
+  "dataset_path": {
+    // TODO: Fill in your dataset path
+    "ljspeech": "[dataset path]",
+    "vctk": "[dataset path]",
+    "libritts": "[dataset path]",
+  },
+  // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
+  "log_dir": "ckpts/vocoder",
+  "preprocess": {
+    // TODO: Fill in the output data path. The default value is "Amphion/data"
+    "processed_dir": "data",
+    // acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    "extract_pitch": false,
+    "extract_uv": false,
+    "extract_amplitude_phase": false,
+    "pitch_extractor": "parselmouth",
+    // Features used for model training
+    "use_mel": true,
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_audio": true,
+    "n_mel": 100,
+    "sample_rate": 24000
+  },
+  "model": {
+    "generator": "hifigan",
+    "discriminators": [
+      "msd",
+      "mpd",
+      "mssbcqtd",
+      "msstftd",
+    ],
+    "hifigan": {
+      "resblock": "1",
+      "upsample_rates": [
+        8,
+        4,
+        2,
+        2,
+        2
+      ],
+      "upsample_kernel_sizes": [
+        16,
+        8,
+        4,
+        4,
+        4
+      ],
+      "upsample_initial_channel": 768,
+      "resblock_kernel_sizes": [
+        3,
+        5,
+        7
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ]
+    },
+    "mpd": {
+      "mpd_reshapes": [
+        2,
+        3,
+        5,
+        7,
+        11,
+        17,
+        23,
+        37
+      ],
+      "use_spectral_norm": false,
+      "discriminator_channel_multi": 1
+    }
+  },
+  "train": {
+    "batch_size": 16,
+    "adamw": {
+      "lr": 2.0e-4,
+      "adam_b1": 0.8,
+      "adam_b2": 0.99
+    },
+    "exponential_lr": {
+      "lr_decay": 0.999
+    },
+    "criterions": [
+      "feature",
+      "discriminator",
+      "generator",
+      "mel",
+    ]
+  },
+  "inference": {
+    "batch_size": 1,
+  }
+}

egs/vocoder/gan/tfr_enhanced_hifigan/run.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; cehckpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --checkpoint "$checkpoint" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    echo $infer_datasets
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

inference.py CHANGED Viewed

@@ -208,9 +208,9 @@ def build_parser():
     return parser
-def main():
     ### Parse arguments and config
-    args = build_parser().parse_args()
     cfg = load_config(args.config)
     # CUDA settings
@@ -256,3 +256,7 @@ def main():
     else:
         ### Infer from dataset
         infer(args, cfg, infer_type="from_dataset")

     return parser
+def main(args_list):
     ### Parse arguments and config
+    args = build_parser().parse_args(args_list)
     cfg = load_config(args.config)
     # CUDA settings
     else:
         ### Infer from dataset
         infer(args, cfg, infer_type="from_dataset")
+if __name__ == "__main__":
+    main()

modules/__init__.py ADDED Viewed

File without changes

modules/activation_functions/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .gated_activation_unit import GaU
+from .snake import Snake, SnakeBeta

modules/activation_functions/gated_activation_unit.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from modules.general.utils import Conv1d
+class GaU(nn.Module):
+    r"""Gated Activation Unit (GaU) proposed in `Gated Activation Units for Neural
+    Networks <https://arxiv.org/pdf/1606.05328.pdf>`_.
+    Args:
+        channels: number of input channels.
+        kernel_size: kernel size of the convolution.
+        dilation: dilation rate of the convolution.
+        d_context: dimension of context tensor, None if don't use context.
+    """
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        d_context: int = None,
+    ):
+        super().__init__()
+        self.context = d_context
+        self.conv = Conv1d(
+            channels,
+            channels * 2,
+            kernel_size,
+            dilation=dilation,
+            padding=dilation * (kernel_size - 1) // 2,
+        )
+        if self.context:
+            self.context_proj = Conv1d(d_context, channels * 2, 1)
+    def forward(self, x: torch.Tensor, context: torch.Tensor = None):
+        r"""Calculate forward propagation.
+        Args:
+            x: input tensor with shape [B, C, T].
+            context: context tensor with shape [B, ``d_context``, T], default to None.
+        """
+        h = self.conv(x)
+        if self.context:
+            h = h + self.context_proj(context)
+        h1, h2 = h.chunk(2, 1)
+        h = torch.tanh(h1) * torch.sigmoid(h2)
+        return h

modules/activation_functions/snake.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn, pow, sin
+from torch.nn import Parameter
+class Snake(nn.Module):
+    r"""Implementation of a sine-based periodic activation function.
+    Alpha is initialized to 1 by default, higher values means higher frequency.
+    It will be trained along with the rest of your model.
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    References:
+        This activation function is from this paper by Liu Ziyin, Tilman Hartwig,
+        Masahito Ueda: https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = Snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        r"""Forward pass of the function. Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (ax)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    r"""A modified Snake function which uses separate parameters for the magnitude
+    of the periodic components. Alpha is initialized to 1 by default,
+    higher values means higher frequency. Beta is initialized to 1 by default,
+    higher values means higher magnitude. Both will be trained along with the
+    rest of your model.
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter that controls frequency
+        beta: trainable parameter that controls magnitude
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    References:
+        This activation function is a modified version based on this paper by Liu Ziyin,
+        Tilman Hartwig, Masahito Ueda: https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = SnakeBeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        r"""Forward pass of the function. Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

modules/anti_aliasing/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .act import *
+from .filter import *
+from .resample import *

modules/anti_aliasing/act.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+from .resample import *
+# This code is adopted from BigVGAN under the MIT License
+# https://github.com/NVIDIA/BigVGAN
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

modules/anti_aliasing/filter.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+if "sinc" in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(
+            x == 0,
+            torch.tensor(1.0, device=x.device, dtype=x.dtype),
+            torch.sin(math.pi * x) / math.pi / x,
+        )
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+def kaiser_sinc_filter1d(
+    cutoff, half_width, kernel_size
+):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff=0.5,
+        half_width=0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+    # input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        return out

modules/anti_aliasing/resample.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#################### Anti-aliasing ####################
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import *
+# This code is adopted from BigVGAN under the MIT License
+# https://github.com/NVIDIA/BigVGAN
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = (
+            self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        )
+        filter = kaiser_sinc_filter1d(
+            cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
+        )
+        self.register_buffer("filter", filter)
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
+        )
+        x = x[..., self.pad_left : -self.pad_right]
+        return x
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+    def forward(self, x):
+        xx = self.lowpass(x)
+        return xx

modules/base/base_module.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn
+from torch.nn import functional as F
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask

modules/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .bidilconv.bidilated_conv import BiDilConv
+from .unet.unet import UNet

modules/diffusion/bidilconv/bidilated_conv.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch.nn as nn
+from modules.general.utils import Conv1d, zero_module
+from .residual_block import ResidualBlock
+class BiDilConv(nn.Module):
+    r"""Dilated CNN architecture with residual connections, default diffusion decoder.
+    Args:
+        input_channel: The number of input channels.
+        base_channel: The number of base channels.
+        n_res_block: The number of residual blocks.
+        conv_kernel_size: The kernel size of convolutional layers.
+        dilation_cycle_length: The cycle length of dilation.
+        conditioner_size: The size of conditioner.
+    """
+    def __init__(
+        self,
+        input_channel,
+        base_channel,
+        n_res_block,
+        conv_kernel_size,
+        dilation_cycle_length,
+        conditioner_size,
+        output_channel: int = -1,
+    ):
+        super().__init__()
+        self.input_channel = input_channel
+        self.base_channel = base_channel
+        self.n_res_block = n_res_block
+        self.conv_kernel_size = conv_kernel_size
+        self.dilation_cycle_length = dilation_cycle_length
+        self.conditioner_size = conditioner_size
+        self.output_channel = output_channel if output_channel > 0 else input_channel
+        self.input = nn.Sequential(
+            Conv1d(
+                input_channel,
+                base_channel,
+                1,
+            ),
+            nn.ReLU(),
+        )
+        self.residual_blocks = nn.ModuleList(
+            [
+                ResidualBlock(
+                    channels=base_channel,
+                    kernel_size=conv_kernel_size,
+                    dilation=2 ** (i % dilation_cycle_length),
+                    d_context=conditioner_size,
+                )
+                for i in range(n_res_block)
+            ]
+        )
+        self.out_proj = nn.Sequential(
+            Conv1d(
+                base_channel,
+                base_channel,
+                1,
+            ),
+            nn.ReLU(),
+            zero_module(
+                Conv1d(
+                    base_channel,
+                    self.output_channel,
+                    1,
+                ),
+            ),
+        )
+    def forward(self, x, y, context=None):
+        """
+        Args:
+            x: Noisy mel-spectrogram [B x ``n_mel`` x L]
+            y: FILM embeddings with the shape of (B, ``base_channel``)
+            context: Context with the shape of [B x ``d_context`` x L], default to None.
+        """
+        h = self.input(x)
+        skip = None
+        for i in range(self.n_res_block):
+            h, skip_connection = self.residual_blocks[i](h, y, context)
+            skip = skip_connection if skip is None else skip_connection + skip
+        out = skip / math.sqrt(self.n_res_block)
+        out = self.out_proj(out)
+        return out

modules/diffusion/bidilconv/residual_block.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+from modules.activation_functions import GaU
+from modules.general.utils import Conv1d
+class ResidualBlock(nn.Module):
+    r"""Residual block with dilated convolution, main portion of ``BiDilConv``.
+    Args:
+        channels: The number of channels of input and output.
+        kernel_size: The kernel size of dilated convolution.
+        dilation: The dilation rate of dilated convolution.
+        d_context: The dimension of content encoder output, None if don't use context.
+    """
+    def __init__(
+        self,
+        channels: int = 256,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        d_context: int = None,
+    ):
+        super().__init__()
+        self.context = d_context
+        self.gau = GaU(
+            channels,
+            kernel_size,
+            dilation,
+            d_context,
+        )
+        self.out_proj = Conv1d(
+            channels,
+            channels * 2,
+            1,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        y_emb: torch.Tensor,
+        context: torch.Tensor = None,
+    ):
+        """
+        Args:
+            x: Latent representation inherited from previous residual block
+                with the shape of [B x C x T].
+            y_emb: Embeddings with the shape of [B x C], which will be FILM on the x.
+            context: Context with the shape of [B x ``d_context`` x T], default to None.
+        """
+        h = x + y_emb[..., None]
+        if self.context:
+            h = self.gau(h, context)
+        else:
+            h = self.gau(h)
+        h = self.out_proj(h)
+        res, skip = h.chunk(2, 1)
+        return (res + x) / math.sqrt(2.0), skip

modules/diffusion/karras/karras_diffusion.py ADDED Viewed

	@@ -0,0 +1,979 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Based on: https://github.com/crowsonkb/k-diffusion
+"""
+import random
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+# from piq import LPIPS
+from utils.ssim import SSIM
+from modules.diffusion.karras.random_utils import get_generator
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+def append_zero(x):
+    return th.cat([x, x.new_zeros([1])])
+def get_weightings(weight_schedule, snrs, sigma_data):
+    if weight_schedule == "snr":
+        weightings = snrs
+    elif weight_schedule == "snr+1":
+        weightings = snrs + 1
+    elif weight_schedule == "karras":
+        weightings = snrs + 1.0 / sigma_data**2
+    elif weight_schedule == "truncated-snr":
+        weightings = th.clamp(snrs, min=1.0)
+    elif weight_schedule == "uniform":
+        weightings = th.ones_like(snrs)
+    else:
+        raise NotImplementedError()
+    return weightings
+class KarrasDenoiser:
+    def __init__(
+        self,
+        sigma_data: float = 0.5,
+        sigma_max=80.0,
+        sigma_min=0.002,
+        rho=7.0,
+        weight_schedule="karras",
+        distillation=False,
+        loss_norm="l2",
+    ):
+        self.sigma_data = sigma_data
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.weight_schedule = weight_schedule
+        self.distillation = distillation
+        self.loss_norm = loss_norm
+        # if loss_norm == "lpips":
+        #     self.lpips_loss = LPIPS(replace_pooling=True, reduction="none")
+        if loss_norm == "ssim":
+            self.ssim_loss = SSIM()
+        self.rho = rho
+        self.num_timesteps = 40
+    def get_snr(self, sigmas):
+        return sigmas**-2
+    def get_sigmas(self, sigmas):
+        return sigmas
+    def get_scalings(self, sigma):
+        c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
+        c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out, c_in
+    def get_scalings_for_boundary_condition(self, sigma):
+        c_skip = self.sigma_data**2 / (
+            (sigma - self.sigma_min) ** 2 + self.sigma_data**2
+        )
+        c_out = (
+            (sigma - self.sigma_min)
+            * self.sigma_data
+            / (sigma**2 + self.sigma_data**2) ** 0.5
+        )
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out, c_in
+    def training_losses(self, model, x_start, sigmas, condition=None, noise=None):
+        if noise is None:
+            noise = th.randn_like(x_start)
+        terms = {}
+        dims = x_start.ndim
+        x_t = x_start + noise * append_dims(sigmas, dims)
+        model_output, denoised = self.denoise(model, x_t, sigmas, condition)
+        snrs = self.get_snr(sigmas)
+        weights = append_dims(
+            get_weightings(self.weight_schedule, snrs, self.sigma_data), dims
+        )
+        # terms["xs_mse"] = mean_flat((denoised - x_start) ** 2)
+        terms["mse"] = mean_flat(weights * (denoised - x_start) ** 2)
+        # terms["mae"] = mean_flat(weights * th.abs(denoised - x_start))
+        # terms["mse"] = nn.MSELoss(reduction="none")(denoised, x_start)
+        # if "vb" in terms:
+        #     terms["loss"] = terms["mse"] + terms["vb"]
+        # else:
+        terms["loss"] = terms["mse"]
+        return terms
+    def consistency_losses(
+        self,
+        model,
+        x_start,
+        num_scales,
+        # model_kwargs=None,
+        condition=None,
+        target_model=None,
+        teacher_model=None,
+        teacher_diffusion=None,
+        noise=None,
+    ):
+        if noise is None:
+            noise = th.randn_like(x_start)
+        dims = x_start.ndim
+        def denoise_fn(x, t):
+            return self.denoise(model, x, t, condition)[1]
+        if target_model:
+            @th.no_grad()
+            def target_denoise_fn(x, t):
+                return self.denoise(target_model, x, t, condition)[1]
+        else:
+            raise NotImplementedError("Must have a target model")
+        if teacher_model:
+            @th.no_grad()
+            def teacher_denoise_fn(x, t):
+                return teacher_diffusion.denoise(teacher_model, x, t, condition)[1]
+        @th.no_grad()
+        def heun_solver(samples, t, next_t, x0):
+            x = samples
+            if teacher_model is None:
+                denoiser = x0
+            else:
+                denoiser = teacher_denoise_fn(x, t)
+            d = (x - denoiser) / append_dims(t, dims)
+            samples = x + d * append_dims(next_t - t, dims)
+            if teacher_model is None:
+                denoiser = x0
+            else:
+                denoiser = teacher_denoise_fn(samples, next_t)
+            next_d = (samples - denoiser) / append_dims(next_t, dims)
+            samples = x + (d + next_d) * append_dims((next_t - t) / 2, dims)
+            return samples
+        @th.no_grad()
+        def euler_solver(samples, t, next_t, x0):
+            x = samples
+            if teacher_model is None:
+                denoiser = x0
+            else:
+                denoiser = teacher_denoise_fn(x, t)
+            d = (x - denoiser) / append_dims(t, dims)
+            samples = x + d * append_dims(next_t - t, dims)
+            return samples
+        indices = th.randint(
+            0, num_scales - 1, (x_start.shape[0],), device=x_start.device
+        )
+        t = self.sigma_max ** (1 / self.rho) + indices / (num_scales - 1) * (
+            self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+        )
+        t = t**self.rho
+        t2 = self.sigma_max ** (1 / self.rho) + (indices + 1) / (num_scales - 1) * (
+            self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+        )
+        t2 = t2**self.rho
+        x_t = x_start + noise * append_dims(t, dims)
+        dropout_state = th.get_rng_state()
+        distiller = denoise_fn(x_t, t)
+        if teacher_model is None:
+            x_t2 = euler_solver(x_t, t, t2, x_start).detach()
+        else:
+            x_t2 = heun_solver(x_t, t, t2, x_start).detach()
+        th.set_rng_state(dropout_state)
+        distiller_target = target_denoise_fn(x_t2, t2)
+        distiller_target = distiller_target.detach()
+        snrs = self.get_snr(t)
+        weights = get_weightings(self.weight_schedule, snrs, self.sigma_data)
+        if self.loss_norm == "l1":
+            diffs = th.abs(distiller - distiller_target)
+            loss = mean_flat(diffs) * weights
+        elif self.loss_norm == "l2":
+            # diffs = (distiller - distiller_target) ** 2
+            loss = F.mse_loss(distiller, distiller_target)
+            # loss = mean_flat(diffs) * weights
+        elif self.loss_norm == "ssim":
+            loss = self.ssim_loss(distiller, distiller_target) * weights
+        # elif self.loss_norm == "l2-32":
+        #     distiller = F.interpolate(distiller, size=32, mode="bilinear")
+        #     distiller_target = F.interpolate(
+        #         distiller_target,
+        #         size=32,
+        #         mode="bilinear",
+        #     )
+        #     diffs = (distiller - distiller_target) ** 2
+        #     loss = mean_flat(diffs) * weights
+        # elif self.loss_norm == "lpips":
+        #     if x_start.shape[-1] < 256:
+        #         distiller = F.interpolate(distiller, size=224, mode="bilinear")
+        #         distiller_target = F.interpolate(
+        #             distiller_target, size=224, mode="bilinear"
+        #         )
+        #     loss = (
+        #         self.lpips_loss(
+        #             (distiller + 1) / 2.0,
+        #             (distiller_target + 1) / 2.0,
+        #         )
+        #         * weights
+        #     )
+        else:
+            raise ValueError(f"Unknown loss norm {self.loss_norm}")
+        terms = {}
+        terms["loss"] = loss
+        return terms
+    # def progdist_losses(
+    #     self,
+    #     model,
+    #     x_start,
+    #     num_scales,
+    #     model_kwargs=None,
+    #     teacher_model=None,
+    #     teacher_diffusion=None,
+    #     noise=None,
+    # ):
+    #     if model_kwargs is None:
+    #         model_kwargs = {}
+    #     if noise is None:
+    #         noise = th.randn_like(x_start)
+    #     dims = x_start.ndim
+    #     def denoise_fn(x, t):
+    #         return self.denoise(model, x, t, **model_kwargs)[1]
+    #     @th.no_grad()
+    #     def teacher_denoise_fn(x, t):
+    #         return teacher_diffusion.denoise(teacher_model, x, t, **model_kwargs)[1]
+    #     @th.no_grad()
+    #     def euler_solver(samples, t, next_t):
+    #         x = samples
+    #         denoiser = teacher_denoise_fn(x, t)
+    #         d = (x - denoiser) / append_dims(t, dims)
+    #         samples = x + d * append_dims(next_t - t, dims)
+    #         return samples
+    #     @th.no_grad()
+    #     def euler_to_denoiser(x_t, t, x_next_t, next_t):
+    #         denoiser = x_t - append_dims(t, dims) * (x_next_t - x_t) / append_dims(
+    #             next_t - t, dims
+    #         )
+    #         return denoiser
+    #     indices = th.randint(0, num_scales, (x_start.shape[0],), device=x_start.device)
+    #     t = self.sigma_max ** (1 / self.rho) + indices / num_scales * (
+    #         self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+    #     )
+    #     t = t**self.rho
+    #     t2 = self.sigma_max ** (1 / self.rho) + (indices + 0.5) / num_scales * (
+    #         self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+    #     )
+    #     t2 = t2**self.rho
+    #     t3 = self.sigma_max ** (1 / self.rho) + (indices + 1) / num_scales * (
+    #         self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
+    #     )
+    #     t3 = t3**self.rho
+    #     x_t = x_start + noise * append_dims(t, dims)
+    #     denoised_x = denoise_fn(x_t, t)
+    #     x_t2 = euler_solver(x_t, t, t2).detach()
+    #     x_t3 = euler_solver(x_t2, t2, t3).detach()
+    #     target_x = euler_to_denoiser(x_t, t, x_t3, t3).detach()
+    #     snrs = self.get_snr(t)
+    #     weights = get_weightings(self.weight_schedule, snrs, self.sigma_data)
+    #     if self.loss_norm == "l1":
+    #         diffs = th.abs(denoised_x - target_x)
+    #         loss = mean_flat(diffs) * weights
+    #     elif self.loss_norm == "l2":
+    #         diffs = (denoised_x - target_x) ** 2
+    #         loss = mean_flat(diffs) * weights
+    #     elif self.loss_norm == "lpips":
+    #         if x_start.shape[-1] < 256:
+    #             denoised_x = F.interpolate(denoised_x, size=224, mode="bilinear")
+    #             target_x = F.interpolate(target_x, size=224, mode="bilinear")
+    #         loss = (
+    #             self.lpips_loss(
+    #                 (denoised_x + 1) / 2.0,
+    #                 (target_x + 1) / 2.0,
+    #             )
+    #             * weights
+    #         )
+    #     else:
+    #         raise ValueError(f"Unknown loss norm {self.loss_norm}")
+    #     terms = {}
+    #     terms["loss"] = loss
+    #     return terms
+    def denoise(self, model, x_t, sigmas, condition):
+        if not self.distillation:
+            c_skip, c_out, c_in = [
+                append_dims(x, x_t.ndim) for x in self.get_scalings(sigmas)
+            ]
+        else:
+            c_skip, c_out, c_in = [
+                append_dims(x, x_t.ndim)
+                for x in self.get_scalings_for_boundary_condition(sigmas)
+            ]
+        rescaled_t = 1000 * 0.25 * th.log(sigmas + 1e-44)
+        # rescaled_t = rescaled_t[:, None]
+        model_output = model(c_in * x_t, rescaled_t, condition)
+        denoised = c_out * model_output + c_skip * x_t
+        return model_output, denoised
+def karras_sample(
+    diffusion,
+    model,
+    shape,
+    steps,
+    clip_denoised=True,
+    progress=True,
+    callback=None,
+    # model_kwargs=None,
+    condition=None,
+    device=None,
+    sigma_min=0.002,
+    sigma_max=80,  # higher for highres?
+    rho=7.0,
+    sampler="heun",
+    s_churn=0.0,
+    s_tmin=0.0,
+    s_tmax=float("inf"),
+    s_noise=1.0,
+    generator=None,
+    ts=None,
+):
+    if generator is None:
+        generator = get_generator("dummy")
+    if sampler == "progdist":
+        sigmas = get_sigmas_karras(steps + 1, sigma_min, sigma_max, rho, device=device)
+    else:
+        sigmas = get_sigmas_karras(steps, sigma_min, sigma_max, rho, device=device)
+    th.manual_seed(42)
+    x_T = generator.randn(*shape, device=device) * sigma_max
+    sigmas = sigmas.unsqueeze(-1)
+    sample_fn = {
+        "heun": sample_heun,
+        "dpm": sample_dpm,
+        "ancestral": sample_euler_ancestral,
+        "onestep": sample_onestep,
+        "progdist": sample_progdist,
+        "euler": sample_euler,
+        "multistep": stochastic_iterative_sampler,
+    }[sampler]
+    if sampler in ["heun", "dpm"]:
+        sampler_args = dict(
+            s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise
+        )
+    elif sampler == "multistep":
+        sampler_args = dict(
+            ts=ts, t_min=sigma_min, t_max=sigma_max, rho=diffusion.rho, steps=steps
+        )
+    else:
+        sampler_args = {}
+    def denoiser(x_t, sigma):
+        _, denoised = diffusion.denoise(model, x_t, sigma, condition)
+        if clip_denoised:
+            denoised = denoised.clamp(-1, 1)
+        return denoised
+    x_0 = sample_fn(
+        denoiser,
+        x_T,
+        sigmas,
+        generator,
+        progress=progress,
+        callback=callback,
+        **sampler_args,
+    )
+    return x_0.clamp(-1, 1)
+def get_sigmas_karras(n, sigma_min, sigma_max, rho=7.0, device="cpu"):
+    """Constructs the noise schedule of Karras et al. (2022)."""
+    ramp = th.linspace(0, 1, n)
+    min_inv_rho = sigma_min ** (1 / rho)
+    max_inv_rho = sigma_max ** (1 / rho)
+    sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+    return append_zero(sigmas).to(device)
+def to_d(x, sigma, denoised):
+    """Converts a denoiser output to a Karras ODE derivative."""
+    return (x - denoised) / append_dims(sigma, x.ndim)
+def get_ancestral_step(sigma_from, sigma_to):
+    """Calculates the noise level (sigma_down) to step down to and the amount
+    of noise to add (sigma_up) when doing an ancestral sampling step."""
+    sigma_up = (
+        sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2
+    ) ** 0.5
+    sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    return sigma_down, sigma_up
+@th.no_grad()
+def sample_euler_ancestral(model, x, sigmas, generator, progress=False, callback=None):
+    """Ancestral sampling with Euler method steps."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        denoised = model(x, sigmas[i] * s_in)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "sigma_hat": sigmas[i],
+                    "denoised": denoised,
+                }
+            )
+        d = to_d(x, sigmas[i], denoised)
+        # Euler method
+        dt = sigma_down - sigmas[i]
+        x = x + d * dt
+        x = x + generator.randn_like(x) * sigma_up
+    return x
+@th.no_grad()
+def sample_midpoint_ancestral(model, x, ts, generator, progress=False, callback=None):
+    """Ancestral sampling with midpoint method steps."""
+    s_in = x.new_ones([x.shape[0]])
+    step_size = 1 / len(ts)
+    if progress:
+        from tqdm.auto import tqdm
+        ts = tqdm(ts)
+    for tn in ts:
+        dn = model(x, tn * s_in)
+        dn_2 = model(x + (step_size / 2) * dn, (tn + step_size / 2) * s_in)
+        x = x + step_size * dn_2
+        if callback is not None:
+            callback({"x": x, "tn": tn, "dn": dn, "dn_2": dn_2})
+    return x
+@th.no_grad()
+def sample_heun(
+    denoiser,
+    x,
+    sigmas,
+    generator,
+    progress=False,
+    callback=None,
+    s_churn=0.0,
+    s_tmin=0.0,
+    s_tmax=float("inf"),
+    s_noise=1.0,
+):
+    """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        gamma = (
+            min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
+            if s_tmin <= sigmas[i] <= s_tmax
+            else 0.0
+        )
+        eps = generator.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        denoised = denoiser(x, sigma_hat * s_in)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "sigma_hat": sigma_hat,
+                    "denoised": denoised,
+                }
+            )
+        dt = sigmas[i + 1] - sigma_hat
+        if sigmas[i + 1] == 0:
+            # Euler method
+            x = x + d * dt
+        else:
+            # Heun's method
+            x_2 = x + d * dt
+            denoised_2 = denoiser(x_2, sigmas[i + 1] * s_in)
+            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
+            d_prime = (d + d_2) / 2
+            x = x + d_prime * dt
+    return x
+@th.no_grad()
+def sample_euler(
+    denoiser,
+    x,
+    sigmas,
+    generator,
+    progress=False,
+    callback=None,
+):
+    """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        sigma = sigmas[i]
+        denoised = denoiser(x, sigma * s_in)
+        d = to_d(x, sigma, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "denoised": denoised,
+                }
+            )
+        dt = sigmas[i + 1] - sigma
+        x = x + d * dt
+    return x
+@th.no_grad()
+def sample_dpm(
+    denoiser,
+    x,
+    sigmas,
+    generator,
+    progress=False,
+    callback=None,
+    s_churn=0.0,
+    s_tmin=0.0,
+    s_tmax=float("inf"),
+    s_noise=1.0,
+):
+    """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
+    s_in = x.new_ones([x.shape[0]])
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        gamma = (
+            min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
+            if s_tmin <= sigmas[i] <= s_tmax
+            else 0.0
+        )
+        eps = generator.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        denoised = denoiser(x, sigma_hat * s_in)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigmas[i],
+                    "sigma_hat": sigma_hat,
+                    "denoised": denoised,
+                }
+            )
+        # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
+        sigma_mid = ((sigma_hat ** (1 / 3) + sigmas[i + 1] ** (1 / 3)) / 2) ** 3
+        dt_1 = sigma_mid - sigma_hat
+        dt_2 = sigmas[i + 1] - sigma_hat
+        x_2 = x + d * dt_1
+        denoised_2 = denoiser(x_2, sigma_mid * s_in)
+        d_2 = to_d(x_2, sigma_mid, denoised_2)
+        x = x + d_2 * dt_2
+    return x
+@th.no_grad()
+def sample_onestep(
+    distiller,
+    x,
+    sigmas,
+    generator=None,
+    progress=False,
+    callback=None,
+):
+    """Single-step generation from a distilled model."""
+    s_in = x.new_ones([x.shape[0]])
+    return distiller(x, sigmas[0] * s_in)
+@th.no_grad()
+def stochastic_iterative_sampler(
+    distiller,
+    x,
+    sigmas,
+    generator,
+    ts,
+    progress=False,
+    callback=None,
+    t_min=0.002,
+    t_max=80.0,
+    rho=7.0,
+    steps=40,
+):
+    t_max_rho = t_max ** (1 / rho)
+    t_min_rho = t_min ** (1 / rho)
+    s_in = x.new_ones([x.shape[0]])
+    for i in range(len(ts) - 1):
+        t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+        x0 = distiller(x, t * s_in)
+        next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+        next_t = np.clip(next_t, t_min, t_max)
+        x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+    return x
+@th.no_grad()
+def sample_progdist(
+    denoiser,
+    x,
+    sigmas,
+    generator=None,
+    progress=False,
+    callback=None,
+):
+    s_in = x.new_ones([x.shape[0]])
+    sigmas = sigmas[:-1]  # skip the zero sigma
+    indices = range(len(sigmas) - 1)
+    if progress:
+        from tqdm.auto import tqdm
+        indices = tqdm(indices)
+    for i in indices:
+        sigma = sigmas[i]
+        denoised = denoiser(x, sigma * s_in)
+        d = to_d(x, sigma, denoised)
+        if callback is not None:
+            callback(
+                {
+                    "x": x,
+                    "i": i,
+                    "sigma": sigma,
+                    "denoised": denoised,
+                }
+            )
+        dt = sigmas[i + 1] - sigma
+        x = x + d * dt
+    return x
+# @th.no_grad()
+# def iterative_colorization(
+#     distiller,
+#     images,
+#     x,
+#     ts,
+#     t_min=0.002,
+#     t_max=80.0,
+#     rho=7.0,
+#     steps=40,
+#     generator=None,
+# ):
+#     def obtain_orthogonal_matrix():
+#         vector = np.asarray([0.2989, 0.5870, 0.1140])
+#         vector = vector / np.linalg.norm(vector)
+#         matrix = np.eye(3)
+#         matrix[:, 0] = vector
+#         matrix = np.linalg.qr(matrix)[0]
+#         if np.sum(matrix[:, 0]) < 0:
+#             matrix = -matrix
+#         return matrix
+#     Q = th.from_numpy(obtain_orthogonal_matrix()).to(dist_util.dev()).to(th.float32)
+#     mask = th.zeros(*x.shape[1:], device=dist_util.dev())
+#     mask[0, ...] = 1.0
+#     def replacement(x0, x1):
+#         x0 = th.einsum("bchw,cd->bdhw", x0, Q)
+#         x1 = th.einsum("bchw,cd->bdhw", x1, Q)
+#         x_mix = x0 * mask + x1 * (1.0 - mask)
+#         x_mix = th.einsum("bdhw,cd->bchw", x_mix, Q)
+#         return x_mix
+#     t_max_rho = t_max ** (1 / rho)
+#     t_min_rho = t_min ** (1 / rho)
+#     s_in = x.new_ones([x.shape[0]])
+#     images = replacement(images, th.zeros_like(images))
+#     for i in range(len(ts) - 1):
+#         t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         x0 = distiller(x, t * s_in)
+#         x0 = th.clamp(x0, -1.0, 1.0)
+#         x0 = replacement(images, x0)
+#         next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         next_t = np.clip(next_t, t_min, t_max)
+#         x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+#     return x, images
+# @th.no_grad()
+# def iterative_inpainting(
+#     distiller,
+#     images,
+#     x,
+#     ts,
+#     t_min=0.002,
+#     t_max=80.0,
+#     rho=7.0,
+#     steps=40,
+#     generator=None,
+# ):
+#     from PIL import Image, ImageDraw, ImageFont
+#     image_size = x.shape[-1]
+#     # create a blank image with a white background
+#     img = Image.new("RGB", (image_size, image_size), color="white")
+#     # get a drawing context for the image
+#     draw = ImageDraw.Draw(img)
+#     # load a font
+#     font = ImageFont.truetype("arial.ttf", 250)
+#     # draw the letter "C" in black
+#     draw.text((50, 0), "S", font=font, fill=(0, 0, 0))
+#     # convert the image to a numpy array
+#     img_np = np.array(img)
+#     img_np = img_np.transpose(2, 0, 1)
+#     img_th = th.from_numpy(img_np).to(dist_util.dev())
+#     mask = th.zeros(*x.shape, device=dist_util.dev())
+#     mask = mask.reshape(-1, 7, 3, image_size, image_size)
+#     mask[::2, :, img_th > 0.5] = 1.0
+#     mask[1::2, :, img_th < 0.5] = 1.0
+#     mask = mask.reshape(-1, 3, image_size, image_size)
+#     def replacement(x0, x1):
+#         x_mix = x0 * mask + x1 * (1 - mask)
+#         return x_mix
+#     t_max_rho = t_max ** (1 / rho)
+#     t_min_rho = t_min ** (1 / rho)
+#     s_in = x.new_ones([x.shape[0]])
+#     images = replacement(images, -th.ones_like(images))
+#     for i in range(len(ts) - 1):
+#         t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         x0 = distiller(x, t * s_in)
+#         x0 = th.clamp(x0, -1.0, 1.0)
+#         x0 = replacement(images, x0)
+#         next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         next_t = np.clip(next_t, t_min, t_max)
+#         x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+#     return x, images
+# @th.no_grad()
+# def iterative_superres(
+#     distiller,
+#     images,
+#     x,
+#     ts,
+#     t_min=0.002,
+#     t_max=80.0,
+#     rho=7.0,
+#     steps=40,
+#     generator=None,
+# ):
+#     patch_size = 8
+#     def obtain_orthogonal_matrix():
+#         vector = np.asarray([1] * patch_size**2)
+#         vector = vector / np.linalg.norm(vector)
+#         matrix = np.eye(patch_size**2)
+#         matrix[:, 0] = vector
+#         matrix = np.linalg.qr(matrix)[0]
+#         if np.sum(matrix[:, 0]) < 0:
+#             matrix = -matrix
+#         return matrix
+#     Q = th.from_numpy(obtain_orthogonal_matrix()).to(dist_util.dev()).to(th.float32)
+#     image_size = x.shape[-1]
+#     def replacement(x0, x1):
+#         x0_flatten = (
+#             x0.reshape(-1, 3, image_size, image_size)
+#             .reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
+#         )
+#         x1_flatten = (
+#             x1.reshape(-1, 3, image_size, image_size)
+#             .reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
+#         )
+#         x0 = th.einsum("bcnd,de->bcne", x0_flatten, Q)
+#         x1 = th.einsum("bcnd,de->bcne", x1_flatten, Q)
+#         x_mix = x0.new_zeros(x0.shape)
+#         x_mix[..., 0] = x0[..., 0]
+#         x_mix[..., 1:] = x1[..., 1:]
+#         x_mix = th.einsum("bcne,de->bcnd", x_mix, Q)
+#         x_mix = (
+#             x_mix.reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size, image_size)
+#         )
+#         return x_mix
+#     def average_image_patches(x):
+#         x_flatten = (
+#             x.reshape(-1, 3, image_size, image_size)
+#             .reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
+#         )
+#         x_flatten[..., :] = x_flatten.mean(dim=-1, keepdim=True)
+#         return (
+#             x_flatten.reshape(
+#                 -1,
+#                 3,
+#                 image_size // patch_size,
+#                 image_size // patch_size,
+#                 patch_size,
+#                 patch_size,
+#             )
+#             .permute(0, 1, 2, 4, 3, 5)
+#             .reshape(-1, 3, image_size, image_size)
+#         )
+#     t_max_rho = t_max ** (1 / rho)
+#     t_min_rho = t_min ** (1 / rho)
+#     s_in = x.new_ones([x.shape[0]])
+#     images = average_image_patches(images)
+#     for i in range(len(ts) - 1):
+#         t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         x0 = distiller(x, t * s_in)
+#         x0 = th.clamp(x0, -1.0, 1.0)
+#         x0 = replacement(images, x0)
+#         next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
+#         next_t = np.clip(next_t, t_min, t_max)
+#         x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
+#     return x, images

modules/diffusion/karras/random_utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch as th
+def get_generator(generator, num_samples=0, seed=0):
+    if generator == "dummy":
+        return DummyGenerator()
+    elif generator == "determ":
+        return DeterministicGenerator(num_samples, seed)
+    elif generator == "determ-indiv":
+        return DeterministicIndividualGenerator(num_samples, seed)
+    else:
+        raise NotImplementedError
+class DummyGenerator:
+    def randn(self, *args, **kwargs):
+        return th.randn(*args, **kwargs)
+    def randint(self, *args, **kwargs):
+        return th.randint(*args, **kwargs)
+    def randn_like(self, *args, **kwargs):
+        return th.randn_like(*args, **kwargs)
+class DeterministicGenerator:
+    """
+    RNG to deterministically sample num_samples samples that does not depend on batch_size or mpi_machines
+    Uses a single rng and samples num_samples sized randomness and subsamples the current indices
+    """
+    def __init__(self, num_samples, seed=0):
+        print("Warning: Distributed not initialised, using single rank")
+        self.rank = 0
+        self.world_size = 1
+        self.num_samples = num_samples
+        self.done_samples = 0
+        self.seed = seed
+        self.rng_cpu = th.Generator()
+        if th.cuda.is_available():
+            self.rng_cuda = th.Generator(dist_util.dev())
+        self.set_seed(seed)
+    def get_global_size_and_indices(self, size):
+        global_size = (self.num_samples, *size[1:])
+        indices = th.arange(
+            self.done_samples + self.rank,
+            self.done_samples + self.world_size * int(size[0]),
+            self.world_size,
+        )
+        indices = th.clamp(indices, 0, self.num_samples - 1)
+        assert (
+            len(indices) == size[0]
+        ), f"rank={self.rank}, ws={self.world_size}, l={len(indices)}, bs={size[0]}"
+        return global_size, indices
+    def get_generator(self, device):
+        return self.rng_cpu if th.device(device).type == "cpu" else self.rng_cuda
+    def randn(self, *size, dtype=th.float, device="cpu"):
+        global_size, indices = self.get_global_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.randn(*global_size, generator=generator, dtype=dtype, device=device)[
+            indices
+        ]
+    def randint(self, low, high, size, dtype=th.long, device="cpu"):
+        global_size, indices = self.get_global_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.randint(
+            low, high, generator=generator, size=global_size, dtype=dtype, device=device
+        )[indices]
+    def randn_like(self, tensor):
+        size, dtype, device = tensor.size(), tensor.dtype, tensor.device
+        return self.randn(*size, dtype=dtype, device=device)
+    def set_done_samples(self, done_samples):
+        self.done_samples = done_samples
+        self.set_seed(self.seed)
+    def get_seed(self):
+        return self.seed
+    def set_seed(self, seed):
+        self.rng_cpu.manual_seed(seed)
+        if th.cuda.is_available():
+            self.rng_cuda.manual_seed(seed)
+class DeterministicIndividualGenerator:
+    """
+    RNG to deterministically sample num_samples samples that does not depend on batch_size or mpi_machines
+    Uses a separate rng for each sample to reduce memoery usage
+    """
+    def __init__(self, num_samples, seed=0):
+        print("Warning: Distributed not initialised, using single rank")
+        self.rank = 0
+        self.world_size = 1
+        self.num_samples = num_samples
+        self.done_samples = 0
+        self.seed = seed
+        self.rng_cpu = [th.Generator() for _ in range(num_samples)]
+        if th.cuda.is_available():
+            self.rng_cuda = [th.Generator(dist_util.dev()) for _ in range(num_samples)]
+        self.set_seed(seed)
+    def get_size_and_indices(self, size):
+        indices = th.arange(
+            self.done_samples + self.rank,
+            self.done_samples + self.world_size * int(size[0]),
+            self.world_size,
+        )
+        indices = th.clamp(indices, 0, self.num_samples - 1)
+        assert (
+            len(indices) == size[0]
+        ), f"rank={self.rank}, ws={self.world_size}, l={len(indices)}, bs={size[0]}"
+        return (1, *size[1:]), indices
+    def get_generator(self, device):
+        return self.rng_cpu if th.device(device).type == "cpu" else self.rng_cuda
+    def randn(self, *size, dtype=th.float, device="cpu"):
+        size, indices = self.get_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.cat(
+            [
+                th.randn(*size, generator=generator[i], dtype=dtype, device=device)
+                for i in indices
+            ],
+            dim=0,
+        )
+    def randint(self, low, high, size, dtype=th.long, device="cpu"):
+        size, indices = self.get_size_and_indices(size)
+        generator = self.get_generator(device)
+        return th.cat(
+            [
+                th.randint(
+                    low,
+                    high,
+                    generator=generator[i],
+                    size=size,
+                    dtype=dtype,
+                    device=device,
+                )
+                for i in indices
+            ],
+            dim=0,
+        )
+    def randn_like(self, tensor):
+        size, dtype, device = tensor.size(), tensor.dtype, tensor.device
+        return self.randn(*size, dtype=dtype, device=device)
+    def set_done_samples(self, done_samples):
+        self.done_samples = done_samples
+    def get_seed(self):
+        return self.seed
+    def set_seed(self, seed):
+        [
+            rng_cpu.manual_seed(i + self.num_samples * seed)
+            for i, rng_cpu in enumerate(self.rng_cpu)
+        ]
+        if th.cuda.is_available():
+            [
+                rng_cuda.manual_seed(i + self.num_samples * seed)
+                for i, rng_cuda in enumerate(self.rng_cuda)
+            ]