|
model_name: pretrain |
|
|
|
model: |
|
dim: 1024 |
|
depth: 24 |
|
heads: 16 |
|
ff_mult: 4 |
|
text_dim: 512 |
|
|
|
conv_layers: 0 |
|
|
|
text_num_embeds: 200 |
|
mel_dim: 100 |
|
t5_dim: 1024 |
|
clap_dim: 512 |
|
|
|
use_checkpoint: false |
|
qk_norm: true |
|
skip: true |
|
|
|
mel: |
|
target_sample_rate: 24000 |
|
n_mel_channels: 100 |
|
hop_length: 256 |
|
|
|
opt: |
|
learning_rate: 2.0e-04 |
|
beta1: 0.9 |
|
beta2: 0.999 |
|
weight_decay: 0.01 |
|
adam_epsilon: 1.0e-08 |
|
grad_clip: 1.0 |
|
batch_size: 64 |
|
accumulation_steps: 1 |
|
|
|
drop_spk: 0.1 |
|
drop_text: 0.5 |
|
|
|
lr_scheduler: |
|
warmup_steps: 5000 |
|
decay_steps: 150000 |
|
end_factor: 1.0e-02 |
|
|
|
data: |
|
trainset: |
|
dataset_dir: "" |
|
clap_emb_dir: "./data/clap_embs/" |
|
t5_folder_name: "t5" |
|
phn_folder_name: "g2p" |
|
manifest_name: "manifest" |
|
json_name: "jsons" |
|
dynamic_batching: true |
|
text_pad_token: -1 |
|
audio_pad_token: 0.0 |
|
split: "train_PT" |
|
sr: 24000 |
|
norm_audio: false |
|
|
|
valset: |
|
dataset_dir: "" |
|
clap_emb_dir: "./data/clap_embs/" |
|
t5_folder_name: "t5" |
|
phn_folder_name: "g2p" |
|
manifest_name: "manifest" |
|
json_name: "jsons" |
|
dynamic_batching: true |
|
text_pad_token: -1 |
|
audio_pad_token: 0.0 |
|
split: "validation_PT" |
|
sr: 24000 |
|
norm_audio: false |
|
|