update F5 and E2 config
Browse files
src/f5_tts/config/E2TTS_Base_train.yaml
CHANGED
@@ -3,36 +3,36 @@ hydra:
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
-
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
|
11 |
optim:
|
12 |
-
epochs: 15
|
13 |
-
learning_rate: 7.5e-5
|
14 |
num_warmup_updates: 20000 # warmup steps
|
15 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
16 |
-
max_grad_norm: 1.0
|
17 |
|
18 |
model:
|
19 |
-
name:
|
20 |
-
tokenizer:
|
21 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
22 |
arch:
|
23 |
-
dim: 1024
|
24 |
-
depth: 24
|
25 |
-
heads: 16
|
26 |
-
ff_mult: 4
|
27 |
mel_spec:
|
28 |
-
target_sample_rate: 24000
|
29 |
-
n_mel_channels: 100
|
30 |
-
hop_length: 256
|
31 |
-
win_length: 1024
|
32 |
-
n_fft: 1024
|
33 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
34 |
-
is_local_vocoder: False
|
35 |
-
local_vocoder_path: None
|
36 |
|
37 |
ckpts:
|
38 |
save_per_updates: 50000 # save checkpoint per steps
|
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
+
name: Emilia_ZH_EN # dataset name
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
|
11 |
optim:
|
12 |
+
epochs: 15 # max epochs
|
13 |
+
learning_rate: 7.5e-5 # learning rate
|
14 |
num_warmup_updates: 20000 # warmup steps
|
15 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
16 |
+
max_grad_norm: 1.0 # gradient clipping
|
17 |
|
18 |
model:
|
19 |
+
name: E2TTS_Base # model name
|
20 |
+
tokenizer: pinyin # tokenizer type
|
21 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
22 |
arch:
|
23 |
+
dim: 1024 # model dimension
|
24 |
+
depth: 24 # number of transformer layers
|
25 |
+
heads: 16 # number of transformer heads
|
26 |
+
ff_mult: 4 # ff layer expansion
|
27 |
mel_spec:
|
28 |
+
target_sample_rate: 24000 # target sample rate
|
29 |
+
n_mel_channels: 100 # mel channel
|
30 |
+
hop_length: 256 # hop length
|
31 |
+
win_length: 1024 # window length
|
32 |
+
n_fft: 1024 # fft length
|
33 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
34 |
+
is_local_vocoder: False # use local vocoder or not
|
35 |
+
local_vocoder_path: None # path to local vocoder
|
36 |
|
37 |
ckpts:
|
38 |
save_per_updates: 50000 # save checkpoint per steps
|
src/f5_tts/config/E2TTS_Small_train.yaml
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
|
5 |
+
datasets:
|
6 |
+
name: Emilia_ZH_EN
|
7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
+
batch_size_type: frame # "frame" or "sample"
|
9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
+
|
11 |
+
optim:
|
12 |
+
epochs: 15
|
13 |
+
learning_rate: 7.5e-5
|
14 |
+
num_warmup_updates: 20000 # warmup steps
|
15 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
16 |
+
max_grad_norm: 1.0
|
17 |
+
|
18 |
+
model:
|
19 |
+
name: E2TTS_Small
|
20 |
+
tokenizer: pinyin
|
21 |
+
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
22 |
+
arch:
|
23 |
+
dim: 768
|
24 |
+
depth: 20
|
25 |
+
heads: 12
|
26 |
+
ff_mult: 4
|
27 |
+
mel_spec:
|
28 |
+
target_sample_rate: 24000
|
29 |
+
n_mel_channels: 100
|
30 |
+
hop_length: 256
|
31 |
+
win_length: 1024
|
32 |
+
n_fft: 1024
|
33 |
+
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
34 |
+
is_local_vocoder: False
|
35 |
+
local_vocoder_path: None
|
36 |
+
|
37 |
+
ckpts:
|
38 |
+
save_per_updates: 50000 # save checkpoint per steps
|
39 |
+
last_per_steps: 5000 # save last checkpoint per steps
|
40 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
src/f5_tts/config/F5TTS_Base_train.yaml
CHANGED
@@ -3,38 +3,38 @@ hydra:
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
-
name: Emilia_ZH_EN
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
|
11 |
optim:
|
12 |
-
epochs: 15
|
13 |
-
learning_rate: 7.5e-5
|
14 |
num_warmup_updates: 20000 # warmup steps
|
15 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
16 |
-
max_grad_norm: 1.0
|
17 |
|
18 |
model:
|
19 |
-
name:
|
20 |
-
tokenizer:
|
21 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
22 |
arch:
|
23 |
-
dim: 1024
|
24 |
-
depth: 22
|
25 |
-
heads: 16
|
26 |
-
ff_mult: 2
|
27 |
-
text_dim: 512
|
28 |
-
conv_layers: 4
|
29 |
mel_spec:
|
30 |
-
target_sample_rate: 24000
|
31 |
-
n_mel_channels: 100
|
32 |
-
hop_length: 256
|
33 |
-
win_length: 1024
|
34 |
-
n_fft: 1024
|
35 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
36 |
-
is_local_vocoder: False
|
37 |
-
local_vocoder_path: None
|
38 |
|
39 |
ckpts:
|
40 |
save_per_updates: 50000 # save checkpoint per steps
|
|
|
3 |
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
|
5 |
datasets:
|
6 |
+
name: Emilia_ZH_EN # dataset name
|
7 |
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
batch_size_type: frame # "frame" or "sample"
|
9 |
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
|
11 |
optim:
|
12 |
+
epochs: 15 # max epochs
|
13 |
+
learning_rate: 7.5e-5 # learning rate
|
14 |
num_warmup_updates: 20000 # warmup steps
|
15 |
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
16 |
+
max_grad_norm: 1.0 # gradient clipping
|
17 |
|
18 |
model:
|
19 |
+
name: F5TTS_Base # model name
|
20 |
+
tokenizer: pinyin # tokenizer type
|
21 |
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
22 |
arch:
|
23 |
+
dim: 1024 # model dim
|
24 |
+
depth: 22 # model depth
|
25 |
+
heads: 16 # model heads
|
26 |
+
ff_mult: 2 # feedforward expansion
|
27 |
+
text_dim: 512 # text encoder dim
|
28 |
+
conv_layers: 4 # convolution layers
|
29 |
mel_spec:
|
30 |
+
target_sample_rate: 24000 # target sample rate
|
31 |
+
n_mel_channels: 100 # mel channel
|
32 |
+
hop_length: 256 # hop length
|
33 |
+
win_length: 1024 # window length
|
34 |
+
n_fft: 1024 # fft length
|
35 |
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
36 |
+
is_local_vocoder: False # use local vocoder or not
|
37 |
+
local_vocoder_path: None # local vocoder path
|
38 |
|
39 |
ckpts:
|
40 |
save_per_updates: 50000 # save checkpoint per steps
|
src/f5_tts/config/F5TTS_Small_train.yaml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
|
5 |
+
datasets:
|
6 |
+
name: Emilia_ZH_EN
|
7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
8 |
+
batch_size_type: frame # "frame" or "sample"
|
9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
10 |
+
|
11 |
+
optim:
|
12 |
+
epochs: 15
|
13 |
+
learning_rate: 7.5e-5
|
14 |
+
num_warmup_updates: 20000 # warmup steps
|
15 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
16 |
+
max_grad_norm: 1.0
|
17 |
+
|
18 |
+
model:
|
19 |
+
name: F5TTS_Small
|
20 |
+
tokenizer: pinyin
|
21 |
+
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
|
22 |
+
arch:
|
23 |
+
dim: 768
|
24 |
+
depth: 18
|
25 |
+
heads: 12
|
26 |
+
ff_mult: 2
|
27 |
+
text_dim: 512
|
28 |
+
conv_layers: 4
|
29 |
+
mel_spec:
|
30 |
+
target_sample_rate: 24000
|
31 |
+
n_mel_channels: 100
|
32 |
+
hop_length: 256
|
33 |
+
win_length: 1024
|
34 |
+
n_fft: 1024
|
35 |
+
mel_spec_type: vocos # 'vocos' or 'bigvgan'
|
36 |
+
is_local_vocoder: False
|
37 |
+
local_vocoder_path: None
|
38 |
+
|
39 |
+
ckpts:
|
40 |
+
save_per_updates: 50000 # save checkpoint per steps
|
41 |
+
last_per_steps: 5000 # save last checkpoint per steps
|
42 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|