zkniu commited on
Commit
6b27dbe
·
1 Parent(s): 4dd981f

update F5 and E2 config

Browse files
src/f5_tts/config/E2TTS_Base_train.yaml CHANGED
@@ -3,36 +3,36 @@ hydra:
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
- name: Emilia_ZH_EN
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
  batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
 
11
  optim:
12
- epochs: 15
13
- learning_rate: 7.5e-5
14
  num_warmup_updates: 20000 # warmup steps
15
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
16
- max_grad_norm: 1.0
17
 
18
  model:
19
- name: E2TTS
20
- tokenizer: char
21
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
22
  arch:
23
- dim: 1024
24
- depth: 24
25
- heads: 16
26
- ff_mult: 4
27
  mel_spec:
28
- target_sample_rate: 24000
29
- n_mel_channels: 100
30
- hop_length: 256
31
- win_length: 1024
32
- n_fft: 1024
33
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
34
- is_local_vocoder: False
35
- local_vocoder_path: None
36
 
37
  ckpts:
38
  save_per_updates: 50000 # save checkpoint per steps
 
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
  batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
 
11
  optim:
12
+ epochs: 15 # max epochs
13
+ learning_rate: 7.5e-5 # learning rate
14
  num_warmup_updates: 20000 # warmup steps
15
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
16
+ max_grad_norm: 1.0 # gradient clipping
17
 
18
  model:
19
+ name: E2TTS_Base # model name
20
+ tokenizer: pinyin # tokenizer type
21
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
22
  arch:
23
+ dim: 1024 # model dimension
24
+ depth: 24 # number of transformer layers
25
+ heads: 16 # number of transformer heads
26
+ ff_mult: 4 # ff layer expansion
27
  mel_spec:
28
+ target_sample_rate: 24000 # target sample rate
29
+ n_mel_channels: 100 # mel channel
30
+ hop_length: 256 # hop length
31
+ win_length: 1024 # window length
32
+ n_fft: 1024 # fft length
33
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
34
+ is_local_vocoder: False # use local vocoder or not
35
+ local_vocoder_path: None # path to local vocoder
36
 
37
  ckpts:
38
  save_per_updates: 50000 # save checkpoint per steps
src/f5_tts/config/E2TTS_Small_train.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+
11
+ optim:
12
+ epochs: 15
13
+ learning_rate: 7.5e-5
14
+ num_warmup_updates: 20000 # warmup steps
15
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
16
+ max_grad_norm: 1.0
17
+
18
+ model:
19
+ name: E2TTS_Small
20
+ tokenizer: pinyin
21
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
22
+ arch:
23
+ dim: 768
24
+ depth: 20
25
+ heads: 12
26
+ ff_mult: 4
27
+ mel_spec:
28
+ target_sample_rate: 24000
29
+ n_mel_channels: 100
30
+ hop_length: 256
31
+ win_length: 1024
32
+ n_fft: 1024
33
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
34
+ is_local_vocoder: False
35
+ local_vocoder_path: None
36
+
37
+ ckpts:
38
+ save_per_updates: 50000 # save checkpoint per steps
39
+ last_per_steps: 5000 # save last checkpoint per steps
40
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/config/F5TTS_Base_train.yaml CHANGED
@@ -3,38 +3,38 @@ hydra:
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
- name: Emilia_ZH_EN
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
  batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
 
11
  optim:
12
- epochs: 15
13
- learning_rate: 7.5e-5
14
  num_warmup_updates: 20000 # warmup steps
15
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
16
- max_grad_norm: 1.0
17
 
18
  model:
19
- name: F5TTS
20
- tokenizer: char
21
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
22
  arch:
23
- dim: 1024
24
- depth: 22
25
- heads: 16
26
- ff_mult: 2
27
- text_dim: 512
28
- conv_layers: 4
29
  mel_spec:
30
- target_sample_rate: 24000
31
- n_mel_channels: 100
32
- hop_length: 256
33
- win_length: 1024
34
- n_fft: 1024
35
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
- is_local_vocoder: False
37
- local_vocoder_path: None
38
 
39
  ckpts:
40
  save_per_updates: 50000 # save checkpoint per steps
 
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
  batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
 
11
  optim:
12
+ epochs: 15 # max epochs
13
+ learning_rate: 7.5e-5 # learning rate
14
  num_warmup_updates: 20000 # warmup steps
15
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
16
+ max_grad_norm: 1.0 # gradient clipping
17
 
18
  model:
19
+ name: F5TTS_Base # model name
20
+ tokenizer: pinyin # tokenizer type
21
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
22
  arch:
23
+ dim: 1024 # model dim
24
+ depth: 22 # model depth
25
+ heads: 16 # model heads
26
+ ff_mult: 2 # feedforward expansion
27
+ text_dim: 512 # text encoder dim
28
+ conv_layers: 4 # convolution layers
29
  mel_spec:
30
+ target_sample_rate: 24000 # target sample rate
31
+ n_mel_channels: 100 # mel channel
32
+ hop_length: 256 # hop length
33
+ win_length: 1024 # window length
34
+ n_fft: 1024 # fft length
35
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ is_local_vocoder: False # use local vocoder or not
37
+ local_vocoder_path: None # local vocoder path
38
 
39
  ckpts:
40
  save_per_updates: 50000 # save checkpoint per steps
src/f5_tts/config/F5TTS_Small_train.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+
11
+ optim:
12
+ epochs: 15
13
+ learning_rate: 7.5e-5
14
+ num_warmup_updates: 20000 # warmup steps
15
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
16
+ max_grad_norm: 1.0
17
+
18
+ model:
19
+ name: F5TTS_Small
20
+ tokenizer: pinyin
21
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
22
+ arch:
23
+ dim: 768
24
+ depth: 18
25
+ heads: 12
26
+ ff_mult: 2
27
+ text_dim: 512
28
+ conv_layers: 4
29
+ mel_spec:
30
+ target_sample_rate: 24000
31
+ n_mel_channels: 100
32
+ hop_length: 256
33
+ win_length: 1024
34
+ n_fft: 1024
35
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ is_local_vocoder: False
37
+ local_vocoder_path: None
38
+
39
+ ckpts:
40
+ save_per_updates: 50000 # save checkpoint per steps
41
+ last_per_steps: 5000 # save last checkpoint per steps
42
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}