Spaces:

Emmiq
/

EmmiSpace

Build error

App Files Files Community

zkniu commited on Nov 24, 2024

Commit

6b27dbe

1 Parent(s): 4dd981f

update F5 and E2 config

Browse files

Files changed (4) hide show

src/f5_tts/config/E2TTS_Base_train.yaml +17 -17
src/f5_tts/config/E2TTS_Small_train.yaml +40 -0
src/f5_tts/config/F5TTS_Base_train.yaml +19 -19
src/f5_tts/config/F5TTS_Small_train.yaml +42 -0

src/f5_tts/config/E2TTS_Base_train.yaml CHANGED Viewed

@@ -3,36 +3,36 @@ hydra:
     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
-  name: Emilia_ZH_EN
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
   batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
 optim:
-  epochs: 15
-  learning_rate: 7.5e-5
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
-  max_grad_norm: 1.0
 model:
-  name: E2TTS
-  tokenizer: char
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
-    dim: 1024
-    depth: 24
-    heads: 16
-    ff_mult: 4
   mel_spec:
-    target_sample_rate: 24000
-    n_mel_channels: 100
-    hop_length: 256
-    win_length: 1024
-    n_fft: 1024
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
-    is_local_vocoder: False
-    local_vocoder_path: None
 ckpts:
   save_per_updates: 50000 # save checkpoint per steps

     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
+  name: Emilia_ZH_EN # dataset name
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
   batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
 optim:
+  epochs: 15 # max epochs
+  learning_rate: 7.5e-5 # learning rate
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0 # gradient clipping
 model:
+  name: E2TTS_Base # model name
+  tokenizer: pinyin # tokenizer type
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
+    dim: 1024 # model dimension
+    depth: 24 # number of transformer layers
+    heads: 16 # number of transformer heads
+    ff_mult: 4 # ff layer expansion
   mel_spec:
+    target_sample_rate: 24000 # target sample rate
+    n_mel_channels: 100 # mel channel
+    hop_length: 256 # hop length
+    win_length: 1024 # window length
+    n_fft: 1024 # fft length
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False # use local vocoder or not
+    local_vocoder_path: None # path to local vocoder
 ckpts:
   save_per_updates: 50000 # save checkpoint per steps

src/f5_tts/config/E2TTS_Small_train.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+model:
+  name: E2TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 20
+    heads: 12
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False
+    local_vocoder_path: None
+ckpts:
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/config/F5TTS_Base_train.yaml CHANGED Viewed

@@ -3,38 +3,38 @@ hydra:
     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
-  name: Emilia_ZH_EN
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
   batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
 optim:
-  epochs: 15
-  learning_rate: 7.5e-5
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
-  max_grad_norm: 1.0
 model:
-  name: F5TTS
-  tokenizer: char
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
-    dim: 1024
-    depth: 22
-    heads: 16
-    ff_mult: 2
-    text_dim: 512
-    conv_layers: 4
   mel_spec:
-    target_sample_rate: 24000
-    n_mel_channels: 100
-    hop_length: 256
-    win_length: 1024
-    n_fft: 1024
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
-    is_local_vocoder: False
-    local_vocoder_path: None
 ckpts:
   save_per_updates: 50000 # save checkpoint per steps

     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
+  name: Emilia_ZH_EN # dataset name
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
   batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
 optim:
+  epochs: 15 # max epochs
+  learning_rate: 7.5e-5 # learning rate
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0 # gradient clipping
 model:
+  name: F5TTS_Base # model name
+  tokenizer: pinyin # tokenizer type
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
+    dim: 1024 # model dim
+    depth: 22 # model depth
+    heads: 16 # model heads
+    ff_mult: 2 # feedforward expansion
+    text_dim: 512 # text encoder dim
+    conv_layers: 4 # convolution layers
   mel_spec:
+    target_sample_rate: 24000 # target sample rate
+    n_mel_channels: 100 # mel channel
+    hop_length: 256 # hop length
+    win_length: 1024 # window length
+    n_fft: 1024 # fft length
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False # use local vocoder or not
+    local_vocoder_path: None # local vocoder path
 ckpts:
   save_per_updates: 50000 # save checkpoint per steps

src/f5_tts/config/F5TTS_Small_train.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+model:
+  name: F5TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 18
+    heads: 12
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False
+    local_vocoder_path: None
+ckpts:
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}