Spaces:

zhouzhou363
/

f5-tts

Configuration error

App Files Files Community

SWivid commited on Oct 24, 2024

Commit

a846ae6

1 Parent(s): ba4b04b

finish train dependencies

Browse files

Files changed (9) hide show

README.md +2 -64
src/f5_tts/model/dataset.py +9 -7
src/f5_tts/train/README.md +68 -0
src/f5_tts/train/datasets/prepare_csv_wavs.py +8 -6
src/f5_tts/train/datasets/prepare_emilia.py +14 -11
src/f5_tts/train/datasets/prepare_wenetspeech4tts.py +14 -8
src/f5_tts/train/finetune_cli.py +6 -5
src/f5_tts/train/finetune_gradio.py +167 -25
src/f5_tts/train/train.py +5 -1

README.md CHANGED Viewed

@@ -65,70 +65,6 @@ pre-commit run --all-files
 Note: Some model components have linting exceptions for E722 to accommodate tensor notation
-## Prepare Dataset
-Example data processing scripts for Emilia and Wenetspeech4TTS, and you may tailor your own one along with a Dataset class in `f5_tts/model/dataset.py`.
-```bash
-# switch to the main directory
-cd f5_tts
-# prepare custom dataset up to your need
-# download corresponding dataset first, and fill in the path in scripts
-# Prepare the Emilia dataset
-python scripts/prepare_emilia.py
-# Prepare the Wenetspeech4TTS dataset
-python scripts/prepare_wenetspeech4tts.py
-# https://github.com/SWivid/F5-TTS/discussions/57#discussioncomment-10959029
-python scripts/prepare_csv_wavs.py
-```
-## Training & Finetuning
-Once your datasets are prepared, you can start the training process.
-```bash
-# switch to the main directory
-cd f5_tts
-# setup accelerate config, e.g. use multi-gpu ddp, fp16
-# will be to: ~/.cache/huggingface/accelerate/default_config.yaml
-accelerate config
-accelerate launch train.py
-```
-An initial guidance on Finetuning [#57](https://github.com/SWivid/F5-TTS/discussions/57).
-Gradio UI finetuning with `f5_tts/finetune_gradio.py` see [#143](https://github.com/SWivid/F5-TTS/discussions/143).
-### Wandb Logging
-By default, the training script does NOT use logging (assuming you didn't manually log in using `wandb login`).
-To turn on wandb logging, you can either:
-1. Manually login with `wandb login`: Learn more [here](https://docs.wandb.ai/ref/cli/wandb-login)
-2. Automatically login programmatically by setting an environment variable: Get an API KEY at https://wandb.ai/site/ and set the environment variable as follows:
-On Mac & Linux:
-```
-export WANDB_API_KEY=<YOUR WANDB API KEY>
-```
-On Windows:
-```
-set WANDB_API_KEY=<YOUR WANDB API KEY>
-```
-Moreover, if you couldn't access Wandb and want to log metrics offline, you can the environment variable as follows:
-```
-export WANDB_MODE=offline
-```
 ## Inference
 ```python
@@ -215,6 +151,8 @@ To test speech editing capabilities, use the following command.
 python f5_tts/speech_edit.py
 ```
 ## [Evaluation](src/f5_tts/eval/README.md)
 ## Acknowledgements

 Note: Some model components have linting exceptions for E722 to accommodate tensor notation
 ## Inference
 ```python
 python f5_tts/speech_edit.py
 ```
+## [Training](src/f5_tts/train/README.md)
 ## [Evaluation](src/f5_tts/eval/README.md)
 ## Acknowledgements

src/f5_tts/model/dataset.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import json
 import random
 from tqdm import tqdm
 import torch
 import torch.nn.functional as F
-from torch.utils.data import Dataset, Sampler
 import torchaudio
 from datasets import load_from_disk
 from datasets import Dataset as Dataset_
-from torch import nn
 from f5_tts.model.modules import MelSpec
 from f5_tts.model.utils import default
@@ -221,16 +222,17 @@ def load_dataset(
     print("Loading dataset ...")
     if dataset_type == "CustomDataset":
         if audio_type == "raw":
             try:
-                train_dataset = load_from_disk(f"data/{dataset_name}_{tokenizer}/raw")
             except:  # noqa: E722
-                train_dataset = Dataset_.from_file(f"data/{dataset_name}_{tokenizer}/raw.arrow")
             preprocessed_mel = False
         elif audio_type == "mel":
-            train_dataset = Dataset_.from_file(f"data/{dataset_name}_{tokenizer}/mel.arrow")
             preprocessed_mel = True
-        with open(f"data/{dataset_name}_{tokenizer}/duration.json", "r", encoding="utf-8") as f:
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(
@@ -261,7 +263,7 @@ def load_dataset(
         )
         pre, post = dataset_name.split("_")
         train_dataset = HFDataset(
-            load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir="./data"),
         )
     return train_dataset

 import json
 import random
+from importlib.resources import files
 from tqdm import tqdm
 import torch
 import torch.nn.functional as F
 import torchaudio
+from torch import nn
+from torch.utils.data import Dataset, Sampler
 from datasets import load_from_disk
 from datasets import Dataset as Dataset_
 from f5_tts.model.modules import MelSpec
 from f5_tts.model.utils import default
     print("Loading dataset ...")
     if dataset_type == "CustomDataset":
+        rel_data_path = str(files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}"))
         if audio_type == "raw":
             try:
+                train_dataset = load_from_disk(f"{rel_data_path}/raw")
             except:  # noqa: E722
+                train_dataset = Dataset_.from_file(f"{rel_data_path}/raw.arrow")
             preprocessed_mel = False
         elif audio_type == "mel":
+            train_dataset = Dataset_.from_file(f"{rel_data_path}/mel.arrow")
             preprocessed_mel = True
+        with open(f"{rel_data_path}/duration.json", "r", encoding="utf-8") as f:
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(
         )
         pre, post = dataset_name.split("_")
         train_dataset = HFDataset(
+            load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir=str(files("f5_tts").joinpath("../../data"))),
         )
     return train_dataset

src/f5_tts/train/README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+## Prepare Dataset
+Example data processing scripts for Emilia and Wenetspeech4TTS, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
+### 1. Datasets used for pretrained models
+Download corresponding dataset first, and fill in the path in scripts.
+```bash
+# Prepare the Emilia dataset
+python src/f5_tts/train/datasets/prepare_emilia.py
+# Prepare the Wenetspeech4TTS dataset
+python src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
+```
+### 2. Create custom dataset with metadata.csv
+Use guidance see [#57 here](https://github.com/SWivid/F5-TTS/discussions/57#discussioncomment-10959029).
+```bash
+python src/f5_tts/train/datasets/prepare_csv_wavs.py
+```
+## Training & Finetuning
+Once your datasets are prepared, you can start the training process.
+### 1. Training script used for pretrained model
+```bash
+# setup accelerate config, e.g. use multi-gpu ddp, fp16
+# will be to: ~/.cache/huggingface/accelerate/default_config.yaml
+accelerate config
+accelerate launch src/f5_tts/train/train.py
+```
+### 2. Finetuning practice
+Discussion board for Finetuning [#57](https://github.com/SWivid/F5-TTS/discussions/57).
+Gradio UI training/finetuning with `src/f5_tts/train/finetune_gradio.py` see [#143](https://github.com/SWivid/F5-TTS/discussions/143).
+### 3. Wandb Logging
+The `wandb/` dir will be created under path you run training/finetuning scripts.
+By default, the training script does NOT use logging (assuming you didn't manually log in using `wandb login`).
+To turn on wandb logging, you can either:
+1. Manually login with `wandb login`: Learn more [here](https://docs.wandb.ai/ref/cli/wandb-login)
+2. Automatically login programmatically by setting an environment variable: Get an API KEY at https://wandb.ai/site/ and set the environment variable as follows:
+On Mac & Linux:
+```
+export WANDB_API_KEY=<YOUR WANDB API KEY>
+```
+On Windows:
+```
+set WANDB_API_KEY=<YOUR WANDB API KEY>
+```
+Moreover, if you couldn't access Wandb and want to log metrics offline, you can the environment variable as follows:
+```
+export WANDB_MODE=offline
+```

src/f5_tts/train/datasets/prepare_csv_wavs.py CHANGED Viewed

@@ -1,14 +1,15 @@
-import sys
 import os
 sys.path.append(os.getcwd())
-from pathlib import Path
 import json
 import shutil
-import argparse
-import csv
 import torchaudio
 from tqdm import tqdm
 from datasets.arrow_writer import ArrowWriter
@@ -17,7 +18,8 @@ from f5_tts.model.utils import (
     convert_char_to_pinyin,
 )
-PRETRAINED_VOCAB_PATH = Path(__file__).parent.parent / "data/Emilia_ZH_EN_pinyin/vocab.txt"
 def is_csv_wavs_format(input_dataset_dir):
@@ -80,7 +82,7 @@ def save_prepped_dataset(out_dir, result, duration_list, text_vocab_set, is_fine
     print(f"\nSaving to {out_dir} ...")
     # dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})  # oom
-    # dataset.save_to_disk(f"data/{dataset_name}/raw", max_shard_size="2GB")
     raw_arrow_path = out_dir / "raw.arrow"
     with ArrowWriter(path=raw_arrow_path.as_posix(), writer_batch_size=1) as writer:
         for line in tqdm(result, desc="Writing to raw.arrow ..."):

 import os
+import sys
 sys.path.append(os.getcwd())
+import argparse
+import csv
 import json
 import shutil
+from importlib.resources import files
+from pathlib import Path
 import torchaudio
 from tqdm import tqdm
 from datasets.arrow_writer import ArrowWriter
     convert_char_to_pinyin,
 )
+PRETRAINED_VOCAB_PATH = files("f5_tts").joinpath("../../data/Emilia_ZH_EN_pinyin/vocab.txt")
 def is_csv_wavs_format(input_dataset_dir):
     print(f"\nSaving to {out_dir} ...")
     # dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})  # oom
+    # dataset.save_to_disk(f"{out_dir}/raw", max_shard_size="2GB")
     raw_arrow_path = out_dir / "raw.arrow"
     with ArrowWriter(path=raw_arrow_path.as_posix(), writer_batch_size=1) as writer:
         for line in tqdm(result, desc="Writing to raw.arrow ..."):

src/f5_tts/train/datasets/prepare_emilia.py CHANGED Viewed

@@ -4,15 +4,16 @@
 # generate audio text map for Emilia ZH & EN
 # evaluate for vocab size
-import sys
 import os
 sys.path.append(os.getcwd())
-from pathlib import Path
 import json
-from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor
 from datasets.arrow_writer import ArrowWriter
@@ -173,24 +174,25 @@ def main():
     executor.shutdown()
     # save preprocessed dataset to disk
-    if not os.path.exists(f"data/{dataset_name}"):
-        os.makedirs(f"data/{dataset_name}")
-    print(f"\nSaving to data/{dataset_name} ...")
     # dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})  # oom
-    # dataset.save_to_disk(f"data/{dataset_name}/raw", max_shard_size="2GB")
-    with ArrowWriter(path=f"data/{dataset_name}/raw.arrow") as writer:
         for line in tqdm(result, desc="Writing to raw.arrow ..."):
             writer.write(line)
     # dup a json separately saving duration in case for DynamicBatchSampler ease
-    with open(f"data/{dataset_name}/duration.json", "w", encoding="utf-8") as f:
         json.dump({"duration": duration_list}, f, ensure_ascii=False)
     # vocab map, i.e. tokenizer
     # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
     # if tokenizer == "pinyin":
     #     text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
-    with open(f"data/{dataset_name}/vocab.txt", "w") as f:
         for vocab in sorted(text_vocab_set):
             f.write(vocab + "\n")
@@ -212,7 +214,8 @@ if __name__ == "__main__":
     langs = ["ZH", "EN"]
     dataset_dir = "<SOME_PATH>/Emilia_Dataset/raw"
     dataset_name = f"Emilia_{'_'.join(langs)}_{tokenizer}"
-    print(f"\nPrepare for {dataset_name}\n")
     main()

 # generate audio text map for Emilia ZH & EN
 # evaluate for vocab size
 import os
+import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
+from importlib.resources import files
+from pathlib import Path
+from tqdm import tqdm
 from datasets.arrow_writer import ArrowWriter
     executor.shutdown()
     # save preprocessed dataset to disk
+    if not os.path.exists(f"{save_dir}"):
+        os.makedirs(f"{save_dir}")
+    print(f"\nSaving to {save_dir} ...")
     # dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})  # oom
+    # dataset.save_to_disk(f"{save_dir}/raw", max_shard_size="2GB")
+    with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
         for line in tqdm(result, desc="Writing to raw.arrow ..."):
             writer.write(line)
     # dup a json separately saving duration in case for DynamicBatchSampler ease
+    with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
         json.dump({"duration": duration_list}, f, ensure_ascii=False)
     # vocab map, i.e. tokenizer
     # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
     # if tokenizer == "pinyin":
     #     text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
+    with open(f"{save_dir}/vocab.txt", "w") as f:
         for vocab in sorted(text_vocab_set):
             f.write(vocab + "\n")
     langs = ["ZH", "EN"]
     dataset_dir = "<SOME_PATH>/Emilia_Dataset/raw"
     dataset_name = f"Emilia_{'_'.join(langs)}_{tokenizer}"
+    save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
+    print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
     main()

src/f5_tts/train/datasets/prepare_wenetspeech4tts.py CHANGED Viewed

@@ -1,14 +1,15 @@
 # generate audio text map for WenetSpeech4TTS
 # evaluate for vocab size
-import sys
 import os
 sys.path.append(os.getcwd())
 import json
-from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor
 import torchaudio
 from datasets import Dataset
@@ -66,11 +67,11 @@ def main():
     if not os.path.exists("data"):
         os.makedirs("data")
-    print(f"\nSaving to data/{dataset_name}_{tokenizer} ...")
     dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})
-    dataset.save_to_disk(f"data/{dataset_name}_{tokenizer}/raw", max_shard_size="2GB")  # arrow format
-    with open(f"data/{dataset_name}_{tokenizer}/duration.json", "w", encoding="utf-8") as f:
         json.dump(
             {"duration": duration_list}, f, ensure_ascii=False
         )  # dup a json separately saving duration in case for DynamicBatchSampler ease
@@ -84,7 +85,7 @@ def main():
     if tokenizer == "pinyin":
         text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
-    with open(f"data/{dataset_name}_{tokenizer}/vocab.txt", "w") as f:
         for vocab in sorted(text_vocab_set):
             f.write(vocab + "\n")
     print(f"\nFor {dataset_name}, sample count: {len(text_list)}")
@@ -98,13 +99,18 @@ if __name__ == "__main__":
     polyphone = True
     dataset_choice = 1  # 1: Premium, 2: Standard, 3: Basic
-    dataset_name = ["WenetSpeech4TTS_Premium", "WenetSpeech4TTS_Standard", "WenetSpeech4TTS_Basic"][dataset_choice - 1]
     dataset_paths = [
         "<SOME_PATH>/WenetSpeech4TTS/Basic",
         "<SOME_PATH>/WenetSpeech4TTS/Standard",
         "<SOME_PATH>/WenetSpeech4TTS/Premium",
     ][-dataset_choice:]
-    print(f"\nChoose Dataset: {dataset_name}\n")
     main()

 # generate audio text map for WenetSpeech4TTS
 # evaluate for vocab size
 import os
+import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
+from importlib.resources import files
+from tqdm import tqdm
 import torchaudio
 from datasets import Dataset
     if not os.path.exists("data"):
         os.makedirs("data")
+    print(f"\nSaving to {save_dir} ...")
     dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})
+    dataset.save_to_disk(f"{save_dir}/raw", max_shard_size="2GB")  # arrow format
+    with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
         json.dump(
             {"duration": duration_list}, f, ensure_ascii=False
         )  # dup a json separately saving duration in case for DynamicBatchSampler ease
     if tokenizer == "pinyin":
         text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
+    with open(f"{save_dir}/vocab.txt", "w") as f:
         for vocab in sorted(text_vocab_set):
             f.write(vocab + "\n")
     print(f"\nFor {dataset_name}, sample count: {len(text_list)}")
     polyphone = True
     dataset_choice = 1  # 1: Premium, 2: Standard, 3: Basic
+    dataset_name = (
+        ["WenetSpeech4TTS_Premium", "WenetSpeech4TTS_Standard", "WenetSpeech4TTS_Basic"][dataset_choice - 1]
+        + "_"
+        + tokenizer
+    )
     dataset_paths = [
         "<SOME_PATH>/WenetSpeech4TTS/Basic",
         "<SOME_PATH>/WenetSpeech4TTS/Standard",
         "<SOME_PATH>/WenetSpeech4TTS/Premium",
     ][-dataset_choice:]
+    save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
+    print(f"\nChoose Dataset: {dataset_name}, will save to {save_dir}\n")
     main()

src/f5_tts/train/finetune_cli.py CHANGED Viewed

@@ -7,6 +7,7 @@ from f5_tts.model import CFM, UNetT, DiT, Trainer
 from f5_tts.model.utils import get_tokenizer
 from f5_tts.model.dataset import load_dataset
 # -------------------------- Dataset Settings --------------------------- #
 target_sample_rate = 24000
 n_mel_channels = 100
@@ -20,9 +21,9 @@ def parse_args():
     # batch_size_per_gpu = 2000 settting for gpu 16GB
     # batch_size_per_gpu = 3200 settting for gpu 24GB
-    # num_warmup_updates 10000 sample = 500
-    # change save_per_updates , last_per_steps what you need ,
     parser = argparse.ArgumentParser(description="Train CFM Model")
@@ -39,9 +40,9 @@ def parse_args():
     parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
     parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
     parser.add_argument("--epochs", type=int, default=10, help="Number of training epochs")
-    parser.add_argument("--num_warmup_updates", type=int, default=500, help="Warmup steps")
     parser.add_argument("--save_per_updates", type=int, default=10000, help="Save checkpoint every X steps")
-    parser.add_argument("--last_per_steps", type=int, default=20000, help="Save last checkpoint every X steps")
     parser.add_argument("--finetune", type=bool, default=True, help="Use Finetune")
     parser.add_argument("--pretrain", type=str, default=None, help="Use pretrain model for finetune")
     parser.add_argument(
@@ -126,7 +127,7 @@ def main():
         max_samples=args.max_samples,
         grad_accumulation_steps=args.grad_accumulation_steps,
         max_grad_norm=args.max_grad_norm,
-        wandb_project="CFM-TTS",
         wandb_run_name=args.exp_name,
         wandb_resume_id=wandb_resume_id,
         last_per_steps=args.last_per_steps,

 from f5_tts.model.utils import get_tokenizer
 from f5_tts.model.dataset import load_dataset
 # -------------------------- Dataset Settings --------------------------- #
 target_sample_rate = 24000
 n_mel_channels = 100
     # batch_size_per_gpu = 2000 settting for gpu 16GB
     # batch_size_per_gpu = 3200 settting for gpu 24GB
+    # num_warmup_updates = 300 for 5000 sample about 10 hours
+    # change save_per_updates , last_per_steps change this value what you need  ,
     parser = argparse.ArgumentParser(description="Train CFM Model")
     parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
     parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
     parser.add_argument("--epochs", type=int, default=10, help="Number of training epochs")
+    parser.add_argument("--num_warmup_updates", type=int, default=300, help="Warmup steps")
     parser.add_argument("--save_per_updates", type=int, default=10000, help="Save checkpoint every X steps")
+    parser.add_argument("--last_per_steps", type=int, default=50000, help="Save last checkpoint every X steps")
     parser.add_argument("--finetune", type=bool, default=True, help="Use Finetune")
     parser.add_argument("--pretrain", type=str, default=None, help="Use pretrain model for finetune")
     parser.add_argument(
         max_samples=args.max_samples,
         grad_accumulation_steps=args.grad_accumulation_steps,
         max_grad_norm=args.max_grad_norm,
+        wandb_project=args.dataset_name,
         wandb_run_name=args.exp_name,
         wandb_resume_id=wandb_resume_id,
         last_per_steps=args.last_per_steps,

src/f5_tts/train/finetune_gradio.py CHANGED Viewed

@@ -251,6 +251,7 @@ def start_training(
     file_checkpoint_train="",
     tokenizer_type="pinyin",
     tokenizer_file="",
 ):
     global training_process, tts_api
@@ -282,9 +283,24 @@ def start_training(
     yield "start train", gr.update(interactive=False), gr.update(interactive=False)
     # Command to run the training script with the specified arguments
     dataset_name = dataset_name.replace("_pinyin", "").replace("_char", "")
     cmd = (
-        f"accelerate launch finetune-cli.py --exp_name {exp_name} "
         f"--learning_rate {learning_rate} "
         f"--batch_size_per_gpu {batch_size_per_gpu} "
         f"--batch_size_type {batch_size_type} "
@@ -305,7 +321,8 @@ def start_training(
     if tokenizer_file != "":
         cmd += f" --tokenizer_path {tokenizer_file}"
-        cmd += f" --tokenizer {tokenizer_type} "
     print(cmd)
@@ -466,7 +483,7 @@ def format_seconds_to_hms(seconds):
     return "{:02d}:{:02d}:{:02d}".format(hours, minutes, int(seconds))
-def create_metadata(name_project, progress=gr.Progress()):
     path_project = os.path.join(path_data, name_project)
     path_project_wavs = os.path.join(path_project, "wavs")
     file_metadata = os.path.join(path_project, "metadata.csv")
@@ -475,7 +492,7 @@ def create_metadata(name_project, progress=gr.Progress()):
     file_vocab = os.path.join(path_project, "vocab.txt")
     if not os.path.isfile(file_metadata):
-        return "The file was not found in " + file_metadata
     with open(file_metadata, "r", encoding="utf-8-sig") as f:
         data = f.read()
@@ -488,6 +505,7 @@ def create_metadata(name_project, progress=gr.Progress()):
     lenght = 0
     result = []
     error_files = []
     for line in progress.tqdm(data.split("\n"), total=count):
         sp_line = line.split("|")
         if len(sp_line) != 2:
@@ -497,29 +515,38 @@ def create_metadata(name_project, progress=gr.Progress()):
         file_audio = os.path.join(path_project_wavs, name_audio + ".wav")
         if not os.path.isfile(file_audio):
-            error_files.append(file_audio)
             continue
-        duraction = get_audio_duration(file_audio)
-        if duraction < 2 and duraction > 15:
             continue
         if len(text) < 4:
             continue
         text = clear_text(text)
         text = convert_char_to_pinyin([text], polyphone=True)[0]
         audio_path_list.append(file_audio)
-        duration_list.append(duraction)
         text_list.append(text)
-        result.append({"audio_path": file_audio, "text": text, "duration": duraction})
-        lenght += duraction
     if duration_list == []:
-        error_files_text = "\n".join(error_files)
-        return f"Error: No audio files found in the specified path : \n{error_files_text}"
     min_second = round(min(duration_list), 2)
     max_second = round(max(duration_list), 2)
@@ -531,17 +558,35 @@ def create_metadata(name_project, progress=gr.Progress()):
     with open(file_duration, "w") as f:
         json.dump({"duration": duration_list}, f, ensure_ascii=False)
-    file_vocab_finetune = "data/Emilia_ZH_EN_pinyin/vocab.txt"
-    if not os.path.isfile(file_vocab_finetune):
-        return "Error: Vocabulary file 'Emilia_ZH_EN_pinyin' not found!"
-    shutil.copy2(file_vocab_finetune, file_vocab)
     if error_files != []:
-        error_text = "error files\n" + "\n".join(error_files)
     else:
         error_text = ""
-    return f"prepare complete \nsamples : {len(text_list)}\ntime data : {format_seconds_to_hms(lenght)}\nmin sec : {min_second}\nmax sec : {max_second}\nfile_arrow : {file_raw}\n{error_text}"
 def check_user(value):
@@ -579,10 +624,21 @@ def calculate_train(
     samples = len(duration_list)
     hours = sum(duration_list) / 3600
     if torch.cuda.is_available():
-        gpu_properties = torch.cuda.get_device_properties(0)
-        total_memory = gpu_properties.total_memory / (1024**3)
     elif torch.backends.mps.is_available():
         total_memory = psutil.virtual_memory().available / (1024**3)
     if batch_size_type == "frame":
@@ -619,7 +675,7 @@ def calculate_train(
     wanted_max_updates = 1000000
     # train params
-    gpus = 1
     frames_per_gpu = batch_size_per_gpu  # 8 * 38400 = 307200
     grad_accum = 1
@@ -816,6 +872,73 @@ def get_checkpoints_project(project_name, is_gradio=True):
     return files_checkpoints, selelect_checkpoint
 with gr.Blocks() as app:
     gr.Markdown(
         """
@@ -904,10 +1027,13 @@ for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussion
      ```"""
             )
             bt_prepare = bt_create = gr.Button("prepare")
             txt_info_prepare = gr.Text(label="info", value="")
-            bt_prepare.click(fn=create_metadata, inputs=[cm_project], outputs=[txt_info_prepare])
             random_sample_prepare = gr.Button("random sample")
@@ -928,7 +1054,7 @@ for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussion
             with gr.Row():
                 ch_finetune = bt_create = gr.Checkbox(label="finetune", value=True)
                 tokenizer_file = gr.Textbox(label="Tokenizer File", value="")
-                file_checkpoint_train = gr.Textbox(label="Checkpoint", value="")
             with gr.Row():
                 exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
@@ -951,6 +1077,7 @@ for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussion
                 last_per_steps = gr.Number(label="Last per Steps", value=50)
             with gr.Row():
                 start_button = gr.Button("Start Training")
                 stop_button = gr.Button("Stop Training", interactive=False)
@@ -974,6 +1101,7 @@ for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussion
                     file_checkpoint_train,
                     tokenizer_type,
                     tokenizer_file,
                 ],
                 outputs=[txt_info_train, start_button, stop_button],
             )
@@ -1019,7 +1147,7 @@ for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussion
                 outputs=[txt_info_reduse],
             )
-        with gr.TabItem("vocab check experiment"):
             check_button = gr.Button("check vocab")
             txt_info_check = gr.Text(label="info", value="")
             check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check])
@@ -1060,6 +1188,20 @@ for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussion
             bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
             cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
 @click.command()
 @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")

     file_checkpoint_train="",
     tokenizer_type="pinyin",
     tokenizer_file="",
+    mixed_precision="fp16",
 ):
     global training_process, tts_api
     yield "start train", gr.update(interactive=False), gr.update(interactive=False)
     # Command to run the training script with the specified arguments
+    if tokenizer_file == "":
+        if dataset_name.endswith("_pinyin"):
+            tokenizer_type = "pinyin"
+        elif dataset_name.endswith("_char"):
+            tokenizer_type = "char"
+    else:
+        tokenizer_file = "custom"
     dataset_name = dataset_name.replace("_pinyin", "").replace("_char", "")
+    if mixed_precision != "none":
+        fp16 = f"--mixed_precision={mixed_precision}"
+    else:
+        fp16 = ""
     cmd = (
+        f"accelerate launch {fp16} finetune-cli.py --exp_name {exp_name} "
         f"--learning_rate {learning_rate} "
         f"--batch_size_per_gpu {batch_size_per_gpu} "
         f"--batch_size_type {batch_size_type} "
     if tokenizer_file != "":
         cmd += f" --tokenizer_path {tokenizer_file}"
+    cmd += f" --tokenizer {tokenizer_type} "
     print(cmd)
     return "{:02d}:{:02d}:{:02d}".format(hours, minutes, int(seconds))
+def create_metadata(name_project, ch_tokenizer, progress=gr.Progress()):
     path_project = os.path.join(path_data, name_project)
     path_project_wavs = os.path.join(path_project, "wavs")
     file_metadata = os.path.join(path_project, "metadata.csv")
     file_vocab = os.path.join(path_project, "vocab.txt")
     if not os.path.isfile(file_metadata):
+        return "The file was not found in " + file_metadata, ""
     with open(file_metadata, "r", encoding="utf-8-sig") as f:
         data = f.read()
     lenght = 0
     result = []
     error_files = []
+    text_vocab_set = set()
     for line in progress.tqdm(data.split("\n"), total=count):
         sp_line = line.split("|")
         if len(sp_line) != 2:
         file_audio = os.path.join(path_project_wavs, name_audio + ".wav")
         if not os.path.isfile(file_audio):
+            error_files.append([file_audio, "error path"])
             continue
+        try:
+            duration = get_audio_duration(file_audio)
+        except Exception as e:
+            error_files.append([file_audio, "duration"])
+            print(f"Error processing {file_audio}: {e}")
+            continue
+        if duration < 1 and duration > 25:
+            error_files.append([file_audio, "duration < 1 and > 25 "])
             continue
         if len(text) < 4:
+            error_files.append([file_audio, "very small text len 3"])
             continue
         text = clear_text(text)
         text = convert_char_to_pinyin([text], polyphone=True)[0]
         audio_path_list.append(file_audio)
+        duration_list.append(duration)
         text_list.append(text)
+        result.append({"audio_path": file_audio, "text": text, "duration": duration})
+        if ch_tokenizer:
+            text_vocab_set.update(list(text))
+        lenght += duration
     if duration_list == []:
+        return f"Error: No audio files found in the specified path : {path_project_wavs}", ""
     min_second = round(min(duration_list), 2)
     max_second = round(max(duration_list), 2)
     with open(file_duration, "w") as f:
         json.dump({"duration": duration_list}, f, ensure_ascii=False)
+    new_vocal = ""
+    if not ch_tokenizer:
+        file_vocab_finetune = "data/Emilia_ZH_EN_pinyin/vocab.txt"
+        if not os.path.isfile(file_vocab_finetune):
+            return "Error: Vocabulary file 'Emilia_ZH_EN_pinyin' not found!"
+        shutil.copy2(file_vocab_finetune, file_vocab)
+        with open(file_vocab, "r", encoding="utf-8-sig") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+    else:
+        with open(file_vocab, "w", encoding="utf-8-sig") as f:
+            for vocab in sorted(text_vocab_set):
+                f.write(vocab + "\n")
+                new_vocal += vocab + "\n"
+        vocab_size = len(text_vocab_set)
     if error_files != []:
+        error_text = "\n".join([" = ".join(item) for item in error_files])
     else:
         error_text = ""
+    return (
+        f"prepare complete \nsamples : {len(text_list)}\ntime data : {format_seconds_to_hms(lenght)}\nmin sec : {min_second}\nmax sec : {max_second}\nfile_arrow : {file_raw}\nvocab : {vocab_size}\n{error_text}",
+        new_vocal,
+    )
 def check_user(value):
     samples = len(duration_list)
     hours = sum(duration_list) / 3600
+    # if torch.cuda.is_available():
+    # gpu_properties = torch.cuda.get_device_properties(0)
+    # total_memory = gpu_properties.total_memory / (1024**3)
+    # elif torch.backends.mps.is_available():
+    # total_memory = psutil.virtual_memory().available / (1024**3)
     if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        total_memory = 0
+        for i in range(gpu_count):
+            gpu_properties = torch.cuda.get_device_properties(i)
+            total_memory += gpu_properties.total_memory / (1024**3)  # in GB
     elif torch.backends.mps.is_available():
+        gpu_count = 1
         total_memory = psutil.virtual_memory().available / (1024**3)
     if batch_size_type == "frame":
     wanted_max_updates = 1000000
     # train params
+    gpus = gpu_count
     frames_per_gpu = batch_size_per_gpu  # 8 * 38400 = 307200
     grad_accum = 1
     return files_checkpoints, selelect_checkpoint
+def get_gpu_stats():
+    gpu_stats = ""
+    if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        for i in range(gpu_count):
+            gpu_name = torch.cuda.get_device_name(i)
+            gpu_properties = torch.cuda.get_device_properties(i)
+            total_memory = gpu_properties.total_memory / (1024**3)  # in GB
+            allocated_memory = torch.cuda.memory_allocated(i) / (1024**2)  # in MB
+            reserved_memory = torch.cuda.memory_reserved(i) / (1024**2)  # in MB
+            gpu_stats += (
+                f"GPU {i} Name: {gpu_name}\n"
+                f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
+                f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
+                f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
+            )
+    elif torch.backends.mps.is_available():
+        gpu_count = 1
+        gpu_stats += "MPS GPU\n"
+        total_memory = psutil.virtual_memory().total / (
+            1024**3
+        )  # Total system memory (MPS doesn't have its own memory)
+        allocated_memory = 0
+        reserved_memory = 0
+        gpu_stats += (
+            f"Total system memory: {total_memory:.2f} GB\n"
+            f"Allocated GPU memory (MPS): {allocated_memory:.2f} MB\n"
+            f"Reserved GPU memory (MPS): {reserved_memory:.2f} MB\n"
+        )
+    else:
+        gpu_stats = "No GPU available"
+    return gpu_stats
+def get_cpu_stats():
+    cpu_usage = psutil.cpu_percent(interval=1)
+    memory_info = psutil.virtual_memory()
+    memory_used = memory_info.used / (1024**2)
+    memory_total = memory_info.total / (1024**2)
+    memory_percent = memory_info.percent
+    pid = os.getpid()
+    process = psutil.Process(pid)
+    nice_value = process.nice()
+    cpu_stats = (
+        f"CPU Usage: {cpu_usage:.2f}%\n"
+        f"System Memory: {memory_used:.2f} MB used / {memory_total:.2f} MB total ({memory_percent}% used)\n"
+        f"Process Priority (Nice value): {nice_value}"
+    )
+    return cpu_stats
+def get_combined_stats():
+    gpu_stats = get_gpu_stats()
+    cpu_stats = get_cpu_stats()
+    combined_stats = f"### GPU Stats\n{gpu_stats}\n\n### CPU Stats\n{cpu_stats}"
+    return combined_stats
 with gr.Blocks() as app:
     gr.Markdown(
         """
      ```"""
             )
+            ch_tokenizern = gr.Checkbox(label="create vocabulary from dataset", value=False)
             bt_prepare = bt_create = gr.Button("prepare")
             txt_info_prepare = gr.Text(label="info", value="")
+            txt_vocab_prepare = gr.Text(label="vocab", value="")
+            bt_prepare.click(
+                fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
+            )
             random_sample_prepare = gr.Button("random sample")
             with gr.Row():
                 ch_finetune = bt_create = gr.Checkbox(label="finetune", value=True)
                 tokenizer_file = gr.Textbox(label="Tokenizer File", value="")
+                file_checkpoint_train = gr.Textbox(label="Pretrain Model", value="")
             with gr.Row():
                 exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
                 last_per_steps = gr.Number(label="Last per Steps", value=50)
             with gr.Row():
+                mixed_precision = gr.Radio(label="mixed_precision", choices=["none", "fp16", "fpb16"], value="none")
                 start_button = gr.Button("Start Training")
                 stop_button = gr.Button("Stop Training", interactive=False)
                     file_checkpoint_train,
                     tokenizer_type,
                     tokenizer_file,
+                    mixed_precision,
                 ],
                 outputs=[txt_info_train, start_button, stop_button],
             )
                 outputs=[txt_info_reduse],
             )
+        with gr.TabItem("vocab check"):
             check_button = gr.Button("check vocab")
             txt_info_check = gr.Text(label="info", value="")
             check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check])
             bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
             cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
+        with gr.TabItem("system info"):
+            output_box = gr.Textbox(label="GPU and CPU Information", lines=20)
+            def update_stats():
+                return get_combined_stats()
+            update_button = gr.Button("Update Stats")
+            update_button.click(fn=update_stats, outputs=output_box)
+            def auto_update():
+                yield gr.update(value=update_stats())
+            gr.update(fn=auto_update, inputs=[], outputs=output_box)
 @click.command()
 @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")

src/f5_tts/train/train.py CHANGED Viewed

@@ -1,3 +1,7 @@
 from f5_tts.model import CFM, UNetT, DiT, Trainer
 from f5_tts.model.utils import get_tokenizer
 from f5_tts.model.dataset import load_dataset
@@ -69,7 +73,7 @@ def main():
         learning_rate,
         num_warmup_updates=num_warmup_updates,
         save_per_updates=save_per_updates,
-        checkpoint_path=f"ckpts/{exp_name}",
         batch_size=batch_size_per_gpu,
         batch_size_type=batch_size_type,
         max_samples=max_samples,

+# training script.
+from importlib.resources import files
 from f5_tts.model import CFM, UNetT, DiT, Trainer
 from f5_tts.model.utils import get_tokenizer
 from f5_tts.model.dataset import load_dataset
         learning_rate,
         num_warmup_updates=num_warmup_updates,
         save_per_updates=save_per_updates,
+        checkpoint_path=str(files("f5_tts").joinpath(f"../../ckpts/{exp_name}")),
         batch_size=batch_size_per_gpu,
         batch_size_type=batch_size_type,
         max_samples=max_samples,