zkniu commited on
Commit
846b661
·
1 Parent(s): 6523beb

update prepare ljspeech script

Browse files
src/f5_tts/train/README.md CHANGED
@@ -16,6 +16,9 @@ python src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
16
 
17
  # Prepare the LibriTTS dataset
18
  python src/f5_tts/train/datasets/prepare_libritts.py
 
 
 
19
  ```
20
 
21
  ### 2. Create custom dataset with metadata.csv
 
16
 
17
  # Prepare the LibriTTS dataset
18
  python src/f5_tts/train/datasets/prepare_libritts.py
19
+
20
+ # Prepare the LJSpeech dataset
21
+ python src/f5_tts/train/datasets/prepare_ljspeech.py
22
  ```
23
 
24
  ### 2. Create custom dataset with metadata.csv
src/f5_tts/train/datasets/prepare_ljspeech.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import json
7
+ from importlib.resources import files
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+ import soundfile as sf
11
+ from datasets.arrow_writer import ArrowWriter
12
+
13
+
14
+ def main():
15
+ result = []
16
+ duration_list = []
17
+ text_vocab_set = set()
18
+
19
+ with open(meta_info, "r") as f:
20
+ lines = f.readlines()
21
+ for line in tqdm(lines):
22
+ uttr, text, norm_text = line.split("|")
23
+ wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
24
+ duration = sf.info(wav_path).duration
25
+ if duration < 0.4 or duration > 30:
26
+ continue
27
+ result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
28
+ duration_list.append(duration)
29
+ text_vocab_set.update(list(norm_text))
30
+
31
+ # save preprocessed dataset to disk
32
+ if not os.path.exists(f"{save_dir}"):
33
+ os.makedirs(f"{save_dir}")
34
+ print(f"\nSaving to {save_dir} ...")
35
+
36
+ with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
37
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
38
+ writer.write(line)
39
+
40
+ # dup a json separately saving duration in case for DynamicBatchSampler ease
41
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
42
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
43
+
44
+ # vocab map, i.e. tokenizer
45
+ # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
46
+ with open(f"{save_dir}/vocab.txt", "w") as f:
47
+ for vocab in sorted(text_vocab_set):
48
+ f.write(vocab + "\n")
49
+
50
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
51
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
52
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ tokenizer = "char" # "pinyin" | "char"
57
+
58
+ dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
59
+ dataset_name = f"LJSpeech_{tokenizer}"
60
+ meta_info = os.path.join(dataset_dir, "metadata.csv")
61
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
62
+ print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
63
+
64
+ main()