Spaces:
Configuration error
Configuration error
| # Copyright (c) 2024 NVIDIA CORPORATION. | |
| # Licensed under the MIT license. | |
| import os, glob | |
| def get_wav_and_text_filelist(data_root, data_type, subsample=1): | |
| wav_list = sorted( | |
| [ | |
| path.replace(data_root, "")[1:] | |
| for path in glob.glob(os.path.join(data_root, data_type, "**/**/*.wav")) | |
| ] | |
| ) | |
| wav_list = wav_list[::subsample] | |
| txt_filelist = [path.replace(".wav", ".normalized.txt") for path in wav_list] | |
| txt_list = [] | |
| for txt_file in txt_filelist: | |
| with open(os.path.join(data_root, txt_file), "r") as f_txt: | |
| text = f_txt.readline().strip("\n") | |
| txt_list.append(text) | |
| wav_list = [path.replace(".wav", "") for path in wav_list] | |
| return wav_list, txt_list | |
| def write_filelist(output_path, wav_list, txt_list): | |
| with open(output_path, "w") as f: | |
| for i in range(len(wav_list)): | |
| filename = wav_list[i] + "|" + txt_list[i] | |
| f.write(filename + "\n") | |
| if __name__ == "__main__": | |
| data_root = "filelists/LibriTTS" | |
| # Dev and test sets. subsample each sets to get ~100 utterances | |
| data_type_list = ["dev-clean", "dev-other", "test-clean", "test-other"] | |
| subsample_list = [50, 50, 50, 50] | |
| for data_type, subsample in zip(data_type_list, subsample_list): | |
| print(f"processing {data_type}") | |
| data_path = os.path.join(data_root, data_type) | |
| assert os.path.exists(data_path), ( | |
| f"path {data_path} not found. make sure the path is accessible by creating the symbolic link using the following command: " | |
| f"ln -s /path/to/your/{data_path} {data_path}" | |
| ) | |
| wav_list, txt_list = get_wav_and_text_filelist(data_root, data_type, subsample) | |
| write_filelist(os.path.join(data_root, data_type + ".txt"), wav_list, txt_list) | |
| # Training and seen speaker validation datasets (libritts-full): train-clean-100 + train-clean-360 + train-other-500 | |
| wav_list_train, txt_list_train = [], [] | |
| for data_type in ["train-clean-100", "train-clean-360", "train-other-500"]: | |
| print(f"processing {data_type}") | |
| data_path = os.path.join(data_root, data_type) | |
| assert os.path.exists(data_path), ( | |
| f"path {data_path} not found. make sure the path is accessible by creating the symbolic link using the following command: " | |
| f"ln -s /path/to/your/{data_path} {data_path}" | |
| ) | |
| wav_list, txt_list = get_wav_and_text_filelist(data_root, data_type) | |
| wav_list_train.extend(wav_list) | |
| txt_list_train.extend(txt_list) | |
| # Split the training set so that the seen speaker validation set contains ~100 utterances | |
| subsample_val = 3000 | |
| wav_list_val, txt_list_val = ( | |
| wav_list_train[::subsample_val], | |
| txt_list_train[::subsample_val], | |
| ) | |
| del wav_list_train[::subsample_val] | |
| del txt_list_train[::subsample_val] | |
| write_filelist( | |
| os.path.join(data_root, "train-full.txt"), wav_list_train, txt_list_train | |
| ) | |
| write_filelist(os.path.join(data_root, "val-full.txt"), wav_list_val, txt_list_val) | |
| print("done") | |