# ***************************************************************************** # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the NVIDIA CORPORATION nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** import argparse import time from pathlib import Path import torch import tqdm import dllogger as DLLogger from dllogger import StdOutBackend, JSONStreamBackend, Verbosity from torch.utils.data import DataLoader from fastpitch.data_function import TTSCollate, TTSDataset def parse_args(parser): """ Parse commandline arguments. """ parser.add_argument('-d', '--dataset-path', type=str, default='./', help='Path to dataset') parser.add_argument('--wav-text-filelists', required=True, nargs='+', type=str, help='Files with audio paths and text') parser.add_argument('--extract-mels', action='store_true', help='Calculate spectrograms from .wav files') parser.add_argument('--extract-pitch', action='store_true', help='Extract pitch') parser.add_argument('--save-alignment-priors', action='store_true', help='Pre-calculate diagonal matrices of alignment of text to audio') parser.add_argument('--log-file', type=str, default='preproc_log.json', help='Filename for logging') parser.add_argument('--n-speakers', type=int, default=1) parser.add_argument('--n-languages', type=int, default=1) # Mel extraction parser.add_argument('--max-wav-value', default=32768.0, type=float, help='Maximum audiowave value') parser.add_argument('--sampling-rate', default=22050, type=int, help='Sampling rate') parser.add_argument('--filter-length', default=1024, type=int, help='Filter length') parser.add_argument('--hop-length', default=256, type=int, help='Hop (stride) length') parser.add_argument('--win-length', default=1024, type=int, help='Window length') parser.add_argument('--mel-fmin', default=0.0, type=float, help='Minimum mel frequency') parser.add_argument('--mel-fmax', default=8000.0, type=float, help='Maximum mel frequency') parser.add_argument('--n-mel-channels', type=int, default=80) # Pitch extraction parser.add_argument('--f0-method', default='pyin', type=str, choices=['pyin'], help='F0 estimation method') parser.add_argument('--pitch-mean', default='214', type=float, ### help='F0 estimation method') parser.add_argument('--pitch-std', default='65', type=float, #### help='F0 estimation method') # Performance parser.add_argument('-b', '--batch-size', default=1, type=int) parser.add_argument('--n-workers', type=int, default=16) return parser def main(): parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing') parser = parse_args(parser) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, Path(args.dataset_path, args.log_file)), StdOutBackend(Verbosity.VERBOSE)]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.flush() if args.extract_mels: Path(args.dataset_path, 'mels').mkdir(parents=False, exist_ok=True) if args.extract_pitch: Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True) if args.save_alignment_priors: Path(args.dataset_path, 'alignment_priors').mkdir(parents=False, exist_ok=True) for filelist in args.wav_text_filelists: print(f'Processing {filelist}...') dataset = TTSDataset( args.dataset_path, filelist, text_cleaners=['basic_cleaners'], n_mel_channels=args.n_mel_channels, p_arpabet=0.0, n_speakers=args.n_speakers, n_languages=args.n_languages, load_mel_from_disk=False, load_pitch_from_disk=False, pitch_mean=args.pitch_mean, pitch_std=args.pitch_std, max_wav_value=args.max_wav_value, sampling_rate=args.sampling_rate, filter_length=args.filter_length, hop_length=args.hop_length, win_length=args.win_length, mel_fmin=args.mel_fmin, mel_fmax=args.mel_fmax, betabinomial_online_dir=None, pitch_online_dir=None, pitch_online_method=args.f0_method) data_loader = DataLoader( dataset, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=args.n_workers, collate_fn=TTSCollate(), pin_memory=False, drop_last=False) all_filenames = set() for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() _, input_lens, mels, mel_lens, _, pitch, _, _, _, attn_prior, fpaths = batch # Ensure filenames are unique for p in fpaths: fname = Path(p).name if fname in all_filenames: raise ValueError(f'Filename is not unique: {fname}') all_filenames.add(fname) if args.extract_mels: for j, mel in enumerate(mels): fname = Path(fpaths[j]).with_suffix('.pt').name fpath = Path(args.dataset_path, 'mels', fname) torch.save(mel[:, :mel_lens[j]], fpath) if args.extract_pitch: for j, p in enumerate(pitch): fname = Path(fpaths[j]).with_suffix('.pt').name fpath = Path(args.dataset_path, 'pitch', fname) torch.save(p[:mel_lens[j]], fpath) if args.save_alignment_priors: for j, prior in enumerate(attn_prior): fname = Path(fpaths[j]).with_suffix('.pt').name fpath = Path(args.dataset_path, 'alignment_priors', fname) torch.save(prior[:mel_lens[j], :input_lens[j]], fpath) if __name__ == '__main__': main()