Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

Text-to-Speech / bins /tts /preprocess.py

zyingt

Upload 685 files

0d80816 almost 2 years ago

raw

history blame

9.56 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import faulthandler
	faulthandler.enable()

	import os
	import argparse
	import json
	import pyworld as pw
	from multiprocessing import cpu_count


	from utils.util import load_config
	from preprocessors.processor import preprocess_dataset, prepare_align
	from preprocessors.metadata import cal_metadata
	from processors import acoustic_extractor, content_extractor, data_augment, phone_extractor


	def extract_acoustic_features(dataset, output_path, cfg, num_workers=1):
	"""Extract acoustic features of utterances in the dataset

	Args:
	dataset (str): name of dataset, e.g. opencpop
	output_path (str): directory that stores train, test and feature files of datasets
	cfg (dict): dictionary that stores configurations
	n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
	"""
	# types = ["train", "test"] if "eval" not in dataset else ["test"]
	types = list()
	types.append((cfg.preprocess.train_file).split('.')[0])
	types.append((cfg.preprocess.valid_file).split('.')[0])
	if 'test' not in types:
	types.append('test')
	if "eval" in dataset:
	types = ["test"]
	print('types: ', types)
	metadata = []
	for dataset_type in types:
	dataset_output = os.path.join(output_path, dataset)
	dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
	with open(dataset_file, "r") as f:
	metadata.extend(json.load(f))

	if num_workers > 1:
	acoustic_extractor.extract_utt_acoustic_features_parallel(
	metadata, dataset_output, cfg, num_workers=num_workers
	)
	else:
	acoustic_extractor.extract_utt_acoustic_features_serial(
	metadata, dataset_output, cfg
	)

	def extract_content_features(dataset, output_path, cfg, num_workers=1):
	"""Extract content features of utterances in the dataset

	Args:
	dataset (str): name of dataset, e.g. opencpop
	output_path (str): directory that stores train, test and feature files of datasets
	cfg (dict): dictionary that stores configurations
	"""
	# types = ["train", "test"] if "eval" not in dataset else ["test"]

	types = list()
	types.append((cfg.preprocess.train_file).split('.')[0])
	types.append((cfg.preprocess.valid_file).split('.')[0])
	if 'test' not in types:
	types.append('test')
	if "eval" in dataset:
	types = ["test"]

	metadata = []
	for dataset_type in types:
	dataset_output = os.path.join(output_path, dataset)
	# dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
	dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
	with open(dataset_file, "r") as f:
	metadata.extend(json.load(f))

	content_extractor.extract_utt_content_features_dataloader(
	cfg, metadata, num_workers
	)

	def extract_phonme_sequences(dataset, output_path, cfg):
	"""Extract phoneme features of utterances in the dataset

	Args:
	dataset (str): name of dataset, e.g. opencpop
	output_path (str): directory that stores train, test and feature files of datasets
	cfg (dict): dictionary that stores configurations

	"""
	# types = ["train", "test"] if "eval" not in dataset else ["test"]

	types = list()
	types.append((cfg.preprocess.train_file).split('.')[0])
	types.append((cfg.preprocess.valid_file).split('.')[0])
	if 'test' not in types:
	types.append('test')
	if "eval" in dataset:
	types = ["test"]

	metadata = []
	for dataset_type in types:
	dataset_output = os.path.join(output_path, dataset)
	dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
	with open(dataset_file, "r") as f:
	metadata.extend(json.load(f))
	phone_extractor.extract_utt_phone_sequence(
	cfg, metadata
	)


	def preprocess(cfg, args):
	"""Proprocess raw data of single or multiple datasets (in cfg.dataset)

	Args:
	cfg (dict): dictionary that stores configurations
	args (ArgumentParser): specify the configuration file and num_workers
	"""
	# Specify the output root path to save the processed data
	output_path = cfg.preprocess.processed_dir
	os.makedirs(output_path, exist_ok=True)

	'''

	## Split train and test sets
	for dataset in cfg.dataset:
	print("Preprocess {}...".format(dataset))

	if args.prepare_alignment:
	## Prepare alignment with MFA
	print("Prepare alignment {}...".format(dataset))
	prepare_align(
	dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
	)

	preprocess_dataset(
	dataset,
	cfg.dataset_path[dataset],
	output_path,
	cfg.preprocess,
	is_custom_dataset=cfg.use_custom_dataset,
	)

	# Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
	try:
	assert isinstance(
	cfg.preprocess.data_augment, list
	), "Please provide a list of datasets need to be augmented."
	if len(cfg.preprocess.data_augment) > 0:
	new_datasets_list = []
	for dataset in cfg.preprocess.data_augment:
	new_datasets = data_augment.augment_dataset(cfg, dataset)
	new_datasets_list.extend(new_datasets)
	cfg.dataset.extend(new_datasets_list)
	print("Augmentation datasets: ", cfg.dataset)
	except:
	print("No Data Augmentation.")

	# Dump metadata of datasets (singers, train/test durations, etc.)
	cal_metadata(cfg)
	'''
	## Prepare the acoustic features
	for dataset in cfg.dataset:
	# Skip augmented datasets which do not need to extract acoustic features
	# We will copy acoustic features from the original dataset later
	if (
	"pitch_shift" in dataset
	or "formant_shift" in dataset
	or "equalizer" in dataset in dataset
	):
	continue
	print(
	"Extracting acoustic features for {} using {} workers ...".format(
	dataset, args.num_workers
	)
	)
	extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
	# Calculate the statistics of acoustic features
	if cfg.preprocess.mel_min_max_norm:
	acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)

	if cfg.preprocess.extract_pitch:
	acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)

	if cfg.preprocess.extract_energy:
	acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)

	if cfg.preprocess.pitch_norm:
	acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)

	if cfg.preprocess.energy_norm:
	acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)

	# Copy acoustic features for augmented datasets by creating soft-links
	for dataset in cfg.dataset:
	if "pitch_shift" in dataset:
	src_dataset = dataset.replace("_pitch_shift", "")
	src_dataset_dir = os.path.join(output_path, src_dataset)
	elif "formant_shift" in dataset:
	src_dataset = dataset.replace("_formant_shift", "")
	src_dataset_dir = os.path.join(output_path, src_dataset)
	elif "equalizer" in dataset:
	src_dataset = dataset.replace("_equalizer", "")
	src_dataset_dir = os.path.join(output_path, src_dataset)
	else:
	continue
	dataset_dir = os.path.join(output_path, dataset)
	metadata = []
	for split in ["train", "test"] if not "eval" in dataset else ["test"]:
	metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
	with open(metadata_file_path, "r") as f:
	metadata.extend(json.load(f))
	print("Copying acoustic features for {}...".format(dataset))
	acoustic_extractor.copy_acoustic_features(
	metadata, dataset_dir, src_dataset_dir, cfg
	)
	if cfg.preprocess.mel_min_max_norm:
	acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)

	if cfg.preprocess.extract_pitch:
	acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)

	# Prepare the content features
	for dataset in cfg.dataset:
	print("Extracting content features for {}...".format(dataset))
	extract_content_features(dataset, output_path, cfg, args.num_workers)

	# Prepare the phenome squences
	if cfg.preprocess.extract_phone:
	for dataset in cfg.dataset:
	print("Extracting phoneme sequence for {}...".format(dataset))
	extract_phonme_sequences(dataset, output_path, cfg)

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--config", default="config.json", help="json files for configurations."
	)
	parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
	parser.add_argument("--prepare_alignment", type=bool, default=False)

	args = parser.parse_args()
	cfg = load_config(args.config)

	preprocess(cfg, args)


	if __name__ == "__main__":
	main()