Spaces:
Runtime error
Runtime error
| import argparse | |
| from ctypes import alignment | |
| import os | |
| import sys | |
| sys.path.append('rtvc/') | |
| from pathlib import Path | |
| import spacy | |
| import matplotlib.pyplot as plt | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| import torch | |
| import noisereduce as nr | |
| from rtvc.encoder import inference as encoder | |
| from rtvc.encoder.params_data import * | |
| from rtvc.synthesizer.inference import Synthesizer_infer | |
| from rtvc.utils.argutils import print_args | |
| from rtvc.utils.default_models import ensure_default_models | |
| from rtvc.vocoder import inference as vocoder | |
| from rtvc.speed_changer.fixSpeed import * | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser( | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
| ) | |
| parser.add_argument("--run_id", type=str, default="default", help= \ | |
| "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state " | |
| "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved " | |
| "states and restart from scratch.") | |
| parser.add_argument("-m", "--models_dir", type=Path, default="rtvc/saved_models", | |
| help="Directory containing all saved models") | |
| parser.add_argument("--weight", type=float, default=1, | |
| help="weight of input audio for voice filter") | |
| parser.add_argument("--griffin_lim", | |
| action="store_true", | |
| help="if True, use vocoder, else use griffin-lim") | |
| parser.add_argument("--cpu", action="store_true", help=\ | |
| "If True, processing is done on CPU, even when a GPU is available.") | |
| parser.add_argument("--no_sound", action="store_true", help=\ | |
| "If True, audio won't be played.") | |
| parser.add_argument("--seed", type=int, default=None, help=\ | |
| "Optional random number seed value to make toolbox deterministic.") | |
| args = parser.parse_args() | |
| arg_dict = vars(args) | |
| print_args(args, parser) | |
| # Hide GPUs from Pytorch to force CPU processing | |
| if arg_dict.pop("cpu"): | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "-1" | |
| print("Running a test of your configuration...\n") | |
| if torch.cuda.is_available(): | |
| device_id = torch.cuda.current_device() | |
| gpu_properties = torch.cuda.get_device_properties(device_id) | |
| ## Print some environment information (for debugging purposes) | |
| print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " | |
| "%.1fGb total memory.\n" % | |
| (torch.cuda.device_count(), | |
| device_id, | |
| gpu_properties.name, | |
| gpu_properties.major, | |
| gpu_properties.minor, | |
| gpu_properties.total_memory / 1e9)) | |
| else: | |
| print("Using CPU for inference.\n") | |
| ## Load the models one by one. | |
| if not args.griffin_lim: | |
| print("Preparing the encoder, the synthesizer and the vocoder...") | |
| else: | |
| print("Preparing the encoder and the synthesizer...") | |
| ensure_default_models(args.run_id, Path("rtvc/saved_models")) | |
| encoder.load_model(list(args.models_dir.glob(f"{args.run_id}/encoder.pt"))[0]) | |
| synthesizer = Synthesizer_infer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0]) | |
| if not args.griffin_lim: | |
| vocoder.load_model(list(args.models_dir.glob(f"{args.run_id}/vocoder.pt"))[0]) | |
| nlp = spacy.load('en_core_web_sm') | |
| weight = arg_dict["weight"] # 声音美颜的用户语音权重 | |
| amp = 1 | |
| directory = "input_audios" | |
| pathlist = Path(directory).rglob('*.*') | |
| for path in pathlist: | |
| path = str(path) | |
| print(path) | |
| # enter the number of reference audios | |
| # Computing the embedding | |
| # First, we load the wav using the function that the speaker encoder provides. This is | |
| # important: there is preprocessing that must be applied. | |
| # The following two methods are equivalent: | |
| # - Directly load from the filepath: | |
| # preprocessed_wav = encoder.preprocess_wav(in_fpath) | |
| # - If the wav is already loaded: | |
| # get duration info from input audio | |
| in_fpath = Path(path.replace("\"", "").replace("\'", "")) | |
| fpath_without_ext = os.path.splitext(str(in_fpath))[0] | |
| speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1] | |
| is_wav_file, wav, wav_path = TransFormat(in_fpath, 'wav') | |
| # 除了m4a格式无法工作而必须转换以外,无论原格式是否为wav,从稳定性的角度考虑也最好再转为wav(因为某些wav本身不带比特率属性,无法在此代码中工作,因此需要转换以赋予其该属性) | |
| if not is_wav_file: | |
| os.remove(wav_path) # remove intermediate wav files | |
| preprocessed_wav = encoder.preprocess_wav(wav) | |
| print("Loaded input audio file succesfully") | |
| # Then we derive the embedding. There are many functions and parameters that the | |
| # speaker encoder interfaces. These are mostly for in-depth research. You will typically | |
| # only use this function (with its default parameters): | |
| embed = encoder.embed_utterance(preprocessed_wav) | |
| embed[embed < set_zero_thres]=0 # 噪声值置零 | |
| if not os.path.exists("embeds"): | |
| os.mkdir("embeds") | |
| np.save(f"embeds/{speaker_name}.npy", embed) |