diff --git a/README.md b/README.md index 9265ca3b3388721794dec02baf941c09fc5975f7..c9921fa16fd4a56470c9cfdc45135b93d0787837 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,15 @@ --- -title: 6L TTS -emoji: 🐨 +title: Multi Sami +emoji: 🔥 colorFrom: green -colorTo: yellow +colorTo: pink sdk: gradio -sdk_version: 5.34.2 +sdk_version: 5.15.0 app_file: app.py pinned: false license: cc-by-nc-nd-4.0 -short_description: Multilingual TTS for Sámi languages +#license: cc-by-4.0 +short_description: Multilingual, multi-speaker Sámi TTS --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..5d81743371cfe82f497aad6efb56bac173b67acf --- /dev/null +++ b/app.py @@ -0,0 +1,73 @@ +import gradio as gr +import syn_hifigan as syn +#import syn_k_univnet_multi as syn +import os, tempfile + +languages = {"South Sámi":0, + "North Sámi":1, + "Lule Sámi":2} + +speakers={#"aj0": 0, + "Aanna - sma": 1, + "Máhtte": 2, + "Siggá - smj": 3, + "Biret - sme": 5, + #"lo": 6, + "Sunná": 7, + "Abmut - smj": 8, + "Nihkol - smj": 9 +} +public=True + +tempdir = tempfile.gettempdir() + +tts = syn.Synthesizer() + + + +def speak(text, language,speaker,l_weight, s_weight, pace, postfilter): #pitch_shift,pitch_std): + + + + # text frontend not implemented... + text = text.replace("...", "…") + print(speakers[speaker]) + audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language], + spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight, + pace=pace, clarity=postfilter) + + if not public: + try: + os.system("play "+tempdir+"/tmp.wav &") + except: + pass + + return (22050, audio) + + + +controls = [] +controls.append(gr.Textbox(label="text", value="Suohtas duinna deaivvadit.")) +controls.append(gr.Dropdown(list(languages.keys()), label="language", value="North Sámi")) +controls.append(gr.Dropdown(list(speakers.keys()), label="speaker", value="Sunná")) + +controls.append(gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="language weight")) +controls.append(gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="speaker weight")) + +controls.append(gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="speech rate")) +controls.append(gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="post-processing")) + + + + +tts_gui = gr.Interface( + fn=speak, + inputs=controls, + outputs= gr.Audio(label="output"), + live=False + +) + + +if __name__ == "__main__": + tts_gui.launch(share=public) diff --git a/common/audio_processing.py b/common/audio_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..fea24d74402cea10aa84f82b1b2d2ec66e0bc1f7 --- /dev/null +++ b/common/audio_processing.py @@ -0,0 +1,120 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import librosa.util as librosa_util +import numpy as np +import torch +from scipy.signal import get_window + + +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm)**2 + win_sq = librosa_util.pad_center(win_sq, size=n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + + +def griffin_lim(magnitudes, stft_fn, n_iters=30): + """ + PARAMS + ------ + magnitudes: spectrogram magnitudes + stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods + """ + + angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) + angles = angles.astype(np.float32) + angles = torch.autograd.Variable(torch.from_numpy(angles)) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + + for i in range(n_iters): + _, angles = stft_fn.transform(signal) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + return signal + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C diff --git a/common/env.py b/common/env.py new file mode 100644 index 0000000000000000000000000000000000000000..649340b21c1d70124584fd2da2e8a692be1857f1 --- /dev/null +++ b/common/env.py @@ -0,0 +1,25 @@ +import os +import shutil +from collections import defaultdict + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +class DefaultAttrDict(defaultdict): + def __init__(self, *args, **kwargs): + super(DefaultAttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + def __getattr__(self, item): + return self[item] + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/common/filter_warnings.py b/common/filter_warnings.py new file mode 100644 index 0000000000000000000000000000000000000000..fd2abc2c0b813110d6b87adb150f8bd4e4fe6998 --- /dev/null +++ b/common/filter_warnings.py @@ -0,0 +1,33 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mutes known and unrelated PyTorch warnings. + +The warnings module keeps a list of filters. Importing it as late as possible +prevents its filters from being overriden. +""" + +import warnings + + +# NGC 22.04-py3 container (PyTorch 1.12.0a0+bd13bc6) +warnings.filterwarnings( + "ignore", + message='positional arguments and argument "destination" are deprecated.' + ' nn.Module.state_dict will not accept them in the future.') + +# 22.08-py3 container +warnings.filterwarnings( + "ignore", + message="is_namedtuple is deprecated, please use the python checks") diff --git a/common/gpu_affinity.py b/common/gpu_affinity.py new file mode 100644 index 0000000000000000000000000000000000000000..191444047dd467b13d9610616351340a9d6049f3 --- /dev/null +++ b/common/gpu_affinity.py @@ -0,0 +1,156 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import math +import os +import pathlib +import re + +import pynvml + +pynvml.nvmlInit() + + +def systemGetDriverVersion(): + return pynvml.nvmlSystemGetDriverVersion() + + +def deviceGetCount(): + return pynvml.nvmlDeviceGetCount() + + +class device: + # assume nvml returns list of 64 bit ints + _nvml_affinity_elements = math.ceil(os.cpu_count() / 64) + + def __init__(self, device_idx): + super().__init__() + self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx) + + def getName(self): + return pynvml.nvmlDeviceGetName(self.handle) + + def getCpuAffinity(self): + affinity_string = '' + for j in pynvml.nvmlDeviceGetCpuAffinity( + self.handle, device._nvml_affinity_elements + ): + # assume nvml returns list of 64 bit ints + affinity_string = '{:064b}'.format(j) + affinity_string + affinity_list = [int(x) for x in affinity_string] + affinity_list.reverse() # so core 0 is in 0th element of list + + ret = [i for i, e in enumerate(affinity_list) if e != 0] + return ret + + +def set_socket_affinity(gpu_id): + dev = device(gpu_id) + affinity = dev.getCpuAffinity() + os.sched_setaffinity(0, affinity) + + +def set_single_affinity(gpu_id): + dev = device(gpu_id) + affinity = dev.getCpuAffinity() + os.sched_setaffinity(0, affinity[:1]) + + +def set_single_unique_affinity(gpu_id, nproc_per_node): + devices = [device(i) for i in range(nproc_per_node)] + socket_affinities = [dev.getCpuAffinity() for dev in devices] + + siblings_list = get_thread_siblings_list() + siblings_dict = dict(siblings_list) + + # remove siblings + for idx, socket_affinity in enumerate(socket_affinities): + socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values())) + + affinities = [] + assigned = [] + + for socket_affinity in socket_affinities: + for core in socket_affinity: + if core not in assigned: + affinities.append([core]) + assigned.append(core) + break + os.sched_setaffinity(0, affinities[gpu_id]) + + +def set_socket_unique_affinity(gpu_id, nproc_per_node, mode): + device_ids = [device(i) for i in range(nproc_per_node)] + socket_affinities = [dev.getCpuAffinity() for dev in device_ids] + + siblings_list = get_thread_siblings_list() + siblings_dict = dict(siblings_list) + + # remove siblings + for idx, socket_affinity in enumerate(socket_affinities): + socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values())) + + socket_affinities_to_device_ids = collections.defaultdict(list) + + for idx, socket_affinity in enumerate(socket_affinities): + socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx) + + for socket_affinity, device_ids in socket_affinities_to_device_ids.items(): + devices_per_group = len(device_ids) + cores_per_device = len(socket_affinity) // devices_per_group + for group_id, device_id in enumerate(device_ids): + if device_id == gpu_id: + if mode == 'interleaved': + affinity = list(socket_affinity[group_id::devices_per_group]) + elif mode == 'continuous': + affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device]) + else: + raise RuntimeError('Unknown set_socket_unique_affinity mode') + + # reintroduce siblings + affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict] + os.sched_setaffinity(0, affinity) + + +def get_thread_siblings_list(): + path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list' + thread_siblings_list = [] + pattern = re.compile(r'(\d+)\D(\d+)') + for fname in pathlib.Path(path[0]).glob(path[1:]): + with open(fname) as f: + content = f.read().strip() + res = pattern.findall(content) + if res: + pair = tuple(map(int, res[0])) + thread_siblings_list.append(pair) + return thread_siblings_list + + +def set_affinity(gpu_id, nproc_per_node, mode='socket'): + if mode == 'socket': + set_socket_affinity(gpu_id) + elif mode == 'single': + set_single_affinity(gpu_id) + elif mode == 'single_unique': + set_single_unique_affinity(gpu_id, nproc_per_node) + elif mode == 'socket_unique_interleaved': + set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved') + elif mode == 'socket_unique_continuous': + set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous') + else: + raise RuntimeError('Unknown affinity mode') + + affinity = os.sched_getaffinity(0) + return affinity diff --git a/common/layers.py b/common/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..706123ada6217230e17ac441260c1f1cb7ed6e0a --- /dev/null +++ b/common/layers.py @@ -0,0 +1,134 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import torch +import torch.nn.functional as F +from librosa.filters import mel as librosa_mel_fn + +from common.audio_processing import (dynamic_range_compression, + dynamic_range_decompression) +from common.stft import STFT + + +class LinearNorm(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_uniform_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + + +class ConvNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear', + batch_norm=False): + super(ConvNorm, self).__init__() + if padding is None: + assert(kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + self.norm = torch.nn.BatchNorm1D(out_channels) if batch_norm else None + + torch.nn.init.xavier_uniform_( + self.conv.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + if self.norm is None: + return self.conv(signal) + else: + return self.norm(self.conv(signal)) + + +class ConvReLUNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0): + super(ConvReLUNorm, self).__init__() + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, + padding=(kernel_size // 2)) + self.norm = torch.nn.LayerNorm(out_channels) + self.dropout = torch.nn.Dropout(dropout) + + def forward(self, signal): + out = F.relu(self.conv(signal)) + out = self.norm(out.transpose(1, 2)).transpose(1, 2).to(signal.dtype) + return self.dropout(out) + + +class TacotronSTFT(torch.nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, + n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, + mel_fmax=8000.0): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + mel_basis = librosa_mel_fn( + sr=sampling_rate, + n_fft=filter_length, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer('mel_basis', mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert(torch.min(y.data) >= -1) + assert(torch.max(y.data) <= 1) + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output diff --git a/common/repeated_dataloader.py b/common/repeated_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..0cc947ef0efd9defc5802ee02bf5b7e4e9a831ac --- /dev/null +++ b/common/repeated_dataloader.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data pipeline elements which wrap the data N times + +A RepeatedDataLoader resets its iterator less frequently. This saves time +on multi-GPU platforms and is invisible to the training loop. + +NOTE: Repeating puts a block of (len(dataset) * repeats) int64s into RAM. +Do not use more repeats than necessary (e.g., 10**6 to simulate infinity). +""" + +import itertools + +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + + +class RepeatedDataLoader(DataLoader): + def __init__(self, repeats, *args, **kwargs): + self.repeats = repeats + super().__init__(*args, **kwargs) + + def __iter__(self): + if self._iterator is None or self.repeats_done >= self.repeats: + self.repeats_done = 1 + return super().__iter__() + else: + self.repeats_done += 1 + return self._iterator + + +class RepeatedDistributedSampler(DistributedSampler): + def __init__(self, repeats, *args, **kwargs): + self.repeats = repeats + assert self.repeats <= 10000, "Too many repeats overload RAM." + super().__init__(*args, **kwargs) + + def __iter__(self): + # Draw indices for `self.repeats` epochs forward + start_epoch = self.epoch + iters = [] + for r in range(self.repeats): + self.set_epoch(start_epoch + r) + iters.append(super().__iter__()) + self.set_epoch(start_epoch) + + return itertools.chain.from_iterable(iters) diff --git a/common/stft.py b/common/stft.py new file mode 100644 index 0000000000000000000000000000000000000000..9426e17a40850fb79e288b9ee465755486c7876d --- /dev/null +++ b/common/stft.py @@ -0,0 +1,140 @@ +""" +BSD 3-Clause License + +Copyright (c) 2017, Prem Seetharaman +All rights reserved. + +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +from common.audio_processing import window_sumsquare + + +class STFT(torch.nn.Module): + """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" + def __init__(self, filter_length=800, hop_length=200, win_length=800, + window='hann', device="cpu"): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])]) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :].copy()) + + if window is not None: + assert(filter_length >= win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float().to(device)) + self.register_buffer('inverse_basis', inverse_basis.float().to(device)) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode='reflect') + input_data = input_data.squeeze(1) + # print(self.forward_basis.device) + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part**2 + imag_part**2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data)) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) + + with torch.no_grad(): + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, self.inverse_basis, + stride=self.hop_length, padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False) + window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] + inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction diff --git a/common/tb_dllogger.py b/common/tb_dllogger.py new file mode 100644 index 0000000000000000000000000000000000000000..e73137c4c0bed4a5cef91fd75a719530b0b09452 --- /dev/null +++ b/common/tb_dllogger.py @@ -0,0 +1,172 @@ +import atexit +import glob +import re +from itertools import product +from pathlib import Path + +import dllogger +import torch +import numpy as np +from dllogger import StdOutBackend, JSONStreamBackend, Verbosity +from torch.utils.tensorboard import SummaryWriter + + +tb_loggers = {} + + +class TBLogger: + """ + xyz_dummies: stretch the screen with empty plots so the legend would + always fit for other plots + """ + def __init__(self, enabled, log_dir, name, interval=1, dummies=True): + self.enabled = enabled + self.interval = interval + self.cache = {} + if self.enabled: + self.summary_writer = SummaryWriter( + log_dir=Path(log_dir, name), flush_secs=120, max_queue=200) + atexit.register(self.summary_writer.close) + if dummies: + for key in ('_', '✕'): + self.summary_writer.add_scalar(key, 0.0, 1) + + def log(self, step, data): + for k, v in data.items(): + self.log_value(step, k, v.item() if type(v) is torch.Tensor else v) + + def log_value(self, step, key, val, stat='mean'): + if self.enabled: + if key not in self.cache: + self.cache[key] = [] + self.cache[key].append(val) + if len(self.cache[key]) == self.interval: + agg_val = getattr(np, stat)(self.cache[key]) + self.summary_writer.add_scalar(key, agg_val, step) + del self.cache[key] + + def log_grads(self, step, model): + if self.enabled: + norms = [p.grad.norm().item() for p in model.parameters() + if p.grad is not None] + for stat in ('max', 'min', 'mean'): + self.log_value(step, f'grad_{stat}', getattr(np, stat)(norms), + stat=stat) + + +def unique_log_fpath(fpath): + + if not Path(fpath).is_file(): + return fpath + + # Avoid overwriting old logs + saved = [re.search('\.(\d+)$', f) for f in glob.glob(f'{fpath}.*')] + saved = [0] + [int(m.group(1)) for m in saved if m is not None] + return f'{fpath}.{max(saved) + 1}' + + +def stdout_step_format(step): + if isinstance(step, str): + return step + fields = [] + if len(step) > 0: + fields.append("epoch {:>4}".format(step[0])) + if len(step) > 1: + fields.append("iter {:>3}".format(step[1])) + if len(step) > 2: + fields[-1] += "/{}".format(step[2]) + return " | ".join(fields) + + +def stdout_metric_format(metric, metadata, value): + name = metadata.get("name", metric + " : ") + unit = metadata.get("unit", None) + format = f'{{{metadata.get("format", "")}}}' + fields = [name, format.format(value) if value is not None else value, unit] + fields = [f for f in fields if f is not None] + return "| " + " ".join(fields) + + +def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw): + + if enabled: + backends = [JSONStreamBackend(Verbosity.DEFAULT, + unique_log_fpath(log_fpath)), + StdOutBackend(Verbosity.VERBOSE, + step_format=stdout_step_format, + metric_format=stdout_metric_format)] + else: + backends = [] + + dllogger.init(backends=backends) + dllogger.metadata("train_lrate", {"name": "lrate", "unit": None, "format": ":>3.2e"}) + + for id_, pref in [('train', ''), ('train_avg', 'avg train '), + ('val', ' avg val '), ('val_ema', ' EMA val ')]: + + dllogger.metadata(f"{id_}_loss", + {"name": f"{pref}loss", "unit": None, "format": ":>5.2f"}) + dllogger.metadata(f"{id_}_mel_loss", + {"name": f"{pref}mel loss", "unit": None, "format": ":>5.2f"}) + + dllogger.metadata(f"{id_}_kl_loss", + {"name": f"{pref}kl loss", "unit": None, "format": ":>5.5f"}) + dllogger.metadata(f"{id_}_kl_weight", + {"name": f"{pref}kl weight", "unit": None, "format": ":>5.5f"}) + + dllogger.metadata(f"{id_}_frames/s", + {"name": None, "unit": "frames/s", "format": ":>10.2f"}) + dllogger.metadata(f"{id_}_took", + {"name": "took", "unit": "s", "format": ":>3.2f"}) + + global tb_loggers + tb_loggers = {s: TBLogger(enabled, log_dir, name=s, **tb_kw) + for s in tb_subsets} + + +def init_inference_metadata(batch_size=None): + + modalities = [('latency', 's', ':>10.5f'), ('RTF', 'x', ':>10.2f'), + ('frames/s', 'frames/s', ':>10.2f'), ('samples/s', 'samples/s', ':>10.2f'), + ('letters/s', 'letters/s', ':>10.2f'), ('tokens/s', 'tokens/s', ':>10.2f')] + + if batch_size is not None: + modalities.append((f'RTF@{batch_size}', 'x', ':>10.2f')) + + percs = ['', 'avg', '90%', '95%', '99%'] + models = ['', 'fastpitch', 'waveglow', 'hifigan'] + + for perc, model, (mod, unit, fmt) in product(percs, models, modalities): + name = f'{perc} {model} {mod}'.strip().replace(' ', ' ') + dllogger.metadata(name.replace(' ', '_'), + {'name': f'{name: <26}', 'unit': unit, 'format': fmt}) + + +def log(step, tb_total_steps=None, data={}, subset='train'): + if tb_total_steps is not None: + tb_loggers[subset].log(tb_total_steps, data) + + if subset != '': + data = {f'{subset}_{key}': v for key, v in data.items()} + dllogger.log(step, data=data) + + +def log_grads_tb(tb_total_steps, grads, tb_subset='train'): + tb_loggers[tb_subset].log_grads(tb_total_steps, grads) + + +def parameters(data, verbosity=0, tb_subset=None): + for k, v in data.items(): + dllogger.log(step="PARAMETER", data={k: v}, verbosity=verbosity) + + if tb_subset is not None and tb_loggers[tb_subset].enabled: + tb_data = {k: v for k, v in data.items() + if type(v) in (str, bool, int, float)} + tb_loggers[tb_subset].summary_writer.add_hparams(tb_data, {}) + + +def flush(): + dllogger.flush() + for tbl in tb_loggers.values(): + if tbl.enabled: + tbl.summary_writer.flush() diff --git a/common/text/LICENSE b/common/text/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..8ac1abf2e69e605de57d5a9ddaad3d83764d7a2a --- /dev/null +++ b/common/text/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2017 Keith Ito + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/common/text/__init__.py b/common/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..962fcef8d987d4ef4d4112d70501f0d23f5c3743 --- /dev/null +++ b/common/text/__init__.py @@ -0,0 +1,3 @@ +from .cmudict import CMUDict + +cmudict = CMUDict() diff --git a/common/text/__pycache__/__init__.cpython-37.pyc b/common/text/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12beed0c168f0ecb16cc17f344f399997061ae88 Binary files /dev/null and b/common/text/__pycache__/__init__.cpython-37.pyc differ diff --git a/common/text/__pycache__/__init__.cpython-38.pyc b/common/text/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddbf86d373b827bf65b7ed2a7cfb14e1380cb835 Binary files /dev/null and b/common/text/__pycache__/__init__.cpython-38.pyc differ diff --git a/common/text/__pycache__/__init__.cpython-39.pyc b/common/text/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dfec7b84a943440fefd6a10ec5d5c7c71625980 Binary files /dev/null and b/common/text/__pycache__/__init__.cpython-39.pyc differ diff --git a/common/text/__pycache__/abbreviations.cpython-37.pyc b/common/text/__pycache__/abbreviations.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ff9192b4a49deb551f8ad5cd2db659bd4fef35f Binary files /dev/null and b/common/text/__pycache__/abbreviations.cpython-37.pyc differ diff --git a/common/text/__pycache__/abbreviations.cpython-38.pyc b/common/text/__pycache__/abbreviations.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4870a76bf021b1afa5c7c931d5b9a5cf3aef1537 Binary files /dev/null and b/common/text/__pycache__/abbreviations.cpython-38.pyc differ diff --git a/common/text/__pycache__/abbreviations.cpython-39.pyc b/common/text/__pycache__/abbreviations.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97932b7f85e7ecbb4bd66eb9b2772ea7f2deafc5 Binary files /dev/null and b/common/text/__pycache__/abbreviations.cpython-39.pyc differ diff --git a/common/text/__pycache__/acronyms.cpython-37.pyc b/common/text/__pycache__/acronyms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e49c99117e5038682289495965c0bc5981230ca8 Binary files /dev/null and b/common/text/__pycache__/acronyms.cpython-37.pyc differ diff --git a/common/text/__pycache__/acronyms.cpython-38.pyc b/common/text/__pycache__/acronyms.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11c9d75df497060507d034899eae6ef3fdaa54d7 Binary files /dev/null and b/common/text/__pycache__/acronyms.cpython-38.pyc differ diff --git a/common/text/__pycache__/acronyms.cpython-39.pyc b/common/text/__pycache__/acronyms.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fc561e8a05298fc59761af6238133e50d164f04 Binary files /dev/null and b/common/text/__pycache__/acronyms.cpython-39.pyc differ diff --git a/common/text/__pycache__/cleaners.cpython-37.pyc b/common/text/__pycache__/cleaners.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f176e61fc51a092bab771b36e444f6dc5930f4f7 Binary files /dev/null and b/common/text/__pycache__/cleaners.cpython-37.pyc differ diff --git a/common/text/__pycache__/cleaners.cpython-38.pyc b/common/text/__pycache__/cleaners.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33367c777c83ff81eff9aeac117cdf4e3b40dc42 Binary files /dev/null and b/common/text/__pycache__/cleaners.cpython-38.pyc differ diff --git a/common/text/__pycache__/cleaners.cpython-39.pyc b/common/text/__pycache__/cleaners.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80cee6fd6f86a4e385eb6f47cd2785f22c5668b1 Binary files /dev/null and b/common/text/__pycache__/cleaners.cpython-39.pyc differ diff --git a/common/text/__pycache__/cmudict.cpython-37.pyc b/common/text/__pycache__/cmudict.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e41a9797f9fcea630ce632943fb08c06d0d73d72 Binary files /dev/null and b/common/text/__pycache__/cmudict.cpython-37.pyc differ diff --git a/common/text/__pycache__/cmudict.cpython-38.pyc b/common/text/__pycache__/cmudict.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ff78b3fd5adf09934a57076560e42447f4e3997 Binary files /dev/null and b/common/text/__pycache__/cmudict.cpython-38.pyc differ diff --git a/common/text/__pycache__/cmudict.cpython-39.pyc b/common/text/__pycache__/cmudict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab94af37abe1bcbae0f671964a41c0ccb203dedb Binary files /dev/null and b/common/text/__pycache__/cmudict.cpython-39.pyc differ diff --git a/common/text/__pycache__/datestime.cpython-37.pyc b/common/text/__pycache__/datestime.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ceb22d5754761318ec04933397c5471e967e06b Binary files /dev/null and b/common/text/__pycache__/datestime.cpython-37.pyc differ diff --git a/common/text/__pycache__/datestime.cpython-38.pyc b/common/text/__pycache__/datestime.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df35a2d4d008a4318704a6f623972b569e6249e7 Binary files /dev/null and b/common/text/__pycache__/datestime.cpython-38.pyc differ diff --git a/common/text/__pycache__/datestime.cpython-39.pyc b/common/text/__pycache__/datestime.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13989da02c239cf5a23194c3ddac8af8f1e6461e Binary files /dev/null and b/common/text/__pycache__/datestime.cpython-39.pyc differ diff --git a/common/text/__pycache__/letters_and_numbers.cpython-37.pyc b/common/text/__pycache__/letters_and_numbers.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eafef327e332742405917e008ea81adb8e635519 Binary files /dev/null and b/common/text/__pycache__/letters_and_numbers.cpython-37.pyc differ diff --git a/common/text/__pycache__/letters_and_numbers.cpython-38.pyc b/common/text/__pycache__/letters_and_numbers.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ec2dbd0b0be8b0e38b0d7ccc0406bda8c676783 Binary files /dev/null and b/common/text/__pycache__/letters_and_numbers.cpython-38.pyc differ diff --git a/common/text/__pycache__/letters_and_numbers.cpython-39.pyc b/common/text/__pycache__/letters_and_numbers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da1f960a8ca7014cbd6109406fccd003863635c1 Binary files /dev/null and b/common/text/__pycache__/letters_and_numbers.cpython-39.pyc differ diff --git a/common/text/__pycache__/numerical.cpython-37.pyc b/common/text/__pycache__/numerical.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c999d6336b201a7e9f5b8ca72d0476549d11d784 Binary files /dev/null and b/common/text/__pycache__/numerical.cpython-37.pyc differ diff --git a/common/text/__pycache__/numerical.cpython-38.pyc b/common/text/__pycache__/numerical.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97e5db5cb8fe314be9bd8b3e8c4f221fd10d111c Binary files /dev/null and b/common/text/__pycache__/numerical.cpython-38.pyc differ diff --git a/common/text/__pycache__/numerical.cpython-39.pyc b/common/text/__pycache__/numerical.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d150b4502a6cc68553e148e1c1843df9523d651 Binary files /dev/null and b/common/text/__pycache__/numerical.cpython-39.pyc differ diff --git a/common/text/__pycache__/symbols.cpython-37.pyc b/common/text/__pycache__/symbols.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e261a2e7bc4a3e014a026565604807128954f617 Binary files /dev/null and b/common/text/__pycache__/symbols.cpython-37.pyc differ diff --git a/common/text/__pycache__/symbols.cpython-38.pyc b/common/text/__pycache__/symbols.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f4435dbc43e6f9dc88e09e9fbadbdf51fec0eba Binary files /dev/null and b/common/text/__pycache__/symbols.cpython-38.pyc differ diff --git a/common/text/__pycache__/symbols.cpython-39.pyc b/common/text/__pycache__/symbols.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..228c1f3232deb7da54ad9d1f78a00ff6b0133492 Binary files /dev/null and b/common/text/__pycache__/symbols.cpython-39.pyc differ diff --git a/common/text/__pycache__/text_processing.cpython-37.pyc b/common/text/__pycache__/text_processing.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4961140081e035b916a0231db0fcd45284429179 Binary files /dev/null and b/common/text/__pycache__/text_processing.cpython-37.pyc differ diff --git a/common/text/__pycache__/text_processing.cpython-38.pyc b/common/text/__pycache__/text_processing.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d6022c0378b82da892b5871988babf96476f86b Binary files /dev/null and b/common/text/__pycache__/text_processing.cpython-38.pyc differ diff --git a/common/text/__pycache__/text_processing.cpython-39.pyc b/common/text/__pycache__/text_processing.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..778ae0ce61ddb87fec16299458a42a23c7fe7d51 Binary files /dev/null and b/common/text/__pycache__/text_processing.cpython-39.pyc differ diff --git a/common/text/abbreviations.py b/common/text/abbreviations.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5ca94917e9fc50c56ea459656616147bff346c --- /dev/null +++ b/common/text/abbreviations.py @@ -0,0 +1,67 @@ +import re + +_no_period_re = re.compile(r'(No[.])(?=[ ]?[0-9])') +_percent_re = re.compile(r'([ ]?[%])') +_half_re = re.compile('([0-9]½)|(½)') +_url_re = re.compile(r'([a-zA-Z])\.(com|gov|org)') + + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('ms', 'miss'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), + ('sen', 'senator'), + ('etc', 'et cetera'), +]] + + +def _expand_no_period(m): + word = m.group(0) + if word[0] == 'N': + return 'Number' + return 'number' + + +def _expand_percent(m): + return ' percent' + + +def _expand_half(m): + word = m.group(1) + if word is None: + return 'half' + return word[0] + ' and a half' + + +def _expand_urls(m): + return f'{m.group(1)} dot {m.group(2)}' + + +def normalize_abbreviations(text): + text = re.sub(_no_period_re, _expand_no_period, text) + text = re.sub(_percent_re, _expand_percent, text) + text = re.sub(_half_re, _expand_half, text) + text = re.sub('&', ' and ', text) + text = re.sub('@', ' at ', text) + text = re.sub(_url_re, _expand_urls, text) + + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text diff --git a/common/text/acronyms.py b/common/text/acronyms.py new file mode 100644 index 0000000000000000000000000000000000000000..ba147584097eff46a943fafa850e92107bfeb146 --- /dev/null +++ b/common/text/acronyms.py @@ -0,0 +1,109 @@ +import re +from . import cmudict + +_letter_to_arpabet = { + 'A': 'EY1', + 'B': 'B IY1', + 'C': 'S IY1', + 'D': 'D IY1', + 'E': 'IY1', + 'F': 'EH1 F', + 'G': 'JH IY1', + 'H': 'EY1 CH', + 'I': 'AY1', + 'J': 'JH EY1', + 'K': 'K EY1', + 'L': 'EH1 L', + 'M': 'EH1 M', + 'N': 'EH1 N', + 'O': 'OW1', + 'P': 'P IY1', + 'Q': 'K Y UW1', + 'R': 'AA1 R', + 'S': 'EH1 S', + 'T': 'T IY1', + 'U': 'Y UW1', + 'V': 'V IY1', + 'X': 'EH1 K S', + 'Y': 'W AY1', + 'W': 'D AH1 B AH0 L Y UW0', + 'Z': 'Z IY1', + 's': 'Z' +} + +# Acronyms that should not be expanded +hardcoded_acronyms = [ + 'BMW', 'MVD', 'WDSU', 'GOP', 'UK', 'AI', 'GPS', 'BP', 'FBI', 'HD', + 'CES', 'LRA', 'PC', 'NBA', 'BBL', 'OS', 'IRS', 'SAC', 'UV', 'CEO', 'TV', + 'CNN', 'MSS', 'GSA', 'USSR', 'DNA', 'PRS', 'TSA', 'US', 'GPU', 'USA', + 'FPCC', 'CIA'] + +# Words and acronyms that should be read as regular words, e.g., NATO, HAPPY, etc. +uppercase_whiteliset = [] + +acronyms_exceptions = { + 'NVIDIA': 'N.VIDIA', +} + +non_uppercase_exceptions = { + 'email': 'e-mail', +} + +# must ignore roman numerals +_acronym_re = re.compile(r'([a-z]*[A-Z][A-Z]+)s?\.?') +_non_uppercase_re = re.compile(r'\b({})\b'.format('|'.join(non_uppercase_exceptions.keys())), re.IGNORECASE) + + +def _expand_acronyms_to_arpa(m, add_spaces=True): + acronym = m.group(0) + + # remove dots if they exist + acronym = re.sub('\.', '', acronym) + + acronym = "".join(acronym.split()) + arpabet = cmudict.lookup(acronym) + + if arpabet is None: + acronym = list(acronym) + arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym] + # temporary fix + if arpabet[-1] == '{Z}' and len(arpabet) > 1: + arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:] + del arpabet[-1] + + arpabet = ' '.join(arpabet) + elif len(arpabet) == 1: + arpabet = "{" + arpabet[0] + "}" + else: + arpabet = acronym + + return arpabet + + +def normalize_acronyms(text): + text = re.sub(_acronym_re, _expand_acronyms_to_arpa, text) + return text + + +def expand_acronyms(m): + text = m.group(1) + if text in acronyms_exceptions: + text = acronyms_exceptions[text] + elif text in uppercase_whiteliset: + text = text + else: + text = '.'.join(text) + '.' + + if 's' in m.group(0): + text = text + '\'s' + + if text[-1] != '.' and m.group(0)[-1] == '.': + return text + '.' + else: + return text + + +def spell_acronyms(text): + text = re.sub(_non_uppercase_re, lambda m: non_uppercase_exceptions[m.group(0).lower()], text) + text = re.sub(_acronym_re, expand_acronyms, text) + return text diff --git a/common/text/cleaners.py b/common/text/cleaners.py new file mode 100644 index 0000000000000000000000000000000000000000..700a96d6ab8c6eefcffc4c3c7c77e1be9d62c25d --- /dev/null +++ b/common/text/cleaners.py @@ -0,0 +1,102 @@ +""" adapted from https://github.com/keithito/tacotron """ + +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +''' + +import re +from .abbreviations import normalize_abbreviations +from .acronyms import normalize_acronyms, spell_acronyms +from .datestime import normalize_datestime +from .letters_and_numbers import normalize_letters_and_numbers +from .numerical import normalize_numbers +from .unidecoder import unidecoder + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + + +def expand_abbreviations(text): + return normalize_abbreviations(text) + + +def expand_numbers(text): + return normalize_numbers(text) + + +def expand_acronyms(text): + return normalize_acronyms(text) + + +def expand_datestime(text): + return normalize_datestime(text) + + +def expand_letters_and_numbers(text): + return normalize_letters_and_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def separate_acronyms(text): + text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text) + text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text) + return text + + +def convert_to_ascii(text): + return unidecoder(text) + + +def basic_cleaners(text): + '''Basic pipeline that collapses whitespace without transliteration.''' + # text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, with number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners_v2(text): + text = convert_to_ascii(text) + text = expand_datestime(text) + text = expand_letters_and_numbers(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = spell_acronyms(text) + text = lowercase(text) + text = collapse_whitespace(text) + # compatibility with basic_english symbol set + text = re.sub(r'/+', ' ', text) + return text diff --git a/common/text/cmudict.py b/common/text/cmudict.py new file mode 100644 index 0000000000000000000000000000000000000000..c021967d61a13a89cc02fcdc8e838b5545a6b4a9 --- /dev/null +++ b/common/text/cmudict.py @@ -0,0 +1,98 @@ +""" from https://github.com/keithito/tacotron """ + +import re +import sys +import urllib.request +from pathlib import Path + + +valid_symbols = [ + 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', + 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', + 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', + 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', + 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', + 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', + 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' +] + +_valid_symbol_set = set(valid_symbols) + + +class CMUDict: + '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' + def __init__(self, file_or_path=None, heteronyms_path=None, keep_ambiguous=True): + self._entries = {} + self.heteronyms = [] + if file_or_path is not None: + self.initialize(file_or_path, heteronyms_path, keep_ambiguous) + + def initialize(self, file_or_path, heteronyms_path, keep_ambiguous=True): + if isinstance(file_or_path, str): + if not Path(file_or_path).exists(): + print("CMUdict missing. Downloading to data/cmudict/.") + self.download() + + with open(file_or_path, encoding='latin-1') as f: + entries = _parse_cmudict(f) + + else: + entries = _parse_cmudict(file_or_path) + + if not keep_ambiguous: + entries = {word: pron for word, pron in entries.items() if len(pron) == 1} + self._entries = entries + + if heteronyms_path is not None: + with open(heteronyms_path, encoding='utf-8') as f: + self.heteronyms = [l.rstrip() for l in f] + + def __len__(self): + if len(self._entries) == 0: + raise ValueError("CMUDict not initialized") + return len(self._entries) + + def lookup(self, word): + '''Returns list of ARPAbet pronunciations of the given word.''' + if len(self._entries) == 0: + raise ValueError("CMUDict not initialized") + return self._entries.get(word.upper()) + + def download(self): + url = 'https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b' + try: + Path('cmudict').mkdir(parents=False, exist_ok=True) + urllib.request.urlretrieve(url, filename='cmudict/cmudict-0.7b') + except: + print("Automatic download of CMUdict failed. Try manually with:") + print() + print(" bash scripts/download_cmudict.sh") + print() + print("and re-run the script.") + sys.exit(0) + + +_alt_re = re.compile(r'\([0-9]+\)') + + +def _parse_cmudict(file): + cmudict = {} + for line in file: + if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): + parts = line.split(' ') + word = re.sub(_alt_re, '', parts[0]) + pronunciation = _get_pronunciation(parts[1]) + if pronunciation: + if word in cmudict: + cmudict[word].append(pronunciation) + else: + cmudict[word] = [pronunciation] + return cmudict + + +def _get_pronunciation(s): + parts = s.strip().split(' ') + for part in parts: + if part not in _valid_symbol_set: + return None + return ' '.join(parts) diff --git a/common/text/datestime.py b/common/text/datestime.py new file mode 100644 index 0000000000000000000000000000000000000000..614039fc4d64df79c7950e6ba72285010d8c9c82 --- /dev/null +++ b/common/text/datestime.py @@ -0,0 +1,22 @@ +import re +_ampm_re = re.compile( + r'([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)') + + +def _expand_ampm(m): + matches = list(m.groups(0)) + txt = matches[0] + txt = txt if int(matches[1]) == 0 else txt + ' ' + matches[1] + + if matches[2][0].lower() == 'a': + txt += ' a.m.' + elif matches[2][0].lower() == 'p': + txt += ' p.m.' + + return txt + + +def normalize_datestime(text): + text = re.sub(_ampm_re, _expand_ampm, text) + #text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text) + return text diff --git a/common/text/letters_and_numbers.py b/common/text/letters_and_numbers.py new file mode 100644 index 0000000000000000000000000000000000000000..4e584c963274f5baf04417026de3d75f81962b32 --- /dev/null +++ b/common/text/letters_and_numbers.py @@ -0,0 +1,90 @@ +import re +_letters_and_numbers_re = re.compile( + r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE) + +_hardware_re = re.compile( + '([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)', re.IGNORECASE) +_hardware_key = {'tb': 'terabyte', + 'gb': 'gigabyte', + 'mb': 'megabyte', + 'kb': 'kilobyte', + 'ghz': 'gigahertz', + 'mhz': 'megahertz', + 'khz': 'kilohertz', + 'hz': 'hertz', + 'mm': 'millimeter', + 'cm': 'centimeter', + 'km': 'kilometer'} + +_dimension_re = re.compile( + r'\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b') +_dimension_key = {'m': 'meter', + 'in': 'inch', + 'inch': 'inch'} + + + + +def _expand_letters_and_numbers(m): + text = re.split(r'(\d+)', m.group(0)) + + # remove trailing space + if text[-1] == '': + text = text[:-1] + elif text[0] == '': + text = text[1:] + + # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc... + if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit(): + text[-2] = text[-2] + text[-1] + text = text[:-1] + + # for combining digits 2 by 2 + new_text = [] + for i in range(len(text)): + string = text[i] + if string.isdigit() and len(string) < 5: + # heuristics + if len(string) > 2 and string[-2] == '0': + if string[-1] == '0': + string = [string] + else: + string = [string[:-2], string[-2], string[-1]] + elif len(string) % 2 == 0: + string = [string[i:i+2] for i in range(0, len(string), 2)] + elif len(string) > 2: + string = [string[0]] + [string[i:i+2] for i in range(1, len(string), 2)] + new_text.extend(string) + else: + new_text.append(string) + + text = new_text + text = " ".join(text) + return text + + +def _expand_hardware(m): + quantity, measure = m.groups(0) + measure = _hardware_key[measure.lower()] + if measure[-1] != 'z' and float(quantity.replace(',', '')) > 1: + return "{} {}s".format(quantity, measure) + return "{} {}".format(quantity, measure) + + +def _expand_dimension(m): + text = "".join([x for x in m.groups(0) if x != 0]) + text = text.replace(' x ', ' by ') + text = text.replace('x', ' by ') + if text.endswith(tuple(_dimension_key.keys())): + if text[-2].isdigit(): + text = "{} {}".format(text[:-1], _dimension_key[text[-1:]]) + elif text[-3].isdigit(): + text = "{} {}".format(text[:-2], _dimension_key[text[-2:]]) + return text + + +def normalize_letters_and_numbers(text): + text = re.sub(_hardware_re, _expand_hardware, text) + text = re.sub(_dimension_re, _expand_dimension, text) + text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text) + return text diff --git a/common/text/numerical.py b/common/text/numerical.py new file mode 100644 index 0000000000000000000000000000000000000000..1fe4265f8d07484ecbb08ad7b536a452f15e7b62 --- /dev/null +++ b/common/text/numerical.py @@ -0,0 +1,153 @@ +""" adapted from https://github.com/keithito/tacotron """ + +import inflect +import re +_magnitudes = ['trillion', 'billion', 'million', 'thousand', 'hundred', 'm', 'b', 't'] +_magnitudes_key = {'m': 'million', 'b': 'billion', 't': 'trillion'} +_measurements = '(f|c|k|d|m)' +_measurements_key = {'f': 'fahrenheit', + 'c': 'celsius', + 'k': 'thousand', + 'm': 'meters'} +_currency_key = {'$': 'dollar', '£': 'pound', '€': 'euro', '₩': 'won'} +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_currency_re = re.compile(r'([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]|$))?'.format("|".join(_magnitudes)), re.IGNORECASE) +_measurement_re = re.compile(r'([0-9\.\,]*[0-9]+(\s)?{}\b)'.format(_measurements), re.IGNORECASE) +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +# _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?') +_roman_re = re.compile(r'\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b') # avoid I +_multiply_re = re.compile(r'(\b[0-9]+)(x)([0-9]+)') +_number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+") + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_currency(m): + currency = _currency_key[m.group(1)] + quantity = m.group(2) + magnitude = m.group(3) + + # remove commas from quantity to be able to convert to numerical + quantity = quantity.replace(',', '') + + # check for million, billion, etc... + if magnitude is not None and magnitude.lower() in _magnitudes: + if len(magnitude) == 1: + magnitude = _magnitudes_key[magnitude.lower()] + return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency+'s') + + parts = quantity.split('.') + if len(parts) > 2: + return quantity + " " + currency + "s" # Unexpected format + + dollars = int(parts[0]) if parts[0] else 0 + + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = currency if dollars == 1 else currency+'s' + cent_unit = 'cent' if cents == 1 else 'cents' + return "{} {}, {} {}".format( + _expand_hundreds(dollars), dollar_unit, + _inflect.number_to_words(cents), cent_unit) + elif dollars: + dollar_unit = currency if dollars == 1 else currency+'s' + return "{} {}".format(_expand_hundreds(dollars), dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return "{} {}".format(_inflect.number_to_words(cents), cent_unit) + else: + return 'zero' + ' ' + currency + 's' + + +def _expand_hundreds(text): + number = float(text) + if 1000 < number < 10000 and (number % 100 == 0) and (number % 1000 != 0): + return _inflect.number_to_words(int(number / 100)) + " hundred" + else: + return _inflect.number_to_words(text) + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_measurement(m): + _, number, measurement = re.split('(\d+(?:\.\d+)?)', m.group(0)) + number = _inflect.number_to_words(number) + measurement = "".join(measurement.split()) + measurement = _measurements_key[measurement.lower()] + return "{} {}".format(number, measurement) + + +def _expand_range(m): + return ' to ' + + +def _expand_multiply(m): + left = m.group(1) + right = m.group(3) + return "{} by {}".format(left, right) + + +def _expand_roman(m): + # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python + roman_numerals = {'I':1, 'V':5, 'X':10, 'L':50, 'C':100, 'D':500, 'M':1000} + result = 0 + num = m.group(0) + for i, c in enumerate(num): + if (i+1) == len(num) or roman_numerals[c] >= roman_numerals[num[i+1]]: + result += roman_numerals[c] + else: + result -= roman_numerals[c] + return str(result) + + +def _expand_number(m): + _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0)) + number = int(number) + if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0): + text = _inflect.number_to_words(number // 100) + " hundred" + elif number > 1000 and number < 3000: + if number == 2000: + text = 'two thousand' + elif number > 2000 and number < 2010: + text = 'two thousand ' + _inflect.number_to_words(number % 100) + elif number % 100 == 0: + text = _inflect.number_to_words(number // 100) + ' hundred' + else: + number = _inflect.number_to_words(number, andword='', zero='oh', group=2).replace(', ', ' ') + number = re.sub(r'-', ' ', number) + text = number + else: + number = _inflect.number_to_words(number, andword='and') + number = re.sub(r'-', ' ', number) + number = re.sub(r',', '', number) + text = number + + if suffix in ("'s", "s"): + if text[-1] == 'y': + text = text[:-1] + 'ies' + else: + text = text + suffix + + return text + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_currency_re, _expand_currency, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + # text = re.sub(_range_re, _expand_range, text) + # text = re.sub(_measurement_re, _expand_measurement, text) + text = re.sub(_roman_re, _expand_roman, text) + text = re.sub(_multiply_re, _expand_multiply, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/common/text/symbols.py b/common/text/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..595e1b113c2ec0dce3b95f6f826b23a062ba932c --- /dev/null +++ b/common/text/symbols.py @@ -0,0 +1,81 @@ +""" from https://github.com/keithito/tacotron """ + +''' +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' +from .cmudict import valid_symbols + + +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): +_arpabet = ['@' + s for s in valid_symbols] + + +def get_symbols(symbol_set='english_basic'): + if symbol_set == 'english_basic': + _pad = '_' + _punctuation = '!\'(),.:;? ' + _special = '-' + _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + elif symbol_set == 'english_basic_lowercase': + _pad = '_' + _punctuation = '!\'"(),.:;? ' + _special = '-' + _letters = 'abcdefghijklmnopqrstuvwxyz' + symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + elif symbol_set == 'english_expanded': + _punctuation = '!\'",.:;? ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + _accented = 'áçéêëñöøćž' + _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + elif symbol_set == 'smj_expanded': + _punctuation = '!\'",.:;?- ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + # _accented = 'áçéêëñöøćžđšŧ' #also north sámi letters... + _accented = 'áçéêëñöø' #also north sámi letters... + # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + _letters = 'AÁÆÅÄBCDEFGHIJKLMNŊŃÑOØÖPQRSTŦUVWXYZaáæåäbcdefghijklmnŋńñoøöpqrstuvwxyz' ########################## Ŧ ######################## + # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet + symbols = list(_punctuation + _letters) + _arpabet + elif symbol_set == 'sme_expanded': + _punctuation = '!\'",.:;?- ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + _accented = 'áçéêëńñöøćčžđšŧ' #also north sámi letters... + # _accented = 'áçéêëñöø' #also north sámi letters... + # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + _letters = 'AÁÆÅÄBCČDĐEFGHIJKLMNŊOØÖPQRSŠTŦUVWXYZŽaáæåäbcčdđefghijklmnŋoøöpqrsštŧuvwxyzž' + # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet + symbols = list(_punctuation + _letters) + _arpabet + elif symbol_set == 'sma_expanded': + _punctuation = '!\'",.:;?- ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + _accented = 'áäæçéêëïńñöøćčžđšŧ' #also north sámi letters... + # _accented = 'áçéêëñöø' #also north sámi letters... + # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + _letters = 'AÆÅBCDEFGHIÏJKLMNOØÖPQRSTUVWXYZaæåbcdefghiïjklmnoøöpqrstuvwxyz' + # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet + symbols = list(_punctuation + _letters) + _arpabet + elif symbol_set == 'all_sami': + _punctuation = '!\'",.:;?- ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + _accented = 'áäæçéêëïńñöøćčžđšŧ' + _letters = 'AÁÆÅÄBCČDĐEFGHIÏJKLMNŊŃÑOØÖPQRSŠTŦUVWXYZŽaáæåäbcčdđefghiïjklmnŋńñoøöpqrsštŧuvwxyzž' + symbols = list(_punctuation + _letters)# + _arpabet + else: + raise Exception("{} symbol set does not exist".format(symbol_set)) + + return symbols + + +def get_pad_idx(symbol_set='english_basic'): + if symbol_set in {'english_basic', 'english_basic_lowercase', 'smj_expanded', 'sme_expanded', 'sma_expanded', 'all_sami'}: + return 0 + else: + raise Exception("{} symbol set not used yet".format(symbol_set)) diff --git a/common/text/text_processing.py b/common/text/text_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..873aeb8aac9d9a24a351efbf3ea19373f53a5bae --- /dev/null +++ b/common/text/text_processing.py @@ -0,0 +1,164 @@ +""" adapted from https://github.com/keithito/tacotron """ +import re +import numpy as np +from . import cleaners +from .symbols import get_symbols +from . import cmudict +from .numerical import _currency_re, _expand_currency + + +######### +# REGEX # +######### + +# Regular expression matching text enclosed in curly braces for encoding +_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + +# Regular expression matching words and not words +_words_re = re.compile(r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)") + +# Regular expression separating words enclosed in curly braces for cleaning +_arpa_re = re.compile(r'{[^}]+}|\S+') + + +class TextProcessing(object): + def __init__(self, symbol_set, cleaner_names, p_arpabet=0.0, + handle_arpabet='word', handle_arpabet_ambiguous='ignore', + expand_currency=True): + self.symbols = get_symbols(symbol_set) + self.cleaner_names = cleaner_names + + # Mappings from symbol to numeric ID and vice versa: + self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} + self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)} + self.expand_currency = expand_currency + + # cmudict + self.p_arpabet = p_arpabet + self.handle_arpabet = handle_arpabet + self.handle_arpabet_ambiguous = handle_arpabet_ambiguous + + + def text_to_sequence(self, text): + sequence = [] + + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + sequence += self.symbols_to_sequence(text) + break + sequence += self.symbols_to_sequence(m.group(1)) + sequence += self.arpabet_to_sequence(m.group(2)) + text = m.group(3) + + return sequence + + def sequence_to_text(self, sequence): + result = '' + for symbol_id in sequence: + if symbol_id in self.id_to_symbol: + s = self.id_to_symbol[symbol_id] + # Enclose ARPAbet back in curly braces: + if len(s) > 1 and s[0] == '@': + s = '{%s}' % s[1:] + result += s + return result.replace('}{', ' ') + + def clean_text(self, text): + for name in self.cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text) + + return text + + def symbols_to_sequence(self, symbols): + return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id] + + def arpabet_to_sequence(self, text): + return self.symbols_to_sequence(['@' + s for s in text.split()]) + + def get_arpabet(self, word): + arpabet_suffix = '' + + if word.lower() in cmudict.heteronyms: + return word + + if len(word) > 2 and word.endswith("'s"): + arpabet = cmudict.lookup(word) + if arpabet is None: + arpabet = self.get_arpabet(word[:-2]) + arpabet_suffix = ' Z' + elif len(word) > 1 and word.endswith("s"): + arpabet = cmudict.lookup(word) + if arpabet is None: + arpabet = self.get_arpabet(word[:-1]) + arpabet_suffix = ' Z' + else: + arpabet = cmudict.lookup(word) + + if arpabet is None: + return word + elif arpabet[0] == '{': + arpabet = [arpabet[1:-1]] + + # XXX arpabet might not be a list here + if type(arpabet) is not list: + return word + + if len(arpabet) > 1: + if self.handle_arpabet_ambiguous == 'first': + arpabet = arpabet[0] + elif self.handle_arpabet_ambiguous == 'random': + arpabet = np.random.choice(arpabet) + elif self.handle_arpabet_ambiguous == 'ignore': + return word + else: + arpabet = arpabet[0] + + arpabet = "{" + arpabet + arpabet_suffix + "}" + + return arpabet + + def encode_text(self, text, return_all=False): + if self.expand_currency: + text = re.sub(_currency_re, _expand_currency, text) + text_clean = [self.clean_text(split) if split[0] != '{' else split + for split in _arpa_re.findall(text)] + text_clean = ' '.join(text_clean) + text_clean = cleaners.collapse_whitespace(text_clean) + text = text_clean + """ + text_arpabet = '' + if self.p_arpabet > 0: + if self.handle_arpabet == 'sentence': + if np.random.uniform() < self.p_arpabet: + words = _words_re.findall(text) + text_arpabet = [ + self.get_arpabet(word[0]) + if (word[0] != '') else word[1] + for word in words] + text_arpabet = ''.join(text_arpabet) + text = text_arpabet + elif self.handle_arpabet == 'word': + words = _words_re.findall(text) + text_arpabet = [ + word[1] if word[0] == '' else ( + self.get_arpabet(word[0]) + if np.random.uniform() < self.p_arpabet + else word[0]) + for word in words] + text_arpabet = ''.join(text_arpabet) + text = text_arpabet + elif self.handle_arpabet != '': + raise Exception("{} handle_arpabet is not supported".format( + self.handle_arpabet)) + """ + text_encoded = self.text_to_sequence(text) + + if return_all: + return text_encoded, text_clean, text_arpabet + # print(text_clean, text_encoded) + return text_encoded diff --git a/common/text/unidecoder/__init__.py b/common/text/unidecoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2e76516561849f9e140a8610d9ada6e832cad64 --- /dev/null +++ b/common/text/unidecoder/__init__.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import warnings + +from .homoglyphs import homoglyphs +from .replacements import replacements + + +_replacements = {uni: asc for uni, asc in replacements} +_homoglyphs = {g: asc for asc, glyphs in homoglyphs.items() for g in glyphs} + + +def unidecoder(s, homoglyphs=False): + """Transliterate unicode + + Args: + s (str): unicode string + homoglyphs (bool): prioritize translating to homoglyphs + """ + warned = False # Once per utterance + ret = '' + for u in s: + if ord(u) < 127: + a = u + elif homoglyphs: + a = _homoglyphs.get(u, _replacements.get(u, None)) + else: + a = _replacements.get(u, _homoglyphs.get(u, None)) + + if a is None: + if not warned: + warnings.warn(f'Unexpected character {u}: ' + 'please revise your text cleaning rules.', + stacklevel=10**6) + warned = True + else: + ret += a + + return ret diff --git a/common/text/unidecoder/__pycache__/__init__.cpython-37.pyc b/common/text/unidecoder/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49650c64dbfcd45a3a65773fd63853fd7606f844 Binary files /dev/null and b/common/text/unidecoder/__pycache__/__init__.cpython-37.pyc differ diff --git a/common/text/unidecoder/__pycache__/__init__.cpython-38.pyc b/common/text/unidecoder/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b74bec8cd75d1b1ac6e97315eaa10197e6fa0c7 Binary files /dev/null and b/common/text/unidecoder/__pycache__/__init__.cpython-38.pyc differ diff --git a/common/text/unidecoder/__pycache__/__init__.cpython-39.pyc b/common/text/unidecoder/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..baca350bb74d722e481ac138a05d5ca3687ac7cb Binary files /dev/null and b/common/text/unidecoder/__pycache__/__init__.cpython-39.pyc differ diff --git a/common/text/unidecoder/__pycache__/homoglyphs.cpython-37.pyc b/common/text/unidecoder/__pycache__/homoglyphs.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7b7e3e5764c2fb4164e75c733d7caa8c0e4edd0 Binary files /dev/null and b/common/text/unidecoder/__pycache__/homoglyphs.cpython-37.pyc differ diff --git a/common/text/unidecoder/__pycache__/homoglyphs.cpython-38.pyc b/common/text/unidecoder/__pycache__/homoglyphs.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2aa32cdf7e93f1d34f7e03d68ad73e61a41aac9a Binary files /dev/null and b/common/text/unidecoder/__pycache__/homoglyphs.cpython-38.pyc differ diff --git a/common/text/unidecoder/__pycache__/homoglyphs.cpython-39.pyc b/common/text/unidecoder/__pycache__/homoglyphs.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8729af4b21c4884a235a3d5aede14608bb99980 Binary files /dev/null and b/common/text/unidecoder/__pycache__/homoglyphs.cpython-39.pyc differ diff --git a/common/text/unidecoder/__pycache__/replacements.cpython-37.pyc b/common/text/unidecoder/__pycache__/replacements.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdd4f1cdbf368bc27621831b1d35a8aeba403662 Binary files /dev/null and b/common/text/unidecoder/__pycache__/replacements.cpython-37.pyc differ diff --git a/common/text/unidecoder/__pycache__/replacements.cpython-38.pyc b/common/text/unidecoder/__pycache__/replacements.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..532606f3f6866eb0d54952d5ce8dab46c591db46 Binary files /dev/null and b/common/text/unidecoder/__pycache__/replacements.cpython-38.pyc differ diff --git a/common/text/unidecoder/__pycache__/replacements.cpython-39.pyc b/common/text/unidecoder/__pycache__/replacements.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8d17e87fe038e1922d3c0bb5752d5ed879acfad Binary files /dev/null and b/common/text/unidecoder/__pycache__/replacements.cpython-39.pyc differ diff --git a/common/text/unidecoder/homoglyphs.py b/common/text/unidecoder/homoglyphs.py new file mode 100644 index 0000000000000000000000000000000000000000..e701be3ee7de83b52110890d554ecbe9ad1fdd70 --- /dev/null +++ b/common/text/unidecoder/homoglyphs.py @@ -0,0 +1,129 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The MIT License (MIT) +# +# Copyright (c) 2015 Rob Dawson +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# +# Based on: +# https://github.com/codebox/homoglyph/blob/master/raw_data/chars.txt +# + +homoglyphs = { + ' ': ['\xa0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200a', '\u2028', '\u2029', '\u202f', '\u205f'], + '!': ['ǃ', 'ⵑ', '!'], + '$': ['$'], + '%': ['%'], + '&': ['ꝸ', '&'], + "'": ['´', 'ʹ', 'ʻ', 'ʼ', 'ʽ', 'ʾ', 'ˈ', 'ˊ', 'ˋ', '˴', 'ʹ', '΄', '՚', '՝', 'י', '׳', 'ߴ', 'ߵ', 'ᑊ', 'ᛌ', '᾽', '᾿', '`', '´', '῾', '‘', '’', '‛', '′', '‵', 'ꞌ', ''', '`', '𖽑', '𖽒'], + '"': ['¨', 'ʺ', '˝', 'ˮ', '״', '“', '”', '‟', '❝', '❞', '⠐', '⹂'], + '(': ['❨', '❲', '〔', '﴾', '(', '['], + ')': ['❩', '❳', '〕', '﴿', ')', ']'], + '*': ['٭', '⁎', '∗', '*', '𐌟'], + '+': ['᛭', '➕', '+', '𐊛'], + ',': ['¸', '؍', '٫', '‚', 'ꓹ', ','], + '-': ['˗', '۔', '‐', '‑', '‒', '–', '⁃', '−', '➖', 'Ⲻ', '﹘'], + '.': ['٠', '۰', '܁', '܂', '․', 'ꓸ', '꘎', '.', '𐩐', '𝅭'], + '/': ['᜵', '⁁', '⁄', '∕', '╱', '⟋', '⧸', 'Ⳇ', '⼃', '〳', 'ノ', '㇓', '丿', '/', '𝈺'], + '2': ['Ƨ', 'Ϩ', 'ᒿ', 'Ꙅ', 'ꛯ', 'Ꝛ', '2', '𝟐', '𝟚', '𝟤', '𝟮', '𝟸', '\U0001fbf2'], + '3': ['Ʒ', 'Ȝ', 'З', 'Ӡ', 'Ⳍ', 'Ꝫ', 'Ɜ', '3', '𑣊', '𖼻', '𝈆', '𝟑', '𝟛', '𝟥', '𝟯', '𝟹', '\U0001fbf3'], + '4': ['Ꮞ', '4', '𑢯', '𝟒', '𝟜', '𝟦', '𝟰', '𝟺', '\U0001fbf4'], + '5': ['Ƽ', '5', '𑢻', '𝟓', '𝟝', '𝟧', '𝟱', '𝟻', '\U0001fbf5'], + '6': ['б', 'Ꮾ', 'Ⳓ', '6', '𑣕', '𝟔', '𝟞', '𝟨', '𝟲', '𝟼', '\U0001fbf6'], + '7': ['7', '𐓒', '𑣆', '𝈒', '𝟕', '𝟟', '𝟩', '𝟳', '𝟽', '\U0001fbf7'], + '8': ['Ȣ', 'ȣ', '৪', '੪', 'ଃ', '8', '𐌚', '𝟖', '𝟠', '𝟪', '𝟴', '𝟾', '𞣋', '\U0001fbf8'], + '9': ['৭', '੧', '୨', '൭', 'Ⳋ', 'Ꝯ', '9', '𑢬', '𑣌', '𑣖', '𝟗', '𝟡', '𝟫', '𝟵', '𝟿', '\U0001fbf9'], + ':': ['ː', '˸', '։', '׃', '܃', '܄', 'ः', 'ઃ', '᛬', '᠃', '᠉', '⁚', '∶', 'ꓽ', '꞉', '︰', ':'], + ';': [';', ';'], + '<': ['˂', 'ᐸ', 'ᚲ', '‹', '❮', '<', '𝈶'], + '=': ['᐀', '⹀', '゠', '꓿', '='], + '>': ['˃', 'ᐳ', '›', '❯', '>', '𖼿', '𝈷'], + '?': ['Ɂ', 'ʔ', 'ॽ', 'Ꭾ', 'ꛫ', '?'], + '@': ['@'], + 'A': ['Α', 'А', 'Ꭺ', 'ᗅ', 'ᴀ', 'ꓮ', 'ꭺ', 'A', '𐊠', '𖽀', '𝐀', '𝐴', '𝑨', '𝒜', '𝓐', '𝔄', '𝔸', '𝕬', '𝖠', '𝗔', '𝘈', '𝘼', '𝙰', '𝚨', '𝛢', '𝜜', '𝝖', '𝞐'], + 'B': ['ʙ', 'Β', 'В', 'в', 'Ᏼ', 'ᏼ', 'ᗷ', 'ᛒ', 'ℬ', 'ꓐ', 'Ꞵ', 'B', '𐊂', '𐊡', '𐌁', '𝐁', '𝐵', '𝑩', '𝓑', '𝔅', '𝔹', '𝕭', '𝖡', '𝗕', '𝘉', '𝘽', '𝙱', '𝚩', '𝛣', '𝜝', '𝝗', '𝞑'], + 'C': ['Ϲ', 'С', 'Ꮯ', 'ᑕ', 'ℂ', 'ℭ', 'Ⅽ', '⊂', 'Ⲥ', '⸦', 'ꓚ', 'C', '𐊢', '𐌂', '𐐕', '𐔜', '𑣩', '𑣲', '𝐂', '𝐶', '𝑪', '𝒞', '𝓒', '𝕮', '𝖢', '𝗖', '𝘊', '𝘾', '𝙲', '🝌'], + 'D': ['Ꭰ', 'ᗞ', 'ᗪ', 'ᴅ', 'ⅅ', 'Ⅾ', 'ꓓ', 'ꭰ', 'D', '𝐃', '𝐷', '𝑫', '𝒟', '𝓓', '𝔇', '𝔻', '𝕯', '𝖣', '𝗗', '𝘋', '𝘿', '𝙳'], + 'E': ['Ε', 'Е', 'Ꭼ', 'ᴇ', 'ℰ', '⋿', 'ⴹ', 'ꓰ', 'ꭼ', 'E', '𐊆', '𑢦', '𑢮', '𝐄', '𝐸', '𝑬', '𝓔', '𝔈', '𝔼', '𝕰', '𝖤', '𝗘', '𝘌', '𝙀', '𝙴', '𝚬', '𝛦', '𝜠', '𝝚', '𝞔'], + 'F': ['Ϝ', 'ᖴ', 'ℱ', 'ꓝ', 'Ꞙ', 'F', '𐊇', '𐊥', '𐔥', '𑢢', '𑣂', '𝈓', '𝐅', '𝐹', '𝑭', '𝓕', '𝔉', '𝔽', '𝕱', '𝖥', '𝗙', '𝘍', '𝙁', '𝙵', '𝟊'], + 'G': ['ɢ', 'Ԍ', 'ԍ', 'Ꮐ', 'Ᏻ', 'ᏻ', 'ꓖ', 'ꮐ', 'G', '𝐆', '𝐺', '𝑮', '𝒢', '𝓖', '𝔊', '𝔾', '𝕲', '𝖦', '𝗚', '𝘎', '𝙂', '𝙶'], + 'H': ['ʜ', 'Η', 'Н', 'н', 'Ꮋ', 'ᕼ', 'ℋ', 'ℌ', 'ℍ', 'Ⲏ', 'ꓧ', 'ꮋ', 'H', '𐋏', '𝐇', '𝐻', '𝑯', '𝓗', '𝕳', '𝖧', '𝗛', '𝘏', '𝙃', '𝙷', '𝚮', '𝛨', '𝜢', '𝝜', '𝞖'], + 'J': ['Ϳ', 'Ј', 'Ꭻ', 'ᒍ', 'ᴊ', 'ꓙ', 'Ʝ', 'ꭻ', 'J', '𝐉', '𝐽', '𝑱', '𝒥', '𝓙', '𝔍', '𝕁', '𝕵', '𝖩', '𝗝', '𝘑', '𝙅', '𝙹'], + 'K': ['Κ', 'К', 'Ꮶ', 'ᛕ', 'K', 'Ⲕ', 'ꓗ', 'K', '𐔘', '𝐊', '𝐾', '𝑲', '𝒦', '𝓚', '𝔎', '𝕂', '𝕶', '𝖪', '𝗞', '𝘒', '𝙆', '𝙺', '𝚱', '𝛫', '𝜥', '𝝟', '𝞙'], + 'L': ['ʟ', 'Ꮮ', 'ᒪ', 'ℒ', 'Ⅼ', 'Ⳑ', 'ⳑ', 'ꓡ', 'ꮮ', 'L', '𐐛', '𐑃', '𐔦', '𑢣', '𑢲', '𖼖', '𝈪', '𝐋', '𝐿', '𝑳', '𝓛', '𝔏', '𝕃', '𝕷', '𝖫', '𝗟', '𝘓', '𝙇', '𝙻'], + 'M': ['Μ', 'Ϻ', 'М', 'Ꮇ', 'ᗰ', 'ᛖ', 'ℳ', 'Ⅿ', 'Ⲙ', 'ꓟ', 'M', '𐊰', '𐌑', '𝐌', '𝑀', '𝑴', '𝓜', '𝔐', '𝕄', '𝕸', '𝖬', '𝗠', '𝘔', '𝙈', '𝙼', '𝚳', '𝛭', '𝜧', '𝝡', '𝞛'], + 'N': ['ɴ', 'Ν', 'ℕ', 'Ⲛ', 'ꓠ', 'N', '𐔓', '𝐍', '𝑁', '𝑵', '𝒩', '𝓝', '𝔑', '𝕹', '𝖭', '𝗡', '𝘕', '𝙉', '𝙽', '𝚴', '𝛮', '𝜨', '𝝢', '𝞜'], + 'P': ['Ρ', 'Р', 'Ꮲ', 'ᑭ', 'ᴘ', 'ᴩ', 'ℙ', 'Ⲣ', 'ꓑ', 'ꮲ', 'P', '𐊕', '𝐏', '𝑃', '𝑷', '𝒫', '𝓟', '𝔓', '𝕻', '𝖯', '𝗣', '𝘗', '𝙋', '𝙿', '𝚸', '𝛲', '𝜬', '𝝦', '𝞠'], + 'Q': ['ℚ', 'ⵕ', 'Q', '𝐐', '𝑄', '𝑸', '𝒬', '𝓠', '𝔔', '𝕼', '𝖰', '𝗤', '𝘘', '𝙌', '𝚀'], + 'R': ['Ʀ', 'ʀ', 'Ꭱ', 'Ꮢ', 'ᖇ', 'ᚱ', 'ℛ', 'ℜ', 'ℝ', 'ꓣ', 'ꭱ', 'ꮢ', 'R', '𐒴', '𖼵', '𝈖', '𝐑', '𝑅', '𝑹', '𝓡', '𝕽', '𝖱', '𝗥', '𝘙', '𝙍', '𝚁'], + 'S': ['Ѕ', 'Տ', 'Ꮥ', 'Ꮪ', 'ꓢ', 'S', '𐊖', '𐐠', '𖼺', '𝐒', '𝑆', '𝑺', '𝒮', '𝓢', '𝔖', '𝕊', '𝕾', '𝖲', '𝗦', '𝘚', '𝙎', '𝚂'], + 'T': ['Τ', 'τ', 'Т', 'т', 'Ꭲ', 'ᴛ', '⊤', '⟙', 'Ⲧ', 'ꓔ', 'ꭲ', 'T', '𐊗', '𐊱', '𐌕', '𑢼', '𖼊', '𝐓', '𝑇', '𝑻', '𝒯', '𝓣', '𝔗', '𝕋', '𝕿', '𝖳', '𝗧', '𝘛', '𝙏', '𝚃', '𝚻', '𝛕', '𝛵', '𝜏', '𝜯', '𝝉', '𝝩', '𝞃', '𝞣', '𝞽', '🝨'], + 'U': ['Ս', 'ሀ', 'ᑌ', '∪', '⋃', 'ꓴ', 'U', '𐓎', '𑢸', '𖽂', '𝐔', '𝑈', '𝑼', '𝒰', '𝓤', '𝔘', '𝕌', '𝖀', '𝖴', '𝗨', '𝘜', '𝙐', '𝚄'], + 'V': ['Ѵ', '٧', '۷', 'Ꮩ', 'ᐯ', 'Ⅴ', 'ⴸ', 'ꓦ', 'ꛟ', 'V', '𐔝', '𑢠', '𖼈', '𝈍', '𝐕', '𝑉', '𝑽', '𝒱', '𝓥', '𝔙', '𝕍', '𝖁', '𝖵', '𝗩', '𝘝', '𝙑', '𝚅'], + 'W': ['Ԝ', 'Ꮃ', 'Ꮤ', 'ꓪ', 'W', '𑣦', '𑣯', '𝐖', '𝑊', '𝑾', '𝒲', '𝓦', '𝔚', '𝕎', '𝖂', '𝖶', '𝗪', '𝘞', '𝙒', '𝚆'], + 'X': ['Χ', 'Х', '᙭', 'ᚷ', 'Ⅹ', '╳', 'Ⲭ', 'ⵝ', 'ꓫ', 'Ꭓ', 'X', '𐊐', '𐊴', '𐌗', '𐌢', '𐔧', '𑣬', '𝐗', '𝑋', '𝑿', '𝒳', '𝓧', '𝔛', '𝕏', '𝖃', '𝖷', '𝗫', '𝘟', '𝙓', '𝚇', '𝚾', '𝛸', '𝜲', '𝝬', '𝞦'], + 'Y': ['Υ', 'ϒ', 'У', 'Ү', 'Ꭹ', 'Ꮍ', 'Ⲩ', 'ꓬ', 'Y', '𐊲', '𑢤', '𖽃', '𝐘', '𝑌', '𝒀', '𝒴', '𝓨', '𝔜', '𝕐', '𝖄', '𝖸', '𝗬', '𝘠', '𝙔', '𝚈', '𝚼', '𝛶', '𝜰', '𝝪', '𝞤'], + 'Z': ['Ζ', 'Ꮓ', 'ℤ', 'ℨ', 'ꓜ', 'Z', '𐋵', '𑢩', '𑣥', '𝐙', '𝑍', '𝒁', '𝒵', '𝓩', '𝖅', '𝖹', '𝗭', '𝘡', '𝙕', '𝚉', '𝚭', '𝛧', '𝜡', '𝝛', '𝞕'], + '\\': ['∖', '⟍', '⧵', '⧹', '⼂', '㇔', '丶', '﹨', '\', '𝈏', '𝈻'], + '^': ['˄', 'ˆ'], + '_': ['ߺ', '﹍', '﹎', '﹏', '_'], + 'a': ['ɑ', 'α', 'а', '⍺', 'a', '𝐚', '𝑎', '𝒂', '𝒶', '𝓪', '𝔞', '𝕒', '𝖆', '𝖺', '𝗮', '𝘢', '𝙖', '𝚊', '𝛂', '𝛼', '𝜶', '𝝰', '𝞪'], + 'b': ['Ƅ', 'Ь', 'Ꮟ', 'ᑲ', 'ᖯ', 'b', '𝐛', '𝑏', '𝒃', '𝒷', '𝓫', '𝔟', '𝕓', '𝖇', '𝖻', '𝗯', '𝘣', '𝙗', '𝚋'], + 'c': ['ϲ', 'с', 'ᴄ', 'ⅽ', 'ⲥ', 'ꮯ', 'c', '𐐽', '𝐜', '𝑐', '𝒄', '𝒸', '𝓬', '𝔠', '𝕔', '𝖈', '𝖼', '𝗰', '𝘤', '𝙘', '𝚌'], + 'd': ['ԁ', 'Ꮷ', 'ᑯ', 'ⅆ', 'ⅾ', 'ꓒ', 'd', '𝐝', '𝑑', '𝒅', '𝒹', '𝓭', '𝔡', '𝕕', '𝖉', '𝖽', '𝗱', '𝘥', '𝙙', '𝚍'], + 'e': ['е', 'ҽ', '℮', 'ℯ', 'ⅇ', 'ꬲ', 'e', '𝐞', '𝑒', '𝒆', '𝓮', '𝔢', '𝕖', '𝖊', '𝖾', '𝗲', '𝘦', '𝙚', '𝚎'], + 'f': ['ſ', 'ϝ', 'ք', 'ẝ', 'ꞙ', 'ꬵ', 'f', '𝐟', '𝑓', '𝒇', '𝒻', '𝓯', '𝔣', '𝕗', '𝖋', '𝖿', '𝗳', '𝘧', '𝙛', '𝚏', '𝟋'], + 'g': ['ƍ', 'ɡ', 'ց', 'ᶃ', 'ℊ', 'g', '𝐠', '𝑔', '𝒈', '𝓰', '𝔤', '𝕘', '𝖌', '𝗀', '𝗴', '𝘨', '𝙜', '𝚐'], + 'h': ['һ', 'հ', 'Ꮒ', 'ℎ', 'h', '𝐡', '𝒉', '𝒽', '𝓱', '𝔥', '𝕙', '𝖍', '𝗁', '𝗵', '𝘩', '𝙝', '𝚑'], + 'i': ['ı', 'ɩ', 'ɪ', '˛', 'ͺ', 'ι', 'і', 'ӏ', 'Ꭵ', 'ι', 'ℹ', 'ⅈ', 'ⅰ', '⍳', 'ꙇ', 'ꭵ', 'i', '𑣃', '𝐢', '𝑖', '𝒊', '𝒾', '𝓲', '𝔦', '𝕚', '𝖎', '𝗂', '𝗶', '𝘪', '𝙞', '𝚒', '𝚤', '𝛊', '𝜄', '𝜾', '𝝸', '𝞲'], + 'j': ['ϳ', 'ј', 'ⅉ', 'j', '𝐣', '𝑗', '𝒋', '𝒿', '𝓳', '𝔧', '𝕛', '𝖏', '𝗃', '𝗷', '𝘫', '𝙟', '𝚓'], + 'k': ['k', '𝐤', '𝑘', '𝒌', '𝓀', '𝓴', '𝔨', '𝕜', '𝖐', '𝗄', '𝗸', '𝘬', '𝙠', '𝚔'], + 'l': ['Ɩ', 'ǀ', 'Ι', 'І', 'Ӏ', '׀', 'ו', 'ן', 'ا', '١', '۱', 'ߊ', 'ᛁ', 'ℐ', 'ℑ', 'ℓ', 'Ⅰ', 'ⅼ', '∣', '⏽', 'Ⲓ', 'ⵏ', 'ꓲ', 'ﺍ', 'ﺎ', '1', 'I', 'l', '│', '𐊊', '𐌉', '𐌠', '𖼨', '𝐈', '𝐥', '𝐼', '𝑙', '𝑰', '𝒍', '𝓁', '𝓘', '𝓵', '𝔩', '𝕀', '𝕝', '𝕴', '𝖑', '𝖨', '𝗅', '𝗜', '𝗹', '𝘐', '𝘭', '𝙄', '𝙡', '𝙸', '𝚕', '𝚰', '𝛪', '𝜤', '𝝞', '𝞘', '𝟏', '𝟙', '𝟣', '𝟭', '𝟷', '𞣇', '𞸀', '𞺀', '\U0001fbf1'], + 'm': ['m'], + 'n': ['ո', 'ռ', 'n', '𝐧', '𝑛', '𝒏', '𝓃', '𝓷', '𝔫', '𝕟', '𝖓', '𝗇', '𝗻', '𝘯', '𝙣', '𝚗'], + 'o': ['Ο', 'ο', 'σ', 'О', 'о', 'Օ', 'օ', 'ס', 'ه', '٥', 'ھ', 'ہ', 'ە', '۵', '߀', '०', '০', '੦', '૦', 'ଠ', '୦', '௦', 'ం', '౦', 'ಂ', '೦', 'ം', 'ഠ', '൦', 'ං', '๐', '໐', 'ဝ', '၀', 'ჿ', 'ዐ', 'ᴏ', 'ᴑ', 'ℴ', 'Ⲟ', 'ⲟ', 'ⵔ', '〇', 'ꓳ', 'ꬽ', 'ﮦ', 'ﮧ', 'ﮨ', 'ﮩ', 'ﮪ', 'ﮫ', 'ﮬ', 'ﮭ', 'ﻩ', 'ﻪ', 'ﻫ', 'ﻬ', '0', 'O', 'o', '𐊒', '𐊫', '𐐄', '𐐬', '𐓂', '𐓪', '𐔖', '𑓐', '𑢵', '𑣈', '𑣗', '𑣠', '𝐎', '𝐨', '𝑂', '𝑜', '𝑶', '𝒐', '𝒪', '𝓞', '𝓸', '𝔒', '𝔬', '𝕆', '𝕠', '𝕺', '𝖔', '𝖮', '𝗈', '𝗢', '𝗼', '𝘖', '𝘰', '𝙊', '𝙤', '𝙾', '𝚘', '𝚶', '𝛐', '𝛔', '𝛰', '𝜊', '𝜎', '𝜪', '𝝄', '𝝈', '𝝤', '𝝾', '𝞂', '𝞞', '𝞸', '𝞼', '𝟎', '𝟘', '𝟢', '𝟬', '𝟶', '𞸤', '𞹤', '𞺄', '\U0001fbf0'], + 'p': ['ρ', 'ϱ', 'р', '⍴', 'ⲣ', 'p', '𝐩', '𝑝', '𝒑', '𝓅', '𝓹', '𝔭', '𝕡', '𝖕', '𝗉', '𝗽', '𝘱', '𝙥', '𝚙', '𝛒', '𝛠', '𝜌', '𝜚', '𝝆', '𝝔', '𝞀', '𝞎', '𝞺', '𝟈'], + 'q': ['ԛ', 'գ', 'զ', 'q', '𝐪', '𝑞', '𝒒', '𝓆', '𝓺', '𝔮', '𝕢', '𝖖', '𝗊', '𝗾', '𝘲', '𝙦', '𝚚'], + 'r': ['г', 'ᴦ', 'ⲅ', 'ꭇ', 'ꭈ', 'ꮁ', 'r', '𝐫', '𝑟', '𝒓', '𝓇', '𝓻', '𝔯', '𝕣', '𝖗', '𝗋', '𝗿', '𝘳', '𝙧', '𝚛'], + 's': ['ƽ', 'ѕ', 'ꜱ', 'ꮪ', 's', '𐑈', '𑣁', '𝐬', '𝑠', '𝒔', '𝓈', '𝓼', '𝔰', '𝕤', '𝖘', '𝗌', '𝘀', '𝘴', '𝙨', '𝚜'], + 't': ['t', '𝐭', '𝑡', '𝒕', '𝓉', '𝓽', '𝔱', '𝕥', '𝖙', '𝗍', '𝘁', '𝘵', '𝙩', '𝚝'], + 'u': ['ʋ', 'υ', 'ս', 'ᴜ', 'ꞟ', 'ꭎ', 'ꭒ', 'u', '𐓶', '𑣘', '𝐮', '𝑢', '𝒖', '𝓊', '𝓾', '𝔲', '𝕦', '𝖚', '𝗎', '𝘂', '𝘶', '𝙪', '𝚞', '𝛖', '𝜐', '𝝊', '𝞄', '𝞾'], + 'v': ['ν', 'ѵ', 'ט', 'ᴠ', 'ⅴ', '∨', '⋁', 'ꮩ', 'v', '𑜆', '𑣀', '𝐯', '𝑣', '𝒗', '𝓋', '𝓿', '𝔳', '𝕧', '𝖛', '𝗏', '𝘃', '𝘷', '𝙫', '𝚟', '𝛎', '𝜈', '𝝂', '𝝼', '𝞶'], + 'w': ['ɯ', 'ѡ', 'ԝ', 'ա', 'ᴡ', 'ꮃ', 'w', '𑜊', '𑜎', '𑜏', '𝐰', '𝑤', '𝒘', '𝓌', '𝔀', '𝔴', '𝕨', '𝖜', '𝗐', '𝘄', '𝘸', '𝙬', '𝚠'], + 'x': ['×', 'х', 'ᕁ', 'ᕽ', '᙮', 'ⅹ', '⤫', '⤬', '⨯', 'x', '𝐱', '𝑥', '𝒙', '𝓍', '𝔁', '𝔵', '𝕩', '𝖝', '𝗑', '𝘅', '𝘹', '𝙭', '𝚡'], + 'y': ['ɣ', 'ʏ', 'γ', 'у', 'ү', 'ყ', 'ᶌ', 'ỿ', 'ℽ', 'ꭚ', 'y', '𑣜', '𝐲', '𝑦', '𝒚', '𝓎', '𝔂', '𝔶', '𝕪', '𝖞', '𝗒', '𝘆', '𝘺', '𝙮', '𝚢', '𝛄', '𝛾', '𝜸', '𝝲', '𝞬'], + 'z': ['ᴢ', 'ꮓ', 'z', '𑣄', '𝐳', '𝑧', '𝒛', '𝓏', '𝔃', '𝔷', '𝕫', '𝖟', '𝗓', '𝘇', '𝘻', '𝙯', '𝚣'], + '{': ['❴', '{', '𝄔'], + '}': ['❵', '}'], + '~': ['˜', '῀', '⁓', '∼'], +} diff --git a/common/text/unidecoder/replacements.py b/common/text/unidecoder/replacements.py new file mode 100644 index 0000000000000000000000000000000000000000..f01825cc6e4f02771343b1ccec09ee918dac6c01 --- /dev/null +++ b/common/text/unidecoder/replacements.py @@ -0,0 +1,2085 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) Sindre Sorhus (https://sindresorhus.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# +# Based on: +# https://github.com/sindresorhus/transliterate/blob/main/replacements.js +# + +replacements = [ + # German umlauts + ['ß', 'ss'], + ['ẞ', 'Ss'], + ['ä', 'ae'], + ['Ä', 'Ae'], + ['ö', 'oe'], + ['Ö', 'Oe'], + ['ü', 'ue'], + ['Ü', 'Ue'], + + # Latin + ['À', 'A'], + ['Á', 'A'], + ['Â', 'A'], + ['Ã', 'A'], + ['Ä', 'Ae'], + ['Å', 'A'], + ['Æ', 'AE'], + ['Ç', 'C'], + ['È', 'E'], + ['É', 'E'], + ['Ê', 'E'], + ['Ë', 'E'], + ['Ì', 'I'], + ['Í', 'I'], + ['Î', 'I'], + ['Ï', 'I'], + ['Ð', 'D'], + ['Ñ', 'N'], + ['Ò', 'O'], + ['Ó', 'O'], + ['Ô', 'O'], + ['Õ', 'O'], + ['Ö', 'Oe'], + ['Ő', 'O'], + ['Ø', 'O'], + ['Ù', 'U'], + ['Ú', 'U'], + ['Û', 'U'], + ['Ü', 'Ue'], + ['Ű', 'U'], + ['Ý', 'Y'], + ['Þ', 'TH'], + ['ß', 'ss'], + ['à', 'a'], + ['á', 'a'], + ['â', 'a'], + ['ã', 'a'], + ['ä', 'ae'], + ['å', 'a'], + ['æ', 'ae'], + ['ç', 'c'], + ['è', 'e'], + ['é', 'e'], + ['ê', 'e'], + ['ë', 'e'], + ['ì', 'i'], + ['í', 'i'], + ['î', 'i'], + ['ï', 'i'], + ['ð', 'd'], + ['ñ', 'n'], + ['ò', 'o'], + ['ó', 'o'], + ['ô', 'o'], + ['õ', 'o'], + ['ö', 'oe'], + ['ő', 'o'], + ['ø', 'o'], + ['ù', 'u'], + ['ú', 'u'], + ['û', 'u'], + ['ü', 'ue'], + ['ű', 'u'], + ['ý', 'y'], + ['þ', 'th'], + ['ÿ', 'y'], + ['ẞ', 'SS'], + + # Vietnamese + ['à', 'a'], + ['À', 'A'], + ['á', 'a'], + ['Á', 'A'], + ['â', 'a'], + ['Â', 'A'], + ['ã', 'a'], + ['Ã', 'A'], + ['è', 'e'], + ['È', 'E'], + ['é', 'e'], + ['É', 'E'], + ['ê', 'e'], + ['Ê', 'E'], + ['ì', 'i'], + ['Ì', 'I'], + ['í', 'i'], + ['Í', 'I'], + ['ò', 'o'], + ['Ò', 'O'], + ['ó', 'o'], + ['Ó', 'O'], + ['ô', 'o'], + ['Ô', 'O'], + ['õ', 'o'], + ['Õ', 'O'], + ['ù', 'u'], + ['Ù', 'U'], + ['ú', 'u'], + ['Ú', 'U'], + ['ý', 'y'], + ['Ý', 'Y'], + ['ă', 'a'], + ['Ă', 'A'], + ['Đ', 'D'], + ['đ', 'd'], + ['ĩ', 'i'], + ['Ĩ', 'I'], + ['ũ', 'u'], + ['Ũ', 'U'], + ['ơ', 'o'], + ['Ơ', 'O'], + ['ư', 'u'], + ['Ư', 'U'], + ['ạ', 'a'], + ['Ạ', 'A'], + ['ả', 'a'], + ['Ả', 'A'], + ['ấ', 'a'], + ['Ấ', 'A'], + ['ầ', 'a'], + ['Ầ', 'A'], + ['ẩ', 'a'], + ['Ẩ', 'A'], + ['ẫ', 'a'], + ['Ẫ', 'A'], + ['ậ', 'a'], + ['Ậ', 'A'], + ['ắ', 'a'], + ['Ắ', 'A'], + ['ằ', 'a'], + ['Ằ', 'A'], + ['ẳ', 'a'], + ['Ẳ', 'A'], + ['ẵ', 'a'], + ['Ẵ', 'A'], + ['ặ', 'a'], + ['Ặ', 'A'], + ['ẹ', 'e'], + ['Ẹ', 'E'], + ['ẻ', 'e'], + ['Ẻ', 'E'], + ['ẽ', 'e'], + ['Ẽ', 'E'], + ['ế', 'e'], + ['Ế', 'E'], + ['ề', 'e'], + ['Ề', 'E'], + ['ể', 'e'], + ['Ể', 'E'], + ['ễ', 'e'], + ['Ễ', 'E'], + ['ệ', 'e'], + ['Ệ', 'E'], + ['ỉ', 'i'], + ['Ỉ', 'I'], + ['ị', 'i'], + ['Ị', 'I'], + ['ọ', 'o'], + ['Ọ', 'O'], + ['ỏ', 'o'], + ['Ỏ', 'O'], + ['ố', 'o'], + ['Ố', 'O'], + ['ồ', 'o'], + ['Ồ', 'O'], + ['ổ', 'o'], + ['Ổ', 'O'], + ['ỗ', 'o'], + ['Ỗ', 'O'], + ['ộ', 'o'], + ['Ộ', 'O'], + ['ớ', 'o'], + ['Ớ', 'O'], + ['ờ', 'o'], + ['Ờ', 'O'], + ['ở', 'o'], + ['Ở', 'O'], + ['ỡ', 'o'], + ['Ỡ', 'O'], + ['ợ', 'o'], + ['Ợ', 'O'], + ['ụ', 'u'], + ['Ụ', 'U'], + ['ủ', 'u'], + ['Ủ', 'U'], + ['ứ', 'u'], + ['Ứ', 'U'], + ['ừ', 'u'], + ['Ừ', 'U'], + ['ử', 'u'], + ['Ử', 'U'], + ['ữ', 'u'], + ['Ữ', 'U'], + ['ự', 'u'], + ['Ự', 'U'], + ['ỳ', 'y'], + ['Ỳ', 'Y'], + ['ỵ', 'y'], + ['Ỵ', 'Y'], + ['ỷ', 'y'], + ['Ỷ', 'Y'], + ['ỹ', 'y'], + ['Ỹ', 'Y'], + + # Arabic + ['ء', 'e'], + ['آ', 'a'], + ['أ', 'a'], + ['ؤ', 'w'], + ['إ', 'i'], + ['ئ', 'y'], + ['ا', 'a'], + ['ب', 'b'], + ['ة', 't'], + ['ت', 't'], + ['ث', 'th'], + ['ج', 'j'], + ['ح', 'h'], + ['خ', 'kh'], + ['د', 'd'], + ['ذ', 'dh'], + ['ر', 'r'], + ['ز', 'z'], + ['س', 's'], + ['ش', 'sh'], + ['ص', 's'], + ['ض', 'd'], + ['ط', 't'], + ['ظ', 'z'], + ['ع', 'e'], + ['غ', 'gh'], + ['ـ', '_'], + ['ف', 'f'], + ['ق', 'q'], + ['ك', 'k'], + ['ل', 'l'], + ['م', 'm'], + ['ن', 'n'], + ['ه', 'h'], + ['و', 'w'], + ['ى', 'a'], + ['ي', 'y'], + ['َ‎', 'a'], + ['ُ', 'u'], + ['ِ‎', 'i'], + ['٠', '0'], + ['١', '1'], + ['٢', '2'], + ['٣', '3'], + ['٤', '4'], + ['٥', '5'], + ['٦', '6'], + ['٧', '7'], + ['٨', '8'], + ['٩', '9'], + + # Persian / Farsi + ['چ', 'ch'], + ['ک', 'k'], + ['گ', 'g'], + ['پ', 'p'], + ['ژ', 'zh'], + ['ی', 'y'], + ['۰', '0'], + ['۱', '1'], + ['۲', '2'], + ['۳', '3'], + ['۴', '4'], + ['۵', '5'], + ['۶', '6'], + ['۷', '7'], + ['۸', '8'], + ['۹', '9'], + + # Pashto + ['ټ', 'p'], + ['ځ', 'z'], + ['څ', 'c'], + ['ډ', 'd'], + ['ﺫ', 'd'], + ['ﺭ', 'r'], + ['ړ', 'r'], + ['ﺯ', 'z'], + ['ږ', 'g'], + ['ښ', 'x'], + ['ګ', 'g'], + ['ڼ', 'n'], + ['ۀ', 'e'], + ['ې', 'e'], + ['ۍ', 'ai'], + + # Urdu + ['ٹ', 't'], + ['ڈ', 'd'], + ['ڑ', 'r'], + ['ں', 'n'], + ['ہ', 'h'], + ['ھ', 'h'], + ['ے', 'e'], + + # Russian + ['А', 'A'], + ['а', 'a'], + ['Б', 'B'], + ['б', 'b'], + ['В', 'V'], + ['в', 'v'], + ['Г', 'G'], + ['г', 'g'], + ['Д', 'D'], + ['д', 'd'], + ['ъе', 'ye'], + ['Ъе', 'Ye'], + ['ъЕ', 'yE'], + ['ЪЕ', 'YE'], + ['Е', 'E'], + ['е', 'e'], + ['Ё', 'Yo'], + ['ё', 'yo'], + ['Ж', 'Zh'], + ['ж', 'zh'], + ['З', 'Z'], + ['з', 'z'], + ['И', 'I'], + ['и', 'i'], + ['ый', 'iy'], + ['Ый', 'Iy'], + ['ЫЙ', 'IY'], + ['ыЙ', 'iY'], + ['Й', 'Y'], + ['й', 'y'], + ['К', 'K'], + ['к', 'k'], + ['Л', 'L'], + ['л', 'l'], + ['М', 'M'], + ['м', 'm'], + ['Н', 'N'], + ['н', 'n'], + ['О', 'O'], + ['о', 'o'], + ['П', 'P'], + ['п', 'p'], + ['Р', 'R'], + ['р', 'r'], + ['С', 'S'], + ['с', 's'], + ['Т', 'T'], + ['т', 't'], + ['У', 'U'], + ['у', 'u'], + ['Ф', 'F'], + ['ф', 'f'], + ['Х', 'Kh'], + ['х', 'kh'], + ['Ц', 'Ts'], + ['ц', 'ts'], + ['Ч', 'Ch'], + ['ч', 'ch'], + ['Ш', 'Sh'], + ['ш', 'sh'], + ['Щ', 'Sch'], + ['щ', 'sch'], + ['Ъ', ''], + ['ъ', ''], + ['Ы', 'Y'], + ['ы', 'y'], + ['Ь', ''], + ['ь', ''], + ['Э', 'E'], + ['э', 'e'], + ['Ю', 'Yu'], + ['ю', 'yu'], + ['Я', 'Ya'], + ['я', 'ya'], + + # Romanian + ['ă', 'a'], + ['Ă', 'A'], + ['ș', 's'], + ['Ș', 'S'], + ['ț', 't'], + ['Ț', 'T'], + ['ţ', 't'], + ['Ţ', 'T'], + + # Turkish + ['ş', 's'], + ['Ş', 'S'], + ['ç', 'c'], + ['Ç', 'C'], + ['ğ', 'g'], + ['Ğ', 'G'], + ['ı', 'i'], + ['İ', 'I'], + + # Armenian + ['ա', 'a'], + ['Ա', 'A'], + ['բ', 'b'], + ['Բ', 'B'], + ['գ', 'g'], + ['Գ', 'G'], + ['դ', 'd'], + ['Դ', 'D'], + ['ե', 'ye'], + ['Ե', 'Ye'], + ['զ', 'z'], + ['Զ', 'Z'], + ['է', 'e'], + ['Է', 'E'], + ['ը', 'y'], + ['Ը', 'Y'], + ['թ', 't'], + ['Թ', 'T'], + ['ժ', 'zh'], + ['Ժ', 'Zh'], + ['ի', 'i'], + ['Ի', 'I'], + ['լ', 'l'], + ['Լ', 'L'], + ['խ', 'kh'], + ['Խ', 'Kh'], + ['ծ', 'ts'], + ['Ծ', 'Ts'], + ['կ', 'k'], + ['Կ', 'K'], + ['հ', 'h'], + ['Հ', 'H'], + ['ձ', 'dz'], + ['Ձ', 'Dz'], + ['ղ', 'gh'], + ['Ղ', 'Gh'], + ['ճ', 'tch'], + ['Ճ', 'Tch'], + ['մ', 'm'], + ['Մ', 'M'], + ['յ', 'y'], + ['Յ', 'Y'], + ['ն', 'n'], + ['Ն', 'N'], + ['շ', 'sh'], + ['Շ', 'Sh'], + ['ո', 'vo'], + ['Ո', 'Vo'], + ['չ', 'ch'], + ['Չ', 'Ch'], + ['պ', 'p'], + ['Պ', 'P'], + ['ջ', 'j'], + ['Ջ', 'J'], + ['ռ', 'r'], + ['Ռ', 'R'], + ['ս', 's'], + ['Ս', 'S'], + ['վ', 'v'], + ['Վ', 'V'], + ['տ', 't'], + ['Տ', 'T'], + ['ր', 'r'], + ['Ր', 'R'], + ['ց', 'c'], + ['Ց', 'C'], + ['ու', 'u'], + ['ՈՒ', 'U'], + ['Ու', 'U'], + ['փ', 'p'], + ['Փ', 'P'], + ['ք', 'q'], + ['Ք', 'Q'], + ['օ', 'o'], + ['Օ', 'O'], + ['ֆ', 'f'], + ['Ֆ', 'F'], + ['և', 'yev'], + + # Georgian + ['ა', 'a'], + ['ბ', 'b'], + ['გ', 'g'], + ['დ', 'd'], + ['ე', 'e'], + ['ვ', 'v'], + ['ზ', 'z'], + ['თ', 't'], + ['ი', 'i'], + ['კ', 'k'], + ['ლ', 'l'], + ['მ', 'm'], + ['ნ', 'n'], + ['ო', 'o'], + ['პ', 'p'], + ['ჟ', 'zh'], + ['რ', 'r'], + ['ს', 's'], + ['ტ', 't'], + ['უ', 'u'], + ['ფ', 'ph'], + ['ქ', 'q'], + ['ღ', 'gh'], + ['ყ', 'k'], + ['შ', 'sh'], + ['ჩ', 'ch'], + ['ც', 'ts'], + ['ძ', 'dz'], + ['წ', 'ts'], + ['ჭ', 'tch'], + ['ხ', 'kh'], + ['ჯ', 'j'], + ['ჰ', 'h'], + + # Czech + ['č', 'c'], + ['ď', 'd'], + ['ě', 'e'], + ['ň', 'n'], + ['ř', 'r'], + ['š', 's'], + ['ť', 't'], + ['ů', 'u'], + ['ž', 'z'], + ['Č', 'C'], + ['Ď', 'D'], + ['Ě', 'E'], + ['Ň', 'N'], + ['Ř', 'R'], + ['Š', 'S'], + ['Ť', 'T'], + ['Ů', 'U'], + ['Ž', 'Z'], + + # Dhivehi + ['ހ', 'h'], + ['ށ', 'sh'], + ['ނ', 'n'], + ['ރ', 'r'], + ['ބ', 'b'], + ['ޅ', 'lh'], + ['ކ', 'k'], + ['އ', 'a'], + ['ވ', 'v'], + ['މ', 'm'], + ['ފ', 'f'], + ['ދ', 'dh'], + ['ތ', 'th'], + ['ލ', 'l'], + ['ގ', 'g'], + ['ޏ', 'gn'], + ['ސ', 's'], + ['ޑ', 'd'], + ['ޒ', 'z'], + ['ޓ', 't'], + ['ޔ', 'y'], + ['ޕ', 'p'], + ['ޖ', 'j'], + ['ޗ', 'ch'], + ['ޘ', 'tt'], + ['ޙ', 'hh'], + ['ޚ', 'kh'], + ['ޛ', 'th'], + ['ޜ', 'z'], + ['ޝ', 'sh'], + ['ޞ', 's'], + ['ޟ', 'd'], + ['ޠ', 't'], + ['ޡ', 'z'], + ['ޢ', 'a'], + ['ޣ', 'gh'], + ['ޤ', 'q'], + ['ޥ', 'w'], + ['ަ', 'a'], + ['ާ', 'aa'], + ['ި', 'i'], + ['ީ', 'ee'], + ['ު', 'u'], + ['ޫ', 'oo'], + ['ެ', 'e'], + ['ޭ', 'ey'], + ['ޮ', 'o'], + ['ޯ', 'oa'], + ['ް', ''], + + # Greek + ['α', 'a'], + ['β', 'v'], + ['γ', 'g'], + ['δ', 'd'], + ['ε', 'e'], + ['ζ', 'z'], + ['η', 'i'], + ['θ', 'th'], + ['ι', 'i'], + ['κ', 'k'], + ['λ', 'l'], + ['μ', 'm'], + ['ν', 'n'], + ['ξ', 'ks'], + ['ο', 'o'], + ['π', 'p'], + ['ρ', 'r'], + ['σ', 's'], + ['τ', 't'], + ['υ', 'y'], + ['φ', 'f'], + ['χ', 'x'], + ['ψ', 'ps'], + ['ω', 'o'], + ['ά', 'a'], + ['έ', 'e'], + ['ί', 'i'], + ['ό', 'o'], + ['ύ', 'y'], + ['ή', 'i'], + ['ώ', 'o'], + ['ς', 's'], + ['ϊ', 'i'], + ['ΰ', 'y'], + ['ϋ', 'y'], + ['ΐ', 'i'], + ['Α', 'A'], + ['Β', 'B'], + ['Γ', 'G'], + ['Δ', 'D'], + ['Ε', 'E'], + ['Ζ', 'Z'], + ['Η', 'I'], + ['Θ', 'TH'], + ['Ι', 'I'], + ['Κ', 'K'], + ['Λ', 'L'], + ['Μ', 'M'], + ['Ν', 'N'], + ['Ξ', 'KS'], + ['Ο', 'O'], + ['Π', 'P'], + ['Ρ', 'R'], + ['Σ', 'S'], + ['Τ', 'T'], + ['Υ', 'Y'], + ['Φ', 'F'], + ['Χ', 'X'], + ['Ψ', 'PS'], + ['Ω', 'O'], + ['Ά', 'A'], + ['Έ', 'E'], + ['Ί', 'I'], + ['Ό', 'O'], + ['Ύ', 'Y'], + ['Ή', 'I'], + ['Ώ', 'O'], + ['Ϊ', 'I'], + ['Ϋ', 'Y'], + + # Disabled as it conflicts with German and Latin. + # Hungarian + # ['ä', 'a'], + # ['Ä', 'A'], + # ['ö', 'o'], + # ['Ö', 'O'], + # ['ü', 'u'], + # ['Ü', 'U'], + # ['ű', 'u'], + # ['Ű', 'U'], + + # Latvian + ['ā', 'a'], + ['ē', 'e'], + ['ģ', 'g'], + ['ī', 'i'], + ['ķ', 'k'], + ['ļ', 'l'], + ['ņ', 'n'], + ['ū', 'u'], + ['Ā', 'A'], + ['Ē', 'E'], + ['Ģ', 'G'], + ['Ī', 'I'], + ['Ķ', 'K'], + ['Ļ', 'L'], + ['Ņ', 'N'], + ['Ū', 'U'], + ['č', 'c'], + ['š', 's'], + ['ž', 'z'], + ['Č', 'C'], + ['Š', 'S'], + ['Ž', 'Z'], + + # Lithuanian + ['ą', 'a'], + ['č', 'c'], + ['ę', 'e'], + ['ė', 'e'], + ['į', 'i'], + ['š', 's'], + ['ų', 'u'], + ['ū', 'u'], + ['ž', 'z'], + ['Ą', 'A'], + ['Č', 'C'], + ['Ę', 'E'], + ['Ė', 'E'], + ['Į', 'I'], + ['Š', 'S'], + ['Ų', 'U'], + ['Ū', 'U'], + + # Macedonian + ['Ќ', 'Kj'], + ['ќ', 'kj'], + ['Љ', 'Lj'], + ['љ', 'lj'], + ['Њ', 'Nj'], + ['њ', 'nj'], + ['Тс', 'Ts'], + ['тс', 'ts'], + + # Polish + ['ą', 'a'], + ['ć', 'c'], + ['ę', 'e'], + ['ł', 'l'], + ['ń', 'n'], + ['ś', 's'], + ['ź', 'z'], + ['ż', 'z'], + ['Ą', 'A'], + ['Ć', 'C'], + ['Ę', 'E'], + ['Ł', 'L'], + ['Ń', 'N'], + ['Ś', 'S'], + ['Ź', 'Z'], + ['Ż', 'Z'], + + # Disabled as it conflicts with Vietnamese. + # Serbian + # ['љ', 'lj'], + # ['њ', 'nj'], + # ['Љ', 'Lj'], + # ['Њ', 'Nj'], + # ['đ', 'dj'], + # ['Đ', 'Dj'], + # ['ђ', 'dj'], + # ['ј', 'j'], + # ['ћ', 'c'], + # ['џ', 'dz'], + # ['Ђ', 'Dj'], + # ['Ј', 'j'], + # ['Ћ', 'C'], + # ['Џ', 'Dz'], + + # Disabled as it conflicts with German and Latin. + # Slovak + # ['ä', 'a'], + # ['Ä', 'A'], + # ['ľ', 'l'], + # ['ĺ', 'l'], + # ['ŕ', 'r'], + # ['Ľ', 'L'], + # ['Ĺ', 'L'], + # ['Ŕ', 'R'], + + # Disabled as it conflicts with German and Latin. + # Swedish + # ['å', 'o'], + # ['Å', 'o'], + # ['ä', 'a'], + # ['Ä', 'A'], + # ['ë', 'e'], + # ['Ë', 'E'], + # ['ö', 'o'], + # ['Ö', 'O'], + + # Ukrainian + ['Є', 'Ye'], + ['І', 'I'], + ['Ї', 'Yi'], + ['Ґ', 'G'], + ['є', 'ye'], + ['і', 'i'], + ['ї', 'yi'], + ['ґ', 'g'], + + # Dutch + ['IJ', 'IJ'], + ['ij', 'ij'], + + # Danish + # ['Æ', 'Ae'], + # ['Ø', 'Oe'], + # ['Å', 'Aa'], + # ['æ', 'ae'], + # ['ø', 'oe'], + # ['å', 'aa'] + + # Currencies + ['¢', 'c'], + ['¥', 'Y'], + ['߿', 'b'], + ['৳', 't'], + ['૱', 'Bo'], + ['฿', 'B'], + ['₠', 'CE'], + ['₡', 'C'], + ['₢', 'Cr'], + ['₣', 'F'], + ['₥', 'm'], + ['₦', 'N'], + ['₧', 'Pt'], + ['₨', 'Rs'], + ['₩', 'W'], + ['₫', 's'], + ['€', 'E'], + ['₭', 'K'], + ['₮', 'T'], + ['₯', 'Dp'], + ['₰', 'S'], + ['₱', 'P'], + ['₲', 'G'], + ['₳', 'A'], + ['₴', 'S'], + ['₵', 'C'], + ['₶', 'tt'], + ['₷', 'S'], + ['₸', 'T'], + ['₹', 'R'], + ['₺', 'L'], + ['₽', 'P'], + ['₿', 'B'], + ['﹩', '$'], + ['¢', 'c'], + ['¥', 'Y'], + ['₩', 'W'], + + # Latin + ['𝐀', 'A'], + ['𝐁', 'B'], + ['𝐂', 'C'], + ['𝐃', 'D'], + ['𝐄', 'E'], + ['𝐅', 'F'], + ['𝐆', 'G'], + ['𝐇', 'H'], + ['𝐈', 'I'], + ['𝐉', 'J'], + ['𝐊', 'K'], + ['𝐋', 'L'], + ['𝐌', 'M'], + ['𝐍', 'N'], + ['𝐎', 'O'], + ['𝐏', 'P'], + ['𝐐', 'Q'], + ['𝐑', 'R'], + ['𝐒', 'S'], + ['𝐓', 'T'], + ['𝐔', 'U'], + ['𝐕', 'V'], + ['𝐖', 'W'], + ['𝐗', 'X'], + ['𝐘', 'Y'], + ['𝐙', 'Z'], + ['𝐚', 'a'], + ['𝐛', 'b'], + ['𝐜', 'c'], + ['𝐝', 'd'], + ['𝐞', 'e'], + ['𝐟', 'f'], + ['𝐠', 'g'], + ['𝐡', 'h'], + ['𝐢', 'i'], + ['𝐣', 'j'], + ['𝐤', 'k'], + ['𝐥', 'l'], + ['𝐦', 'm'], + ['𝐧', 'n'], + ['𝐨', 'o'], + ['𝐩', 'p'], + ['𝐪', 'q'], + ['𝐫', 'r'], + ['𝐬', 's'], + ['𝐭', 't'], + ['𝐮', 'u'], + ['𝐯', 'v'], + ['𝐰', 'w'], + ['𝐱', 'x'], + ['𝐲', 'y'], + ['𝐳', 'z'], + ['𝐴', 'A'], + ['𝐵', 'B'], + ['𝐶', 'C'], + ['𝐷', 'D'], + ['𝐸', 'E'], + ['𝐹', 'F'], + ['𝐺', 'G'], + ['𝐻', 'H'], + ['𝐼', 'I'], + ['𝐽', 'J'], + ['𝐾', 'K'], + ['𝐿', 'L'], + ['𝑀', 'M'], + ['𝑁', 'N'], + ['𝑂', 'O'], + ['𝑃', 'P'], + ['𝑄', 'Q'], + ['𝑅', 'R'], + ['𝑆', 'S'], + ['𝑇', 'T'], + ['𝑈', 'U'], + ['𝑉', 'V'], + ['𝑊', 'W'], + ['𝑋', 'X'], + ['𝑌', 'Y'], + ['𝑍', 'Z'], + ['𝑎', 'a'], + ['𝑏', 'b'], + ['𝑐', 'c'], + ['𝑑', 'd'], + ['𝑒', 'e'], + ['𝑓', 'f'], + ['𝑔', 'g'], + ['𝑖', 'i'], + ['𝑗', 'j'], + ['𝑘', 'k'], + ['𝑙', 'l'], + ['𝑚', 'm'], + ['𝑛', 'n'], + ['𝑜', 'o'], + ['𝑝', 'p'], + ['𝑞', 'q'], + ['𝑟', 'r'], + ['𝑠', 's'], + ['𝑡', 't'], + ['𝑢', 'u'], + ['𝑣', 'v'], + ['𝑤', 'w'], + ['𝑥', 'x'], + ['𝑦', 'y'], + ['𝑧', 'z'], + ['𝑨', 'A'], + ['𝑩', 'B'], + ['𝑪', 'C'], + ['𝑫', 'D'], + ['𝑬', 'E'], + ['𝑭', 'F'], + ['𝑮', 'G'], + ['𝑯', 'H'], + ['𝑰', 'I'], + ['𝑱', 'J'], + ['𝑲', 'K'], + ['𝑳', 'L'], + ['𝑴', 'M'], + ['𝑵', 'N'], + ['𝑶', 'O'], + ['𝑷', 'P'], + ['𝑸', 'Q'], + ['𝑹', 'R'], + ['𝑺', 'S'], + ['𝑻', 'T'], + ['𝑼', 'U'], + ['𝑽', 'V'], + ['𝑾', 'W'], + ['𝑿', 'X'], + ['𝒀', 'Y'], + ['𝒁', 'Z'], + ['𝒂', 'a'], + ['𝒃', 'b'], + ['𝒄', 'c'], + ['𝒅', 'd'], + ['𝒆', 'e'], + ['𝒇', 'f'], + ['𝒈', 'g'], + ['𝒉', 'h'], + ['𝒊', 'i'], + ['𝒋', 'j'], + ['𝒌', 'k'], + ['𝒍', 'l'], + ['𝒎', 'm'], + ['𝒏', 'n'], + ['𝒐', 'o'], + ['𝒑', 'p'], + ['𝒒', 'q'], + ['𝒓', 'r'], + ['𝒔', 's'], + ['𝒕', 't'], + ['𝒖', 'u'], + ['𝒗', 'v'], + ['𝒘', 'w'], + ['𝒙', 'x'], + ['𝒚', 'y'], + ['𝒛', 'z'], + ['𝒜', 'A'], + ['𝒞', 'C'], + ['𝒟', 'D'], + ['𝒢', 'g'], + ['𝒥', 'J'], + ['𝒦', 'K'], + ['𝒩', 'N'], + ['𝒪', 'O'], + ['𝒫', 'P'], + ['𝒬', 'Q'], + ['𝒮', 'S'], + ['𝒯', 'T'], + ['𝒰', 'U'], + ['𝒱', 'V'], + ['𝒲', 'W'], + ['𝒳', 'X'], + ['𝒴', 'Y'], + ['𝒵', 'Z'], + ['𝒶', 'a'], + ['𝒷', 'b'], + ['𝒸', 'c'], + ['𝒹', 'd'], + ['𝒻', 'f'], + ['𝒽', 'h'], + ['𝒾', 'i'], + ['𝒿', 'j'], + ['𝓀', 'h'], + ['𝓁', 'l'], + ['𝓂', 'm'], + ['𝓃', 'n'], + ['𝓅', 'p'], + ['𝓆', 'q'], + ['𝓇', 'r'], + ['𝓈', 's'], + ['𝓉', 't'], + ['𝓊', 'u'], + ['𝓋', 'v'], + ['𝓌', 'w'], + ['𝓍', 'x'], + ['𝓎', 'y'], + ['𝓏', 'z'], + ['𝓐', 'A'], + ['𝓑', 'B'], + ['𝓒', 'C'], + ['𝓓', 'D'], + ['𝓔', 'E'], + ['𝓕', 'F'], + ['𝓖', 'G'], + ['𝓗', 'H'], + ['𝓘', 'I'], + ['𝓙', 'J'], + ['𝓚', 'K'], + ['𝓛', 'L'], + ['𝓜', 'M'], + ['𝓝', 'N'], + ['𝓞', 'O'], + ['𝓟', 'P'], + ['𝓠', 'Q'], + ['𝓡', 'R'], + ['𝓢', 'S'], + ['𝓣', 'T'], + ['𝓤', 'U'], + ['𝓥', 'V'], + ['𝓦', 'W'], + ['𝓧', 'X'], + ['𝓨', 'Y'], + ['𝓩', 'Z'], + ['𝓪', 'a'], + ['𝓫', 'b'], + ['𝓬', 'c'], + ['𝓭', 'd'], + ['𝓮', 'e'], + ['𝓯', 'f'], + ['𝓰', 'g'], + ['𝓱', 'h'], + ['𝓲', 'i'], + ['𝓳', 'j'], + ['𝓴', 'k'], + ['𝓵', 'l'], + ['𝓶', 'm'], + ['𝓷', 'n'], + ['𝓸', 'o'], + ['𝓹', 'p'], + ['𝓺', 'q'], + ['𝓻', 'r'], + ['𝓼', 's'], + ['𝓽', 't'], + ['𝓾', 'u'], + ['𝓿', 'v'], + ['𝔀', 'w'], + ['𝔁', 'x'], + ['𝔂', 'y'], + ['𝔃', 'z'], + ['𝔄', 'A'], + ['𝔅', 'B'], + ['𝔇', 'D'], + ['𝔈', 'E'], + ['𝔉', 'F'], + ['𝔊', 'G'], + ['𝔍', 'J'], + ['𝔎', 'K'], + ['𝔏', 'L'], + ['𝔐', 'M'], + ['𝔑', 'N'], + ['𝔒', 'O'], + ['𝔓', 'P'], + ['𝔔', 'Q'], + ['𝔖', 'S'], + ['𝔗', 'T'], + ['𝔘', 'U'], + ['𝔙', 'V'], + ['𝔚', 'W'], + ['𝔛', 'X'], + ['𝔜', 'Y'], + ['𝔞', 'a'], + ['𝔟', 'b'], + ['𝔠', 'c'], + ['𝔡', 'd'], + ['𝔢', 'e'], + ['𝔣', 'f'], + ['𝔤', 'g'], + ['𝔥', 'h'], + ['𝔦', 'i'], + ['𝔧', 'j'], + ['𝔨', 'k'], + ['𝔩', 'l'], + ['𝔪', 'm'], + ['𝔫', 'n'], + ['𝔬', 'o'], + ['𝔭', 'p'], + ['𝔮', 'q'], + ['𝔯', 'r'], + ['𝔰', 's'], + ['𝔱', 't'], + ['𝔲', 'u'], + ['𝔳', 'v'], + ['𝔴', 'w'], + ['𝔵', 'x'], + ['𝔶', 'y'], + ['𝔷', 'z'], + ['𝔸', 'A'], + ['𝔹', 'B'], + ['𝔻', 'D'], + ['𝔼', 'E'], + ['𝔽', 'F'], + ['𝔾', 'G'], + ['𝕀', 'I'], + ['𝕁', 'J'], + ['𝕂', 'K'], + ['𝕃', 'L'], + ['𝕄', 'M'], + ['𝕆', 'N'], + ['𝕊', 'S'], + ['𝕋', 'T'], + ['𝕌', 'U'], + ['𝕍', 'V'], + ['𝕎', 'W'], + ['𝕏', 'X'], + ['𝕐', 'Y'], + ['𝕒', 'a'], + ['𝕓', 'b'], + ['𝕔', 'c'], + ['𝕕', 'd'], + ['𝕖', 'e'], + ['𝕗', 'f'], + ['𝕘', 'g'], + ['𝕙', 'h'], + ['𝕚', 'i'], + ['𝕛', 'j'], + ['𝕜', 'k'], + ['𝕝', 'l'], + ['𝕞', 'm'], + ['𝕟', 'n'], + ['𝕠', 'o'], + ['𝕡', 'p'], + ['𝕢', 'q'], + ['𝕣', 'r'], + ['𝕤', 's'], + ['𝕥', 't'], + ['𝕦', 'u'], + ['𝕧', 'v'], + ['𝕨', 'w'], + ['𝕩', 'x'], + ['𝕪', 'y'], + ['𝕫', 'z'], + ['𝕬', 'A'], + ['𝕭', 'B'], + ['𝕮', 'C'], + ['𝕯', 'D'], + ['𝕰', 'E'], + ['𝕱', 'F'], + ['𝕲', 'G'], + ['𝕳', 'H'], + ['𝕴', 'I'], + ['𝕵', 'J'], + ['𝕶', 'K'], + ['𝕷', 'L'], + ['𝕸', 'M'], + ['𝕹', 'N'], + ['𝕺', 'O'], + ['𝕻', 'P'], + ['𝕼', 'Q'], + ['𝕽', 'R'], + ['𝕾', 'S'], + ['𝕿', 'T'], + ['𝖀', 'U'], + ['𝖁', 'V'], + ['𝖂', 'W'], + ['𝖃', 'X'], + ['𝖄', 'Y'], + ['𝖅', 'Z'], + ['𝖆', 'a'], + ['𝖇', 'b'], + ['𝖈', 'c'], + ['𝖉', 'd'], + ['𝖊', 'e'], + ['𝖋', 'f'], + ['𝖌', 'g'], + ['𝖍', 'h'], + ['𝖎', 'i'], + ['𝖏', 'j'], + ['𝖐', 'k'], + ['𝖑', 'l'], + ['𝖒', 'm'], + ['𝖓', 'n'], + ['𝖔', 'o'], + ['𝖕', 'p'], + ['𝖖', 'q'], + ['𝖗', 'r'], + ['𝖘', 's'], + ['𝖙', 't'], + ['𝖚', 'u'], + ['𝖛', 'v'], + ['𝖜', 'w'], + ['𝖝', 'x'], + ['𝖞', 'y'], + ['𝖟', 'z'], + ['𝖠', 'A'], + ['𝖡', 'B'], + ['𝖢', 'C'], + ['𝖣', 'D'], + ['𝖤', 'E'], + ['𝖥', 'F'], + ['𝖦', 'G'], + ['𝖧', 'H'], + ['𝖨', 'I'], + ['𝖩', 'J'], + ['𝖪', 'K'], + ['𝖫', 'L'], + ['𝖬', 'M'], + ['𝖭', 'N'], + ['𝖮', 'O'], + ['𝖯', 'P'], + ['𝖰', 'Q'], + ['𝖱', 'R'], + ['𝖲', 'S'], + ['𝖳', 'T'], + ['𝖴', 'U'], + ['𝖵', 'V'], + ['𝖶', 'W'], + ['𝖷', 'X'], + ['𝖸', 'Y'], + ['𝖹', 'Z'], + ['𝖺', 'a'], + ['𝖻', 'b'], + ['𝖼', 'c'], + ['𝖽', 'd'], + ['𝖾', 'e'], + ['𝖿', 'f'], + ['𝗀', 'g'], + ['𝗁', 'h'], + ['𝗂', 'i'], + ['𝗃', 'j'], + ['𝗄', 'k'], + ['𝗅', 'l'], + ['𝗆', 'm'], + ['𝗇', 'n'], + ['𝗈', 'o'], + ['𝗉', 'p'], + ['𝗊', 'q'], + ['𝗋', 'r'], + ['𝗌', 's'], + ['𝗍', 't'], + ['𝗎', 'u'], + ['𝗏', 'v'], + ['𝗐', 'w'], + ['𝗑', 'x'], + ['𝗒', 'y'], + ['𝗓', 'z'], + ['𝗔', 'A'], + ['𝗕', 'B'], + ['𝗖', 'C'], + ['𝗗', 'D'], + ['𝗘', 'E'], + ['𝗙', 'F'], + ['𝗚', 'G'], + ['𝗛', 'H'], + ['𝗜', 'I'], + ['𝗝', 'J'], + ['𝗞', 'K'], + ['𝗟', 'L'], + ['𝗠', 'M'], + ['𝗡', 'N'], + ['𝗢', 'O'], + ['𝗣', 'P'], + ['𝗤', 'Q'], + ['𝗥', 'R'], + ['𝗦', 'S'], + ['𝗧', 'T'], + ['𝗨', 'U'], + ['𝗩', 'V'], + ['𝗪', 'W'], + ['𝗫', 'X'], + ['𝗬', 'Y'], + ['𝗭', 'Z'], + ['𝗮', 'a'], + ['𝗯', 'b'], + ['𝗰', 'c'], + ['𝗱', 'd'], + ['𝗲', 'e'], + ['𝗳', 'f'], + ['𝗴', 'g'], + ['𝗵', 'h'], + ['𝗶', 'i'], + ['𝗷', 'j'], + ['𝗸', 'k'], + ['𝗹', 'l'], + ['𝗺', 'm'], + ['𝗻', 'n'], + ['𝗼', 'o'], + ['𝗽', 'p'], + ['𝗾', 'q'], + ['𝗿', 'r'], + ['𝘀', 's'], + ['𝘁', 't'], + ['𝘂', 'u'], + ['𝘃', 'v'], + ['𝘄', 'w'], + ['𝘅', 'x'], + ['𝘆', 'y'], + ['𝘇', 'z'], + ['𝘈', 'A'], + ['𝘉', 'B'], + ['𝘊', 'C'], + ['𝘋', 'D'], + ['𝘌', 'E'], + ['𝘍', 'F'], + ['𝘎', 'G'], + ['𝘏', 'H'], + ['𝘐', 'I'], + ['𝘑', 'J'], + ['𝘒', 'K'], + ['𝘓', 'L'], + ['𝘔', 'M'], + ['𝘕', 'N'], + ['𝘖', 'O'], + ['𝘗', 'P'], + ['𝘘', 'Q'], + ['𝘙', 'R'], + ['𝘚', 'S'], + ['𝘛', 'T'], + ['𝘜', 'U'], + ['𝘝', 'V'], + ['𝘞', 'W'], + ['𝘟', 'X'], + ['𝘠', 'Y'], + ['𝘡', 'Z'], + ['𝘢', 'a'], + ['𝘣', 'b'], + ['𝘤', 'c'], + ['𝘥', 'd'], + ['𝘦', 'e'], + ['𝘧', 'f'], + ['𝘨', 'g'], + ['𝘩', 'h'], + ['𝘪', 'i'], + ['𝘫', 'j'], + ['𝘬', 'k'], + ['𝘭', 'l'], + ['𝘮', 'm'], + ['𝘯', 'n'], + ['𝘰', 'o'], + ['𝘱', 'p'], + ['𝘲', 'q'], + ['𝘳', 'r'], + ['𝘴', 's'], + ['𝘵', 't'], + ['𝘶', 'u'], + ['𝘷', 'v'], + ['𝘸', 'w'], + ['𝘹', 'x'], + ['𝘺', 'y'], + ['𝘻', 'z'], + ['𝘼', 'A'], + ['𝘽', 'B'], + ['𝘾', 'C'], + ['𝘿', 'D'], + ['𝙀', 'E'], + ['𝙁', 'F'], + ['𝙂', 'G'], + ['𝙃', 'H'], + ['𝙄', 'I'], + ['𝙅', 'J'], + ['𝙆', 'K'], + ['𝙇', 'L'], + ['𝙈', 'M'], + ['𝙉', 'N'], + ['𝙊', 'O'], + ['𝙋', 'P'], + ['𝙌', 'Q'], + ['𝙍', 'R'], + ['𝙎', 'S'], + ['𝙏', 'T'], + ['𝙐', 'U'], + ['𝙑', 'V'], + ['𝙒', 'W'], + ['𝙓', 'X'], + ['𝙔', 'Y'], + ['𝙕', 'Z'], + ['𝙖', 'a'], + ['𝙗', 'b'], + ['𝙘', 'c'], + ['𝙙', 'd'], + ['𝙚', 'e'], + ['𝙛', 'f'], + ['𝙜', 'g'], + ['𝙝', 'h'], + ['𝙞', 'i'], + ['𝙟', 'j'], + ['𝙠', 'k'], + ['𝙡', 'l'], + ['𝙢', 'm'], + ['𝙣', 'n'], + ['𝙤', 'o'], + ['𝙥', 'p'], + ['𝙦', 'q'], + ['𝙧', 'r'], + ['𝙨', 's'], + ['𝙩', 't'], + ['𝙪', 'u'], + ['𝙫', 'v'], + ['𝙬', 'w'], + ['𝙭', 'x'], + ['𝙮', 'y'], + ['𝙯', 'z'], + ['𝙰', 'A'], + ['𝙱', 'B'], + ['𝙲', 'C'], + ['𝙳', 'D'], + ['𝙴', 'E'], + ['𝙵', 'F'], + ['𝙶', 'G'], + ['𝙷', 'H'], + ['𝙸', 'I'], + ['𝙹', 'J'], + ['𝙺', 'K'], + ['𝙻', 'L'], + ['𝙼', 'M'], + ['𝙽', 'N'], + ['𝙾', 'O'], + ['𝙿', 'P'], + ['𝚀', 'Q'], + ['𝚁', 'R'], + ['𝚂', 'S'], + ['𝚃', 'T'], + ['𝚄', 'U'], + ['𝚅', 'V'], + ['𝚆', 'W'], + ['𝚇', 'X'], + ['𝚈', 'Y'], + ['𝚉', 'Z'], + ['𝚊', 'a'], + ['𝚋', 'b'], + ['𝚌', 'c'], + ['𝚍', 'd'], + ['𝚎', 'e'], + ['𝚏', 'f'], + ['𝚐', 'g'], + ['𝚑', 'h'], + ['𝚒', 'i'], + ['𝚓', 'j'], + ['𝚔', 'k'], + ['𝚕', 'l'], + ['𝚖', 'm'], + ['𝚗', 'n'], + ['𝚘', 'o'], + ['𝚙', 'p'], + ['𝚚', 'q'], + ['𝚛', 'r'], + ['𝚜', 's'], + ['𝚝', 't'], + ['𝚞', 'u'], + ['𝚟', 'v'], + ['𝚠', 'w'], + ['𝚡', 'x'], + ['𝚢', 'y'], + ['𝚣', 'z'], + + # Dotless letters + ['𝚤', 'l'], + ['𝚥', 'j'], + + # Greek + ['𝛢', 'A'], + ['𝛣', 'B'], + ['𝛤', 'G'], + ['𝛥', 'D'], + ['𝛦', 'E'], + ['𝛧', 'Z'], + ['𝛨', 'I'], + ['𝛩', 'TH'], + ['𝛪', 'I'], + ['𝛫', 'K'], + ['𝛬', 'L'], + ['𝛭', 'M'], + ['𝛮', 'N'], + ['𝛯', 'KS'], + ['𝛰', 'O'], + ['𝛱', 'P'], + ['𝛲', 'R'], + ['𝛳', 'TH'], + ['𝛴', 'S'], + ['𝛵', 'T'], + ['𝛶', 'Y'], + ['𝛷', 'F'], + ['𝛸', 'x'], + ['𝛹', 'PS'], + ['𝛺', 'O'], + ['𝛻', 'D'], + ['𝛼', 'a'], + ['𝛽', 'b'], + ['𝛾', 'g'], + ['𝛿', 'd'], + ['𝜀', 'e'], + ['𝜁', 'z'], + ['𝜂', 'i'], + ['𝜃', 'th'], + ['𝜄', 'i'], + ['𝜅', 'k'], + ['𝜆', 'l'], + ['𝜇', 'm'], + ['𝜈', 'n'], + ['𝜉', 'ks'], + ['𝜊', 'o'], + ['𝜋', 'p'], + ['𝜌', 'r'], + ['𝜍', 's'], + ['𝜎', 's'], + ['𝜏', 't'], + ['𝜐', 'y'], + ['𝜑', 'f'], + ['𝜒', 'x'], + ['𝜓', 'ps'], + ['𝜔', 'o'], + ['𝜕', 'd'], + ['𝜖', 'E'], + ['𝜗', 'TH'], + ['𝜘', 'K'], + ['𝜙', 'f'], + ['𝜚', 'r'], + ['𝜛', 'p'], + ['𝜜', 'A'], + ['𝜝', 'V'], + ['𝜞', 'G'], + ['𝜟', 'D'], + ['𝜠', 'E'], + ['𝜡', 'Z'], + ['𝜢', 'I'], + ['𝜣', 'TH'], + ['𝜤', 'I'], + ['𝜥', 'K'], + ['𝜦', 'L'], + ['𝜧', 'M'], + ['𝜨', 'N'], + ['𝜩', 'KS'], + ['𝜪', 'O'], + ['𝜫', 'P'], + ['𝜬', 'S'], + ['𝜭', 'TH'], + ['𝜮', 'S'], + ['𝜯', 'T'], + ['𝜰', 'Y'], + ['𝜱', 'F'], + ['𝜲', 'X'], + ['𝜳', 'PS'], + ['𝜴', 'O'], + ['𝜵', 'D'], + ['𝜶', 'a'], + ['𝜷', 'v'], + ['𝜸', 'g'], + ['𝜹', 'd'], + ['𝜺', 'e'], + ['𝜻', 'z'], + ['𝜼', 'i'], + ['𝜽', 'th'], + ['𝜾', 'i'], + ['𝜿', 'k'], + ['𝝀', 'l'], + ['𝝁', 'm'], + ['𝝂', 'n'], + ['𝝃', 'ks'], + ['𝝄', 'o'], + ['𝝅', 'p'], + ['𝝆', 'r'], + ['𝝇', 's'], + ['𝝈', 's'], + ['𝝉', 't'], + ['𝝊', 'y'], + ['𝝋', 'f'], + ['𝝌', 'x'], + ['𝝍', 'ps'], + ['𝝎', 'o'], + ['𝝏', 'a'], + ['𝝐', 'e'], + ['𝝑', 'i'], + ['𝝒', 'k'], + ['𝝓', 'f'], + ['𝝔', 'r'], + ['𝝕', 'p'], + ['𝝖', 'A'], + ['𝝗', 'B'], + ['𝝘', 'G'], + ['𝝙', 'D'], + ['𝝚', 'E'], + ['𝝛', 'Z'], + ['𝝜', 'I'], + ['𝝝', 'TH'], + ['𝝞', 'I'], + ['𝝟', 'K'], + ['𝝠', 'L'], + ['𝝡', 'M'], + ['𝝢', 'N'], + ['𝝣', 'KS'], + ['𝝤', 'O'], + ['𝝥', 'P'], + ['𝝦', 'R'], + ['𝝧', 'TH'], + ['𝝨', 'S'], + ['𝝩', 'T'], + ['𝝪', 'Y'], + ['𝝫', 'F'], + ['𝝬', 'X'], + ['𝝭', 'PS'], + ['𝝮', 'O'], + ['𝝯', 'D'], + ['𝝰', 'a'], + ['𝝱', 'v'], + ['𝝲', 'g'], + ['𝝳', 'd'], + ['𝝴', 'e'], + ['𝝵', 'z'], + ['𝝶', 'i'], + ['𝝷', 'th'], + ['𝝸', 'i'], + ['𝝹', 'k'], + ['𝝺', 'l'], + ['𝝻', 'm'], + ['𝝼', 'n'], + ['𝝽', 'ks'], + ['𝝾', 'o'], + ['𝝿', 'p'], + ['𝞀', 'r'], + ['𝞁', 's'], + ['𝞂', 's'], + ['𝞃', 't'], + ['𝞄', 'y'], + ['𝞅', 'f'], + ['𝞆', 'x'], + ['𝞇', 'ps'], + ['𝞈', 'o'], + ['𝞉', 'a'], + ['𝞊', 'e'], + ['𝞋', 'i'], + ['𝞌', 'k'], + ['𝞍', 'f'], + ['𝞎', 'r'], + ['𝞏', 'p'], + ['𝞐', 'A'], + ['𝞑', 'V'], + ['𝞒', 'G'], + ['𝞓', 'D'], + ['𝞔', 'E'], + ['𝞕', 'Z'], + ['𝞖', 'I'], + ['𝞗', 'TH'], + ['𝞘', 'I'], + ['𝞙', 'K'], + ['𝞚', 'L'], + ['𝞛', 'M'], + ['𝞜', 'N'], + ['𝞝', 'KS'], + ['𝞞', 'O'], + ['𝞟', 'P'], + ['𝞠', 'S'], + ['𝞡', 'TH'], + ['𝞢', 'S'], + ['𝞣', 'T'], + ['𝞤', 'Y'], + ['𝞥', 'F'], + ['𝞦', 'X'], + ['𝞧', 'PS'], + ['𝞨', 'O'], + ['𝞩', 'D'], + ['𝞪', 'av'], + ['𝞫', 'g'], + ['𝞬', 'd'], + ['𝞭', 'e'], + ['𝞮', 'z'], + ['𝞯', 'i'], + ['𝞰', 'i'], + ['𝞱', 'th'], + ['𝞲', 'i'], + ['𝞳', 'k'], + ['𝞴', 'l'], + ['𝞵', 'm'], + ['𝞶', 'n'], + ['𝞷', 'ks'], + ['𝞸', 'o'], + ['𝞹', 'p'], + ['𝞺', 'r'], + ['𝞻', 's'], + ['𝞼', 's'], + ['𝞽', 't'], + ['𝞾', 'y'], + ['𝞿', 'f'], + ['𝟀', 'x'], + ['𝟁', 'ps'], + ['𝟂', 'o'], + ['𝟃', 'a'], + ['𝟄', 'e'], + ['𝟅', 'i'], + ['𝟆', 'k'], + ['𝟇', 'f'], + ['𝟈', 'r'], + ['𝟉', 'p'], + ['𝟊', 'F'], + ['𝟋', 'f'], + ['⒜', '(a)'], + ['⒝', '(b)'], + ['⒞', '(c)'], + ['⒟', '(d)'], + ['⒠', '(e)'], + ['⒡', '(f)'], + ['⒢', '(g)'], + ['⒣', '(h)'], + ['⒤', '(i)'], + ['⒥', '(j)'], + ['⒦', '(k)'], + ['⒧', '(l)'], + ['⒨', '(m)'], + ['⒩', '(n)'], + ['⒪', '(o)'], + ['⒫', '(p)'], + ['⒬', '(q)'], + ['⒭', '(r)'], + ['⒮', '(s)'], + ['⒯', '(t)'], + ['⒰', '(u)'], + ['⒱', '(v)'], + ['⒲', '(w)'], + ['⒳', '(x)'], + ['⒴', '(y)'], + ['⒵', '(z)'], + ['Ⓐ', '(A)'], + ['Ⓑ', '(B)'], + ['Ⓒ', '(C)'], + ['Ⓓ', '(D)'], + ['Ⓔ', '(E)'], + ['Ⓕ', '(F)'], + ['Ⓖ', '(G)'], + ['Ⓗ', '(H)'], + ['Ⓘ', '(I)'], + ['Ⓙ', '(J)'], + ['Ⓚ', '(K)'], + ['Ⓛ', '(L)'], + ['Ⓝ', '(N)'], + ['Ⓞ', '(O)'], + ['Ⓟ', '(P)'], + ['Ⓠ', '(Q)'], + ['Ⓡ', '(R)'], + ['Ⓢ', '(S)'], + ['Ⓣ', '(T)'], + ['Ⓤ', '(U)'], + ['Ⓥ', '(V)'], + ['Ⓦ', '(W)'], + ['Ⓧ', '(X)'], + ['Ⓨ', '(Y)'], + ['Ⓩ', '(Z)'], + ['ⓐ', '(a)'], + ['ⓑ', '(b)'], + ['ⓒ', '(b)'], + ['ⓓ', '(c)'], + ['ⓔ', '(e)'], + ['ⓕ', '(f)'], + ['ⓖ', '(g)'], + ['ⓗ', '(h)'], + ['ⓘ', '(i)'], + ['ⓙ', '(j)'], + ['ⓚ', '(k)'], + ['ⓛ', '(l)'], + ['ⓜ', '(m)'], + ['ⓝ', '(n)'], + ['ⓞ', '(o)'], + ['ⓟ', '(p)'], + ['ⓠ', '(q)'], + ['ⓡ', '(r)'], + ['ⓢ', '(s)'], + ['ⓣ', '(t)'], + ['ⓤ', '(u)'], + ['ⓥ', '(v)'], + ['ⓦ', '(w)'], + ['ⓧ', '(x)'], + ['ⓨ', '(y)'], + ['ⓩ', '(z)'], + + # Numbers + ['𝟎', '0'], + ['𝟏', '1'], + ['𝟐', '2'], + ['𝟑', '3'], + ['𝟒', '4'], + ['𝟓', '5'], + ['𝟔', '6'], + ['𝟕', '7'], + ['𝟖', '8'], + ['𝟗', '9'], + ['𝟘', '0'], + ['𝟙', '1'], + ['𝟚', '2'], + ['𝟛', '3'], + ['𝟜', '4'], + ['𝟝', '5'], + ['𝟞', '6'], + ['𝟟', '7'], + ['𝟠', '8'], + ['𝟡', '9'], + ['𝟢', '0'], + ['𝟣', '1'], + ['𝟤', '2'], + ['𝟥', '3'], + ['𝟦', '4'], + ['𝟧', '5'], + ['𝟨', '6'], + ['𝟩', '7'], + ['𝟪', '8'], + ['𝟫', '9'], + ['𝟬', '0'], + ['𝟭', '1'], + ['𝟮', '2'], + ['𝟯', '3'], + ['𝟰', '4'], + ['𝟱', '5'], + ['𝟲', '6'], + ['𝟳', '7'], + ['𝟴', '8'], + ['𝟵', '9'], + ['𝟶', '0'], + ['𝟷', '1'], + ['𝟸', '2'], + ['𝟹', '3'], + ['𝟺', '4'], + ['𝟻', '5'], + ['𝟼', '6'], + ['𝟽', '7'], + ['𝟾', '8'], + ['𝟿', '9'], + ['①', '1'], + ['②', '2'], + ['③', '3'], + ['④', '4'], + ['⑤', '5'], + ['⑥', '6'], + ['⑦', '7'], + ['⑧', '8'], + ['⑨', '9'], + ['⑩', '10'], + ['⑪', '11'], + ['⑫', '12'], + ['⑬', '13'], + ['⑭', '14'], + ['⑮', '15'], + ['⑯', '16'], + ['⑰', '17'], + ['⑱', '18'], + ['⑲', '19'], + ['⑳', '20'], + ['⑴', '1'], + ['⑵', '2'], + ['⑶', '3'], + ['⑷', '4'], + ['⑸', '5'], + ['⑹', '6'], + ['⑺', '7'], + ['⑻', '8'], + ['⑼', '9'], + ['⑽', '10'], + ['⑾', '11'], + ['⑿', '12'], + ['⒀', '13'], + ['⒁', '14'], + ['⒂', '15'], + ['⒃', '16'], + ['⒄', '17'], + ['⒅', '18'], + ['⒆', '19'], + ['⒇', '20'], + ['⒈', '1.'], + ['⒉', '2.'], + ['⒊', '3.'], + ['⒋', '4.'], + ['⒌', '5.'], + ['⒍', '6.'], + ['⒎', '7.'], + ['⒏', '8.'], + ['⒐', '9.'], + ['⒑', '10.'], + ['⒒', '11.'], + ['⒓', '12.'], + ['⒔', '13.'], + ['⒕', '14.'], + ['⒖', '15.'], + ['⒗', '16.'], + ['⒘', '17.'], + ['⒙', '18.'], + ['⒚', '19.'], + ['⒛', '20.'], + ['⓪', '0'], + ['⓫', '11'], + ['⓬', '12'], + ['⓭', '13'], + ['⓮', '14'], + ['⓯', '15'], + ['⓰', '16'], + ['⓱', '17'], + ['⓲', '18'], + ['⓳', '19'], + ['⓴', '20'], + ['⓵', '1'], + ['⓶', '2'], + ['⓷', '3'], + ['⓸', '4'], + ['⓹', '5'], + ['⓺', '6'], + ['⓻', '7'], + ['⓼', '8'], + ['⓽', '9'], + ['⓾', '10'], + ['⓿', '0'], + + # Punctuation + ['🙰', '&'], + ['🙱', '&'], + ['🙲', '&'], + ['🙳', '&'], + ['🙴', '&'], + ['🙵', '&'], + ['🙶', '"'], + ['🙷', '"'], + ['🙸', '"'], + ['‽', '?!'], + ['🙹', '?!'], + ['🙺', '?!'], + ['🙻', '?!'], + ['🙼', '/'], + ['🙽', '\\'], + + # Alchemy + ['🜇', 'AR'], + ['🜈', 'V'], + ['🜉', 'V'], + ['🜆', 'VR'], + ['🜅', 'VF'], + ['🜩', '2'], + ['🜪', '5'], + ['🝡', 'f'], + ['🝢', 'W'], + ['🝣', 'U'], + ['🝧', 'V'], + ['🝨', 'T'], + ['🝪', 'V'], + ['🝫', 'MB'], + ['🝬', 'VB'], + ['🝲', '3B'], + ['🝳', '3B'], + + # Emojis + ['💯', '100'], + ['🔙', 'BACK'], + ['🔚', 'END'], + ['🔛', 'ON!'], + ['🔜', 'SOON'], + ['🔝', 'TOP'], + ['🔞', '18'], + ['🔤', 'abc'], + ['🔠', 'ABCD'], + ['🔡', 'abcd'], + ['🔢', '1234'], + ['🔣', 'T&@%'], + ['#️⃣', '#'], + ['*️⃣', '*'], + ['0️⃣', '0'], + ['1️⃣', '1'], + ['2️⃣', '2'], + ['3️⃣', '3'], + ['4️⃣', '4'], + ['5️⃣', '5'], + ['6️⃣', '6'], + ['7️⃣', '7'], + ['8️⃣', '8'], + ['9️⃣', '9'], + ['🔟', '10'], + ['🅰️', 'A'], + ['🅱️', 'B'], + ['🆎', 'AB'], + ['🆑', 'CL'], + ['🅾️', 'O'], + ['🅿', 'P'], + ['🆘', 'SOS'], + ['🅲', 'C'], + ['🅳', 'D'], + ['🅴', 'E'], + ['🅵', 'F'], + ['🅶', 'G'], + ['🅷', 'H'], + ['🅸', 'I'], + ['🅹', 'J'], + ['🅺', 'K'], + ['🅻', 'L'], + ['🅼', 'M'], + ['🅽', 'N'], + ['🆀', 'Q'], + ['🆁', 'R'], + ['🆂', 'S'], + ['🆃', 'T'], + ['🆄', 'U'], + ['🆅', 'V'], + ['🆆', 'W'], + ['🆇', 'X'], + ['🆈', 'Y'], + ['🆉', 'Z'], +] diff --git a/common/utils.py b/common/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc2e7e558031a0c604f18541960ae051910d54e --- /dev/null +++ b/common/utils.py @@ -0,0 +1,293 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) 2020 Jungil Kong +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# The following functions/classes were based on code from https://github.com/jik876/hifi-gan: +# init_weights, get_padding, AttrDict + +import ctypes +import glob +import os +import re +import shutil +import warnings +from collections import defaultdict, OrderedDict +from pathlib import Path +from typing import Optional + +import librosa +import numpy as np + +import torch +import torch.distributed as dist +from scipy.io.wavfile import read + + +def mask_from_lens(lens, max_len: Optional[int] = None): + if max_len is None: + max_len = lens.max() + ids = torch.arange(0, max_len, device=lens.device, dtype=lens.dtype) + mask = torch.lt(ids, lens.unsqueeze(1)) + return mask + + +def load_wav(full_path, torch_tensor=False): + import soundfile # flac + data, sampling_rate = soundfile.read(full_path, dtype='int16') + if torch_tensor: + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + else: + return data, sampling_rate + + +def load_wav_to_torch(full_path, force_sampling_rate=None): + if force_sampling_rate is not None: + data, sampling_rate = librosa.load(full_path, sr=force_sampling_rate) + else: + sampling_rate, data = read(full_path) + + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(dataset_path, fnames, has_speakers=False, split="|"): + def split_line(root, line): + parts = line.strip().split(split) + if has_speakers: + #ANT: is this ok? + paths, non_paths = parts[:2], parts[2:] + #paths, non_paths = parts[:-2], parts[-2:] + else: + paths, non_paths = parts[:-1], parts[-1:] + return tuple(str(Path(root, p)) for p in paths) + tuple(non_paths) + + fpaths_and_text = [] + for fname in fnames: + with open(fname, encoding='utf-8') as f: + fpaths_and_text += [split_line(dataset_path, line) for line in f] + return fpaths_and_text + + +def to_gpu(x): + x = x.contiguous() + return x.cuda(non_blocking=True) if torch.cuda.is_available() else x + + +def l2_promote(): + _libcudart = ctypes.CDLL('libcudart.so') + # Set device limit on the current device + # cudaLimitMaxL2FetchGranularity = 0x05 + pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int)) + _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) + _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) + assert pValue.contents.value == 128 + + +def prepare_tmp(path): + if path is None: + return + p = Path(path) + if p.is_dir(): + warnings.warn(f'{p} exists. Removing...') + shutil.rmtree(p, ignore_errors=True) + p.mkdir(parents=False, exist_ok=False) + + +def print_once(*msg): + if not dist.is_initialized() or dist.get_rank() == 0: + print(*msg) + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +class DefaultAttrDict(defaultdict): + def __init__(self, *args, **kwargs): + super(DefaultAttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + def __getattr__(self, item): + return self[item] + + +class BenchmarkStats: + """ Tracks statistics used for benchmarking. """ + def __init__(self): + self.num_frames = [] + self.losses = [] + self.mel_losses = [] + self.took = [] + + def update(self, num_frames, losses, mel_losses, took): + self.num_frames.append(num_frames) + self.losses.append(losses) + self.mel_losses.append(mel_losses) + self.took.append(took) + + def get(self, n_epochs): + frames_s = sum(self.num_frames[-n_epochs:]) / sum(self.took[-n_epochs:]) + return {'frames/s': frames_s, + 'loss': np.mean(self.losses[-n_epochs:]), + 'mel_loss': np.mean(self.mel_losses[-n_epochs:]), + 'took': np.mean(self.took[-n_epochs:]), + 'benchmark_epochs_num': n_epochs} + + def __len__(self): + return len(self.losses) + + +class Checkpointer: + + def __init__(self, save_dir, keep_milestones=[]): + self.save_dir = save_dir + self.keep_milestones = keep_milestones + + find = lambda name: [ + (int(re.search("_(\d+).pt", fn).group(1)), fn) + for fn in glob.glob(f"{save_dir}/{name}_checkpoint_*.pt")] + + tracked = sorted(find("FastPitch"), key=lambda t: t[0]) + self.tracked = OrderedDict(tracked) + + def last_checkpoint(self, output): + + def corrupted(fpath): + try: + torch.load(fpath, map_location="cpu") + return False + except: + warnings.warn(f"Cannot load {fpath}") + return True + + saved = sorted( + glob.glob(f"{output}/FastPitch_checkpoint_*.pt"), + key=lambda f: int(re.search("_(\d+).pt", f).group(1))) + + if len(saved) >= 1 and not corrupted(saved[-1]): + return saved[-1] + elif len(saved) >= 2: + return saved[-2] + else: + return None + + def maybe_load(self, model, optimizer, scaler, train_state, args, + ema_model=None): + + assert args.checkpoint_path is None or args.resume is False, ( + "Specify a single checkpoint source") + + fpath = None + if args.checkpoint_path is not None: + fpath = args.checkpoint_path + self.tracked = OrderedDict() # Do not track/delete prev ckpts + elif args.resume: + fpath = self.last_checkpoint(args.output) + + if fpath is None: + return + + print_once(f"Loading model and optimizer state from {fpath}") + ckpt = torch.load(fpath, map_location="cpu") + train_state["epoch"] = ckpt["epoch"] + 1 + train_state["total_iter"] = ckpt["iteration"] + + no_pref = lambda sd: {re.sub("^module.", "", k): v for k, v in sd.items()} + unwrap = lambda m: getattr(m, "module", m) + + unwrap(model).load_state_dict(no_pref(ckpt["state_dict"])) + + if ema_model is not None: + unwrap(ema_model).load_state_dict(no_pref(ckpt["ema_state_dict"])) + + optimizer.load_state_dict(ckpt["optimizer"]) + + if "scaler" in ckpt: + scaler.load_state_dict(ckpt["scaler"]) + else: + warnings.warn("AMP scaler state missing from the checkpoint.") + + def maybe_save(self, args, model, ema_model, optimizer, scaler, epoch, + total_iter, config): + + intermediate = (args.epochs_per_checkpoint > 0 + and epoch % args.epochs_per_checkpoint == 0) + final = epoch == args.epochs + + if not intermediate and not final and epoch not in self.keep_milestones: + return + + rank = 0 + if dist.is_initialized(): + dist.barrier() + rank = dist.get_rank() + + if rank != 0: + return + + unwrap = lambda m: getattr(m, "module", m) + ckpt = {"epoch": epoch, + "iteration": total_iter, + "config": config, + "train_setup": args.__dict__, + "state_dict": unwrap(model).state_dict(), + "optimizer": optimizer.state_dict(), + "scaler": scaler.state_dict()} + if ema_model is not None: + ckpt["ema_state_dict"] = unwrap(ema_model).state_dict() + + fpath = Path(args.output, f"FastPitch_checkpoint_{epoch}.pt") + print(f"Saving model and optimizer state at epoch {epoch} to {fpath}") + torch.save(ckpt, fpath) + + # Remove old checkpoints; keep milestones and the last two + self.tracked[epoch] = fpath + for epoch in set(list(self.tracked)[:-2]) - set(self.keep_milestones): + try: + os.remove(self.tracked[epoch]) + except: + pass + del self.tracked[epoch] diff --git a/fastpitch/alignment.py b/fastpitch/alignment.py new file mode 100644 index 0000000000000000000000000000000000000000..6671f6c0c4702583ce5657348ab7b41cddc394f7 --- /dev/null +++ b/fastpitch/alignment.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numba import jit, prange + + +@jit(nopython=True) +def mas(log_attn_map, width=1): + # assumes mel x text + opt = np.zeros_like(log_attn_map) + log_attn_map = log_attn_map.copy() + log_attn_map[0, 1:] = -np.inf + log_p = np.zeros_like(log_attn_map) + log_p[0, :] = log_attn_map[0, :] + prev_ind = np.zeros_like(log_attn_map, dtype=np.int64) + for i in range(1, log_attn_map.shape[0]): + for j in range(log_attn_map.shape[1]): # for each text dim + prev_j = np.arange(max(0, j-width), j+1) + prev_log = np.array([log_p[i-1, prev_idx] for prev_idx in prev_j]) + + ind = np.argmax(prev_log) + log_p[i, j] = log_attn_map[i, j] + prev_log[ind] + prev_ind[i, j] = prev_j[ind] + + # now backtrack + curr_text_idx = log_attn_map.shape[1]-1 + for i in range(log_attn_map.shape[0]-1, -1, -1): + opt[i, curr_text_idx] = 1 + curr_text_idx = prev_ind[i, curr_text_idx] + opt[0, curr_text_idx] = 1 + return opt + + +@jit(nopython=True) +def mas_width1(log_attn_map): + """mas with hardcoded width=1""" + # assumes mel x text + neg_inf = log_attn_map.dtype.type(-np.inf) + log_p = log_attn_map.copy() + log_p[0, 1:] = neg_inf + for i in range(1, log_p.shape[0]): + prev_log1 = neg_inf + for j in range(log_p.shape[1]): + prev_log2 = log_p[i-1, j] + log_p[i, j] += max(prev_log1, prev_log2) + prev_log1 = prev_log2 + + # now backtrack + opt = np.zeros_like(log_p) + one = opt.dtype.type(1) + j = log_p.shape[1]-1 + for i in range(log_p.shape[0]-1, 0, -1): + opt[i, j] = one + if log_p[i-1, j-1] >= log_p[i-1, j]: + j -= 1 + if j == 0: + opt[1:i, j] = one + break + opt[0, j] = one + return opt + + +@jit(nopython=True, parallel=True) +def b_mas(b_log_attn_map, in_lens, out_lens, width=1): + assert width == 1 + attn_out = np.zeros_like(b_log_attn_map) + + for b in prange(b_log_attn_map.shape[0]): + out = mas_width1(b_log_attn_map[b, 0, :out_lens[b], :in_lens[b]]) + attn_out[b, 0, :out_lens[b], :in_lens[b]] = out + return attn_out diff --git a/fastpitch/arg_parser.py b/fastpitch/arg_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5b1376413efb23cb1a67d0b76c31e93df304a0 --- /dev/null +++ b/fastpitch/arg_parser.py @@ -0,0 +1,130 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import argparse + + +def parse_fastpitch_args(parent, add_help=False): + """ + Parse commandline arguments. + """ + parser = argparse.ArgumentParser(parents=[parent], add_help=add_help, + allow_abbrev=False) + io = parser.add_argument_group('io parameters') + io.add_argument('--n-mel-channels', default=80, type=int, + help='Number of bins in mel-spectrograms') + io.add_argument('--max-seq-len', default=2048, type=int, + help='') + + symbols = parser.add_argument_group('symbols parameters') + symbols.add_argument('--n-symbols', default=148, type=int, + help='Number of symbols in dictionary') + symbols.add_argument('--padding-idx', default=0, type=int, + help='Index of padding symbol in dictionary') + symbols.add_argument('--symbols-embedding-dim', default=384, type=int, + help='Input embedding dimension') + + in_fft = parser.add_argument_group('input FFT parameters') + in_fft.add_argument('--in-fft-n-layers', default=6, type=int, + help='Number of FFT blocks') + in_fft.add_argument('--in-fft-n-heads', default=1, type=int, + help='Number of attention heads') + in_fft.add_argument('--in-fft-d-head', default=64, type=int, + help='Dim of attention heads') + in_fft.add_argument('--in-fft-conv1d-kernel-size', default=3, type=int, + help='Conv-1D kernel size') + in_fft.add_argument('--in-fft-conv1d-filter-size', default=1536, type=int, + help='Conv-1D filter size') + in_fft.add_argument('--in-fft-output-size', default=384, type=int, + help='Output dim') + in_fft.add_argument('--p-in-fft-dropout', default=0.1, type=float, + help='Dropout probability') + in_fft.add_argument('--p-in-fft-dropatt', default=0.1, type=float, + help='Multi-head attention dropout') + in_fft.add_argument('--p-in-fft-dropemb', default=0.0, type=float, + help='Dropout added to word+positional embeddings') + + out_fft = parser.add_argument_group('output FFT parameters') + out_fft.add_argument('--out-fft-n-layers', default=6, type=int, + help='Number of FFT blocks') + out_fft.add_argument('--out-fft-n-heads', default=1, type=int, + help='Number of attention heads') + out_fft.add_argument('--out-fft-d-head', default=64, type=int, + help='Dim of attention head') + out_fft.add_argument('--out-fft-conv1d-kernel-size', default=3, type=int, + help='Conv-1D kernel size') + out_fft.add_argument('--out-fft-conv1d-filter-size', default=1536, type=int, + help='Conv-1D filter size') + out_fft.add_argument('--out-fft-output-size', default=384, type=int, + help='Output dim') + out_fft.add_argument('--p-out-fft-dropout', default=0.1, type=float, + help='Dropout probability for out_fft') + out_fft.add_argument('--p-out-fft-dropatt', default=0.1, type=float, + help='Multi-head attention dropout') + out_fft.add_argument('--p-out-fft-dropemb', default=0.0, type=float, + help='Dropout added to word+positional embeddings') + + dur_pred = parser.add_argument_group('duration predictor parameters') + dur_pred.add_argument('--dur-predictor-kernel-size', default=3, type=int, + help='Duration predictor conv-1D kernel size') + dur_pred.add_argument('--dur-predictor-filter-size', default=256, type=int, + help='Duration predictor conv-1D filter size') + dur_pred.add_argument('--p-dur-predictor-dropout', default=0.1, type=float, + help='Dropout probability for duration predictor') + dur_pred.add_argument('--dur-predictor-n-layers', default=2, type=int, + help='Number of conv-1D layers') + + pitch_pred = parser.add_argument_group('pitch predictor parameters') + pitch_pred.add_argument('--pitch-predictor-kernel-size', default=3, type=int, + help='Pitch predictor conv-1D kernel size') + pitch_pred.add_argument('--pitch-predictor-filter-size', default=256, type=int, + help='Pitch predictor conv-1D filter size') + pitch_pred.add_argument('--p-pitch-predictor-dropout', default=0.1, type=float, + help='Pitch probability for pitch predictor') + pitch_pred.add_argument('--pitch-predictor-n-layers', default=2, type=int, + help='Number of conv-1D layers') + + energy_pred = parser.add_argument_group('energy predictor parameters') + energy_pred.add_argument('--energy-conditioning', action='store_true') + energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int, + help='Pitch predictor conv-1D kernel size') + energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int, + help='Pitch predictor conv-1D filter size') + energy_pred.add_argument('--p-energy-predictor-dropout', default=0.1, type=float, + help='Pitch probability for energy predictor') + energy_pred.add_argument('--energy-predictor-n-layers', default=2, type=int, + help='Number of conv-1D layers') + + cond = parser.add_argument_group('conditioning parameters') + cond.add_argument('--pitch-embedding-kernel-size', default=3, type=int, + help='Pitch embedding conv-1D kernel size') + cond.add_argument('--energy-embedding-kernel-size', default=3, type=int, + help='Pitch embedding conv-1D kernel size') + cond.add_argument('--speaker-emb-weight', type=float, default=1.0, + help='Scale speaker embedding') + + return parser diff --git a/fastpitch/attention.py b/fastpitch/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..59a7397d637216aed4f0eaa03942a83a1e9a1190 --- /dev/null +++ b/fastpitch/attention.py @@ -0,0 +1,220 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +class ConvNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear'): + super(ConvNorm, self).__init__() + if padding is None: + assert(kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + + torch.nn.init.xavier_uniform_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +class Invertible1x1ConvLUS(torch.nn.Module): + def __init__(self, c): + super(Invertible1x1ConvLUS, self).__init__() + # Sample a random orthonormal matrix to initialize weights + W, _ = torch.linalg.qr(torch.randn(c, c)) + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:, 0] = -1*W[:, 0] + p, lower, upper = torch.lu_unpack(*torch.lu(W)) + + self.register_buffer('p', p) + # diagonals of lower will always be 1s anyway + lower = torch.tril(lower, -1) + lower_diag = torch.diag(torch.eye(c, c)) + self.register_buffer('lower_diag', lower_diag) + self.lower = nn.Parameter(lower) + self.upper_diag = nn.Parameter(torch.diag(upper)) + self.upper = nn.Parameter(torch.triu(upper, 1)) + + def forward(self, z, reverse=False): + U = torch.triu(self.upper, 1) + torch.diag(self.upper_diag) + L = torch.tril(self.lower, -1) + torch.diag(self.lower_diag) + W = torch.mm(self.p, torch.mm(L, U)) + if reverse: + if not hasattr(self, 'W_inverse'): + # Reverse computation + W_inverse = W.float().inverse() + if z.type() == 'torch.cuda.HalfTensor': + W_inverse = W_inverse.half() + + self.W_inverse = W_inverse[..., None] + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + else: + W = W[..., None] + z = F.conv1d(z, W, bias=None, stride=1, padding=0) + log_det_W = torch.sum(torch.log(torch.abs(self.upper_diag))) + return z, log_det_W + + +class ConvAttention(torch.nn.Module): + def __init__(self, n_mel_channels=80, n_speaker_dim=128, + n_text_channels=512, n_att_channels=80, temperature=1.0, + n_mel_convs=2, align_query_enc_type='3xconv', + use_query_proj=True): + super(ConvAttention, self).__init__() + self.temperature = temperature + self.att_scaling_factor = np.sqrt(n_att_channels) + self.softmax = torch.nn.Softmax(dim=3) + self.log_softmax = torch.nn.LogSoftmax(dim=3) + self.query_proj = Invertible1x1ConvLUS(n_mel_channels) + self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1) + self.align_query_enc_type = align_query_enc_type + self.use_query_proj = bool(use_query_proj) + + self.key_proj = nn.Sequential( + ConvNorm(n_text_channels, + n_text_channels * 2, + kernel_size=3, + bias=True, + w_init_gain='relu'), + torch.nn.ReLU(), + ConvNorm(n_text_channels * 2, + n_att_channels, + kernel_size=1, + bias=True)) + + self.align_query_enc_type = align_query_enc_type + + if align_query_enc_type == "inv_conv": + self.query_proj = Invertible1x1ConvLUS(n_mel_channels) + elif align_query_enc_type == "3xconv": + self.query_proj = nn.Sequential( + ConvNorm(n_mel_channels, + n_mel_channels * 2, + kernel_size=3, + bias=True, + w_init_gain='relu'), + torch.nn.ReLU(), + ConvNorm(n_mel_channels * 2, + n_mel_channels, + kernel_size=1, + bias=True), + torch.nn.ReLU(), + ConvNorm(n_mel_channels, + n_att_channels, + kernel_size=1, + bias=True)) + else: + raise ValueError("Unknown query encoder type specified") + + def run_padded_sequence(self, sorted_idx, unsort_idx, lens, padded_data, + recurrent_model): + """Sorts input data by previded ordering (and un-ordering) and runs the + packed data through the recurrent model + + Args: + sorted_idx (torch.tensor): 1D sorting index + unsort_idx (torch.tensor): 1D unsorting index (inverse of sorted_idx) + lens: lengths of input data (sorted in descending order) + padded_data (torch.tensor): input sequences (padded) + recurrent_model (nn.Module): recurrent model to run data through + Returns: + hidden_vectors (torch.tensor): outputs of the RNN, in the original, + unsorted, ordering + """ + + # sort the data by decreasing length using provided index + # we assume batch index is in dim=1 + padded_data = padded_data[:, sorted_idx] + padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens) + hidden_vectors = recurrent_model(padded_data)[0] + hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors) + # unsort the results at dim=1 and return + hidden_vectors = hidden_vectors[:, unsort_idx] + return hidden_vectors + + def encode_query(self, query, query_lens): + query = query.permute(2, 0, 1) # seq_len, batch, feature dim + lens, ids = torch.sort(query_lens, descending=True) + original_ids = [0] * lens.size(0) + for i in range(len(ids)): + original_ids[ids[i]] = i + + query_encoded = self.run_padded_sequence(ids, original_ids, lens, + query, self.query_lstm) + query_encoded = query_encoded.permute(1, 2, 0) + return query_encoded + + def forward(self, queries, keys, query_lens, mask=None, key_lens=None, + keys_encoded=None, attn_prior=None): + """Attention mechanism for flowtron parallel + Unlike in Flowtron, we have no restrictions such as causality etc, + since we only need this during training. + + Args: + queries (torch.tensor): B x C x T1 tensor + (probably going to be mel data) + keys (torch.tensor): B x C2 x T2 tensor (text data) + query_lens: lengths for sorting the queries in descending order + mask (torch.tensor): uint8 binary mask for variable length entries + (should be in the T2 domain) + Output: + attn (torch.tensor): B x 1 x T1 x T2 attention mask. + Final dim T2 should sum to 1 + """ + keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 + + # Beware can only do this since query_dim = attn_dim = n_mel_channels + if self.use_query_proj: + if self.align_query_enc_type == "inv_conv": + queries_enc, log_det_W = self.query_proj(queries) + elif self.align_query_enc_type == "3xconv": + queries_enc = self.query_proj(queries) + log_det_W = 0.0 + else: + queries_enc, log_det_W = self.query_proj(queries) + else: + queries_enc, log_det_W = queries, 0.0 + + # different ways of computing attn, + # one is isotopic gaussians (per phoneme) + # Simplistic Gaussian Isotopic Attention + + # B x n_attn_dims x T1 x T2 + attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2 + # compute log likelihood from a gaussian + attn = -0.0005 * attn.sum(1, keepdim=True) + if attn_prior is not None: + attn = self.log_softmax(attn) + torch.log(attn_prior[:, None]+1e-8) + + attn_logprob = attn.clone() + + if mask is not None: + attn.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2), + -float("inf")) + + attn = self.softmax(attn) # Softmax along T2 + return attn, attn_logprob diff --git a/fastpitch/attn_loss_function.py b/fastpitch/attn_loss_function.py new file mode 100644 index 0000000000000000000000000000000000000000..52748f21c3aedecb6e085a913e4a21fc91168b6b --- /dev/null +++ b/fastpitch/attn_loss_function.py @@ -0,0 +1,71 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class AttentionCTCLoss(torch.nn.Module): + def __init__(self, blank_logprob=-1): + super(AttentionCTCLoss, self).__init__() + self.log_softmax = torch.nn.LogSoftmax(dim=-1) + self.blank_logprob = blank_logprob + self.CTCLoss = nn.CTCLoss(zero_infinity=True) + + def forward(self, attn_logprob, in_lens, out_lens): + key_lens = in_lens + query_lens = out_lens + max_key_len = attn_logprob.size(-1) + + # Reorder input to [query_len, batch_size, key_len] + attn_logprob = attn_logprob.squeeze(1) + attn_logprob = attn_logprob.permute(1, 0, 2) + + # Add blank label + attn_logprob = F.pad( + input=attn_logprob, + pad=(1, 0, 0, 0, 0, 0), + value=self.blank_logprob) + + # Convert to log probabilities + # Note: Mask out probs beyond key_len + key_inds = torch.arange( + max_key_len+1, + device=attn_logprob.device, + dtype=torch.long) + attn_logprob.masked_fill_( + key_inds.view(1,1,-1) > key_lens.view(1,-1,1), # key_inds >= key_lens+1 + -float("inf")) + attn_logprob = self.log_softmax(attn_logprob) + + # Target sequences + target_seqs = key_inds[1:].unsqueeze(0) + target_seqs = target_seqs.repeat(key_lens.numel(), 1) + + # Evaluate CTC loss + cost = self.CTCLoss( + attn_logprob, target_seqs, + input_lengths=query_lens, target_lengths=key_lens) + return cost + + +class AttentionBinarizationLoss(torch.nn.Module): + def __init__(self): + super(AttentionBinarizationLoss, self).__init__() + + def forward(self, hard_attention, soft_attention, eps=1e-12): + log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1], + min=eps)).sum() + return -log_sum / hard_attention.sum() diff --git a/fastpitch/data_function.py b/fastpitch/data_function.py new file mode 100644 index 0000000000000000000000000000000000000000..3c947433f2529ede99cb53355c535ecb9d594068 --- /dev/null +++ b/fastpitch/data_function.py @@ -0,0 +1,464 @@ +# ***************************************************************************** +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import functools +import json +import re +from pathlib import Path + +import librosa +import numpy as np +import torch +import torch.nn.functional as F +from scipy import ndimage +from scipy.stats import betabinom + +import common.layers as layers +from common.text.text_processing import TextProcessing +from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu + + +class BetaBinomialInterpolator: + """Interpolates alignment prior matrices to save computation. + + Calculating beta-binomial priors is costly. Instead cache popular sizes + and use img interpolation to get priors faster. + """ + def __init__(self, round_mel_len_to=100, round_text_len_to=20): + self.round_mel_len_to = round_mel_len_to + self.round_text_len_to = round_text_len_to + self.bank = functools.lru_cache(beta_binomial_prior_distribution) + + def round(self, val, to): + return max(1, int(np.round((val + 1) / to))) * to + + def __call__(self, w, h): + bw = self.round(w, to=self.round_mel_len_to) + bh = self.round(h, to=self.round_text_len_to) + ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1) + assert ret.shape[0] == w, ret.shape + assert ret.shape[1] == h, ret.shape + return ret + + +def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling=1.0): + P = phoneme_count + M = mel_count + x = np.arange(0, P) + mel_text_probs = [] + for i in range(1, M+1): + a, b = scaling * i, scaling * (M + 1 - i) + rv = betabinom(P, a, b) + mel_i_prob = rv.pmf(x) + mel_text_probs.append(mel_i_prob) + return torch.tensor(np.array(mel_text_probs)) + + +def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None, + normalize_std=None, n_formants=1): + + if type(normalize_mean) is float or type(normalize_mean) is list: + normalize_mean = torch.tensor(normalize_mean) + + if type(normalize_std) is float or type(normalize_std) is list: + normalize_std = torch.tensor(normalize_std) + + if method == 'pyin': + + snd, sr = librosa.load(wav) + pitch_mel, voiced_flag, voiced_probs = librosa.pyin( + # snd, fmin=librosa.note_to_hz('C2'), ######################### + snd, fmin=60, ###################### + # fmax=librosa.note_to_hz('C7'), frame_length=1024) + fmax=400, frame_length=1024) + assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0 + + pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel) + pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0) + pitch_mel = F.pad(pitch_mel, (0, mel_len - pitch_mel.size(1))) + + if n_formants > 1: + raise NotImplementedError + + else: + raise ValueError + + pitch_mel = pitch_mel.float() + + if normalize_mean is not None: + assert normalize_std is not None + pitch_mel = normalize_pitch(pitch_mel, normalize_mean, normalize_std) + + return pitch_mel + + +def normalize_pitch(pitch, mean, std): + zeros = (pitch == 0.0) + pitch -= mean[:, None] + pitch /= std[:, None] + pitch[zeros] = 0.0 + return pitch + + +class TTSDataset(torch.utils.data.Dataset): + """ + 1) loads audio,text pairs + 2) normalizes text and converts them to sequences of one-hot vectors + 3) computes mel-spectrograms from audio files. + """ + def __init__(self, + dataset_path, + audiopaths_and_text, + text_cleaners, + n_mel_channels, + symbol_set='smj_expanded', + p_arpabet=1.0, + n_speakers=1, + n_languages=1, #ANT: added + load_mel_from_disk=True, + load_pitch_from_disk=True, + pitch_mean=214.72203, # LJSpeech defaults + pitch_std=65.72038, + max_wav_value=None, + sampling_rate=None, + filter_length=None, + hop_length=None, + win_length=None, + mel_fmin=None, + mel_fmax=None, + prepend_space_to_text=False, + append_space_to_text=False, + pitch_online_dir=None, + betabinomial_online_dir=None, + use_betabinomial_interpolator=True, + pitch_online_method='pyin', + **ignored): + # print(prepend_space_to_text, append_space_to_text) + # Expect a list of filenames + if type(audiopaths_and_text) is str: + audiopaths_and_text = [audiopaths_and_text] + + self.dataset_path = dataset_path + #ANT: do we need to add language to common_utils.load_filepaths_and_text, probably + self.audiopaths_and_text = load_filepaths_and_text( + dataset_path, audiopaths_and_text, + has_speakers=(n_speakers > 1)) + self.load_mel_from_disk = load_mel_from_disk + if not load_mel_from_disk: + self.max_wav_value = max_wav_value + self.sampling_rate = sampling_rate + self.stft = layers.TacotronSTFT( + filter_length, hop_length, win_length, + n_mel_channels, sampling_rate, mel_fmin, mel_fmax) + self.load_pitch_from_disk = load_pitch_from_disk + + self.prepend_space_to_text = prepend_space_to_text + self.append_space_to_text = append_space_to_text + + assert p_arpabet == 0.0 or p_arpabet == 1.0, ( + 'Only 0.0 and 1.0 p_arpabet is currently supported. ' + 'Variable probability breaks caching of betabinomial matrices.') + + self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet) + self.n_speakers = n_speakers + # ANT: added languages, must add to config and probably train.py too + self.n_languages = n_languages + self.pitch_tmp_dir = pitch_online_dir + self.f0_method = pitch_online_method + self.betabinomial_tmp_dir = betabinomial_online_dir + self.use_betabinomial_interpolator = use_betabinomial_interpolator + + if use_betabinomial_interpolator: + self.betabinomial_interpolator = BetaBinomialInterpolator() + # ANT: added language here + expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1) + (n_languages > 1)) + assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None) + """ + if len(self.audiopaths_and_text[0]) < expected_columns: + raise ValueError(f'Expected {expected_columns} columns in audiopaths file. ' + 'The format is |[|][|]') + """ + if len(self.audiopaths_and_text[0]) > expected_columns: + print('WARNING: Audiopaths file has more columns than expected') + + to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x + self.pitch_mean = to_tensor(pitch_mean) + self.pitch_std = to_tensor(pitch_std) + + def __getitem__(self, index): + # Separate filename and text + # ANT: added language, assume that if language is present, speaker labels are too + # print(self.n_speakers, self.n_languages) ############################ + if self.n_speakers > 1 and self.n_languages > 1: + audiopath, *extra, text, speaker, language = self.audiopaths_and_text[index] + speaker = int(speaker) + language = int(language) + # print("spkr", speaker, "lang",language) ############################ + + elif self.n_speakers >1: + audiopath, *extra, text, speaker = self.audiopaths_and_text[index] + speaker = int(speaker) + # print(speaker) ############################ + else: + audiopath, *extra, text = self.audiopaths_and_text[index] + speaker = None + language = None + + mel = self.get_mel(audiopath) + text = self.get_text(text) + # print(text) + pitch = self.get_pitch(index, mel.size(-1)) + ## ANT: if external pitch extraction is used, n_frames may be one off due to rounding differences + if pitch.size(-1) != mel.size(-1): ############################ + print(pitch.shape, mel.shape, audiopath) ############################ + if pitch.size(-1) < mel.size(-1): + mel = mel[:, :pitch.size(-1)] + else: + pitch = pitch[:,:mel.size(-1)] #### + + energy = torch.norm(mel.float(), dim=0, p=2) + attn_prior = self.get_prior(index, mel.shape[1], text.shape[0]) + + + assert pitch.size(-1) == mel.size(-1) + + # No higher formants? + if len(pitch.size()) == 1: + pitch = pitch[None, :] + + + return (text, mel, len(text), pitch, energy, speaker, language, attn_prior, + audiopath) + + def __len__(self): + return len(self.audiopaths_and_text) + + def get_mel(self, filename): + if not self.load_mel_from_disk: + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.stft.sampling_rate: + print(filename) + raise ValueError("{} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate)) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + audio_norm = torch.autograd.Variable(audio_norm, + requires_grad=False) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + else: + raise Exception(filename) + melspec = torch.load(filename) + assert melspec.size(0) == self.stft.n_mel_channels, ( + 'Mel dimension mismatch: given {}, expected {}'.format( + melspec.size(0), self.stft.n_mel_channels)) + +################ Plotting mels ######################################## + """ + import matplotlib.pyplot as plt + # plt.imshow(melspec.detach().cpu().T,aspect="auto") + fig, ax1 = plt.subplots(ncols=1) + pos = ax1.imshow(melspec.cpu().numpy().T,aspect="auto") + fig.colorbar(pos, ax=ax1) + plt.show() + """ +####################################################################### + + return melspec + + def get_text(self, text): + text = self.tp.encode_text(text) + space = [self.tp.encode_text("A A")[1]] + + if self.prepend_space_to_text: + text = space + text + print("prepending") + if self.append_space_to_text: + text = text + space + print("appending") + return torch.LongTensor(text) + + def get_prior(self, index, mel_len, text_len): + + if self.use_betabinomial_interpolator: + return torch.from_numpy(self.betabinomial_interpolator(mel_len, + text_len)) + + if self.betabinomial_tmp_dir is not None: + audiopath, *_ = self.audiopaths_and_text[index] + fname = Path(audiopath).relative_to(self.dataset_path) + fname = fname.with_suffix('.pt') + cached_fpath = Path(self.betabinomial_tmp_dir, fname) + + if cached_fpath.is_file(): + return torch.load(cached_fpath) + + attn_prior = beta_binomial_prior_distribution(text_len, mel_len) + + if self.betabinomial_tmp_dir is not None: + cached_fpath.parent.mkdir(parents=True, exist_ok=True) + torch.save(attn_prior, cached_fpath) + + return attn_prior + + def get_pitch(self, index, mel_len=None): + audiopath, *fields = self.audiopaths_and_text[index] + + # ANT: spk is not used but I'll let it be + if self.n_speakers > 1 and self.n_languages > 1: + spk = spk = int(fields[-2]) + elif self.n_speakers > 1: + spk = int(fields[-1]) + else: + spk = 0 + + if self.load_pitch_from_disk: + pitchpath = fields[0] + pitch = torch.load(pitchpath) + if self.pitch_mean is not None: + assert self.pitch_std is not None + pitch = normalize_pitch(pitch, self.pitch_mean, self.pitch_std) + return pitch + + if self.pitch_tmp_dir is not None: + fname = Path(audiopath).relative_to(self.dataset_path) + fname_method = fname.with_suffix('.pt') + cached_fpath = Path(self.pitch_tmp_dir, fname_method) + if cached_fpath.is_file(): + return torch.load(cached_fpath) + + # No luck so far - calculate + wav = audiopath + if not wav.endswith('.wav'): + wav = re.sub('/mels/', '/wavs/', wav) + wav = re.sub('.pt$', '.wav', wav) + + pitch_mel = estimate_pitch(wav, mel_len, self.f0_method, + self.pitch_mean, self.pitch_std) + + if self.pitch_tmp_dir is not None and not cached_fpath.is_file(): + cached_fpath.parent.mkdir(parents=True, exist_ok=True) + torch.save(pitch_mel, cached_fpath) + + return pitch_mel + + +class TTSCollate: + """Zero-pads model inputs and targets based on number of frames per step""" + + def __call__(self, batch): + """Collate training batch from normalized text and mel-spec""" + # Right zero-pad all one-hot text sequences to max input length + input_lengths, ids_sorted_decreasing = torch.sort( + torch.LongTensor([len(x[0]) for x in batch]), + dim=0, descending=True) + max_input_len = input_lengths[0] + + text_padded = torch.LongTensor(len(batch), max_input_len) + text_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + text = batch[ids_sorted_decreasing[i]][0] + text_padded[i, :text.size(0)] = text + + # Right zero-pad mel-spec + num_mels = batch[0][1].size(0) + max_target_len = max([x[1].size(1) for x in batch]) + + # Include mel padded and gate padded + mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) + mel_padded.zero_() + output_lengths = torch.LongTensor(len(batch)) + for i in range(len(ids_sorted_decreasing)): + mel = batch[ids_sorted_decreasing[i]][1] + mel_padded[i, :, :mel.size(1)] = mel + output_lengths[i] = mel.size(1) + + n_formants = batch[0][3].shape[0] + pitch_padded = torch.zeros(mel_padded.size(0), n_formants, + mel_padded.size(2), dtype=batch[0][3].dtype) + energy_padded = torch.zeros_like(pitch_padded[:, 0, :]) + + for i in range(len(ids_sorted_decreasing)): + pitch = batch[ids_sorted_decreasing[i]][3] + energy = batch[ids_sorted_decreasing[i]][4] + pitch_padded[i, :, :pitch.shape[1]] = pitch + energy_padded[i, :energy.shape[0]] = energy + + if batch[0][5] is not None: + speaker = torch.zeros_like(input_lengths) + for i in range(len(ids_sorted_decreasing)): + speaker[i] = batch[ids_sorted_decreasing[i]][5] + else: + speaker = None + #ANT: added language here and increased the attn_prior and audiopaths index by 1 + if batch[0][6] is not None: + language = torch.zeros_like(input_lengths) + for i in range(len(ids_sorted_decreasing)): + language[i] = batch[ids_sorted_decreasing[i]][6] + else: + language = None + attn_prior_padded = torch.zeros(len(batch), max_target_len, + max_input_len) + attn_prior_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + prior = batch[ids_sorted_decreasing[i]][7] + attn_prior_padded[i, :prior.size(0), :prior.size(1)] = prior + + # Count number of items - characters in text + len_x = [x[2] for x in batch] + len_x = torch.Tensor(len_x) + + audiopaths = [batch[i][8] for i in ids_sorted_decreasing] + + return (text_padded, input_lengths, mel_padded, output_lengths, len_x, + pitch_padded, energy_padded, speaker, language, attn_prior_padded, + audiopaths) + + +def batch_to_gpu(batch): + # ANT: added language here too + (text_padded, input_lengths, mel_padded, output_lengths, len_x, + pitch_padded, energy_padded, speaker, language, attn_prior, audiopaths) = batch + + text_padded = to_gpu(text_padded).long() + input_lengths = to_gpu(input_lengths).long() + mel_padded = to_gpu(mel_padded).float() + output_lengths = to_gpu(output_lengths).long() + pitch_padded = to_gpu(pitch_padded).float() + energy_padded = to_gpu(energy_padded).float() + attn_prior = to_gpu(attn_prior).float() + if speaker is not None: + speaker = to_gpu(speaker).long() + if language is not None: + language = to_gpu(language).long() + # Alignments act as both inputs and targets - pass shallow copies + x = [text_padded, input_lengths, mel_padded, output_lengths, + pitch_padded, energy_padded, speaker, language, attn_prior, audiopaths] + y = [mel_padded, input_lengths, output_lengths] + len_x = torch.sum(output_lengths) + # print(output_lengths) + return (x, y, len_x) diff --git a/fastpitch/loss_function.py b/fastpitch/loss_function.py new file mode 100644 index 0000000000000000000000000000000000000000..0cd3775e56723226c2a13f08fb0146d0ee49a033 --- /dev/null +++ b/fastpitch/loss_function.py @@ -0,0 +1,112 @@ +# ***************************************************************************** +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import torch +import torch.nn.functional as F +from torch import nn + +from common.utils import mask_from_lens +from fastpitch.attn_loss_function import AttentionCTCLoss + + +class FastPitchLoss(nn.Module): + def __init__(self, dur_predictor_loss_scale=1.0, + pitch_predictor_loss_scale=1.0, attn_loss_scale=1.0, + energy_predictor_loss_scale=0.1): + super(FastPitchLoss, self).__init__() + self.dur_predictor_loss_scale = dur_predictor_loss_scale + self.pitch_predictor_loss_scale = pitch_predictor_loss_scale + self.energy_predictor_loss_scale = energy_predictor_loss_scale + self.attn_loss_scale = attn_loss_scale + self.attn_ctc_loss = AttentionCTCLoss() + + def forward(self, model_out, targets, is_training=True, meta_agg='mean'): + (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, + energy_pred, energy_tgt, attn_soft, attn_hard, attn_dur, + attn_logprob) = model_out + + (mel_tgt, in_lens, out_lens) = targets + + dur_tgt = attn_dur + dur_lens = in_lens + + mel_tgt.requires_grad = False + # (B,H,T) => (B,T,H) + mel_tgt = mel_tgt.transpose(1, 2) + + dur_mask = mask_from_lens(dur_lens, max_len=dur_tgt.size(1)) + log_dur_tgt = torch.log(dur_tgt.float() + 1) + loss_fn = F.mse_loss + dur_pred_loss = loss_fn(log_dur_pred, log_dur_tgt, reduction='none') + dur_pred_loss = (dur_pred_loss * dur_mask).sum() / dur_mask.sum() + + ldiff = mel_tgt.size(1) - mel_out.size(1) + mel_out = F.pad(mel_out, (0, 0, 0, ldiff, 0, 0), value=0.0) + mel_mask = mel_tgt.ne(0).float() + loss_fn = F.mse_loss + mel_loss = loss_fn(mel_out, mel_tgt, reduction='none') + mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum() + + ldiff = pitch_tgt.size(2) - pitch_pred.size(2) + pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0) + pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none') + pitch_loss = (pitch_loss * dur_mask.unsqueeze(1)).sum() / dur_mask.sum() + + if energy_pred is not None: + energy_pred = F.pad(energy_pred, (0, ldiff, 0, 0), value=0.0) + energy_loss = F.mse_loss(energy_tgt, energy_pred, reduction='none') + energy_loss = (energy_loss * dur_mask).sum() / dur_mask.sum() + else: + energy_loss = 0 + + # Attention loss + attn_loss = self.attn_ctc_loss(attn_logprob, in_lens, out_lens) + + loss = (mel_loss + + dur_pred_loss * self.dur_predictor_loss_scale + + pitch_loss * self.pitch_predictor_loss_scale + + energy_loss * self.energy_predictor_loss_scale + + attn_loss * self.attn_loss_scale) + + meta = { + 'loss': loss.clone().detach(), + 'mel_loss': mel_loss.clone().detach(), + 'duration_predictor_loss': dur_pred_loss.clone().detach(), + 'pitch_loss': pitch_loss.clone().detach(), + 'attn_loss': attn_loss.clone().detach(), + 'dur_error': (torch.abs(dur_pred - dur_tgt).sum() + / dur_mask.sum()).detach(), + } + + if energy_pred is not None: + meta['energy_loss'] = energy_loss.clone().detach() + + assert meta_agg in ('sum', 'mean') + if meta_agg == 'sum': + bsz = mel_out.size(0) + meta = {k: v * bsz for k, v in meta.items()} + return loss, meta diff --git a/fastpitch/model.py b/fastpitch/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c54afee6cc45df7f6bb3250ce6f40e7336ffafef --- /dev/null +++ b/fastpitch/model.py @@ -0,0 +1,419 @@ +# ***************************************************************************** +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +from typing import Optional + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from common import filter_warnings +from common.layers import ConvReLUNorm +from common.utils import mask_from_lens +from fastpitch.alignment import b_mas, mas_width1 +from fastpitch.attention import ConvAttention +from fastpitch.transformer import FFTransformer + + +def regulate_len(durations, enc_out, pace: float = 1.0, + mel_max_len: Optional[int] = None): + """If target=None, then predicted durations are applied""" + dtype = enc_out.dtype + reps = durations.float() / pace + reps = (reps + 0.5).long() + dec_lens = reps.sum(dim=1) + + max_len = dec_lens.max() + reps_cumsum = torch.cumsum(F.pad(reps, (1, 0, 0, 0), value=0.0), + dim=1)[:, None, :] + reps_cumsum = reps_cumsum.to(dtype) + + range_ = torch.arange(max_len, device=enc_out.device)[None, :, None] + mult = ((reps_cumsum[:, :, :-1] <= range_) & + (reps_cumsum[:, :, 1:] > range_)) + mult = mult.to(dtype) + enc_rep = torch.matmul(mult, enc_out) + + if mel_max_len is not None: + enc_rep = enc_rep[:, :mel_max_len] + dec_lens = torch.clamp_max(dec_lens, mel_max_len) + return enc_rep, dec_lens + + +def average_pitch(pitch, durs): + durs_cums_ends = torch.cumsum(durs, dim=1).long() + durs_cums_starts = F.pad(durs_cums_ends[:, :-1], (1, 0)) + pitch_nonzero_cums = F.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0)) + pitch_cums = F.pad(torch.cumsum(pitch, dim=2), (1, 0)) + + bs, l = durs_cums_ends.size() + n_formants = pitch.size(1) + dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l) + dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l) + + pitch_sums = (torch.gather(pitch_cums, 2, dce) + - torch.gather(pitch_cums, 2, dcs)).float() + pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce) + - torch.gather(pitch_nonzero_cums, 2, dcs)).float() + + pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems, + pitch_sums / pitch_nelems) + return pitch_avg + + +class TemporalPredictor(nn.Module): + """Predicts a single float per each temporal location""" + + def __init__(self, input_size, filter_size, kernel_size, dropout, + n_layers=2, n_predictions=1): + super(TemporalPredictor, self).__init__() + + self.layers = nn.Sequential(*[ + ConvReLUNorm(input_size if i == 0 else filter_size, filter_size, + kernel_size=kernel_size, dropout=dropout) + for i in range(n_layers)] + ) + self.n_predictions = n_predictions + self.fc = nn.Linear(filter_size, self.n_predictions, bias=True) + + def forward(self, enc_out, enc_out_mask): + out = enc_out * enc_out_mask + out = self.layers(out.transpose(1, 2)).transpose(1, 2) + out = self.fc(out) * enc_out_mask + return out + + +class FastPitch(nn.Module): + def __init__(self, n_mel_channels, n_symbols, padding_idx, + symbols_embedding_dim, in_fft_n_layers, in_fft_n_heads, + in_fft_d_head, + in_fft_conv1d_kernel_size, in_fft_conv1d_filter_size, + in_fft_output_size, + p_in_fft_dropout, p_in_fft_dropatt, p_in_fft_dropemb, + out_fft_n_layers, out_fft_n_heads, out_fft_d_head, + out_fft_conv1d_kernel_size, out_fft_conv1d_filter_size, + out_fft_output_size, + p_out_fft_dropout, p_out_fft_dropatt, p_out_fft_dropemb, + dur_predictor_kernel_size, dur_predictor_filter_size, + p_dur_predictor_dropout, dur_predictor_n_layers, + pitch_predictor_kernel_size, pitch_predictor_filter_size, + p_pitch_predictor_dropout, pitch_predictor_n_layers, + pitch_embedding_kernel_size, + energy_conditioning, + energy_predictor_kernel_size, energy_predictor_filter_size, + p_energy_predictor_dropout, energy_predictor_n_layers, + energy_embedding_kernel_size, + n_speakers, speaker_emb_weight, n_languages, pitch_conditioning_formants=1): + super(FastPitch, self).__init__() + + self.encoder = FFTransformer( + n_layer=in_fft_n_layers, n_head=in_fft_n_heads, + d_model=symbols_embedding_dim, + d_head=in_fft_d_head, + d_inner=in_fft_conv1d_filter_size, + kernel_size=in_fft_conv1d_kernel_size, + dropout=p_in_fft_dropout, + dropatt=p_in_fft_dropatt, + dropemb=p_in_fft_dropemb, + embed_input=True, + d_embed=symbols_embedding_dim, + n_embed=n_symbols, + padding_idx=padding_idx) + + if n_speakers > 1: + print(n_speakers, "### Is the number of speakers in this model ###") ################################################ + self.speaker_emb = nn.Embedding(n_speakers, symbols_embedding_dim) + else: + self.speaker_emb = None + + self.speaker_emb_weight = speaker_emb_weight + + #ANT: added language embedding + if n_languages > 1: + print(n_languages, "### Is the number of languages in this model ###") ################################################ + self.language_emb = nn.Embedding(n_languages, symbols_embedding_dim) + else: + self.language_emb = None + + + self.duration_predictor = TemporalPredictor( + in_fft_output_size, + filter_size=dur_predictor_filter_size, + kernel_size=dur_predictor_kernel_size, + dropout=p_dur_predictor_dropout, n_layers=dur_predictor_n_layers + ) + + self.decoder = FFTransformer( + n_layer=out_fft_n_layers, n_head=out_fft_n_heads, + d_model=symbols_embedding_dim, + d_head=out_fft_d_head, + d_inner=out_fft_conv1d_filter_size, + kernel_size=out_fft_conv1d_kernel_size, + dropout=p_out_fft_dropout, + dropatt=p_out_fft_dropatt, + dropemb=p_out_fft_dropemb, + embed_input=False, + d_embed=symbols_embedding_dim + ) + + self.pitch_predictor = TemporalPredictor( + in_fft_output_size, + filter_size=pitch_predictor_filter_size, + kernel_size=pitch_predictor_kernel_size, + dropout=p_pitch_predictor_dropout, n_layers=pitch_predictor_n_layers, + n_predictions=pitch_conditioning_formants + ) + + self.pitch_emb = nn.Conv1d( + pitch_conditioning_formants, symbols_embedding_dim, + kernel_size=pitch_embedding_kernel_size, + padding=int((pitch_embedding_kernel_size - 1) / 2)) + + # Store values precomputed for training data within the model + self.register_buffer('pitch_mean', torch.zeros(1)) + self.register_buffer('pitch_std', torch.zeros(1)) + + self.energy_conditioning = energy_conditioning + if energy_conditioning: + self.energy_predictor = TemporalPredictor( + in_fft_output_size, + filter_size=energy_predictor_filter_size, + kernel_size=energy_predictor_kernel_size, + dropout=p_energy_predictor_dropout, + n_layers=energy_predictor_n_layers, + n_predictions=1 + ) + + self.energy_emb = nn.Conv1d( + 1, symbols_embedding_dim, + kernel_size=energy_embedding_kernel_size, + padding=int((energy_embedding_kernel_size - 1) / 2)) + + self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True) + + self.attention = ConvAttention( + n_mel_channels, 0, symbols_embedding_dim, + use_query_proj=True, align_query_enc_type='3xconv') + + def binarize_attention(self, attn, in_lens, out_lens): + """For training purposes only. Binarizes attention with MAS. + These will no longer recieve a gradient. + + Args: + attn: B x 1 x max_mel_len x max_text_len + """ + b_size = attn.shape[0] + with torch.no_grad(): + attn_out_cpu = np.zeros(attn.data.shape, dtype=np.float32) + log_attn_cpu = torch.log(attn.data).to(device='cpu', dtype=torch.float32) + log_attn_cpu = log_attn_cpu.numpy() + out_lens_cpu = out_lens.cpu() + in_lens_cpu = in_lens.cpu() + for ind in range(b_size): + hard_attn = mas_width1( + log_attn_cpu[ind, 0, :out_lens_cpu[ind], :in_lens_cpu[ind]]) + attn_out_cpu[ind, 0, :out_lens_cpu[ind], :in_lens_cpu[ind]] = hard_attn + attn_out = torch.tensor( + attn_out_cpu, device=attn.get_device(), dtype=attn.dtype) + return attn_out + + def binarize_attention_parallel(self, attn, in_lens, out_lens): + """For training purposes only. Binarizes attention with MAS. + These will no longer recieve a gradient. + + Args: + attn: B x 1 x max_mel_len x max_text_len + """ + with torch.no_grad(): + log_attn_cpu = torch.log(attn.data).cpu().numpy() + attn_out = b_mas(log_attn_cpu, in_lens.cpu().numpy(), + out_lens.cpu().numpy(), width=1) + return torch.from_numpy(attn_out).to(attn.get_device()) + + def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75): + #ANT: added language + (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense, + speaker, language, attn_prior, audiopaths) = inputs + + text_max_len = inputs.size(1) + mel_max_len = mel_tgt.size(2) + + # Calculate speaker embedding + conditionings = [] + if self.speaker_emb is None: + spk_emb = 0 + else: + spk_emb = self.speaker_emb(speaker).unsqueeze(1) + spk_emb.mul_(self.speaker_emb_weight) + conditionings.append(spk_emb) + # ANT: added language + if self.language_emb is None: + language_emb = 0 + else: + language_emb = self.language_emb(language).unsqueeze(1) + conditionings.append(language_emb) + + + # Input FFT + #enc_out, enc_mask = self.encoder(inputs, conditioning=[]) + enc_out, enc_mask = self.encoder(inputs, conditioning=conditionings) + + # Predict durations + log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1) + dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration) + + # Predict pitch + pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1) + + # Alignment + text_emb = self.encoder.word_emb(inputs) + + # make sure to do the alignments before folding + attn_mask = mask_from_lens(input_lens, max_len=text_max_len) + attn_mask = attn_mask[..., None] == 0 + # attn_mask should be 1 for unused timesteps in the text_enc_w_spkvec tensor + + attn_soft, attn_logprob = self.attention( + mel_tgt, text_emb.permute(0, 2, 1), mel_lens, attn_mask, + key_lens=input_lens, keys_encoded=enc_out, attn_prior=attn_prior) + + attn_hard = self.binarize_attention(attn_soft, input_lens, mel_lens) + + # Viterbi --> durations + attn_hard_dur = attn_hard.sum(2)[:, 0, :] + dur_tgt = attn_hard_dur + + if not torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens)): + print(audiopaths,input_lens,dur_tgt.sum(dim=1), mel_lens) + + assert torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens)) + + # Average pitch over characters + pitch_tgt = average_pitch(pitch_dense, dur_tgt) + + if use_gt_pitch and pitch_tgt is not None: + pitch_emb = self.pitch_emb(pitch_tgt) + else: + pitch_emb = self.pitch_emb(pitch_pred) + enc_out = enc_out + pitch_emb.transpose(1, 2) + + # Predict energy + if self.energy_conditioning: + energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1) + + # Average energy over characters + energy_tgt = average_pitch(energy_dense.unsqueeze(1), dur_tgt) + energy_tgt = torch.log(1.0 + energy_tgt) + + energy_emb = self.energy_emb(energy_tgt) + energy_tgt = energy_tgt.squeeze(1) + enc_out = enc_out + energy_emb.transpose(1, 2) + else: + energy_pred = None + energy_tgt = None + + len_regulated, dec_lens = regulate_len( + dur_tgt, enc_out, pace, mel_max_len) + + # Output FFT + dec_out, dec_mask = self.decoder(len_regulated, dec_lens) + mel_out = self.proj(dec_out) + return (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, + pitch_tgt, energy_pred, energy_tgt, attn_soft, attn_hard, + attn_hard_dur, attn_logprob) + + def infer(self, inputs, pace=1.0, dur_tgt=None, pitch_tgt=None, + energy_tgt=None, pitch_transform=None, max_duration=75, + speaker=0, language=0, speaker_weight=1.0, language_weight=1.0): + + if self.speaker_emb is None: + spk_emb = 0 + else: + print("using speaker embeddings") + speaker = (torch.ones(inputs.size(0)).long().to(inputs.device) + * speaker) + spk_emb = self.speaker_emb(speaker).unsqueeze(1) + print("spkr weight", speaker_weight) + spk_emb = spk_emb *speaker_weight + # ANT: added language + if self.language_emb is None: + language_emb = 0 + else: + print("using language embeddings") + language = (torch.ones(inputs.size(0)).long().to(inputs.device) + * language) + language_emb = self.language_emb(language).unsqueeze(1) + language_emb = language_emb * language_weight + # Input FFT + enc_out, enc_mask = self.encoder(inputs, conditioning=[spk_emb, language_emb]) + + # Predict durations + log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1) + dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration) + + # Pitch over chars + pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1) + + if pitch_transform is not None: + if self.pitch_std[0] == 0.0: + # XXX LJSpeech-1.1 defaults + mean, std = 218.14, 67.24 + else: + mean, std = self.pitch_mean[0], self.pitch_std[0] + pitch_pred = pitch_transform(pitch_pred, enc_mask.sum(dim=(1,2)), + mean, std) + if pitch_tgt is None: + pitch_emb = self.pitch_emb(pitch_pred).transpose(1, 2) + else: + pitch_emb = self.pitch_emb(pitch_tgt).transpose(1, 2) + + enc_out = enc_out + pitch_emb + + # Predict energy + if self.energy_conditioning: + + if energy_tgt is None: + energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1) + energy_emb = self.energy_emb(energy_pred.unsqueeze(1)).transpose(1, 2) + else: + energy_emb = self.energy_emb(energy_tgt).transpose(1, 2) + + enc_out = enc_out + energy_emb + else: + energy_pred = None + + len_regulated, dec_lens = regulate_len( + dur_pred if dur_tgt is None else dur_tgt, + enc_out, pace, mel_max_len=None) + + dec_out, dec_mask = self.decoder(len_regulated, dec_lens) + mel_out = self.proj(dec_out) + # mel_lens = dec_mask.squeeze(2).sum(axis=1).long() + mel_out = mel_out.permute(0, 2, 1) # For inference.py + return mel_out, dec_lens, dur_pred, pitch_pred, energy_pred diff --git a/fastpitch/model_jit.py b/fastpitch/model_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..0e1d437007664bce75be36aaf6cea257459f934f --- /dev/null +++ b/fastpitch/model_jit.py @@ -0,0 +1,216 @@ +# ***************************************************************************** +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +from typing import Optional + +import torch +from torch import nn as nn + +from common import filter_warnings +from fastpitch.model import TemporalPredictor +from fastpitch.transformer_jit import FFTransformer + + +def regulate_len(durations, enc_out, pace: float = 1.0, + mel_max_len: Optional[int] = None): + """If target=None, then predicted durations are applied""" + reps = torch.round(durations.float() / pace).long() + dec_lens = reps.sum(dim=1) + + max_len = dec_lens.max() + bsz, _, hid = enc_out.size() + + reps_padded = torch.cat([reps, (max_len - dec_lens)[:, None]], dim=1) + pad_vec = torch.zeros(bsz, 1, hid, dtype=enc_out.dtype, + device=enc_out.device) + + enc_rep = torch.cat([enc_out, pad_vec], dim=1) + enc_rep = torch.repeat_interleave( + enc_rep.view(-1, hid), reps_padded.view(-1), dim=0 + ).view(bsz, -1, hid) + + if mel_max_len is not None: + enc_rep = enc_rep[:, :mel_max_len] + dec_lens = torch.clamp_max(dec_lens, mel_max_len) + return enc_rep, dec_lens + + +class FastPitchJIT(nn.Module): + __constants__ = ['energy_conditioning'] + def __init__(self, n_mel_channels, n_symbols, padding_idx, + symbols_embedding_dim, in_fft_n_layers, in_fft_n_heads, + in_fft_d_head, + in_fft_conv1d_kernel_size, in_fft_conv1d_filter_size, + in_fft_output_size, + p_in_fft_dropout, p_in_fft_dropatt, p_in_fft_dropemb, + out_fft_n_layers, out_fft_n_heads, out_fft_d_head, + out_fft_conv1d_kernel_size, out_fft_conv1d_filter_size, + out_fft_output_size, + p_out_fft_dropout, p_out_fft_dropatt, p_out_fft_dropemb, + dur_predictor_kernel_size, dur_predictor_filter_size, + p_dur_predictor_dropout, dur_predictor_n_layers, + pitch_predictor_kernel_size, pitch_predictor_filter_size, + p_pitch_predictor_dropout, pitch_predictor_n_layers, + pitch_embedding_kernel_size, + energy_conditioning, + energy_predictor_kernel_size, energy_predictor_filter_size, + p_energy_predictor_dropout, energy_predictor_n_layers, + energy_embedding_kernel_size, + n_speakers, speaker_emb_weight, pitch_conditioning_formants=1): + super(FastPitchJIT, self).__init__() + + self.encoder = FFTransformer( + n_layer=in_fft_n_layers, n_head=in_fft_n_heads, + d_model=symbols_embedding_dim, + d_head=in_fft_d_head, + d_inner=in_fft_conv1d_filter_size, + kernel_size=in_fft_conv1d_kernel_size, + dropout=p_in_fft_dropout, + dropatt=p_in_fft_dropatt, + dropemb=p_in_fft_dropemb, + embed_input=True, + d_embed=symbols_embedding_dim, + n_embed=n_symbols, + padding_idx=padding_idx) + + if n_speakers > 1: + self.speaker_emb = nn.Embedding(n_speakers, symbols_embedding_dim) + else: + self.speaker_emb = None + self.speaker_emb_weight = speaker_emb_weight + + self.duration_predictor = TemporalPredictor( + in_fft_output_size, + filter_size=dur_predictor_filter_size, + kernel_size=dur_predictor_kernel_size, + dropout=p_dur_predictor_dropout, n_layers=dur_predictor_n_layers + ) + + self.decoder = FFTransformer( + n_layer=out_fft_n_layers, n_head=out_fft_n_heads, + d_model=symbols_embedding_dim, + d_head=out_fft_d_head, + d_inner=out_fft_conv1d_filter_size, + kernel_size=out_fft_conv1d_kernel_size, + dropout=p_out_fft_dropout, + dropatt=p_out_fft_dropatt, + dropemb=p_out_fft_dropemb, + embed_input=False, + d_embed=symbols_embedding_dim + ) + + self.pitch_predictor = TemporalPredictor( + in_fft_output_size, + filter_size=pitch_predictor_filter_size, + kernel_size=pitch_predictor_kernel_size, + dropout=p_pitch_predictor_dropout, n_layers=pitch_predictor_n_layers, + n_predictions=pitch_conditioning_formants + ) + + self.pitch_emb = nn.Conv1d( + pitch_conditioning_formants, symbols_embedding_dim, + kernel_size=pitch_embedding_kernel_size, + padding=int((pitch_embedding_kernel_size - 1) / 2)) + + # Store values precomputed for training data within the model + self.register_buffer('pitch_mean', torch.zeros(1)) + self.register_buffer('pitch_std', torch.zeros(1)) + + self.energy_conditioning = energy_conditioning + if energy_conditioning: + self.energy_predictor = TemporalPredictor( + in_fft_output_size, + filter_size=energy_predictor_filter_size, + kernel_size=energy_predictor_kernel_size, + dropout=p_energy_predictor_dropout, + n_layers=energy_predictor_n_layers, + n_predictions=1 + ) + + self.energy_emb = nn.Conv1d( + 1, symbols_embedding_dim, + kernel_size=energy_embedding_kernel_size, + padding=int((energy_embedding_kernel_size - 1) / 2)) + + self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True) + + # skip self.attention (used only in training) + + def infer(self, inputs, pace: float = 1.0, + dur_tgt: Optional[torch.Tensor] = None, + pitch_tgt: Optional[torch.Tensor] = None, + energy_tgt: Optional[torch.Tensor] = None, + speaker: int = 0): + + if self.speaker_emb is None: + spk_emb = None + else: + speaker = (torch.ones(inputs.size(0)).long().to(inputs.device) + * speaker) + spk_emb = self.speaker_emb(speaker).unsqueeze(1) + spk_emb.mul_(self.speaker_emb_weight) + + # Input FFT + enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb) + + # Predict durations + log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1) + dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, 100.0) + + # Pitch over chars + pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1) + + if pitch_tgt is None: + pitch_emb = self.pitch_emb(pitch_pred).transpose(1, 2) + else: + pitch_emb = self.pitch_emb(pitch_tgt).transpose(1, 2) + + enc_out = enc_out + pitch_emb + + # Predict energy + if self.energy_conditioning: + + if energy_tgt is None: + energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1) + energy_emb = self.energy_emb(energy_pred.unsqueeze(1)).transpose(1, 2) + else: + energy_pred = None + energy_emb = self.energy_emb(energy_tgt).transpose(1, 2) + + enc_out = enc_out + energy_emb + else: + energy_pred = None + + len_regulated, dec_lens = regulate_len( + dur_pred if dur_tgt is None else dur_tgt, + enc_out, pace, mel_max_len=None) + + dec_out, dec_mask = self.decoder(len_regulated, dec_lens) + mel_out = self.proj(dec_out) + # mel_lens = dec_mask.squeeze(2).sum(axis=1).long() + mel_out = mel_out.permute(0, 2, 1) # For inference.py + return mel_out, dec_lens, dur_pred, pitch_pred, energy_pred diff --git a/fastpitch/pitch_transform.py b/fastpitch/pitch_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..4fbe0c0dc20836fd105d6e30d964be1a88388950 --- /dev/null +++ b/fastpitch/pitch_transform.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + + +def pitch_transform_custom(pitch, pitch_lens): + """Apply a custom pitch transformation to predicted pitch values. + + This sample modification linearly increases the pitch throughout + the utterance from 0.5 of predicted pitch to 1.5 of predicted pitch. + In other words, it starts low and ends high. + + PARAMS + ------ + pitch: torch.Tensor (bs, max_len) + Predicted pitch values for each lexical unit, padded to max_len (in Hz). + pitch_lens: torch.Tensor (bs, max_len) + Number of lexical units in each utterance. + + RETURNS + ------- + pitch: torch.Tensor + Modified pitch (in Hz). + """ + + weights = torch.arange(pitch.size(1), dtype=torch.float32, device=pitch.device) + + # The weights increase linearly from 0.0 to 1.0 in every i-th row + # in the range (0, pitch_lens[i]) + weights = weights.unsqueeze(0) / pitch_lens.unsqueeze(1) + + # Shift the range from (0.0, 1.0) to (0.5, 1.5) + weights += 0.5 + + return pitch * weights diff --git a/fastpitch/transformer.py b/fastpitch/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..31b11670897ff2052c5095f8b6e3cd08b3fb1002 --- /dev/null +++ b/fastpitch/transformer.py @@ -0,0 +1,218 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from common.utils import mask_from_lens + + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + self.demb = demb + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.matmul(torch.unsqueeze(pos_seq, -1), + torch.unsqueeze(self.inv_freq, 0)) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1) + if bsz is not None: + return pos_emb[None, :, :].expand(bsz, -1, -1) + else: + return pos_emb[None, :, :] + + +class PositionwiseConvFF(nn.Module): + def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False): + super(PositionwiseConvFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)), + nn.ReLU(), + # nn.Dropout(dropout), # worse convergence + nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)), + nn.Dropout(dropout), + ) + self.layer_norm = nn.LayerNorm(d_model) + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + return self._forward(inp) + + def _forward(self, inp): + if self.pre_lnorm: + # layer normalization + positionwise feed-forward + core_out = inp.transpose(1, 2) + core_out = self.CoreNet(self.layer_norm(core_out).to(inp.dtype)) + core_out = core_out.transpose(1, 2) + + # residual connection + output = core_out + inp + else: + # positionwise feed-forward + core_out = inp.transpose(1, 2) + core_out = self.CoreNet(core_out) + core_out = core_out.transpose(1, 2) + + # residual connection + layer normalization + output = self.layer_norm(inp + core_out).to(inp.dtype) + + return output + + +class MultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, + pre_lnorm=False): + super(MultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.scale = 1 / (d_head ** 0.5) + self.pre_lnorm = pre_lnorm + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head) + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + self.layer_norm = nn.LayerNorm(d_model) + + def forward(self, inp, attn_mask=None): + return self._forward(inp, attn_mask) + + def _forward(self, inp, attn_mask=None): + residual = inp + + if self.pre_lnorm: + # layer normalization + inp = self.layer_norm(inp) + + n_head, d_head = self.n_head, self.d_head + + head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=2) + head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head) + head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head) + head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head) + + q = head_q.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head) + k = head_k.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head) + v = head_v.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head) + + attn_score = torch.bmm(q, k.transpose(1, 2)) + attn_score.mul_(self.scale) + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(1).to(attn_score.dtype) + attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1) + attn_score.masked_fill_(attn_mask.to(torch.bool), -float('inf')) + + attn_prob = F.softmax(attn_score, dim=2) + attn_prob = self.dropatt(attn_prob) + attn_vec = torch.bmm(attn_prob, v) + + attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head) + attn_vec = attn_vec.permute(1, 2, 0, 3).contiguous().view( + inp.size(0), inp.size(1), n_head * d_head) + + # linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + # residual connection + output = residual + attn_out + else: + # residual connection + layer normalization + output = self.layer_norm(residual + attn_out) + + output = output.to(attn_out.dtype) + + return output + + +class TransformerLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, + **kwargs): + super(TransformerLayer, self).__init__() + + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, mask=None): + output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2)) + output *= mask + output = self.pos_ff(output) + output *= mask + return output + + +class FFTransformer(nn.Module): + def __init__(self, n_layer, n_head, d_model, d_head, d_inner, kernel_size, + dropout, dropatt, dropemb=0.0, embed_input=True, + n_embed=None, d_embed=None, padding_idx=0, pre_lnorm=False): + super(FFTransformer, self).__init__() + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + self.padding_idx = padding_idx + + if embed_input: + self.word_emb = nn.Embedding(n_embed, d_embed or d_model, + padding_idx=self.padding_idx) + else: + self.word_emb = None + + self.pos_emb = PositionalEmbedding(self.d_model) + self.drop = nn.Dropout(dropemb) + self.layers = nn.ModuleList() + + for _ in range(n_layer): + self.layers.append( + TransformerLayer( + n_head, d_model, d_head, d_inner, kernel_size, dropout, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + # ANT: change conditioning to a list of conditionings + + def forward(self, dec_inp, seq_lens=None, conditioning=[]): + if self.word_emb is None: + inp = dec_inp + mask = mask_from_lens(seq_lens).unsqueeze(2) + else: + inp = self.word_emb(dec_inp) + # [bsz x L x 1] + mask = (dec_inp != self.padding_idx).unsqueeze(2) + + pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype) + pos_emb = self.pos_emb(pos_seq) * mask + out = inp + pos_emb + # out = self.drop(inp+pos_emb) + # ANT: is this ok?, used to be out = self.drop(inp+pos_emb+c) + # should dropout be applied multiple times? + for c in conditioning: + out = out + c + out = self.drop(out) + for layer in self.layers: + out = layer(out, mask=mask) + + # out = self.drop(out) + return out, mask diff --git a/fastpitch/transformer_jit.py b/fastpitch/transformer_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..7b0bb559425fee79b61a68cfe7392bdb8f089f06 --- /dev/null +++ b/fastpitch/transformer_jit.py @@ -0,0 +1,255 @@ +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from common.utils import mask_from_lens + + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + self.demb = demb + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz: Optional[int] = None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1) + if bsz is not None: + return pos_emb[None, :, :].expand(bsz, -1, -1) + else: + return pos_emb[None, :, :] + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False): + super(PositionwiseFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = nn.LayerNorm(d_model) + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + # layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + # residual connection + output = core_out + inp + else: + # positionwise feed-forward + core_out = self.CoreNet(inp) + + # residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + + +class PositionwiseConvFF(nn.Module): + def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False): + super(PositionwiseConvFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)), + nn.ReLU(), + # nn.Dropout(dropout), # worse convergence + nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)), + nn.Dropout(dropout), + ) + self.layer_norm = nn.LayerNorm(d_model) + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + # layer normalization + positionwise feed-forward + core_out = inp.transpose(1, 2) + core_out = self.CoreNet(self.layer_norm(core_out)) + core_out = core_out.transpose(1, 2) + + # residual connection + output = core_out + inp + else: + # positionwise feed-forward + core_out = inp.transpose(1, 2) + core_out = self.CoreNet(core_out) + core_out = core_out.transpose(1, 2) + + # residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + + +class MultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, + pre_lnorm=False): + super(MultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.scale = 1 / (d_head ** 0.5) + self.dropout = dropout + self.pre_lnorm = pre_lnorm + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head) + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + self.layer_norm = nn.LayerNorm(d_model) + + + def forward(self, inp, attn_mask: Optional[torch.Tensor] = None): + residual = inp + + if self.pre_lnorm: + # layer normalization + inp = self.layer_norm(inp) + + n_head, d_head = self.n_head, self.d_head + + head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=-1) + head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head) + head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head) + head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head) + + q = head_q.permute(0, 2, 1, 3).reshape(-1, inp.size(1), d_head) + k = head_k.permute(0, 2, 1, 3).reshape(-1, inp.size(1), d_head) + v = head_v.permute(0, 2, 1, 3).reshape(-1, inp.size(1), d_head) + + attn_score = torch.bmm(q, k.transpose(1, 2)) + attn_score.mul_(self.scale) + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(1) + attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1) + attn_score.masked_fill_(attn_mask, -float('inf')) + + attn_prob = F.softmax(attn_score, dim=2) + attn_prob = self.dropatt(attn_prob) + attn_vec = torch.bmm(attn_prob, v) + + attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head) + attn_vec = attn_vec.permute(1, 2, 0, 3).contiguous().view( + inp.size(0), inp.size(1), n_head * d_head) + + # linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + # residual connection + output = residual + attn_out + else: + # residual connection + layer normalization + + # XXX Running TorchScript on 20.02 and 20.03 containers crashes here + # XXX Works well with 20.01-py3 container. + # XXX dirty fix is: + # XXX output = self.layer_norm(residual + attn_out).half() + output = self.layer_norm(residual + attn_out) + + return output + + +class TransformerLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, + **kwargs): + super(TransformerLayer, self).__init__() + + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, mask): + output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2)) + output *= mask + output = self.pos_ff(output) + output *= mask + return output + + +class FFTransformer(nn.Module): + def __init__(self, n_layer, n_head, d_model, d_head, d_inner, kernel_size, + dropout, dropatt, dropemb=0.0, embed_input=True, + n_embed=None, d_embed=None, padding_idx=0, pre_lnorm=False): + super(FFTransformer, self).__init__() + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + self.padding_idx = padding_idx + self.n_embed = n_embed + + self.embed_input = embed_input + if embed_input: + print(padding_idx) ######################################### + self.word_emb = nn.Embedding(n_embed, d_embed or d_model, + padding_idx=self.padding_idx) + else: + self.word_emb = nn.Identity() + + self.pos_emb = PositionalEmbedding(self.d_model) + self.drop = nn.Dropout(dropemb) + self.layers = nn.ModuleList() + + for _ in range(n_layer): + self.layers.append( + TransformerLayer( + n_head, d_model, d_head, d_inner, kernel_size, dropout, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + + def forward(self, dec_inp, seq_lens: Optional[torch.Tensor] = None, + conditioning: Optional[torch.Tensor] = None): + if not self.embed_input: + inp = dec_inp + assert seq_lens is not None + mask = mask_from_lens(seq_lens).unsqueeze(2) + else: + inp = self.word_emb(dec_inp) + # [bsz x L x 1] + mask = (dec_inp != self.padding_idx).unsqueeze(2) + + pos_seq = torch.arange(inp.size(1), device=inp.device, dtype=inp.dtype) + pos_emb = self.pos_emb(pos_seq) * mask + if conditioning is not None: + out = self.drop(inp + pos_emb + conditioning) + else: + out = self.drop(inp + pos_emb) + + for layer in self.layers: + out = layer(out, mask=mask) + + # out = self.drop(out) + return out, mask diff --git a/hifigan/__pycache__/arg_parser.cpython-39.pyc b/hifigan/__pycache__/arg_parser.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30d60733b739a3d51b381cfcfb1fbde4f4d22c98 Binary files /dev/null and b/hifigan/__pycache__/arg_parser.cpython-39.pyc differ diff --git a/hifigan/__pycache__/data_function.cpython-39.pyc b/hifigan/__pycache__/data_function.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81b34ee26aca207e17396bfe3908a50f9217d028 Binary files /dev/null and b/hifigan/__pycache__/data_function.cpython-39.pyc differ diff --git a/hifigan/__pycache__/models.cpython-37.pyc b/hifigan/__pycache__/models.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90138726308c705bfb2845270ec9bbc604c32620 Binary files /dev/null and b/hifigan/__pycache__/models.cpython-37.pyc differ diff --git a/hifigan/__pycache__/models.cpython-38.pyc b/hifigan/__pycache__/models.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff5527348d384a5657e2940f99cce747cb9a3388 Binary files /dev/null and b/hifigan/__pycache__/models.cpython-38.pyc differ diff --git a/hifigan/__pycache__/models.cpython-39.pyc b/hifigan/__pycache__/models.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34d94cb6aded7a8adb8c09249c3ab9d317fb7b40 Binary files /dev/null and b/hifigan/__pycache__/models.cpython-39.pyc differ diff --git a/hifigan/arg_parser.py b/hifigan/arg_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..784389ad89dfa02e19830b27a300072b52ed6b30 --- /dev/null +++ b/hifigan/arg_parser.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from ast import literal_eval + + +def parse_hifigan_args(parent, add_help=False): + """ + Parse model specific commandline arguments. + """ + parser = argparse.ArgumentParser(parents=[parent], add_help=add_help, + allow_abbrev=False) + hfg = parser.add_argument_group('HiFi-GAN generator parameters') + hfg.add_argument('--upsample_rates', default=[8, 8, 2, 2], + type=literal_eval_arg, + help='Upsample rates') + hfg.add_argument('--upsample_kernel_sizes', default=[16, 16, 4, 4], + type=literal_eval_arg, + help='Upsample kernel sizes') + hfg.add_argument('--upsample_initial_channel', default=512, type=int, + help='Upsample initial channel') + hfg.add_argument('--resblock', default='1', type=str, + help='Resblock module version') + hfg.add_argument('--resblock_kernel_sizes', default=[3, 7, 11], + type=literal_eval_arg, + help='Resblock kernel sizes') + hfg.add_argument('--resblock_dilation_sizes', type=literal_eval_arg, + default=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + help='Resblock dilation sizes'), + + hfg = parser.add_argument_group('HiFi-GAN discriminator parameters') + hfg.add_argument('--mpd_periods', default=[2, 3, 5, 7, 11], + type=literal_eval_arg, + help='Periods of MultiPeriodDiscriminator') + hfg.add_argument('--concat_fwd', action='store_true', + help='Faster Discriminators (requires more GPU memory)') + hfg.add_argument('--hifigan-config', type=str, default=None, required=False, + help='Path to a HiFi-GAN config .json' + ' (if provided, overrides model architecture flags)') + return parser + + +def literal_eval_arg(val): + try: + return literal_eval(val) + except SyntaxError as e: # Argparse does not handle SyntaxError + raise ValueError(str(e)) from e diff --git a/hifigan/data_function.py b/hifigan/data_function.py new file mode 100644 index 0000000000000000000000000000000000000000..ac77c7bd67f6356adda1a29e8a2e2cdbd01e72ed --- /dev/null +++ b/hifigan/data_function.py @@ -0,0 +1,220 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) 2020 Jungil Kong +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# The following functions/classes were based on code from https://github.com/jik876/hifi-gan: +# mel_spectrogram, MelDataset + +import math +import os + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +from librosa.util import normalize +from numpy import random +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from common.audio_processing import dynamic_range_compression +from common.utils import load_filepaths_and_text, load_wav + +MAX_WAV_VALUE = 32768.0 + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, + fmin, fmax, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global mel_basis, hann_window + fmax_key = f'{fmax}_{y.device}' + if fmax_key not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[fmax_key] = torch.from_numpy(mel).float().to(y.device) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + pad = int((n_fft-hop_size)/2) + y = F.pad(y.unsqueeze(1), (pad, pad), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, + window=hann_window[str(y.device)], center=center, + pad_mode='reflect', normalized=False, onesided=True, + return_complex=True) + + spec = torch.view_as_real(spec) + spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) + spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) + spec = dynamic_range_compression(spec) # spectral normalize + return spec + + +class MelDataset(torch.utils.data.Dataset): + def __init__(self, training_files, segment_size, n_fft, num_mels, + hop_size, win_size, sampling_rate, fmin, fmax, split=True, + device=None, fmax_loss=None, fine_tuning=False, + base_mels_path=None, repeat=1, deterministic=False, + max_wav_value=MAX_WAV_VALUE): + + self.audio_files = training_files + self.segment_size = segment_size + self.sampling_rate = sampling_rate + self.split = split + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.fmax_loss = fmax_loss + self.max_wav_value = max_wav_value + self.fine_tuning = fine_tuning + self.base_mels_path = base_mels_path + self.repeat = repeat + self.deterministic = deterministic + self.rng = random.default_rng() + + def __getitem__(self, index): + if index >= len(self): + raise IndexError('Dataset index out of range') + rng = random.default_rng(index) if self.deterministic else self.rng + index = index % len(self.audio_files) # collapse **after** setting seed + filename = self.audio_files[index] + audio, sampling_rate = load_wav(filename) + audio = audio / self.max_wav_value + if not self.fine_tuning: + audio = normalize(audio) * 0.95 + if sampling_rate != self.sampling_rate: + raise ValueError("{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate)) + + audio = torch.FloatTensor(audio) + audio = audio.unsqueeze(0) + + if not self.fine_tuning: + if self.split: + if audio.size(1) >= self.segment_size: + max_audio_start = audio.size(1) - self.segment_size + audio_start = rng.integers(0, max_audio_start) + audio = audio[:, audio_start:audio_start+self.segment_size] + else: + audio = F.pad(audio, (0, self.segment_size - audio.size(1))) + + mel = mel_spectrogram(audio, self.n_fft, self.num_mels, + self.sampling_rate, self.hop_size, + self.win_size, self.fmin, self.fmax, + center=False) + else: + mel = np.load( + os.path.join(self.base_mels_path, + os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) + mel = torch.from_numpy(mel).float() + + if len(mel.shape) < 3: + mel = mel.unsqueeze(0) + + if self.split: + frames_per_seg = math.ceil(self.segment_size / self.hop_size) + + if audio.size(1) >= self.segment_size: + mel_start = rng.integers(0, mel.size(2) - frames_per_seg - 1) + mel = mel[:, :, mel_start:mel_start + frames_per_seg] + a = mel_start * self.hop_size + b = (mel_start + frames_per_seg) * self.hop_size + audio = audio[:, a:b] + else: + mel = F.pad(mel, (0, frames_per_seg - mel.size(2))) + audio = F.pad(audio, (0, self.segment_size - audio.size(1))) + + mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, + self.sampling_rate, self.hop_size, + self.win_size, self.fmin, self.fmax_loss, + center=False) + return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) + + def __len__(self): + return len(self.audio_files) * self.repeat + + +def get_data_loader(args, distributed_run, train=True, batch_size=None, + val_kwargs=None): + + filelists = args.training_files if train else args.validation_files + files = load_filepaths_and_text(args.dataset_path, filelists) + files = list(zip(*files))[0] + + dataset_kw = { + 'segment_size': args.segment_size, + 'n_fft': args.filter_length, + 'num_mels': args.num_mels, + 'hop_size': args.hop_length, + 'win_size': args.win_length, + 'sampling_rate': args.sampling_rate, + 'fmin': args.mel_fmin, + 'fmax': args.mel_fmax, + 'fmax_loss': args.mel_fmax_loss, + 'max_wav_value': args.max_wav_value, + 'fine_tuning': args.fine_tuning, + 'base_mels_path': args.input_mels_dir, + 'deterministic': not train + } + + if train: + dataset = MelDataset(files, **dataset_kw) + sampler = DistributedSampler(dataset) if distributed_run else None + else: + dataset_kw.update(val_kwargs or {}) + dataset = MelDataset(files, **dataset_kw) + sampler = (DistributedSampler(dataset, shuffle=False) + if distributed_run else None) + + loader = DataLoader(dataset, + # NOTE On DGX-1 and DGX A100 =1 is optimal + num_workers=args.num_workers if train else 1, + shuffle=(train and not distributed_run), + sampler=sampler, + batch_size=batch_size or args.batch_size, + pin_memory=True, + persistent_workers=True, + drop_last=train) + return loader diff --git a/hifigan/denoiser.py b/hifigan/denoiser.py new file mode 100644 index 0000000000000000000000000000000000000000..fd407c14725cd652d11d1d0771a5684af764af07 --- /dev/null +++ b/hifigan/denoiser.py @@ -0,0 +1,38 @@ +import torch +from .stft import STFT + + +class Denoiser(torch.nn.Module): + """ Removes model bias from audio produced with hifigan """ + + def __init__(self, hifigan, filter_length=1024, n_overlap=4, + win_length=1024, mode='zeros'): + super(Denoiser, self).__init__() + self.stft = STFT(filter_length=filter_length, + hop_length=int(filter_length/n_overlap), + win_length=win_length).cuda() + if mode == 'zeros': + mel_input = torch.zeros( + (1, 80, 88), + dtype=hifigan.ups[0].weight.dtype, + device=hifigan.ups[0].weight.device) + elif mode == 'normal': + mel_input = torch.randn( + (1, 80, 88), + dtype=hifigan.upsample.weight.dtype, + device=hifigan.upsample.weight.device) + else: + raise Exception("Mode {} if not supported".format(mode)) + + with torch.no_grad(): + bias_audio = hifigan(mel_input).float()[0] + bias_spec, _ = self.stft.transform(bias_audio) + + self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) + + def forward(self, audio, strength=0.1): + audio_spec, audio_angles = self.stft.transform(audio.cuda().float()) + audio_spec_denoised = audio_spec - self.bias_spec * strength + audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) + audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) + return audio_denoised diff --git a/hifigan/logging.py b/hifigan/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..763b45aa6f062d9f295403475ae7f6ce81fbd143 --- /dev/null +++ b/hifigan/logging.py @@ -0,0 +1,231 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from collections import OrderedDict +from copy import copy +from pathlib import Path + +import dllogger +import numpy as np +import torch.distributed as dist +import torch +from dllogger import StdOutBackend, JSONStreamBackend, Verbosity + +from common import tb_dllogger +from common.tb_dllogger import (stdout_metric_format, stdout_step_format, + unique_log_fpath, TBLogger) + + +def init_logger(output_dir, log_file, ema_decay=0.0): + + local_rank = 0 if not dist.is_initialized() else dist.get_rank() + + print('logger init', local_rank) + + if local_rank == 0: + Path(output_dir).mkdir(parents=False, exist_ok=True) + log_fpath = log_file or Path(output_dir, 'nvlog.json') + + dllogger.init(backends=[ + JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(log_fpath)), + StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, + metric_format=stdout_metric_format)]) + + init_train_metadata() + else: + dllogger.init(backends=[]) + + tb_train = ['train'] + tb_val = ['val'] + tb_ema = [k + '_ema' for k in tb_val] if ema_decay > 0.0 else [] + + tb_dllogger.tb_loggers = { + s: TBLogger(enabled=(local_rank == 0), log_dir=output_dir, name=s) + for s in tb_train + tb_val + tb_ema} + + +def init_train_metadata(): + + dllogger.metadata("train_lrate_gen", + {"name": "g lr", "unit": None, "format": ":>3.2e"}) + dllogger.metadata("train_lrate_discrim", + {"name": "d lr", "unit": None, "format": ":>3.2e"}) + dllogger.metadata("train_avg_lrate_gen", + {"name": "avg g lr", "unit": None, "format": ":>3.2e"}) + dllogger.metadata("train_avg_lrate_discrim", + {"name": "avg d lr", "unit": None, "format": ":>3.2e"}) + + for id_, pref in [('train', ''), ('train_avg', 'avg train '), + ('val', ' avg val '), ('val_ema', ' EMA val ')]: + + dllogger.metadata(f"{id_}_loss_gen", + {"name": f"{pref}g loss", "unit": None, "format": ":>6.3f"}) + dllogger.metadata(f"{id_}_loss_discrim", + {"name": f"{pref}d loss", "unit": None, "format": ":>6.3f"}) + dllogger.metadata(f"{id_}_loss_mel", + {"name": f"{pref}mel loss", "unit": None, "format": ":>6.3f"}) + + dllogger.metadata(f"{id_}_frames/s", + {"name": None, "unit": "frames/s", "format": ":>8.2f"}) + dllogger.metadata(f"{id_}_took", + {"name": "took", "unit": "s", "format": ":>3.2f"}) + + +def init_infer_metadata(): + raise NotImplementedError + + # modalities = [('latency', 's', ':>10.5f'), ('RTF', 'x', ':>10.2f'), + # ('frames/s', None, ':>10.2f'), ('samples/s', None, ':>10.2f'), + # ('letters/s', None, ':>10.2f')] + + # for perc in ['', 'avg', '90%', '95%', '99%']: + # for model in ['fastpitch', 'waveglow', '']: + # for mod, unit, format in modalities: + + # name = f'{perc} {model} {mod}'.strip().replace(' ', ' ') + + # dllogger.metadata( + # name.replace(' ', '_'), + # {'name': f'{name: <26}', 'unit': unit, 'format': format}) + + +class defaultdict(OrderedDict): + """A simple, ordered defaultdict.""" + + def __init__(self, type_, *args, **kwargs): + self.type_ = type_ + super().__init__(*args, **kwargs) + + def __getitem__(self, key): + if key not in self: + self.__setitem__(key, self.type_()) + return super().__getitem__(key) + + def __copy__(self): + return defaultdict(self.type_, self) + + +class Metrics(dict): + + def __init__(self, scopes=['train', 'train_avg'], + dll_keys=['loss_gen', 'loss_discrim', 'loss_mel', + 'frames/s', 'took', 'lrate_gen', 'lrate_discrim'], + benchmark_epochs=0): + super().__init__() + + self.dll_keys = dll_keys + self.metrics = {scope: defaultdict(float) for scope in scopes} + self.metric_counts = {scope: defaultdict(int) for scope in scopes} + self.start_time = {scope: None for scope in scopes} + self.benchmark_epochs = benchmark_epochs + if benchmark_epochs > 0: + self.metrics['train_benchmark'] = defaultdict(list) + + def __setitem__(self, key, val): + extract = lambda t: t.item() if type(t) is torch.Tensor else t + + if type(val) is dict: + for k, v in val.items(): + super().__setitem__(k, extract(v)) + else: + super().__setitem__(key, extract(val)) + + def __getitem__(self, key): + if key not in self: + self.__setitem__(key, 0.0) + return super().__getitem__(key) + + def start_accumulating(self, step, start_timer=True, scope='train'): + del step # unused + self.clear() + self.metrics[scope].clear() + self.metric_counts[scope].clear() + if start_timer: + self.start_time[scope] = time.time() + + def accumulate(self, scopes=['train', 'train_avg']): + for scope in scopes: + for k, v in self.items(): + self.metrics[scope][k] += v + self.metric_counts[scope][k] += 1 + + self.clear() + + def finish_accumulating(self, stop_timer=True, scope='train'): + + metr = self.metrics[scope] + counts = self.metric_counts[scope] + + for k, v in metr.items(): + metr[k] = v / counts[k] + + if stop_timer: + took = time.time() - self.start_time[scope] + if 'frames' in metr: + metr['frames/s'] = metr.pop('frames') * counts['frames'] / took + metr['took'] = took + + def start_iter(self, iter, start_timer=True): + self.start_accumulating(iter, start_timer, 'train') + + def start_epoch(self, epoch, start_timer=True): + self.start_accumulating(epoch, start_timer, 'train_avg') + + def start_val(self, start_timer=True): + self.start_accumulating(None, start_timer, 'val') + + def finish_iter(self, stop_timer=True): + self.finish_accumulating(stop_timer, 'train') + + def finish_epoch(self, stop_timer=True): + self.finish_accumulating(stop_timer, 'train_avg') + + metr = self.metrics['train_benchmark'] + for k in ('took', 'frames/s', 'loss_gen', 'loss_discrim', 'loss_mel'): + metr[k].append(self.metrics['train_avg'][k]) + + if len(metr[k]) > self.benchmark_epochs: + metr[k].pop(0) + + def finish_val(self, stop_timer=True): + self.finish_accumulating(stop_timer, 'val') + + def get_metrics(self, scope='train', target='dll'): + + if scope == 'train_benchmark': + metr = self.metrics[scope] + ret = {'train_' + k: np.mean(v) for k, v in metr.items()} + ret['benchmark_epochs_num'] = len(list(metr.values())[0]) + return ret + + ret = copy(self.metrics[scope]) + + if scope == 'train': + ret.update(self) + + if target == 'dll': + ret = {f'{scope}_{k}': v + for k, v in ret.items() if k in self.dll_keys} + + elif target == 'tb': + # Rename keys so they would group nicely inside TensorBoard + + def split_key(k): + pos = k.rfind('_') + return k[:pos] + '/' + k[pos+1:] if pos >= 0 else k + + ret = {split_key(k): v for k, v in ret.items()} + + return ret diff --git a/hifigan/metrics.py b/hifigan/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..da053b478413b6006717ec0766aed389b2871e59 --- /dev/null +++ b/hifigan/metrics.py @@ -0,0 +1,151 @@ +import timer +from collections import defaultdict + + +class Metrics(defaultdict): + + # TODO Where to measure - gpu:0 or all gpus? + + def __init__(self, tb_keys=[], benchmark_epochs=10): + super().__init__(float) + + # dll_tb_keys=['loss_gen', 'loss_discrim', 'loss_mel', 'took']: + + self.tb_keys = tb_keys #_ = {'dll': dll_keys, 'tb': tb_keys, 'dll+tb': dll_tb_keys} + self.iter_start_time = None + self.iter_metrics = defaultdict(float) + self.epoch_start_time = None + self.epoch_metrics = defaultdict(float) + self.benchmark_epochs = benchmark_epochs + + def start_epoch(self, epoch, start_timer=True): + self.epoch = epoch + if start_timer: + self.epoch_start_time = time.time() + + def start_iter(self, iter, start_timer=True): + self.iter = iter + self.accum_steps = 0 + self.step_metrics.clear() + if start_timer: + self.iter_start_time = time.time() + + def update_iter(self, ...): + # do stuff + pass + + def accumulate(self, scope='step'): + tgt = {'step': self.step_metrics, 'epoch': self.epoch_metrics}[scope] + + for k, v in self.items(): + tgt[k] += v + + self.clear() + + def update_iter(self, metrics={}, stop_timer=True): + + is not self.started_iter: + return + + self.accumulate(metrics) + self.accumulate(self.iter_metrics, scope='epoch') + + if stop_timer: + self.iter_metrics['took'] = time.time() - self.iter_time_start + + def update_epoch(self, stop_timer=True): + + # tb_total_steps=None, + # subset='train_avg', + # data=OrderedDict([ + # ('loss', epoch_loss[-1]), + # ('mel_loss', epoch_mel_loss[-1]), + # ('frames/s', epoch_num_frames[-1] / epoch_time[-1]), + # ('took', epoch_time[-1])]), + # ) + + if stop_timer: + self.['epoch_time'] = time.time() - self.epoch_time_start + + + if steps % args.stdout_interval == 0: + # with torch.no_grad(): + # mel_error = F.l1_loss(y_mel, y_g_hat_mel).item() + + took = time.time() - self.start_b + + + self.sws['train'].add_scalar("gen_loss_total", loss_gen_all.item(), steps) + self.sws['train'].add_scalar("mel_spec_error", mel_error.item(), steps) + + for key, val in meta.items(): + + sw_name = 'train' + for name_ in keys_mpd + keys_msd: + if name_ in key: + sw_name = 'train_' + name_ + + key = key.replace('loss_', 'loss/') + key = re.sub('mpd\d+', 'mpd-msd', key) + key = re.sub('msd\d+', 'mpd-msd', key) + + self.sws[sw_name].add_scalar(key, val / h.batch_size, steps) + + def iter_metrics(self, target='dll+tb'): + return {self.iter_metrics[k] for k in self.keys_[target]} + + def foo + +Steps : 40, Gen Loss Total : 57.993, Mel-Spec. Error : 47.374, s/b : 1.013 + + logger.log((epoch, epoch_iter, num_iters), + tb_total_steps=total_iter, + subset='train', + data=OrderedDict([ + ('loss', iter_loss), + ('mel_loss', iter_mel_loss), + ('frames/s', iter_num_frames / iter_time), + ('took', iter_time), + ('lrate', optimizer.param_groups[0]['lr'])]), + ) + + + +class Meter: + def __init__(self, sink_type, scope, downstream=None, end_points=None, verbosity=dllogger.Verbosity.DEFAULT): + self.verbosity = verbosity + self.sink_type = sink_type + self.scope = scope + self.downstream = downstream + + self.end_points = end_points or [] + + def start(self): + ds = None if self.downstream is None else self.downstream.sink + end_pt_fn = lambda x: list(map(lambda f: f(x), self.end_points)) # call all endpoint functions + self.sink = self.sink_type(end_pt_fn, ds) + + def end(self): + self.sink.close() + + def send(self, data): + self.sink.send(data) + + def meters(self): + if self.downstream is not None: + downstream_meters = self.downstream.meters() + else: + downstream_meters = [] + return [self] + downstream_meters + + def add_end_point(self, new_endpoint): + self.end_points.append(new_endpoint) + + def __or__(self, other): + """for easy chaining of meters""" + if self.downstream is None: + self.downstream = other + else: + self.downstream | other + + return self diff --git a/hifigan/models.py b/hifigan/models.py new file mode 100644 index 0000000000000000000000000000000000000000..238e978915b47a57d9a3f8757dc6726c19ee1679 --- /dev/null +++ b/hifigan/models.py @@ -0,0 +1,457 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) 2020 Jungil Kong +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# The following functions/classes were based on code from https://github.com/jik876/hifi-gan: +# ResBlock1, ResBlock2, Generator, DiscriminatorP, DiscriminatorS, MultiScaleDiscriminator, +# MultiPeriodDiscriminator, feature_loss, discriminator_loss, generator_loss, +# init_weights, get_padding + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from common.stft import STFT +from common.utils import AttrDict, init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +class NoAMPConv1d(Conv1d): + def __init__(self, *args, no_amp=False, **kwargs): + super().__init__(*args, **kwargs) + self.no_amp = no_amp + + def _cast(self, x, dtype): + if isinstance(x, (list, tuple)): + return [self._cast(t, dtype) for t in x] + else: + return x.to(dtype) + + def forward(self, *args): + if not self.no_amp: + return super().forward(*args) + + with torch.cuda.amp.autocast(enabled=False): + return self._cast( + super().forward(*self._cast(args, torch.float)), args[0].dtype) + + +class ResBlock1(nn.Module): + __constants__ = ['lrelu_slope'] + + def __init__(self, conf, channels, kernel_size=3, dilation=(1, 3, 5)): + super().__init__() + self.conf = conf + self.lrelu_slope = LRELU_SLOPE + + ch, ks = channels, kernel_size + self.convs1 = nn.Sequential(*[ + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[0]), dilation[0])), + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[1]), dilation[1])), + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[2]), dilation[2])), + ]) + + self.convs2 = nn.Sequential(*[ + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))), + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))), + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))), + ]) + self.convs1.apply(init_weights) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, self.lrelu_slope) + xt = c1(xt) + xt = F.leaky_relu(xt, self.lrelu_slope) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(nn.Module): + __constants__ = ['lrelu_slope'] + + def __init__(self, conf, channels, kernel_size=3, dilation=(1, 3)): + super().__init__() + self.conf = conf + + ch, ks = channels, kernel_size + self.convs = nn.ModuleList([ + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(kernel_size, dilation[0]), dilation[0])), + weight_norm(Conv1d(ch, ch, ks, 1, get_padding(kernel_size, dilation[1]), dilation[1])), + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, self.lrelu_slope) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(nn.Module): + __constants__ = ['lrelu_slope', 'num_kernels', 'num_upsamples'] + + def __init__(self, conf): + super().__init__() + conf = AttrDict(conf) + self.conf = conf + self.num_kernels = len(conf.resblock_kernel_sizes) + self.num_upsamples = len(conf.upsample_rates) + + self.conv_pre = weight_norm( + Conv1d(80, conf.upsample_initial_channel, 7, 1, padding=3)) + + self.lrelu_slope = LRELU_SLOPE + + resblock = ResBlock1 if conf.resblock == '1' else ResBlock2 + + self.ups = [] + for i, (u, k) in enumerate(zip(conf.upsample_rates, + conf.upsample_kernel_sizes)): + self.ups.append(weight_norm( + ConvTranspose1d(conf.upsample_initial_channel // (2 ** i), + conf.upsample_initial_channel // (2 ** (i + 1)), + k, u, padding=(k-u)//2))) + + self.ups = nn.Sequential(*self.ups) + + self.resblocks = [] + for i in range(len(self.ups)): + resblock_list = [] + + ch = conf.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(conf.resblock_kernel_sizes, + conf.resblock_dilation_sizes)): + resblock_list.append(resblock(conf, ch, k, d)) + resblock_list = nn.Sequential(*resblock_list) + self.resblocks.append(resblock_list) + self.resblocks = nn.Sequential(*self.resblocks) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def load_state_dict(self, state_dict, strict=True): + # Fallback for old checkpoints (pre-ONNX fix) + new_sd = {} + for k, v in state_dict.items(): + new_k = k + if 'resblocks' in k: + parts = k.split(".") + # only do this is the checkpoint type is older + if len(parts) == 5: + layer = int(parts[1]) + new_layer = f"{layer//3}.{layer%3}" + new_k = f"resblocks.{new_layer}.{'.'.join(parts[2:])}" + new_sd[new_k] = v + + # Fix for conv1d/conv2d/NHWC + curr_sd = self.state_dict() + for key in new_sd: + len_diff = len(new_sd[key].size()) - len(curr_sd[key].size()) + if len_diff == -1: + new_sd[key] = new_sd[key].unsqueeze(-1) + elif len_diff == 1: + new_sd[key] = new_sd[key].squeeze(-1) + + super().load_state_dict(new_sd, strict=strict) + + def forward(self, x): + x = self.conv_pre(x) + + for upsample_layer, resblock_group in zip(self.ups, self.resblocks): + x = F.leaky_relu(x, self.lrelu_slope) + x = upsample_layer(x) + xs = 0 + for resblock in resblock_group: + xs += resblock(x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + print('HiFi-GAN: Removing weight norm.') + for l in self.ups: + remove_weight_norm(l) + for group in self.resblocks: + for block in group: + block.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class Denoiser(nn.Module): + """ Removes model bias from audio produced with hifigan """ + + def __init__(self, hifigan, filter_length=1024, n_overlap=4, + win_length=1024, mode='zeros', device="cpu", **infer_kw): + super().__init__() + self.stft = STFT(filter_length=filter_length, + hop_length=int(filter_length/n_overlap), + #win_length=win_length).cuda() # was like this + win_length=win_length, device=device) + + for name, p in hifigan.named_parameters(): + if name.endswith('.weight'): + dtype = p.dtype + device = p.device + break + + mel_init = {'zeros': torch.zeros, 'normal': torch.randn}[mode] + mel_input = mel_init((1, 80, 88), dtype=dtype, device=device) + + with torch.no_grad(): + bias_audio = hifigan(mel_input, **infer_kw).float() + if len(bias_audio.size()) > 2: + bias_audio = bias_audio.squeeze(0) + elif len(bias_audio.size()) < 2: + bias_audio = bias_audio.unsqueeze(0) + assert len(bias_audio.size()) == 2 + + bias_spec, _ = self.stft.transform(bias_audio) + + self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) + + def forward(self, audio, strength=0.1): + audio_spec, audio_angles = self.stft.transform(audio.float()) + audio_spec_denoised = audio_spec - self.bias_spec * strength + audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) + audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) + return audio_denoised + + +class DiscriminatorP(nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super().__init__() + self.period = period + norm_f = spectral_norm if use_spectral_norm else weight_norm + + ks = kernel_size + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (ks, 1), (stride, 1), (get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (ks, 1), (stride, 1), (get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (ks, 1), (stride, 1), (get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (ks, 1), (stride, 1), (get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (ks, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + def share_params_of(self, dp): + assert len(self.convs) == len(dp.convs) + for c1, c2 in zip(self.convs, dp.convs): + c1.weight = c2.weight + c1.bias = c2.bias + + +class MultiPeriodDiscriminator(nn.Module): + def __init__(self, periods, concat_fwd=False): + super().__init__() + layers = [DiscriminatorP(p) for p in periods] + self.discriminators = nn.ModuleList(layers) + self.concat_fwd = concat_fwd + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if self.concat_fwd: + y_ds, fmaps = d(concat_discr_input(y, y_hat)) + y_d_r, y_d_g, fmap_r, fmap_g = split_discr_output(y_ds, fmaps) + else: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(nn.Module): + def __init__(self, use_spectral_norm=False, no_amp_grouped_conv=False): + super().__init__() + norm_f = spectral_norm if use_spectral_norm else weight_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(NoAMPConv1d(128, 256, 41, 2, groups=16, padding=20, no_amp=no_amp_grouped_conv)), + norm_f(NoAMPConv1d(256, 512, 41, 4, groups=16, padding=20, no_amp=no_amp_grouped_conv)), + norm_f(NoAMPConv1d(512, 1024, 41, 4, groups=16, padding=20, no_amp=no_amp_grouped_conv)), + norm_f(NoAMPConv1d(1024, 1024, 41, 1, groups=16, padding=20, no_amp=no_amp_grouped_conv)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + # x = l(x.unsqueeze(-1)).squeeze(-1) + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap + + +class MultiScaleDiscriminator(nn.Module): + def __init__(self, no_amp_grouped_conv=False, concat_fwd=False): + super().__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True, no_amp_grouped_conv=no_amp_grouped_conv), + DiscriminatorS(no_amp_grouped_conv=no_amp_grouped_conv), + DiscriminatorS(no_amp_grouped_conv=no_amp_grouped_conv), + ]) + self.meanpools = nn.ModuleList([ + AvgPool1d(4, 2, padding=1), + AvgPool1d(4, 2, padding=1) + ]) + self.concat_fwd = concat_fwd + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if self.concat_fwd: + ys = concat_discr_input(y, y_hat) + if i != 0: + ys = self.meanpools[i-1](ys) + y_ds, fmaps = d(ys) + y_d_r, y_d_g, fmap_r, fmap_g = split_discr_output(y_ds, fmaps) + else: + if i != 0: + y = self.meanpools[i-1](y) + y_hat = self.meanpools[i-1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def concat_discr_input(y, y_hat): + return torch.cat((y, y_hat), dim=0) + + +def split_discr_output(y_ds, fmaps): + y_d_r, y_d_g = torch.chunk(y_ds, 2, dim=0) + fmap_r, fmap_g = zip(*(torch.chunk(f, 2, dim=0) for f in fmaps)) + return y_d_r, y_d_g, fmap_r, fmap_g + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss*2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + + for dg in disc_outputs: + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/hifigan/models_ch_last_.py b/hifigan/models_ch_last_.py new file mode 100644 index 0000000000000000000000000000000000000000..1c8f7c6ca22dd2c6efc24afd3458311bfbc10a2e --- /dev/null +++ b/hifigan/models_ch_last_.py @@ -0,0 +1,378 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d, ConvTranspose2d, AvgPool2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +from common.utils import init_weights, get_padding, print_once + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=(dilation[0], 1), + padding=(get_padding(kernel_size, dilation[0]), 0))), + weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=(dilation[1], 1), + padding=(get_padding(kernel_size, dilation[1]), 0))), + weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=(dilation[2], 1), + padding=(get_padding(kernel_size, dilation[2]), 0))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=1, + padding=(get_padding(kernel_size, 1), 0))), + weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=1, + padding=(get_padding(kernel_size, 1), 0))), + weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=1, + padding=(get_padding(kernel_size, 1), 0))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm(Conv2d(80, h.upsample_initial_channel, (7,1), (1,1), padding=(3,0))) + assert h.resblock == '1', 'Only ResBlock1 currently supported for NHWC' + resblock = ResBlock1 if h.resblock == '1' else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append(weight_norm( + # ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), + # k, u, padding=(k-u)//2))) + ConvTranspose2d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), + (k, 1), (u, 1), padding=((k-u)//2, 0)))) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel//(2**(i+1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv2d(ch, 1, (7,1), (1,1), padding=(3,0))) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = x.unsqueeze(-1).to(memory_format=torch.channels_last) + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + # x = self.ups[i](x.unsqueeze(-1)).squeeze(-1) + x = self.ups[i](x) + xs = 0 + for j in range(self.num_kernels): + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + x = x.squeeze(-1) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t, unit = x.shape + assert unit == 1 + + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, 0, 0, n_pad), "reflect") + t = t + n_pad + # print_once('x pre channels last:', x.is_contiguous(memory_format=torch.channels_last)) + x = x.view(b, c, t // self.period, self.period) + # print_once('x post channels last:', x.is_contiguous(memory_format=torch.channels_last)) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + # x = torch.flatten(x, 1, -1) + + return x, fmap + + def share_params_of(self, dp): + assert len(self.convs) == len(dp.convs) + for c1, c2 in zip(self.convs, dp.convs): + c1.weight = c2.weight + c1.bias = c2.bias + + +class DiscriminatorPConv1d(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorPConv1d, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0), dilation=(period, 1))), + ]) + # self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1, dilation=period)) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0), dilation=(period, 1))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t, unit = x.shape + assert unit == 1 + # if t % self.period != 0: # pad first + # n_pad = self.period - (t % self.period) + # x = F.pad(x, (0, n_pad), "reflect") + # t = t + n_pad + # x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + def share_params_of(self, dp): + assert len(self.convs) == len(dp.convs) + for c1, c2 in zip(self.convs, dp.convs): + c1.weight = c2.weight + c1.bias = c2.bias + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, periods, use_conv1d=False, shared=False): + super(MultiPeriodDiscriminator, self).__init__() + print('MPD PERIODS:', periods) + if use_conv1d: + print('Constructing dilated MPD') + layers = [DiscriminatorPConv1d(p) for p in periods] + else: + layers = [DiscriminatorP(p) for p in periods] + + if shared: + print('MPD HAS SHARED PARAMS') + for l in layers[1:]: + l.share_params_of(layers[0]) + + self.discriminators = nn.ModuleList(layers) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False, amp_groups=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + # self.convs = nn.ModuleList([ + # norm_f(Conv1d(1, 128, 15, 1, padding=7)), + # norm_f(Conv1d(128, 128, 41, 2, groups=1 if amp_groups else 4, padding=20)), # was: groups=4 + # norm_f(Conv1d(128, 256, 41, 2, groups=1 if amp_groups else 16, padding=20)), # was: groups=16 + # norm_f(Conv1d(256, 512, 41, 4, groups=1 if amp_groups else 16, padding=20)), # was: groups=16 + # norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + # norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + # norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + # ]) + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 128, (15,1), (1,1), padding=(7 , 0))), + norm_f(Conv2d(128, 128, (41,1), (2,1), groups=1 if amp_groups else 4, padding=(20, 0))), # was: groups=4 + norm_f(Conv2d(128, 256, (41,1), (2,1), groups=1 if amp_groups else 16, padding=(20, 0))), # was: groups=16 + norm_f(Conv2d(256, 512, (41,1), (4,1), groups=1 if amp_groups else 16, padding=(20, 0))), # was: groups=16 + norm_f(Conv2d(512, 1024, (41,1), (4,1), groups=16 , padding=(20, 0))), + norm_f(Conv2d(1024, 1024, (41,1), (1,1), groups=16 , padding=(20, 0))), + norm_f(Conv2d(1024, 1024, ( 5,1), (1,1), padding=(2 , 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3,1), (1,1), padding=(1,0))) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + # x = x.squeeze(-1) + # x = torch.flatten(x, 1, -1) + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self, amp_groups=False): + super(MultiScaleDiscriminator, self).__init__() + if amp_groups: + print('MSD: AMP groups') + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True, amp_groups=amp_groups), + DiscriminatorS(amp_groups=amp_groups), + DiscriminatorS(amp_groups=amp_groups), + ]) + self.meanpools = nn.ModuleList([ + AvgPool2d((4, 1), (2, 1), padding=(1, 0)), + AvgPool2d((4, 1), (2, 1), padding=(1, 0)) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i-1](y) + y_hat = self.meanpools[i-1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g, keys=[]): + loss = 0 + meta = {} + assert len(keys) == len(fmap_r) + + for key, dr, dg in zip(keys, fmap_r, fmap_g): + + k = 'loss_gen_feat_' + key + meta[k] = 0 + + for rl, gl in zip(dr, dg): + # loss += torch.mean(torch.abs(rl - gl)) + diff = torch.mean(torch.abs(rl - gl)) + loss += diff + meta[k] += diff.item() + + return loss*2, meta + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs, keys=[]): + loss = 0 + r_losses = [] + g_losses = [] + meta = {} + assert len(keys) == len(disc_real_outputs) + + for key, dr, dg in zip(keys, disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + meta['loss_disc_real_' + key] = r_loss.item() + meta['loss_disc_gen_' + key] = g_loss.item() + + return loss, r_losses, g_losses, meta + + +def generator_loss(disc_outputs, keys=[]): + loss = 0 + gen_losses = [] + meta = {} + assert len(keys) == len(disc_outputs) + + for key, dg in zip(keys, disc_outputs): + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + meta['loss_gen_' + key] = l.item() + + return loss, gen_losses, meta + diff --git a/models.py b/models.py new file mode 100644 index 0000000000000000000000000000000000000000..79ec744528259ec588ef27d2902a611668103848 --- /dev/null +++ b/models.py @@ -0,0 +1,359 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import json +import re +import sys + +import torch + +from common.text.symbols import get_symbols, get_pad_idx +from common.utils import DefaultAttrDict, AttrDict +from fastpitch.model import FastPitch +from fastpitch.model_jit import FastPitchJIT +from hifigan.models import Generator + +try: + from waveglow.model import WaveGlow + from waveglow import model as glow + from waveglow.denoiser import Denoiser + sys.modules['glow'] = glow +except ImportError: + print("WARNING: Couldn't import WaveGlow") + + +def parse_model_args(model_name, parser, add_help=False): + if model_name == 'FastPitch': + from fastpitch import arg_parser + return arg_parser.parse_fastpitch_args(parser, add_help) + + elif model_name == 'HiFi-GAN': + from hifigan import arg_parser + return arg_parser.parse_hifigan_args(parser, add_help) + + elif model_name == 'WaveGlow': + from waveglow.arg_parser import parse_waveglow_args + return parse_waveglow_args(parser, add_help) + + else: + raise NotImplementedError(model_name) + + +def get_model(model_name, model_config, device, bn_uniform_init=False, + forward_is_infer=False, jitable=False): + """Chooses a model based on name""" + del bn_uniform_init # unused (old name: uniform_initialize_bn_weight) + + if model_name == 'FastPitch': + if jitable: + model = FastPitchJIT(**model_config) + else: + model = FastPitch(**model_config) + + elif model_name == 'HiFi-GAN': + model = Generator(model_config) + + elif model_name == 'WaveGlow': + model = WaveGlow(**model_config) + + else: + raise NotImplementedError(model_name) + + if forward_is_infer and hasattr(model, 'infer'): + model.forward = model.infer + + return model.to(device) + + +def get_model_config(model_name, args, ckpt_config=None): + """ Get config needed to instantiate the model """ + + # Mark keys missing in `args` with an object (None is ambiguous) + _missing = object() + args = DefaultAttrDict(lambda: _missing, vars(args)) + + # `ckpt_config` is loaded from the checkpoint and has the priority + # `model_config` is based on args and fills empty slots in `ckpt_config` + if model_name == 'FastPitch': + print(get_symbols(args.symbol_set)) ############################ + model_config = dict( + # io + n_mel_channels=args.n_mel_channels, + # symbols + n_symbols=(len(get_symbols(args.symbol_set)) + if args.symbol_set is not _missing else _missing), + padding_idx=(get_pad_idx(args.symbol_set) + if args.symbol_set is not _missing else _missing), + symbols_embedding_dim=args.symbols_embedding_dim, + # input FFT + in_fft_n_layers=args.in_fft_n_layers, + in_fft_n_heads=args.in_fft_n_heads, + in_fft_d_head=args.in_fft_d_head, + in_fft_conv1d_kernel_size=args.in_fft_conv1d_kernel_size, + in_fft_conv1d_filter_size=args.in_fft_conv1d_filter_size, + in_fft_output_size=args.in_fft_output_size, + p_in_fft_dropout=args.p_in_fft_dropout, + p_in_fft_dropatt=args.p_in_fft_dropatt, + p_in_fft_dropemb=args.p_in_fft_dropemb, + # output FFT + out_fft_n_layers=args.out_fft_n_layers, + out_fft_n_heads=args.out_fft_n_heads, + out_fft_d_head=args.out_fft_d_head, + out_fft_conv1d_kernel_size=args.out_fft_conv1d_kernel_size, + out_fft_conv1d_filter_size=args.out_fft_conv1d_filter_size, + out_fft_output_size=args.out_fft_output_size, + p_out_fft_dropout=args.p_out_fft_dropout, + p_out_fft_dropatt=args.p_out_fft_dropatt, + p_out_fft_dropemb=args.p_out_fft_dropemb, + # duration predictor + dur_predictor_kernel_size=args.dur_predictor_kernel_size, + dur_predictor_filter_size=args.dur_predictor_filter_size, + p_dur_predictor_dropout=args.p_dur_predictor_dropout, + dur_predictor_n_layers=args.dur_predictor_n_layers, + # pitch predictor + pitch_predictor_kernel_size=args.pitch_predictor_kernel_size, + pitch_predictor_filter_size=args.pitch_predictor_filter_size, + p_pitch_predictor_dropout=args.p_pitch_predictor_dropout, + pitch_predictor_n_layers=args.pitch_predictor_n_layers, + # pitch conditioning + pitch_embedding_kernel_size=args.pitch_embedding_kernel_size, + # speakers parameters + n_speakers=args.n_speakers, + speaker_emb_weight=args.speaker_emb_weight, + n_languages=args.n_languages, + # energy predictor + energy_predictor_kernel_size=args.energy_predictor_kernel_size, + energy_predictor_filter_size=args.energy_predictor_filter_size, + p_energy_predictor_dropout=args.p_energy_predictor_dropout, + energy_predictor_n_layers=args.energy_predictor_n_layers, + # energy conditioning + energy_conditioning=args.energy_conditioning, + energy_embedding_kernel_size=args.energy_embedding_kernel_size, + ) + elif model_name == 'HiFi-GAN': + if args.hifigan_config is not None: + assert ckpt_config is None, ( + "Supplied --hifigan-config, but the checkpoint has a config. " + "Drop the flag or remove the config from the checkpoint file.") + print(f'HiFi-GAN: Reading model config from {args.hifigan_config}') + with open(args.hifigan_config) as f: + args = AttrDict(json.load(f)) + + model_config = dict( + # generator architecture + upsample_rates=args.upsample_rates, + upsample_kernel_sizes=args.upsample_kernel_sizes, + upsample_initial_channel=args.upsample_initial_channel, + resblock=args.resblock, + resblock_kernel_sizes=args.resblock_kernel_sizes, + resblock_dilation_sizes=args.resblock_dilation_sizes, + ) + elif model_name == 'WaveGlow': + model_config = dict( + n_mel_channels=args.n_mel_channels, + n_flows=args.flows, + n_group=args.groups, + n_early_every=args.early_every, + n_early_size=args.early_size, + WN_config=dict( + n_layers=args.wn_layers, + kernel_size=args.wn_kernel_size, + n_channels=args.wn_channels + ) + ) + else: + raise NotImplementedError(model_name) + + # Start with ckpt_config, and fill missing keys from model_config + final_config = {} if ckpt_config is None else ckpt_config.copy() + missing_keys = set(model_config.keys()) - set(final_config.keys()) + final_config.update({k: model_config[k] for k in missing_keys}) + + # If there was a ckpt_config, it should have had all args + if ckpt_config is not None and len(missing_keys) > 0: + print(f'WARNING: Keys {missing_keys} missing from the loaded config; ' + 'using args instead.') + # NOTE: useful to debug the assertion error + #for k, v in final_config.items(): + # if v is _missing: + # print(k) + assert all(v is not _missing for v in final_config.values()) ########################################## + return final_config + + +def get_model_train_setup(model_name, args): + """ Dump train setup for documentation purposes """ + if model_name == 'FastPitch': + return dict() + elif model_name == 'HiFi-GAN': + return dict( + # audio + segment_size=args.segment_size, + filter_length=args.filter_length, + num_mels=args.num_mels, + hop_length=args.hop_length, + win_length=args.win_length, + sampling_rate=args.sampling_rate, + mel_fmin=args.mel_fmin, + mel_fmax=args.mel_fmax, + mel_fmax_loss=args.mel_fmax_loss, + max_wav_value=args.max_wav_value, + # other + seed=args.seed, + # optimization + base_lr=args.learning_rate, + lr_decay=args.lr_decay, + epochs_all=args.epochs, + ) + elif model_name == 'WaveGlow': + return dict() + else: + raise NotImplementedError(model_name) + + +def load_model_from_ckpt(checkpoint_data, model, key='state_dict'): + + if key is None: + return checkpoint_data['model'], None + + sd = checkpoint_data[key] + sd = {re.sub('^module\.', '', k): v for k, v in sd.items()} + status = model.load_state_dict(sd, strict=False) + return model, status + + +def load_and_setup_model(model_name, parser, checkpoint, amp, device, + unk_args=[], forward_is_infer=False, jitable=False): + if checkpoint is not None: + #ckpt_data = torch.load(checkpoint) + ckpt_data = torch.load(checkpoint, map_location=device) + print(f'{model_name}: Loading {checkpoint}...') + ckpt_config = ckpt_data.get('config') + if ckpt_config is None: + print(f'{model_name}: No model config in the checkpoint; using args.') + else: + print(f'{model_name}: Found model config saved in the checkpoint.') + else: + ckpt_config = None + ckpt_data = {} + + model_parser = parse_model_args(model_name, parser, add_help=False) + model_args, model_unk_args = model_parser.parse_known_args() + unk_args[:] = list(set(unk_args) & set(model_unk_args)) + + model_config = get_model_config(model_name, model_args, ckpt_config) + + model = get_model(model_name, model_config, device, + forward_is_infer=forward_is_infer, + jitable=jitable) + + if checkpoint is not None: + key = 'generator' if model_name == 'HiFi-GAN' else 'state_dict' + model, status = load_model_from_ckpt(ckpt_data, model, key) + + missing = [] if status is None else status.missing_keys + unexpected = [] if status is None else status.unexpected_keys + + # Attention is only used during training, we won't miss it + if model_name == 'FastPitch': + missing = [k for k in missing if not k.startswith('attention.')] + unexpected = [k for k in unexpected if not k.startswith('attention.')] + + assert len(missing) == 0 and len(unexpected) == 0, ( + f'Mismatched keys when loading parameters. Missing: {missing}, ' + f'unexpected: {unexpected}.') + + if model_name == "WaveGlow": + for k, m in model.named_modules(): + m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatability + model = model.remove_weightnorm(model) + + elif model_name == 'HiFi-GAN': + assert model_args.hifigan_config is not None or ckpt_config is not None, ( + 'Use a HiFi-GAN checkpoint from NVIDIA DeepLearningExamples with ' + 'saved config or supply --hifigan-config .') + model.remove_weight_norm() + + if amp: + model.half() + + model.eval() + return model.to(device), model_config, ckpt_data.get('train_setup', {}) + + +def load_and_setup_ts_model(model_name, checkpoint, amp, device=None): + print(f'{model_name}: Loading TorchScript checkpoint {checkpoint}...') + model = torch.jit.load(checkpoint).eval() + if device is not None: + model = model.to(device) + + if amp: + model.half() + elif next(model.parameters()).dtype == torch.float16: + raise ValueError('Trying to load FP32 model,' + 'TS checkpoint is in FP16 precision.') + return model + + +def convert_ts_to_trt(model_name, ts_model, parser, amp, unk_args=[]): + trt_parser = _parse_trt_compilation_args(model_name, parser, add_help=False) + trt_args, trt_unk_args = trt_parser.parse_known_args() + unk_args[:] = list(set(unk_args) & set(trt_unk_args)) + + if model_name == 'HiFi-GAN': + return _convert_ts_to_trt_hifigan( + ts_model, amp, trt_args.trt_min_opt_max_batch, + trt_args.trt_min_opt_max_hifigan_length) + else: + raise NotImplementedError + + +def _parse_trt_compilation_args(model_name, parent, add_help=False): + """ + Parse model and inference specific commandline arguments. + """ + parser = argparse.ArgumentParser(parents=[parent], add_help=add_help, + allow_abbrev=False) + trt = parser.add_argument_group(f'{model_name} Torch-TensorRT compilation parameters') + trt.add_argument('--trt-min-opt-max-batch', nargs=3, type=int, + default=(1, 8, 16), + help='Torch-TensorRT min, optimal and max batch size') + if model_name == 'HiFi-GAN': + trt.add_argument('--trt-min-opt-max-hifigan-length', nargs=3, type=int, + default=(100, 800, 1200), + help='Torch-TensorRT min, optimal and max audio length (in frames)') + return parser + + +def _convert_ts_to_trt_hifigan(ts_model, amp, trt_min_opt_max_batch, + trt_min_opt_max_hifigan_length, num_mels=80): + import torch_tensorrt + trt_dtype = torch.half if amp else torch.float + print(f'Torch TensorRT: compiling HiFi-GAN for dtype {trt_dtype}.') + min_shp, opt_shp, max_shp = zip(trt_min_opt_max_batch, + (num_mels,) * 3, + trt_min_opt_max_hifigan_length) + compile_settings = { + "inputs": [torch_tensorrt.Input( + min_shape=min_shp, + opt_shape=opt_shp, + max_shape=max_shp, + dtype=trt_dtype, + )], + "enabled_precisions": {trt_dtype}, + "require_full_compilation": True, + } + trt_model = torch_tensorrt.compile(ts_model, **compile_settings) + print('Torch TensorRT: compilation successful.') + return trt_model diff --git a/pretrained_models/hifigan/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz_21.08.0_amp.zip b/pretrained_models/hifigan/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz_21.08.0_amp.zip new file mode 100644 index 0000000000000000000000000000000000000000..a60321254ad44c9a59ca743adcdb620053dac18f --- /dev/null +++ b/pretrained_models/hifigan/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz_21.08.0_amp.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00af1cfbc043da27906c1b535115c6d46c385f033353aefa08b3bd31c5a82acb +size 51879246 diff --git a/pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt b/pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt new file mode 100644 index 0000000000000000000000000000000000000000..9aeed2cf22c5af1e96b1ff88d6c61886498b2cba --- /dev/null +++ b/pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23d8023b48c13e5cdaf8758077c6bc3567d14813f5327f63aa1d26f579c6e9a3 +size 55819501 diff --git a/pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt b/pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cc70a4bf11a91f3368d6998e9d2a34f9bcda273 --- /dev/null +++ b/pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a6092979c190aa784fe84ccdba7582e99ff50a2e9c8db8a5c227afe5e21cd26 +size 55824685 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5867ca5f58a6574f4a90481e26c323a9835b89c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +torch +torchvision +torchaudio +inflect +librosa==0.9.0 +matplotlib +numpy +pynvml==11.0.0 +scipy +tensorboardX==2.0 +git+https://github.com/NVIDIA/dllogger@v1.0.0#egg=dllogger +gradio==5.15 +pydantic==2.10.6 diff --git a/symbols.py b/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..fb7e52dde5461ac6720aae8b2a65cd7107c720e5 --- /dev/null +++ b/symbols.py @@ -0,0 +1,64 @@ +""" from https://github.com/keithito/tacotron """ + +''' +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' +from .cmudict import valid_symbols + + +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): +_arpabet = ['@' + s for s in valid_symbols] + + +def get_symbols(symbol_set='english_basic'): + if symbol_set == 'english_basic': + _pad = '_' + _punctuation = '!\'(),.:;? ' + _special = '-' + _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + elif symbol_set == 'english_basic_lowercase': + _pad = '_' + _punctuation = '!\'"(),.:;? ' + _special = '-' + _letters = 'abcdefghijklmnopqrstuvwxyz' + symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + elif symbol_set == 'english_expanded': + _punctuation = '!\'",.:;? ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + _accented = 'áçéêëñöøćž' + _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + elif symbol_set == 'smj_expanded': + _punctuation = '!\'",.:;?- ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + # _accented = 'áçéêëñöøćžđšŧ' #also north sámi letters... + _accented = 'áçéêëñöø' #also north sámi letters... + # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + _letters = 'AÁÆÅÄBCDEFGHIJKLMNŊŃÑOØÖPQRSTŦUVWXYZaáæåäbcdefghijklmnŋńñoøöpqrstuvwxyz' + # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet + symbols = list(_punctuation + _letters) + _arpabet + elif symbol_set == 'sme_expanded': + _punctuation = '!\'",.:;?- ' + _math = '#%&*+-/[]()' + _special = '_@©°½—₩€$' + _accented = 'áçéêëńñöøćčžđšŧ' #also north sámi letters... + # _accented = 'áçéêëñöø' #also north sámi letters... + # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + _letters = 'AÁÆÅÄBCČDĐEFGHIJKLMNŊOØÖPQRSŠTŦUVWXYZŽaáæåäbcčdđefghijklmnŋoøöpqrsštŧuvwxyzž' + # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet + symbols = list(_punctuation + _letters) + _arpabet + else: + raise Exception("{} symbol set does not exist".format(symbol_set)) + + return symbols + + +def get_pad_idx(symbol_set='english_basic'): + if symbol_set in {'english_basic', 'english_basic_lowercase', 'smj_expanded', 'sme_expanded'}: + return 0 + else: + raise Exception("{} symbol set not used yet".format(symbol_set)) diff --git a/syn_hifigan.py b/syn_hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..384945dc0e0e003f289ef625d5e2289613b1f8f7 --- /dev/null +++ b/syn_hifigan.py @@ -0,0 +1,293 @@ +import argparse + +import models +import time +import sys +import warnings +#from pathlib import Path + + + +import torch +import numpy as np +from scipy.stats import norm +from scipy.io.wavfile import write +from torch.nn.utils.rnn import pad_sequence +#import style_controller +from common.utils import load_wav_to_torch + + +from common import utils, layers + +from common.text.text_processing import TextProcessing + + +import os +#os.environ["CUDA_VISIBLE_DEVICES"]="" +#device = "cuda:0" +device = "cpu" + +vocoder = "hifigan" +SHARPEN = True +from hifigan.data_function import MAX_WAV_VALUE, mel_spectrogram +from hifigan.models import Denoiser +import json +from scipy import ndimage + +import os + +def parse_args(parser): + """ + Parse commandline arguments. + """ + parser.add_argument('-i', '--input', type=str, required=False, + help='Full path to the input text (phareses separated by newlines)') + parser.add_argument('-o', '--output', default=None, + help='Output folder to save audio (file per phrase)') + parser.add_argument('--log-file', type=str, default=None, + help='Path to a DLLogger log file') + parser.add_argument('--save-mels', action='store_true', help='') + parser.add_argument('--cuda', action='store_true', + help='Run inference on a GPU using CUDA') + + parser.add_argument('--cudnn-benchmark', action='store_true', + help='Enable cudnn benchmark mode') + + #parser.add_argument('--fastpitch', type=str, default='output_smj_sander/FastPitch_checkpoint_660.pt', + #help='Full path to the generator checkpoint file (skip to use ground truth mels)') ######### + + parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt', + help='Full path to the generator checkpoint file (skip to use ground truth mels)') ######### + + parser.add_argument('-d', '--denoising-strength', default=0.01, type=float, + help='WaveGlow denoising') + parser.add_argument('-sr', '--sampling-rate', default=22050, type=int, + help='Sampling rate') + parser.add_argument('--stft-hop-length', type=int, default=256, + help='STFT hop length for estimating audio length from mel size') + parser.add_argument('--amp', action='store_true',default=False, + help='Inference with AMP') + parser.add_argument('-bs', '--batch-size', type=int, default=1) + + parser.add_argument('--ema', action='store_true', + help='Use EMA averaged model (if saved in checkpoints)') + + parser.add_argument('--speaker', type=int, default=0, + help='Speaker ID for a multi-speaker model') + parser.add_argument('--language', type=int, default=0, + help='Language ID for a multilingual model') + parser.add_argument('--p-arpabet', type=float, default=0.0, help='') ################ + + + text_processing = parser.add_argument_group('Text processing parameters') + text_processing.add_argument('--text-cleaners', nargs='*', + default=['basic_cleaners'], type=str, + help='Type of text cleaners for input text') + text_processing.add_argument('--symbol-set', type=str, default='all_sami', ################# + help='Define symbol set for input text') + + cond = parser.add_argument_group('conditioning on additional attributes') + + cond.add_argument('--n-speakers', type=int, default=10, + help='Number of speakers in the model.') + cond.add_argument('--n-languages', type=int, default=3, + help='Number of languages in the model.') + + return parser + + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + +def load_model_from_ckpt(checkpoint_path, ema, model): + + + checkpoint_data = torch.load(checkpoint_path,map_location = device) + status = '' + + if 'state_dict' in checkpoint_data: + sd = checkpoint_data['state_dict'] + if ema and 'ema_state_dict' in checkpoint_data: + sd = checkpoint_data['ema_state_dict'] + status += ' (EMA)' + elif ema and not 'ema_state_dict' in checkpoint_data: + print(f'WARNING: EMA weights missing for {checkpoint_data}') + + if any(key.startswith('module.') for key in sd): + sd = {k.replace('module.', ''): v for k,v in sd.items()} + status += ' ' + str(model.load_state_dict(sd, strict=False)) + else: + model = checkpoint_data['model'] + print(f'Loaded {checkpoint_path}{status}') + + return model + +def load_and_setup_model(model_name, parser, checkpoint, amp, device, + unk_args=[], forward_is_infer=False, ema=True, + jitable=False): + + + model_parser = models.parse_model_args(model_name, parser, add_help=False) + model_args, model_unk_args = model_parser.parse_known_args() + unk_args[:] = list(set(unk_args) & set(model_unk_args)) + + setattr(model_args, "energy_conditioning",True) + model_config = models.get_model_config(model_name, model_args) + # print(model_config) + model = models.get_model(model_name, model_config, device, + forward_is_infer=forward_is_infer, + jitable=jitable) + + if checkpoint is not None: + model = load_model_from_ckpt(checkpoint, ema, model) + + amp = False + if amp: + model.half() + model.eval() + + return model.to(device) + +class Synthesizer: + + def _load_pyt_or_ts_model(self, model_name, ckpt_path, format = 'pyt'): + + if format == 'ts': + + model = models.load_and_setup_ts_model(model_name, ckpt_path, + False, device) + model_train_setup = {} + return model, model_train_setup + + is_ts_based_infer = False + model, _, model_train_setup = models.load_and_setup_model( + model_name, self.parser, ckpt_path, False, device, + unk_args=self.unk_args, forward_is_infer=True, jitable=is_ts_based_infer) + + if is_ts_based_infer: + model = torch.jit.script(model) + return model, model_train_setup + + + + def __init__(self): + parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', + allow_abbrev=False) + self.parser = parse_args(parser) + + self.args, self.unk_args = self.parser.parse_known_args() + self.generator = load_and_setup_model( + 'FastPitch', parser, self.args.fastpitch, self.args.amp, device, + unk_args=self.unk_args, forward_is_infer=True, ema=self.args.ema, + jitable=False) + + + self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt" # Better with Sander! + + #self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt" + #self.vocoder = UnivNetModel.from_pretrained(model_name="tts_en_libritts_univnet") + self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model) + self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device) + self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0) + + + + def unsharp_mask(self, img, radius=1, amount=1): + blurred = ndimage.gaussian_filter(img, radius) + sharpened = img + amount * ( img - blurred) + return sharpened + + def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1): + + text = self.tp.encode_text(text) + #text = [9]+self.tp.encode_text(text)+[9] + text = torch.LongTensor([text]).to(device) + #probs = surprisals + for p in [0]: + + with torch.no_grad(): + print(s_weight, l_weight) + mel, mel_lens, *_ = self.generator(text, pace, max_duration=15, speaker=spkr, language=lang, speaker_weight=s_weight, language_weight=l_weight) #, ref_vector=embedding, speaker=speaker_i) #, **gen_kw, speaker 0 = bad audio, speaker 1 = better audio + + if SHARPEN: + + mel_np = mel.float().data.cpu().numpy()[0] + tgt_min = -11 + tgt_max = 1.25 + #print(np.min(mel_np), np.max(mel_np)) + mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5) + mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05) + # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05) + + for i in range(0, 80): + mel_np[i,:]+=(i-30)*clarity*0.02 + mel_np = (mel_np-np.min(mel_np))/ (np.max(mel_np)-np.min(mel_np)) * (tgt_max - tgt_min) + tgt_min + mel[0] = torch.from_numpy(mel_np).float().to(device) + """ + mel_np = mel.float().data.cpu().numpy()[0] + blurred_f = ndimage.gaussian_filter(mel_np, 1.0) #3 + alpha = 0.2 #0.3 ta + mel_np = mel_np + alpha * (mel_np - blurred_f) + blurred_f = ndimage.gaussian_filter(mel_np, 3.0) #3 + alpha = 0.1 # 0.1 ta + sharpened = mel_np + alpha * (mel_np - blurred_f) + + for i in range(0,80): + sharpened[i, :]+=(i-40)*0.01 #0.01 ta + mel[0] = torch.from_numpy(sharpened).float().to(device) + + """ + with torch.no_grad(): + + y_g_hat = self.vocoder(mel).float() ########### + #y_g_hat = self.denoiser(y_g_hat.squeeze(1), strength=0.01) #[:, 0] + audio = y_g_hat.squeeze() + # normalize volume + audio = audio/torch.max(torch.abs(audio))*0.95*32768 + audio = audio.cpu().numpy().astype('int16') + + + write(output_file+".wav", 22050, audio) + + os.system("play -q "+output_file+".wav") + return audio + + +if __name__ == '__main__': + syn = Synthesizer() + hifigan = syn.hifigan_model + hifigan_n = hifigan.replace(".pt", "") + fastpitch = syn.args.fastpitch + fastpitch_n = fastpitch.replace(".pt", "") + print(hifigan_n + " " + fastpitch_n) + + hifigan_n_short = hifigan_n.split("/") + hifigan_n_shorter = hifigan_n_short[2].split("_") + hifigan_n_shortest = hifigan_n_shorter[3] + + fastpitch_n_short = fastpitch_n.split("/") + fastpitch_n_shorter = fastpitch_n_short[1].split("_") + fastpitch_n_shortest = fastpitch_n_shorter[2] + + #syn.speak("Gå lij riek mælggadav vádtsám, de bådij vijmak tjáppa vuobmáj.") + i = 0 + spkr = 1 + lang = 1 + while (1==1): + + text = input(">") + text1 = text.split(" ") + syn.speak(text, output_file="/tmp/tmp.wav", spkr=6, lang=1) + syn.speak(text, output_file="/tmp/tmp.wav", spkr=7, lang=1) + continue + for s in range(1,10): + for l in range(3): ## + print("speaker", s, "language", l) ## + syn.speak(text, output_file="/tmp/"+str(i)+"_"+text1[0]+"_"+str(s)+"_"+str(l)+"_FP_"+fastpitch_n_shortest+"univnet", spkr=s, lang=l) + #syn.speak(text, output_file="/home/hiovain/DeepLearningExamples/PyTorch/SpeechSynthesis/FastPitchMulti/inf_output_multi/"+str(i)+"_"+text1[0]+"_"+str(s)+"_"+str(l)+"_FP_"+fastpitch_n_shortest+"univnet", spkr=s, lang=l) + i += 1 + + diff --git a/syn_k_univnet_multi.py b/syn_k_univnet_multi.py new file mode 100644 index 0000000000000000000000000000000000000000..f4803d4590ed4792e14043ca11c8ff9b3698d503 --- /dev/null +++ b/syn_k_univnet_multi.py @@ -0,0 +1,275 @@ +import argparse + +import models +import time +import sys +import warnings +#from pathlib import Path + +from nemo.collections.tts.models import UnivNetModel + +import torch +import numpy as np +from scipy.stats import norm +from scipy.io.wavfile import write +from torch.nn.utils.rnn import pad_sequence +#import style_controller +from common.utils import load_wav_to_torch + + +from common import utils, layers + +from common.text.text_processing import TextProcessing + + +import os +#os.environ["CUDA_VISIBLE_DEVICES"]="" +device = "cuda:0" +#device = "cpu" +vocoder = "univnet" +vocoder1 = "hifigan" +SHARPEN = True + +from hifigan.data_function import MAX_WAV_VALUE, mel_spectrogram +from hifigan.models import Denoiser +import json +from scipy import ndimage + +import os + +def parse_args(parser): + """ + Parse commandline arguments. + """ + parser.add_argument('-i', '--input', type=str, required=False, + help='Full path to the input text (phareses separated by newlines)') + parser.add_argument('-o', '--output', default=None, + help='Output folder to save audio (file per phrase)') + parser.add_argument('--log-file', type=str, default=None, + help='Path to a DLLogger log file') + parser.add_argument('--save-mels', action='store_true', help='') + parser.add_argument('--cuda', action='store_true', + help='Run inference on a GPU using CUDA') + + parser.add_argument('--cudnn-benchmark', action='store_true', + help='Enable cudnn benchmark mode') + + #parser.add_argument('--fastpitch', type=str, default='output_smj_sander/FastPitch_checkpoint_660.pt', + #help='Full path to the generator checkpoint file (skip to use ground truth mels)') ######### + + parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt', + help='Full path to the generator checkpoint file (skip to use ground truth mels)') ######### + + parser.add_argument('-d', '--denoising-strength', default=0.01, type=float, + help='WaveGlow denoising') + parser.add_argument('-sr', '--sampling-rate', default=22050, type=int, + help='Sampling rate') + parser.add_argument('--stft-hop-length', type=int, default=256, + help='STFT hop length for estimating audio length from mel size') + parser.add_argument('--amp', action='store_true',default=False, + help='Inference with AMP') + parser.add_argument('-bs', '--batch-size', type=int, default=1) + + parser.add_argument('--ema', action='store_true', + help='Use EMA averaged model (if saved in checkpoints)') + + parser.add_argument('--speaker', type=int, default=0, + help='Speaker ID for a multi-speaker model') + parser.add_argument('--language', type=int, default=0, + help='Language ID for a multilingual model') + parser.add_argument('--p-arpabet', type=float, default=0.0, help='') ################ + + + text_processing = parser.add_argument_group('Text processing parameters') + text_processing.add_argument('--text-cleaners', nargs='*', + default=['basic_cleaners'], type=str, + help='Type of text cleaners for input text') + text_processing.add_argument('--symbol-set', type=str, default='all_sami', ################# + help='Define symbol set for input text') + + cond = parser.add_argument_group('conditioning on additional attributes') + + cond.add_argument('--n-speakers', type=int, default=10, + help='Number of speakers in the model.') + cond.add_argument('--n-languages', type=int, default=3, + help='Number of languages in the model.') + + return parser + + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + +def load_model_from_ckpt(checkpoint_path, ema, model): + + + checkpoint_data = torch.load(checkpoint_path,map_location = device) + status = '' + + if 'state_dict' in checkpoint_data: + sd = checkpoint_data['state_dict'] + if ema and 'ema_state_dict' in checkpoint_data: + sd = checkpoint_data['ema_state_dict'] + status += ' (EMA)' + elif ema and not 'ema_state_dict' in checkpoint_data: + print(f'WARNING: EMA weights missing for {checkpoint_data}') + + if any(key.startswith('module.') for key in sd): + sd = {k.replace('module.', ''): v for k,v in sd.items()} + status += ' ' + str(model.load_state_dict(sd, strict=False)) + else: + model = checkpoint_data['model'] + print(f'Loaded {checkpoint_path}{status}') + + return model + +def load_and_setup_model(model_name, parser, checkpoint, amp, device, + unk_args=[], forward_is_infer=False, ema=True, + jitable=False): + + model_parser = models.parse_model_args(model_name, parser, add_help=False) + model_args, model_unk_args = model_parser.parse_known_args() + unk_args[:] = list(set(unk_args) & set(model_unk_args)) + + setattr(model_args, "energy_conditioning",True) + model_config = models.get_model_config(model_name, model_args) + # print(model_config) + model = models.get_model(model_name, model_config, device, + forward_is_infer=forward_is_infer, + jitable=jitable) + + if checkpoint is not None: + model = load_model_from_ckpt(checkpoint, ema, model) + + amp = False + if amp: + model.half() + model.eval() + + return model.to(device) + +class Synthesizer: + + def _load_pyt_or_ts_model(self, model_name, ckpt_path, format = 'pyt'): + if format == 'ts': + + model = models.load_and_setup_ts_model(model_name, ckpt_path, + False, device) + model_train_setup = {} + return model, model_train_setup + + is_ts_based_infer = False + model, _, model_train_setup = models.load_and_setup_model( + model_name, self.parser, ckpt_path, False, device, + unk_args=self.unk_args, forward_is_infer=True, jitable=is_ts_based_infer) + + if is_ts_based_infer: + model = torch.jit.script(model) + return model, model_train_setup + + + + def __init__(self): + parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', + allow_abbrev=False) + self.parser = parse_args(parser) + + self.args, self.unk_args = self.parser.parse_known_args() + self.generator = load_and_setup_model( + 'FastPitch', parser, self.args.fastpitch, self.args.amp, device, + unk_args=self.unk_args, forward_is_infer=True, ema=self.args.ema, + jitable=False) + + + self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt" # Better with Sander! + #self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt" + self.vocoder = UnivNetModel.from_pretrained(model_name="tts_en_libritts_univnet") + self.vocoder1, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model) + self.denoiser = Denoiser(self.vocoder1,device=device) #, win_length=self.args.win_length).to(device) + self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0) + + def unsharp_mask(self, img, radius=1, amount=1): + blurred = ndimage.gaussian_filter(img, radius) + sharpened = img + amount * ( img - blurred) + return sharpened + + # + def speak(self, text, output_file="/tmp/tmp", lang=0, spkr=0, l_weight=1, s_weight=1, pace=0.95,clarity=1): + + text = self.tp.encode_text(text) + #text = [9]+self.tp.encode_text(text)+[9] + text = torch.LongTensor([text]).to(device) + + for p in [0]: + + + with torch.no_grad(): + + mel, mel_lens, *_ = self.generator(text, pace=pace, max_duration=15, speaker=spkr, language=lang, speaker_weight=s_weight, language_weight=l_weight) #, ref_vector=embedding, speaker=speaker_i) #, **gen_kw, speaker 0 = bad audio, speaker 1 = better audio + if SHARPEN: + + mel_np = mel.float().data.cpu().numpy()[0] + tgt_min = -11 + tgt_max = 1.5 + #print(np.min(mel_np), np.max(mel_np)) + mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=1) + mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05) + # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05) + + for i in range(0, 80): + mel_np[i,:]+=(i-30)*clarity + mel_np = (mel_np-np.min(mel_np))/ (np.max(mel_np)-np.min(mel_np)) * (tgt_max - tgt_min) + tgt_min + mel[0] = torch.from_numpy(mel_np).float().to(device) + + + + with torch.no_grad(): + y_g_hat = self.vocoder(spec=mel).float() + #y_g_hat = self.vocoder1(mel).float() ########### + y_g_hat = self.denoiser(y_g_hat.squeeze(1), strength=0.01) #[:, 0] + audio = y_g_hat.squeeze() + # normalize volume + audio = audio/torch.max(torch.abs(audio))*0.95*32768 + audio = audio.cpu().numpy().astype('int16') + + + write(output_file+".wav", 22050, audio) + # ANT: Remove playing form here so GUI doesn't play twice + #os.system("play -q "+output_file+".wav") + return audio + + +if __name__ == '__main__': + syn = Synthesizer() + hifigan = syn.hifigan_model + hifigan_n = hifigan.replace(".pt", "") + fastpitch = syn.args.fastpitch + fastpitch_n = fastpitch.replace(".pt", "") + print(hifigan_n + " " + fastpitch_n) + + hifigan_n_short = hifigan_n.split("/") + hifigan_n_shorter = hifigan_n_short[2].split("_") + hifigan_n_shortest = hifigan_n_shorter[3] + + fastpitch_n_short = fastpitch_n.split("/") + fastpitch_n_shorter = fastpitch_n_short[1].split("_") + fastpitch_n_shortest = fastpitch_n_shorter[2] + + #syn.speak("Gå lij riek mælggadav vádtsám, de bådij vijmak tjáppa vuobmáj.") + i = 0 + spkr = 1 + lang = 1 + while (1==1): + + text = input(">") + text1 = text.split(" ") + for s in range(1,10): + for l in range(3): ## + print("speaker", s, "language", l) ## + syn.speak(text, output_file="/home/hiovain/DeepLearningExamples/PyTorch/SpeechSynthesis/FastPitchMulti/inf_output_multi/"+str(i)+"_"+text1[0]+"_"+str(s)+"_"+str(l)+"_FP_"+fastpitch_n_shortest+"univnet", spkr=s, lang=l) + i += 1 + + diff --git a/waveglow/arg_parser.py b/waveglow/arg_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..e46a537396222fa1c4faff2706a4c651d11aa012 --- /dev/null +++ b/waveglow/arg_parser.py @@ -0,0 +1,65 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import argparse + +def parse_waveglow_args(parent, add_help=False): + """ + Parse commandline arguments. + """ + parser = argparse.ArgumentParser(parents=[parent], add_help=add_help, allow_abbrev=False) + + # misc parameters + parser.add_argument('--n-mel-channels', default=80, type=int, + help='Number of bins in mel-spectrograms') + + # glow parameters + parser.add_argument('--flows', default=12, type=int, + help='Number of steps of flow') + parser.add_argument('--groups', default=8, type=int, + help='Number of samples in a group processed by the steps of flow') + parser.add_argument('--early-every', default=4, type=int, + help='Determines how often (i.e., after how many coupling layers) \ + a number of channels (defined by --early-size parameter) are output\ + to the loss function') + parser.add_argument('--early-size', default=2, type=int, + help='Number of channels output to the loss function') + parser.add_argument('--sigma', default=1.0, type=float, + help='Standard deviation used for sampling from Gaussian') + parser.add_argument('--segment-length', default=4000, type=int, + help='Segment length (audio samples) processed per iteration') + + # wavenet parameters + wavenet = parser.add_argument_group('WaveNet parameters') + wavenet.add_argument('--wn-kernel-size', default=3, type=int, + help='Kernel size for dialted convolution in the affine coupling layer (WN)') + wavenet.add_argument('--wn-channels', default=512, type=int, + help='Number of channels in WN') + wavenet.add_argument('--wn-layers', default=8, type=int, + help='Number of layers in WN') + + return parser diff --git a/waveglow/data_function.py b/waveglow/data_function.py new file mode 100644 index 0000000000000000000000000000000000000000..583a50205b7459bfe87c898583aadd70e5d402b5 --- /dev/null +++ b/waveglow/data_function.py @@ -0,0 +1,100 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# *****************************************************************************\ + +import torch +import random +import common.layers as layers +from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu + + +class MelAudioLoader(torch.utils.data.Dataset): + """ + 1) loads audio,text pairs + 2) computes mel-spectrograms from audio files. + """ + + def __init__(self, + dataset_path, + audiopaths_and_text, + segment_length, + n_mel_channels, + max_wav_value, + sampling_rate, + filter_length, + hop_length, + win_length, + mel_fmin, + mel_fmax, + args): + self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) + self.max_wav_value = max_wav_value + self.sampling_rate = sampling_rate + self.stft = layers.TacotronSTFT( + filter_length, hop_length, win_length, + n_mel_channels, sampling_rate, mel_fmin, + mel_fmax) + self.segment_length = segment_length + random.seed(1234) + random.shuffle(self.audiopaths_and_text) + + def get_mel_audio_pair(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + + if sampling_rate != self.stft.sampling_rate: + raise ValueError("{} {} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate)) + + # Take segment + if audio.size(0) >= self.segment_length: + max_audio_start = audio.size(0) - self.segment_length + audio_start = random.randint(0, max_audio_start) + audio = audio[audio_start:audio_start+self.segment_length] + else: + audio = torch.nn.functional.pad( + audio, (0, self.segment_length - audio.size(0)), 'constant').data + + audio = audio / self.max_wav_value + audio_norm = audio.unsqueeze(0) + audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = melspec.squeeze(0) + + return (melspec, audio, len(audio)) + + def __getitem__(self, index): + return self.get_mel_audio_pair(self.audiopaths_and_text[index][0]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +def batch_to_gpu(batch): + x, y, len_y = batch + x = to_gpu(x).float() + y = to_gpu(y).float() + len_y = to_gpu(torch.sum(len_y)) + return ((x, y), y, len_y) diff --git a/waveglow/denoiser.py b/waveglow/denoiser.py new file mode 100644 index 0000000000000000000000000000000000000000..824e1086a7004bdd44b1824738d8a525aa04bd2f --- /dev/null +++ b/waveglow/denoiser.py @@ -0,0 +1,61 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import torch +from common.layers import STFT + + +class Denoiser(torch.nn.Module): + """ Removes model bias from audio produced with waveglow """ + + def __init__(self, waveglow, filter_length=1024, n_overlap=4, + win_length=1024, mode='zeros'): + super(Denoiser, self).__init__() + device = waveglow.upsample.weight.device + dtype = waveglow.upsample.weight.dtype + self.stft = STFT(filter_length=filter_length, + hop_length=int(filter_length/n_overlap), + win_length=win_length).to(device) + if mode == 'zeros': + mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device) + elif mode == 'normal': + mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device) + else: + raise Exception("Mode {} if not supported".format(mode)) + + with torch.no_grad(): + bias_audio = waveglow.infer(mel_input, sigma=0.0).float() + bias_spec, _ = self.stft.transform(bias_audio) + + self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) + + def forward(self, audio, strength=0.1): + audio_spec, audio_angles = self.stft.transform(audio) + audio_spec_denoised = audio_spec - self.bias_spec * strength + audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) + audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) + return audio_denoised diff --git a/waveglow/loss_function.py b/waveglow/loss_function.py new file mode 100644 index 0000000000000000000000000000000000000000..1ff86e8814203d4e2c187a76171107d4e367d55d --- /dev/null +++ b/waveglow/loss_function.py @@ -0,0 +1,49 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import torch + +class WaveGlowLoss(torch.nn.Module): + def __init__(self, sigma=1.0): + super(WaveGlowLoss, self).__init__() + self.sigma = sigma + + def forward(self, model_output, clean_audio): + # clean_audio is unused; + z, log_s_list, log_det_W_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = torch.sum(log_s) + log_det_W_total = log_det_W_list[i] + else: + log_s_total = log_s_total + torch.sum(log_s) + log_det_W_total += log_det_W_list[i] + + loss = torch.sum( + z * z) / (2 * self.sigma * self.sigma) - log_s_total - log_det_W_total # noqa: E501 + meta = {} + return loss / (z.size(0) * z.size(1) * z.size(2)), meta diff --git a/waveglow/model.py b/waveglow/model.py new file mode 100644 index 0000000000000000000000000000000000000000..770564ff69488c324bd883ccb97d2f09b7e103d2 --- /dev/null +++ b/waveglow/model.py @@ -0,0 +1,342 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** +import torch +from torch.autograd import Variable +import torch.nn.functional as F + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class Invertible1x1Conv(torch.nn.Module): + """ + The layer outputs both the convolution, and the log determinant + of its weight matrix. If reverse=True it does convolution with + inverse + """ + + def __init__(self, c): + super(Invertible1x1Conv, self).__init__() + self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, + bias=False) + + # Sample a random orthonormal matrix to initialize weights + W = torch.qr(torch.FloatTensor(c, c).normal_())[0] + + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:, 0] = -1 * W[:, 0] + W = W.view(c, c, 1) + W = W.contiguous() + self.conv.weight.data = W + + def forward(self, z): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + # Forward computation + log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze() + z = self.conv(z) + return z, log_det_W + + + def infer(self, z): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + if not hasattr(self, 'W_inverse'): + # Reverse computation + W_inverse = W.float().inverse() + W_inverse = Variable(W_inverse[..., None]) + if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor': + W_inverse = W_inverse.half() + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + + +class WN(torch.nn.Module): + """ + This is the WaveNet like layer for the affine coupling. The primary + difference from WaveNet is the convolutions need not be causal. There is + also no dilation size reset. The dilation only doubles on each layer + """ + + def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, + kernel_size): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + assert(n_channels % 2 == 0) + self.n_layers = n_layers + self.n_channels = n_channels + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.cond_layers = torch.nn.ModuleList() + + start = torch.nn.Conv1d(n_in_channels, n_channels, 1) + start = torch.nn.utils.weight_norm(start, name='weight') + self.start = start + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + for i in range(n_layers): + dilation = 2 ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) + cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + self.cond_layers.append(cond_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * n_channels + else: + res_skip_channels = n_channels + res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm( + res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, forward_input): + audio, spect = forward_input + audio = self.start(audio) + + for i in range(self.n_layers): + acts = fused_add_tanh_sigmoid_multiply( + self.in_layers[i](audio), + self.cond_layers[i](spect), + torch.IntTensor([self.n_channels])) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + audio = res_skip_acts[:, :self.n_channels, :] + audio + skip_acts = res_skip_acts[:, self.n_channels:, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output = skip_acts + output + return self.end(output) + + +class WaveGlow(torch.nn.Module): + def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, + n_early_size, WN_config): + super(WaveGlow, self).__init__() + + self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, + n_mel_channels, + 1024, stride=256) + assert(n_group % 2 == 0) + self.n_flows = n_flows + self.n_group = n_group + self.n_early_every = n_early_every + self.n_early_size = n_early_size + self.WN = torch.nn.ModuleList() + self.convinv = torch.nn.ModuleList() + + n_half = int(n_group / 2) + + # Set up layers with the right sizes based on how many dimensions + # have been output already + n_remaining_channels = n_group + for k in range(n_flows): + if k % self.n_early_every == 0 and k > 0: + n_half = n_half - int(self.n_early_size / 2) + n_remaining_channels = n_remaining_channels - self.n_early_size + self.convinv.append(Invertible1x1Conv(n_remaining_channels)) + self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels + + def forward(self, forward_input): + """ + forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames + forward_input[1] = audio: batch x time + """ + spect, audio = forward_input + + # Upsample spectrogram to size of audio + spect = self.upsample(spect) + assert(spect.size(2) >= audio.size(1)) + if spect.size(2) > audio.size(1): + spect = spect[:, :, :audio.size(1)] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) + spect = spect.permute(0, 2, 1) + + audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) + output_audio = [] + log_s_list = [] + log_det_W_list = [] + + for k in range(self.n_flows): + if k % self.n_early_every == 0 and k > 0: + output_audio.append(audio[:, :self.n_early_size, :]) + audio = audio[:, self.n_early_size:, :] + + audio, log_det_W = self.convinv[k](audio) + log_det_W_list.append(log_det_W) + + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + log_s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = torch.exp(log_s) * audio_1 + b + log_s_list.append(log_s) + + audio = torch.cat([audio_0, audio_1], 1) + + output_audio.append(audio) + return torch.cat(output_audio, 1), log_s_list, log_det_W_list + + def infer(self, spect, sigma=1.0): + + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) + spect = spect.permute(0, 2, 1) + + audio = torch.randn(spect.size(0), + self.n_remaining_channels, + spect.size(2), device=spect.device).to(spect.dtype) + + audio = torch.autograd.Variable(sigma * audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b) / torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + + audio = self.convinv[k].infer(audio) + + if k % self.n_early_every == 0 and k > 0: + z = torch.randn(spect.size(0), self.n_early_size, spect.size( + 2), device=spect.device).to(spect.dtype) + audio = torch.cat((sigma * z, audio), 1) + + audio = audio.permute( + 0, 2, 1).contiguous().view( + audio.size(0), -1).data + return audio + + + def infer_onnx(self, spect, z, sigma=0.9): + + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + length_spect_group = spect.size(2)//8 + mel_dim = 80 + batch_size = spect.size(0) + + spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group)) + spect = spect.permute(0, 2, 1, 3) + spect = spect.contiguous() + spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim)) + spect = spect.permute(0, 2, 1) + spect = spect.contiguous() + + audio = z[:, :self.n_remaining_channels, :] + z = z[:, self.n_remaining_channels:self.n_group, :] + audio = sigma*audio + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1) // 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:(n_half+n_half), :] + + output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:(n_half+n_half), :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b) / torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + audio = self.convinv[k].infer(audio) + + if k % self.n_early_every == 0 and k > 0: + audio = torch.cat((z[:, :self.n_early_size, :], audio), 1) + z = z[:, self.n_early_size:self.n_group, :] + + audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group)) + + return audio + + + @staticmethod + def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layers = remove(WN.cond_layers) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow + + +def remove(conv_list): + new_conv_list = torch.nn.ModuleList() + for old_conv in conv_list: + old_conv = torch.nn.utils.remove_weight_norm(old_conv) + new_conv_list.append(old_conv) + return new_conv_list