diff --git a/README.md b/README.md
index 9265ca3b3388721794dec02baf941c09fc5975f7..c9921fa16fd4a56470c9cfdc45135b93d0787837 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,15 @@
 ---
-title: 6L TTS
-emoji: 🐨
+title: Multi Sami
+emoji: 🔥
 colorFrom: green
-colorTo: yellow
+colorTo: pink
 sdk: gradio
-sdk_version: 5.34.2
+sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: cc-by-nc-nd-4.0
-short_description: Multilingual TTS for Sámi languages
+#license: cc-by-4.0
+short_description: Multilingual, multi-speaker Sámi TTS
 ---
 
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d81743371cfe82f497aad6efb56bac173b67acf
--- /dev/null
+++ b/app.py
@@ -0,0 +1,73 @@
+import gradio as gr
+import syn_hifigan as syn
+#import syn_k_univnet_multi as syn
+import os, tempfile
+        
+languages = {"South Sámi":0,
+          "North Sámi":1,
+          "Lule Sámi":2}
+
+speakers={#"aj0": 0,
+          "Aanna - sma": 1,
+          "Máhtte": 2,
+          "Siggá - smj": 3,
+          "Biret - sme": 5,
+          #"lo": 6,
+          "Sunná": 7,
+          "Abmut - smj": 8,
+          "Nihkol - smj": 9
+}
+public=True
+
+tempdir = tempfile.gettempdir()
+
+tts = syn.Synthesizer()
+
+
+
+def speak(text, language,speaker,l_weight, s_weight, pace, postfilter): #pitch_shift,pitch_std):
+
+
+
+    # text frontend not implemented...
+    text = text.replace("...", "…")
+    print(speakers[speaker])
+    audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language],
+                      spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight,
+                      pace=pace, clarity=postfilter)
+
+    if not public:
+        try:
+            os.system("play "+tempdir+"/tmp.wav &")
+        except:
+            pass
+
+    return (22050, audio)
+
+
+
+controls = []
+controls.append(gr.Textbox(label="text", value="Suohtas duinna deaivvadit."))
+controls.append(gr.Dropdown(list(languages.keys()), label="language", value="North Sámi"))
+controls.append(gr.Dropdown(list(speakers.keys()), label="speaker", value="Sunná"))
+
+controls.append(gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="language weight"))
+controls.append(gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="speaker weight"))
+
+controls.append(gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="speech rate"))
+controls.append(gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="post-processing"))
+
+
+
+
+tts_gui = gr.Interface(
+    fn=speak,
+    inputs=controls,
+    outputs= gr.Audio(label="output"),
+    live=False
+
+)
+
+
+if __name__ == "__main__":
+    tts_gui.launch(share=public)
diff --git a/common/audio_processing.py b/common/audio_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fea24d74402cea10aa84f82b1b2d2ec66e0bc1f7
--- /dev/null
+++ b/common/audio_processing.py
@@ -0,0 +1,120 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import librosa.util as librosa_util
+import numpy as np
+import torch
+from scipy.signal import get_window
+
+
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+                     n_fft=800, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+
+    n_frames : int > 0
+        The number of analysis frames
+
+    hop_length : int > 0
+        The number of samples to advance between frames
+
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+
+    n_fft : int > 0
+        The length of each analysis frame.
+
+    dtype : np.dtype
+        The data type of the output
+
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
+
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+    return x
+
+
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
diff --git a/common/env.py b/common/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..649340b21c1d70124584fd2da2e8a692be1857f1
--- /dev/null
+++ b/common/env.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+from collections import defaultdict
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+class DefaultAttrDict(defaultdict):
+    def __init__(self, *args, **kwargs):
+        super(DefaultAttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+    def __getattr__(self, item):
+        return self[item]
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
diff --git a/common/filter_warnings.py b/common/filter_warnings.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2abc2c0b813110d6b87adb150f8bd4e4fe6998
--- /dev/null
+++ b/common/filter_warnings.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mutes known and unrelated PyTorch warnings.
+
+The warnings module keeps a list of filters. Importing it as late as possible
+prevents its filters from being overriden.
+"""
+
+import warnings
+
+
+# NGC 22.04-py3 container (PyTorch 1.12.0a0+bd13bc6)
+warnings.filterwarnings(
+    "ignore",
+    message='positional arguments and argument "destination" are deprecated.'
+            ' nn.Module.state_dict will not accept them in the future.')
+
+# 22.08-py3 container
+warnings.filterwarnings(
+    "ignore",
+    message="is_namedtuple is deprecated, please use the python checks")
diff --git a/common/gpu_affinity.py b/common/gpu_affinity.py
new file mode 100644
index 0000000000000000000000000000000000000000..191444047dd467b13d9610616351340a9d6049f3
--- /dev/null
+++ b/common/gpu_affinity.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
diff --git a/common/layers.py b/common/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..706123ada6217230e17ac441260c1f1cb7ed6e0a
--- /dev/null
+++ b/common/layers.py
@@ -0,0 +1,134 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+import torch.nn.functional as F
+from librosa.filters import mel as librosa_mel_fn
+
+from common.audio_processing import (dynamic_range_compression,
+                                     dynamic_range_decompression)
+from common.stft import STFT
+
+
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, x):
+        return self.linear_layer(x)
+
+
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear',
+                 batch_norm=False):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+        self.norm = torch.nn.BatchNorm1D(out_channels) if batch_norm else None
+
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, signal):
+        if self.norm is None:
+            return self.conv(signal)
+        else:
+            return self.norm(self.conv(signal))
+
+
+class ConvReLUNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0):
+        super(ConvReLUNorm, self).__init__()
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size,
+                                    padding=(kernel_size // 2))
+        self.norm = torch.nn.LayerNorm(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+
+    def forward(self, signal):
+        out = F.relu(self.conv(signal))
+        out = self.norm(out.transpose(1, 2)).transpose(1, 2).to(signal.dtype)
+        return self.dropout(out)
+
+
+class TacotronSTFT(torch.nn.Module):
+    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
+                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
+                 mel_fmax=8000.0):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sr=sampling_rate,
+            n_fft=filter_length,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer('mel_basis', mel_basis)
+
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert(torch.min(y.data) >= -1)
+        assert(torch.max(y.data) <= 1)
+
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        return mel_output
diff --git a/common/repeated_dataloader.py b/common/repeated_dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc947ef0efd9defc5802ee02bf5b7e4e9a831ac
--- /dev/null
+++ b/common/repeated_dataloader.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data pipeline elements which wrap the data N times
+
+A RepeatedDataLoader resets its iterator less frequently. This saves time
+on multi-GPU platforms and is invisible to the training loop.
+
+NOTE: Repeating puts a block of (len(dataset) * repeats) int64s into RAM.
+Do not use more repeats than necessary (e.g., 10**6 to simulate infinity).
+"""
+
+import itertools
+
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+
+class RepeatedDataLoader(DataLoader):
+    def __init__(self, repeats, *args, **kwargs):
+        self.repeats = repeats
+        super().__init__(*args, **kwargs)
+
+    def __iter__(self):
+        if self._iterator is None or self.repeats_done >= self.repeats:
+            self.repeats_done = 1
+            return super().__iter__()
+        else:
+            self.repeats_done += 1
+            return self._iterator
+
+
+class RepeatedDistributedSampler(DistributedSampler):
+    def __init__(self, repeats, *args, **kwargs):
+        self.repeats = repeats
+        assert self.repeats <= 10000, "Too many repeats overload RAM."
+        super().__init__(*args, **kwargs)
+
+    def __iter__(self):
+        # Draw indices for `self.repeats` epochs forward
+        start_epoch = self.epoch
+        iters = []
+        for r in range(self.repeats):
+            self.set_epoch(start_epoch + r)
+            iters.append(super().__iter__())
+        self.set_epoch(start_epoch)
+
+        return itertools.chain.from_iterable(iters)
diff --git a/common/stft.py b/common/stft.py
new file mode 100644
index 0000000000000000000000000000000000000000..9426e17a40850fb79e288b9ee465755486c7876d
--- /dev/null
+++ b/common/stft.py
@@ -0,0 +1,140 @@
+"""
+BSD 3-Clause License
+
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from common.audio_processing import window_sumsquare
+
+
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length=800, hop_length=200, win_length=800,
+                 window='hann', device="cpu"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+                                   np.imag(fourier_basis[:cutoff, :])])
+
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :].copy())
+
+        if window is not None:
+            assert(filter_length >= win_length)
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, size=filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+
+        self.register_buffer('forward_basis', forward_basis.float().to(device))
+        self.register_buffer('inverse_basis', inverse_basis.float().to(device))
+
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+
+        self.num_samples = num_samples
+
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode='reflect')
+        input_data = input_data.squeeze(1)
+        # print(self.forward_basis.device)
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(
+            torch.atan2(imag_part.data, real_part.data))
+
+        return magnitude, phase
+
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+
+        with torch.no_grad():
+            inverse_transform = F.conv_transpose1d(
+                recombine_magnitude_phase, self.inverse_basis,
+                stride=self.hop_length, padding=0)
+
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window, magnitude.size(-1), hop_length=self.hop_length,
+                win_length=self.win_length, n_fft=self.filter_length,
+                dtype=np.float32)
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0])
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False)
+            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+
+        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
+
+        return inverse_transform
+
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
diff --git a/common/tb_dllogger.py b/common/tb_dllogger.py
new file mode 100644
index 0000000000000000000000000000000000000000..e73137c4c0bed4a5cef91fd75a719530b0b09452
--- /dev/null
+++ b/common/tb_dllogger.py
@@ -0,0 +1,172 @@
+import atexit
+import glob
+import re
+from itertools import product
+from pathlib import Path
+
+import dllogger
+import torch
+import numpy as np
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+from torch.utils.tensorboard import SummaryWriter
+
+
+tb_loggers = {}
+
+
+class TBLogger:
+    """
+    xyz_dummies: stretch the screen with empty plots so the legend would
+                 always fit for other plots
+    """
+    def __init__(self, enabled, log_dir, name, interval=1, dummies=True):
+        self.enabled = enabled
+        self.interval = interval
+        self.cache = {}
+        if self.enabled:
+            self.summary_writer = SummaryWriter(
+                log_dir=Path(log_dir, name), flush_secs=120, max_queue=200)
+            atexit.register(self.summary_writer.close)
+            if dummies:
+                for key in ('_', '✕'):
+                    self.summary_writer.add_scalar(key, 0.0, 1)
+
+    def log(self, step, data):
+        for k, v in data.items():
+            self.log_value(step, k, v.item() if type(v) is torch.Tensor else v)
+
+    def log_value(self, step, key, val, stat='mean'):
+        if self.enabled:
+            if key not in self.cache:
+                self.cache[key] = []
+            self.cache[key].append(val)
+            if len(self.cache[key]) == self.interval:
+                agg_val = getattr(np, stat)(self.cache[key])
+                self.summary_writer.add_scalar(key, agg_val, step)
+                del self.cache[key]
+
+    def log_grads(self, step, model):
+        if self.enabled:
+            norms = [p.grad.norm().item() for p in model.parameters()
+                     if p.grad is not None]
+            for stat in ('max', 'min', 'mean'):
+                self.log_value(step, f'grad_{stat}', getattr(np, stat)(norms),
+                               stat=stat)
+
+
+def unique_log_fpath(fpath):
+
+    if not Path(fpath).is_file():
+        return fpath
+
+    # Avoid overwriting old logs
+    saved = [re.search('\.(\d+)$', f) for f in glob.glob(f'{fpath}.*')]
+    saved = [0] + [int(m.group(1)) for m in saved if m is not None]
+    return f'{fpath}.{max(saved) + 1}'
+
+
+def stdout_step_format(step):
+    if isinstance(step, str):
+        return step
+    fields = []
+    if len(step) > 0:
+        fields.append("epoch {:>4}".format(step[0]))
+    if len(step) > 1:
+        fields.append("iter {:>3}".format(step[1]))
+    if len(step) > 2:
+        fields[-1] += "/{}".format(step[2])
+    return " | ".join(fields)
+
+
+def stdout_metric_format(metric, metadata, value):
+    name = metadata.get("name", metric + " : ")
+    unit = metadata.get("unit", None)
+    format = f'{{{metadata.get("format", "")}}}'
+    fields = [name, format.format(value) if value is not None else value, unit]
+    fields = [f for f in fields if f is not None]
+    return "| " + " ".join(fields)
+
+
+def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw):
+
+    if enabled:
+        backends = [JSONStreamBackend(Verbosity.DEFAULT,
+                                      unique_log_fpath(log_fpath)),
+                    StdOutBackend(Verbosity.VERBOSE,
+                                  step_format=stdout_step_format,
+                                  metric_format=stdout_metric_format)]
+    else:
+        backends = []
+
+    dllogger.init(backends=backends)
+    dllogger.metadata("train_lrate", {"name": "lrate", "unit": None, "format": ":>3.2e"})
+
+    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
+                      ('val', '  avg val '), ('val_ema', '  EMA val ')]:
+
+        dllogger.metadata(f"{id_}_loss",
+                          {"name": f"{pref}loss", "unit": None, "format": ":>5.2f"})
+        dllogger.metadata(f"{id_}_mel_loss",
+                          {"name": f"{pref}mel loss", "unit": None, "format": ":>5.2f"})
+
+        dllogger.metadata(f"{id_}_kl_loss",
+                          {"name": f"{pref}kl loss", "unit": None, "format": ":>5.5f"})
+        dllogger.metadata(f"{id_}_kl_weight",
+                          {"name": f"{pref}kl weight", "unit": None, "format": ":>5.5f"})
+
+        dllogger.metadata(f"{id_}_frames/s",
+                          {"name": None, "unit": "frames/s", "format": ":>10.2f"})
+        dllogger.metadata(f"{id_}_took",
+                          {"name": "took", "unit": "s", "format": ":>3.2f"})
+
+    global tb_loggers
+    tb_loggers = {s: TBLogger(enabled, log_dir, name=s, **tb_kw)
+                  for s in tb_subsets}
+
+
+def init_inference_metadata(batch_size=None):
+
+    modalities = [('latency', 's', ':>10.5f'), ('RTF', 'x', ':>10.2f'),
+                  ('frames/s', 'frames/s', ':>10.2f'), ('samples/s', 'samples/s', ':>10.2f'),
+                  ('letters/s', 'letters/s', ':>10.2f'), ('tokens/s', 'tokens/s', ':>10.2f')]
+
+    if batch_size is not None:
+        modalities.append((f'RTF@{batch_size}', 'x', ':>10.2f'))
+
+    percs = ['', 'avg', '90%', '95%', '99%']
+    models = ['', 'fastpitch', 'waveglow', 'hifigan']
+
+    for perc, model, (mod, unit, fmt) in product(percs, models, modalities):
+        name = f'{perc} {model} {mod}'.strip().replace('  ', ' ')
+        dllogger.metadata(name.replace(' ', '_'),
+                          {'name': f'{name: <26}', 'unit': unit, 'format': fmt})
+
+
+def log(step, tb_total_steps=None, data={}, subset='train'):
+    if tb_total_steps is not None:
+        tb_loggers[subset].log(tb_total_steps, data)
+
+    if subset != '':
+        data = {f'{subset}_{key}': v for key, v in data.items()}
+    dllogger.log(step, data=data)
+
+
+def log_grads_tb(tb_total_steps, grads, tb_subset='train'):
+    tb_loggers[tb_subset].log_grads(tb_total_steps, grads)
+
+
+def parameters(data, verbosity=0, tb_subset=None):
+    for k, v in data.items():
+        dllogger.log(step="PARAMETER", data={k: v}, verbosity=verbosity)
+
+    if tb_subset is not None and tb_loggers[tb_subset].enabled:
+        tb_data = {k: v for k, v in data.items()
+                   if type(v) in (str, bool, int, float)}
+        tb_loggers[tb_subset].summary_writer.add_hparams(tb_data, {})
+
+
+def flush():
+    dllogger.flush()
+    for tbl in tb_loggers.values():
+        if tbl.enabled:
+            tbl.summary_writer.flush()
diff --git a/common/text/LICENSE b/common/text/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..8ac1abf2e69e605de57d5a9ddaad3d83764d7a2a
--- /dev/null
+++ b/common/text/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2017 Keith Ito
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/common/text/__init__.py b/common/text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..962fcef8d987d4ef4d4112d70501f0d23f5c3743
--- /dev/null
+++ b/common/text/__init__.py
@@ -0,0 +1,3 @@
+from .cmudict import CMUDict
+
+cmudict = CMUDict()
diff --git a/common/text/__pycache__/__init__.cpython-37.pyc b/common/text/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12beed0c168f0ecb16cc17f344f399997061ae88
Binary files /dev/null and b/common/text/__pycache__/__init__.cpython-37.pyc differ
diff --git a/common/text/__pycache__/__init__.cpython-38.pyc b/common/text/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddbf86d373b827bf65b7ed2a7cfb14e1380cb835
Binary files /dev/null and b/common/text/__pycache__/__init__.cpython-38.pyc differ
diff --git a/common/text/__pycache__/__init__.cpython-39.pyc b/common/text/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dfec7b84a943440fefd6a10ec5d5c7c71625980
Binary files /dev/null and b/common/text/__pycache__/__init__.cpython-39.pyc differ
diff --git a/common/text/__pycache__/abbreviations.cpython-37.pyc b/common/text/__pycache__/abbreviations.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ff9192b4a49deb551f8ad5cd2db659bd4fef35f
Binary files /dev/null and b/common/text/__pycache__/abbreviations.cpython-37.pyc differ
diff --git a/common/text/__pycache__/abbreviations.cpython-38.pyc b/common/text/__pycache__/abbreviations.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4870a76bf021b1afa5c7c931d5b9a5cf3aef1537
Binary files /dev/null and b/common/text/__pycache__/abbreviations.cpython-38.pyc differ
diff --git a/common/text/__pycache__/abbreviations.cpython-39.pyc b/common/text/__pycache__/abbreviations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97932b7f85e7ecbb4bd66eb9b2772ea7f2deafc5
Binary files /dev/null and b/common/text/__pycache__/abbreviations.cpython-39.pyc differ
diff --git a/common/text/__pycache__/acronyms.cpython-37.pyc b/common/text/__pycache__/acronyms.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e49c99117e5038682289495965c0bc5981230ca8
Binary files /dev/null and b/common/text/__pycache__/acronyms.cpython-37.pyc differ
diff --git a/common/text/__pycache__/acronyms.cpython-38.pyc b/common/text/__pycache__/acronyms.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11c9d75df497060507d034899eae6ef3fdaa54d7
Binary files /dev/null and b/common/text/__pycache__/acronyms.cpython-38.pyc differ
diff --git a/common/text/__pycache__/acronyms.cpython-39.pyc b/common/text/__pycache__/acronyms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fc561e8a05298fc59761af6238133e50d164f04
Binary files /dev/null and b/common/text/__pycache__/acronyms.cpython-39.pyc differ
diff --git a/common/text/__pycache__/cleaners.cpython-37.pyc b/common/text/__pycache__/cleaners.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f176e61fc51a092bab771b36e444f6dc5930f4f7
Binary files /dev/null and b/common/text/__pycache__/cleaners.cpython-37.pyc differ
diff --git a/common/text/__pycache__/cleaners.cpython-38.pyc b/common/text/__pycache__/cleaners.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33367c777c83ff81eff9aeac117cdf4e3b40dc42
Binary files /dev/null and b/common/text/__pycache__/cleaners.cpython-38.pyc differ
diff --git a/common/text/__pycache__/cleaners.cpython-39.pyc b/common/text/__pycache__/cleaners.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80cee6fd6f86a4e385eb6f47cd2785f22c5668b1
Binary files /dev/null and b/common/text/__pycache__/cleaners.cpython-39.pyc differ
diff --git a/common/text/__pycache__/cmudict.cpython-37.pyc b/common/text/__pycache__/cmudict.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e41a9797f9fcea630ce632943fb08c06d0d73d72
Binary files /dev/null and b/common/text/__pycache__/cmudict.cpython-37.pyc differ
diff --git a/common/text/__pycache__/cmudict.cpython-38.pyc b/common/text/__pycache__/cmudict.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ff78b3fd5adf09934a57076560e42447f4e3997
Binary files /dev/null and b/common/text/__pycache__/cmudict.cpython-38.pyc differ
diff --git a/common/text/__pycache__/cmudict.cpython-39.pyc b/common/text/__pycache__/cmudict.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab94af37abe1bcbae0f671964a41c0ccb203dedb
Binary files /dev/null and b/common/text/__pycache__/cmudict.cpython-39.pyc differ
diff --git a/common/text/__pycache__/datestime.cpython-37.pyc b/common/text/__pycache__/datestime.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ceb22d5754761318ec04933397c5471e967e06b
Binary files /dev/null and b/common/text/__pycache__/datestime.cpython-37.pyc differ
diff --git a/common/text/__pycache__/datestime.cpython-38.pyc b/common/text/__pycache__/datestime.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df35a2d4d008a4318704a6f623972b569e6249e7
Binary files /dev/null and b/common/text/__pycache__/datestime.cpython-38.pyc differ
diff --git a/common/text/__pycache__/datestime.cpython-39.pyc b/common/text/__pycache__/datestime.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13989da02c239cf5a23194c3ddac8af8f1e6461e
Binary files /dev/null and b/common/text/__pycache__/datestime.cpython-39.pyc differ
diff --git a/common/text/__pycache__/letters_and_numbers.cpython-37.pyc b/common/text/__pycache__/letters_and_numbers.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eafef327e332742405917e008ea81adb8e635519
Binary files /dev/null and b/common/text/__pycache__/letters_and_numbers.cpython-37.pyc differ
diff --git a/common/text/__pycache__/letters_and_numbers.cpython-38.pyc b/common/text/__pycache__/letters_and_numbers.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ec2dbd0b0be8b0e38b0d7ccc0406bda8c676783
Binary files /dev/null and b/common/text/__pycache__/letters_and_numbers.cpython-38.pyc differ
diff --git a/common/text/__pycache__/letters_and_numbers.cpython-39.pyc b/common/text/__pycache__/letters_and_numbers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da1f960a8ca7014cbd6109406fccd003863635c1
Binary files /dev/null and b/common/text/__pycache__/letters_and_numbers.cpython-39.pyc differ
diff --git a/common/text/__pycache__/numerical.cpython-37.pyc b/common/text/__pycache__/numerical.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c999d6336b201a7e9f5b8ca72d0476549d11d784
Binary files /dev/null and b/common/text/__pycache__/numerical.cpython-37.pyc differ
diff --git a/common/text/__pycache__/numerical.cpython-38.pyc b/common/text/__pycache__/numerical.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97e5db5cb8fe314be9bd8b3e8c4f221fd10d111c
Binary files /dev/null and b/common/text/__pycache__/numerical.cpython-38.pyc differ
diff --git a/common/text/__pycache__/numerical.cpython-39.pyc b/common/text/__pycache__/numerical.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d150b4502a6cc68553e148e1c1843df9523d651
Binary files /dev/null and b/common/text/__pycache__/numerical.cpython-39.pyc differ
diff --git a/common/text/__pycache__/symbols.cpython-37.pyc b/common/text/__pycache__/symbols.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e261a2e7bc4a3e014a026565604807128954f617
Binary files /dev/null and b/common/text/__pycache__/symbols.cpython-37.pyc differ
diff --git a/common/text/__pycache__/symbols.cpython-38.pyc b/common/text/__pycache__/symbols.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f4435dbc43e6f9dc88e09e9fbadbdf51fec0eba
Binary files /dev/null and b/common/text/__pycache__/symbols.cpython-38.pyc differ
diff --git a/common/text/__pycache__/symbols.cpython-39.pyc b/common/text/__pycache__/symbols.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..228c1f3232deb7da54ad9d1f78a00ff6b0133492
Binary files /dev/null and b/common/text/__pycache__/symbols.cpython-39.pyc differ
diff --git a/common/text/__pycache__/text_processing.cpython-37.pyc b/common/text/__pycache__/text_processing.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4961140081e035b916a0231db0fcd45284429179
Binary files /dev/null and b/common/text/__pycache__/text_processing.cpython-37.pyc differ
diff --git a/common/text/__pycache__/text_processing.cpython-38.pyc b/common/text/__pycache__/text_processing.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d6022c0378b82da892b5871988babf96476f86b
Binary files /dev/null and b/common/text/__pycache__/text_processing.cpython-38.pyc differ
diff --git a/common/text/__pycache__/text_processing.cpython-39.pyc b/common/text/__pycache__/text_processing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..778ae0ce61ddb87fec16299458a42a23c7fe7d51
Binary files /dev/null and b/common/text/__pycache__/text_processing.cpython-39.pyc differ
diff --git a/common/text/abbreviations.py b/common/text/abbreviations.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5ca94917e9fc50c56ea459656616147bff346c
--- /dev/null
+++ b/common/text/abbreviations.py
@@ -0,0 +1,67 @@
+import re
+
+_no_period_re = re.compile(r'(No[.])(?=[ ]?[0-9])')
+_percent_re = re.compile(r'([ ]?[%])')
+_half_re = re.compile('([0-9]½)|(½)')
+_url_re = re.compile(r'([a-zA-Z])\.(com|gov|org)')
+
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('ms', 'miss'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+    ('sen', 'senator'),
+    ('etc', 'et cetera'),
+]]
+
+
+def _expand_no_period(m):
+    word = m.group(0)
+    if word[0] == 'N':
+        return 'Number'
+    return 'number'
+
+
+def _expand_percent(m):
+    return ' percent'
+
+
+def _expand_half(m):
+    word = m.group(1)
+    if word is None:
+        return 'half'
+    return word[0] + ' and a half'
+
+
+def _expand_urls(m):
+    return f'{m.group(1)} dot {m.group(2)}'
+
+
+def normalize_abbreviations(text):
+    text = re.sub(_no_period_re, _expand_no_period, text)
+    text = re.sub(_percent_re, _expand_percent, text)
+    text = re.sub(_half_re, _expand_half, text)
+    text = re.sub('&', ' and ', text)
+    text = re.sub('@', ' at ', text)
+    text = re.sub(_url_re, _expand_urls, text)
+
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
diff --git a/common/text/acronyms.py b/common/text/acronyms.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba147584097eff46a943fafa850e92107bfeb146
--- /dev/null
+++ b/common/text/acronyms.py
@@ -0,0 +1,109 @@
+import re
+from . import cmudict
+
+_letter_to_arpabet = {
+    'A': 'EY1',
+    'B': 'B IY1',
+    'C': 'S IY1',
+    'D': 'D IY1',
+    'E': 'IY1',
+    'F': 'EH1 F',
+    'G': 'JH IY1',
+    'H': 'EY1 CH',
+    'I': 'AY1',
+    'J': 'JH EY1',
+    'K': 'K EY1',
+    'L': 'EH1 L',
+    'M': 'EH1 M',
+    'N': 'EH1 N',
+    'O': 'OW1',
+    'P': 'P IY1',
+    'Q': 'K Y UW1',
+    'R': 'AA1 R',
+    'S': 'EH1 S',
+    'T': 'T IY1',
+    'U': 'Y UW1',
+    'V': 'V IY1',
+    'X': 'EH1 K S',
+    'Y': 'W AY1',
+    'W': 'D AH1 B AH0 L Y UW0',
+    'Z': 'Z IY1',
+    's': 'Z'
+}
+
+# Acronyms that should not be expanded
+hardcoded_acronyms = [
+    'BMW', 'MVD', 'WDSU', 'GOP', 'UK', 'AI', 'GPS', 'BP', 'FBI', 'HD',
+    'CES', 'LRA', 'PC', 'NBA', 'BBL', 'OS', 'IRS', 'SAC', 'UV', 'CEO', 'TV',
+    'CNN', 'MSS', 'GSA', 'USSR', 'DNA', 'PRS', 'TSA', 'US', 'GPU', 'USA',
+    'FPCC', 'CIA']
+
+# Words and acronyms that should be read as regular words, e.g., NATO, HAPPY, etc.
+uppercase_whiteliset = []
+
+acronyms_exceptions = {
+    'NVIDIA': 'N.VIDIA',
+}
+
+non_uppercase_exceptions = {
+    'email': 'e-mail',
+}
+
+# must ignore roman numerals
+_acronym_re = re.compile(r'([a-z]*[A-Z][A-Z]+)s?\.?')
+_non_uppercase_re = re.compile(r'\b({})\b'.format('|'.join(non_uppercase_exceptions.keys())), re.IGNORECASE)
+
+
+def _expand_acronyms_to_arpa(m, add_spaces=True):
+    acronym = m.group(0)
+
+    # remove dots if they exist
+    acronym = re.sub('\.', '', acronym)
+
+    acronym = "".join(acronym.split())
+    arpabet = cmudict.lookup(acronym)
+
+    if arpabet is None:
+        acronym = list(acronym)
+        arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
+        # temporary fix
+        if arpabet[-1] == '{Z}' and len(arpabet) > 1:
+            arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:]
+            del arpabet[-1]
+
+        arpabet = ' '.join(arpabet)
+    elif len(arpabet) == 1:
+        arpabet = "{" + arpabet[0] + "}"
+    else:
+        arpabet = acronym
+
+    return arpabet
+
+
+def normalize_acronyms(text):
+    text = re.sub(_acronym_re, _expand_acronyms_to_arpa, text)
+    return text
+
+
+def expand_acronyms(m):
+    text = m.group(1)
+    if text in acronyms_exceptions:
+        text = acronyms_exceptions[text]
+    elif text in uppercase_whiteliset:
+        text = text
+    else:
+        text = '.'.join(text) + '.'
+
+    if 's' in m.group(0):
+        text = text + '\'s'
+
+    if text[-1] != '.' and m.group(0)[-1] == '.':
+        return text + '.'
+    else:
+        return text
+
+
+def spell_acronyms(text):
+    text = re.sub(_non_uppercase_re, lambda m: non_uppercase_exceptions[m.group(0).lower()], text)
+    text = re.sub(_acronym_re, expand_acronyms, text)
+    return text
diff --git a/common/text/cleaners.py b/common/text/cleaners.py
new file mode 100644
index 0000000000000000000000000000000000000000..700a96d6ab8c6eefcffc4c3c7c77e1be9d62c25d
--- /dev/null
+++ b/common/text/cleaners.py
@@ -0,0 +1,102 @@
+""" adapted from https://github.com/keithito/tacotron """
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+    1. "english_cleaners" for English text
+    2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+         the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+    3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+         the symbols in symbols.py to match your data).
+'''
+
+import re
+from .abbreviations import normalize_abbreviations
+from .acronyms import normalize_acronyms, spell_acronyms
+from .datestime import normalize_datestime
+from .letters_and_numbers import normalize_letters_and_numbers
+from .numerical import normalize_numbers
+from .unidecoder import unidecoder
+
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+
+def expand_abbreviations(text):
+    return normalize_abbreviations(text)
+
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+
+def expand_acronyms(text):
+    return normalize_acronyms(text)
+
+
+def expand_datestime(text):
+    return normalize_datestime(text)
+
+
+def expand_letters_and_numbers(text):
+    return normalize_letters_and_numbers(text)
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+
+def separate_acronyms(text):
+    text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text)
+    text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text)
+    return text
+
+
+def convert_to_ascii(text):
+    return unidecoder(text)
+
+
+def basic_cleaners(text):
+    '''Basic pipeline that collapses whitespace without transliteration.'''
+    # text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    '''Pipeline for English text, with number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners_v2(text):
+    text = convert_to_ascii(text)
+    text = expand_datestime(text)
+    text = expand_letters_and_numbers(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = spell_acronyms(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    # compatibility with basic_english symbol set
+    text = re.sub(r'/+', ' ', text)
+    return text
diff --git a/common/text/cmudict.py b/common/text/cmudict.py
new file mode 100644
index 0000000000000000000000000000000000000000..c021967d61a13a89cc02fcdc8e838b5545a6b4a9
--- /dev/null
+++ b/common/text/cmudict.py
@@ -0,0 +1,98 @@
+""" from https://github.com/keithito/tacotron """
+
+import re
+import sys
+import urllib.request
+from pathlib import Path
+
+
+valid_symbols = [
+  'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+  'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+  'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+  'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+  'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+  'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+  'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+]
+
+_valid_symbol_set = set(valid_symbols)
+
+
+class CMUDict:
+  '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
+  def __init__(self, file_or_path=None, heteronyms_path=None, keep_ambiguous=True):
+    self._entries = {}
+    self.heteronyms = []
+    if file_or_path is not None:
+      self.initialize(file_or_path, heteronyms_path, keep_ambiguous)
+
+  def initialize(self, file_or_path, heteronyms_path, keep_ambiguous=True):
+    if isinstance(file_or_path, str):
+      if not Path(file_or_path).exists():
+        print("CMUdict missing. Downloading to data/cmudict/.")
+        self.download()
+
+      with open(file_or_path, encoding='latin-1') as f:
+        entries = _parse_cmudict(f)
+
+    else:
+      entries = _parse_cmudict(file_or_path)
+
+    if not keep_ambiguous:
+      entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+    self._entries = entries
+
+    if heteronyms_path is not None:
+      with open(heteronyms_path, encoding='utf-8') as f:
+        self.heteronyms = [l.rstrip() for l in f]
+
+  def __len__(self):
+    if len(self._entries) == 0:
+      raise ValueError("CMUDict not initialized")
+    return len(self._entries)
+
+  def lookup(self, word):
+    '''Returns list of ARPAbet pronunciations of the given word.'''
+    if len(self._entries) == 0:
+      raise ValueError("CMUDict not initialized")
+    return self._entries.get(word.upper())
+
+  def download(self):
+    url = 'https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b'
+    try:
+      Path('cmudict').mkdir(parents=False, exist_ok=True)
+      urllib.request.urlretrieve(url, filename='cmudict/cmudict-0.7b')
+    except:
+      print("Automatic download of CMUdict failed. Try manually with:")
+      print()
+      print("    bash scripts/download_cmudict.sh")
+      print()
+      print("and re-run the script.")
+      sys.exit(0)
+
+
+_alt_re = re.compile(r'\([0-9]+\)')
+
+
+def _parse_cmudict(file):
+  cmudict = {}
+  for line in file:
+    if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
+      parts = line.split('  ')
+      word = re.sub(_alt_re, '', parts[0])
+      pronunciation = _get_pronunciation(parts[1])
+      if pronunciation:
+        if word in cmudict:
+          cmudict[word].append(pronunciation)
+        else:
+          cmudict[word] = [pronunciation]
+  return cmudict
+
+
+def _get_pronunciation(s):
+  parts = s.strip().split(' ')
+  for part in parts:
+    if part not in _valid_symbol_set:
+      return None
+  return ' '.join(parts)
diff --git a/common/text/datestime.py b/common/text/datestime.py
new file mode 100644
index 0000000000000000000000000000000000000000..614039fc4d64df79c7950e6ba72285010d8c9c82
--- /dev/null
+++ b/common/text/datestime.py
@@ -0,0 +1,22 @@
+import re
+_ampm_re = re.compile(
+    r'([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)')
+
+
+def _expand_ampm(m):
+    matches = list(m.groups(0))
+    txt = matches[0]
+    txt = txt if int(matches[1]) == 0 else txt + ' ' + matches[1]
+
+    if matches[2][0].lower() == 'a':
+        txt += ' a.m.'
+    elif matches[2][0].lower() == 'p':
+        txt += ' p.m.'
+
+    return txt
+
+
+def normalize_datestime(text):
+    text = re.sub(_ampm_re, _expand_ampm, text)
+    #text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text)
+    return text
diff --git a/common/text/letters_and_numbers.py b/common/text/letters_and_numbers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e584c963274f5baf04417026de3d75f81962b32
--- /dev/null
+++ b/common/text/letters_and_numbers.py
@@ -0,0 +1,90 @@
+import re
+_letters_and_numbers_re = re.compile(
+    r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE)
+
+_hardware_re = re.compile(
+    '([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)', re.IGNORECASE)
+_hardware_key = {'tb': 'terabyte',
+                 'gb': 'gigabyte',
+                 'mb': 'megabyte',
+                 'kb': 'kilobyte',
+                 'ghz': 'gigahertz',
+                 'mhz': 'megahertz',
+                 'khz': 'kilohertz',
+                 'hz': 'hertz',
+                 'mm': 'millimeter',
+                 'cm': 'centimeter',
+                 'km': 'kilometer'}
+
+_dimension_re = re.compile(
+    r'\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b')
+_dimension_key = {'m': 'meter',
+                  'in': 'inch',
+                  'inch': 'inch'}
+
+
+
+
+def _expand_letters_and_numbers(m):
+    text = re.split(r'(\d+)', m.group(0))
+
+    # remove trailing space
+    if text[-1] == '':
+        text = text[:-1]
+    elif text[0] == '':
+        text = text[1:]
+
+    # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc...
+    if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit():
+        text[-2] = text[-2] + text[-1]
+        text = text[:-1]
+
+    # for combining digits 2 by 2
+    new_text = []
+    for i in range(len(text)):
+        string = text[i]
+        if string.isdigit() and len(string) < 5:
+            # heuristics
+            if len(string) > 2 and string[-2] == '0':
+                if string[-1] == '0':
+                    string = [string]
+                else:
+                    string = [string[:-2], string[-2], string[-1]]
+            elif len(string) % 2 == 0:
+                string = [string[i:i+2] for i in range(0, len(string), 2)]
+            elif len(string) > 2:
+                string = [string[0]] + [string[i:i+2] for i in range(1, len(string), 2)]
+            new_text.extend(string)
+        else:
+            new_text.append(string)
+
+    text = new_text
+    text = " ".join(text)
+    return text
+
+
+def _expand_hardware(m):
+    quantity, measure = m.groups(0)
+    measure = _hardware_key[measure.lower()]
+    if measure[-1] != 'z' and float(quantity.replace(',', '')) > 1:
+        return "{} {}s".format(quantity, measure)
+    return "{} {}".format(quantity, measure)
+
+
+def _expand_dimension(m):
+    text = "".join([x for x in m.groups(0) if x != 0])
+    text = text.replace(' x ', ' by ')
+    text = text.replace('x', ' by ')
+    if text.endswith(tuple(_dimension_key.keys())):
+        if text[-2].isdigit():
+            text = "{} {}".format(text[:-1], _dimension_key[text[-1:]])
+        elif text[-3].isdigit():
+            text = "{} {}".format(text[:-2], _dimension_key[text[-2:]])
+    return text
+
+
+def normalize_letters_and_numbers(text):
+    text = re.sub(_hardware_re, _expand_hardware, text)
+    text = re.sub(_dimension_re, _expand_dimension, text)
+    text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text)
+    return text
diff --git a/common/text/numerical.py b/common/text/numerical.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe4265f8d07484ecbb08ad7b536a452f15e7b62
--- /dev/null
+++ b/common/text/numerical.py
@@ -0,0 +1,153 @@
+""" adapted from https://github.com/keithito/tacotron """
+
+import inflect
+import re
+_magnitudes = ['trillion', 'billion', 'million', 'thousand', 'hundred', 'm', 'b', 't']
+_magnitudes_key = {'m': 'million', 'b': 'billion', 't': 'trillion'}
+_measurements = '(f|c|k|d|m)'
+_measurements_key = {'f': 'fahrenheit',
+                     'c': 'celsius',
+                     'k': 'thousand',
+                     'm': 'meters'}
+_currency_key = {'$': 'dollar', '£': 'pound', '€': 'euro', '₩': 'won'}
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_currency_re = re.compile(r'([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]|$))?'.format("|".join(_magnitudes)), re.IGNORECASE)
+_measurement_re = re.compile(r'([0-9\.\,]*[0-9]+(\s)?{}\b)'.format(_measurements), re.IGNORECASE)
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+# _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
+_roman_re = re.compile(r'\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b')  # avoid I
+_multiply_re = re.compile(r'(\b[0-9]+)(x)([0-9]+)')
+_number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_currency(m):
+    currency = _currency_key[m.group(1)]
+    quantity = m.group(2)
+    magnitude = m.group(3)
+
+    # remove commas from quantity to be able to convert to numerical
+    quantity = quantity.replace(',', '')
+
+    # check for million, billion, etc...
+    if magnitude is not None and magnitude.lower() in _magnitudes:
+        if len(magnitude) == 1:
+            magnitude = _magnitudes_key[magnitude.lower()]
+        return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency+'s')
+
+    parts = quantity.split('.')
+    if len(parts) > 2:
+        return quantity + " " + currency + "s"    # Unexpected format
+
+    dollars = int(parts[0]) if parts[0] else 0
+
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = currency if dollars == 1 else currency+'s'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return "{} {}, {} {}".format(
+            _expand_hundreds(dollars), dollar_unit,
+            _inflect.number_to_words(cents), cent_unit)
+    elif dollars:
+        dollar_unit = currency if dollars == 1 else currency+'s'
+        return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
+    else:
+        return 'zero' + ' ' + currency + 's'
+
+
+def _expand_hundreds(text):
+    number = float(text)
+    if 1000 < number < 10000 and (number % 100 == 0) and (number % 1000 != 0):
+        return _inflect.number_to_words(int(number / 100)) + " hundred"
+    else:
+        return _inflect.number_to_words(text)
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_measurement(m):
+    _, number, measurement = re.split('(\d+(?:\.\d+)?)', m.group(0))
+    number = _inflect.number_to_words(number)
+    measurement = "".join(measurement.split())
+    measurement = _measurements_key[measurement.lower()]
+    return "{} {}".format(number, measurement)
+
+
+def _expand_range(m):
+    return ' to '
+
+
+def _expand_multiply(m):
+    left = m.group(1)
+    right = m.group(3)
+    return "{} by {}".format(left, right)
+
+
+def _expand_roman(m):
+    # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
+    roman_numerals = {'I':1, 'V':5, 'X':10, 'L':50, 'C':100, 'D':500, 'M':1000}
+    result = 0
+    num = m.group(0)
+    for i, c in enumerate(num):
+        if (i+1) == len(num) or roman_numerals[c] >= roman_numerals[num[i+1]]:
+            result += roman_numerals[c]
+        else:
+            result -= roman_numerals[c]
+    return str(result)
+
+
+def _expand_number(m):
+    _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
+    number = int(number)
+    if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
+        text = _inflect.number_to_words(number // 100) + " hundred"
+    elif number > 1000 and number < 3000:
+        if number == 2000:
+            text = 'two thousand'
+        elif number > 2000 and number < 2010:
+            text = 'two thousand ' + _inflect.number_to_words(number % 100)
+        elif number % 100 == 0:
+            text = _inflect.number_to_words(number // 100) + ' hundred'
+        else:
+            number = _inflect.number_to_words(number, andword='', zero='oh', group=2).replace(', ', ' ')
+            number = re.sub(r'-', ' ', number)
+            text = number
+    else:
+        number = _inflect.number_to_words(number, andword='and')
+        number = re.sub(r'-', ' ', number)
+        number = re.sub(r',', '', number)
+        text = number
+
+    if suffix in ("'s", "s"):
+        if text[-1] == 'y':
+            text = text[:-1] + 'ies'
+        else:
+            text = text + suffix
+
+    return text
+
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_currency_re, _expand_currency, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    # text = re.sub(_range_re, _expand_range, text)
+    # text = re.sub(_measurement_re, _expand_measurement, text)
+    text = re.sub(_roman_re, _expand_roman, text)
+    text = re.sub(_multiply_re, _expand_multiply, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
diff --git a/common/text/symbols.py b/common/text/symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..595e1b113c2ec0dce3b95f6f826b23a062ba932c
--- /dev/null
+++ b/common/text/symbols.py
@@ -0,0 +1,81 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
+from .cmudict import valid_symbols
+
+
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ['@' + s for s in valid_symbols]
+
+
+def get_symbols(symbol_set='english_basic'):
+    if symbol_set == 'english_basic':
+        _pad = '_'
+        _punctuation = '!\'(),.:;? '
+        _special = '-'
+        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+    elif symbol_set == 'english_basic_lowercase':
+        _pad = '_'
+        _punctuation = '!\'"(),.:;? '
+        _special = '-'
+        _letters = 'abcdefghijklmnopqrstuvwxyz'
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+    elif symbol_set == 'english_expanded':
+        _punctuation = '!\'",.:;? '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        _accented = 'áçéêëñöøćž'
+        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet
+    elif symbol_set == 'smj_expanded':
+        _punctuation = '!\'",.:;?- '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        # _accented = 'áçéêëñöøćžđšŧ' #also north sámi letters...
+        _accented = 'áçéêëñöø' #also north sámi letters...
+        # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        _letters = 'AÁÆÅÄBCDEFGHIJKLMNŊŃÑOØÖPQRSTŦUVWXYZaáæåäbcdefghijklmnŋńñoøöpqrstuvwxyz' ########################## Ŧ ########################
+        # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet
+        symbols = list(_punctuation + _letters) + _arpabet
+    elif symbol_set == 'sme_expanded':
+        _punctuation = '!\'",.:;?- '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        _accented = 'áçéêëńñöøćčžđšŧ' #also north sámi letters...
+        # _accented = 'áçéêëñöø' #also north sámi letters...
+        # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        _letters = 'AÁÆÅÄBCČDĐEFGHIJKLMNŊOØÖPQRSŠTŦUVWXYZŽaáæåäbcčdđefghijklmnŋoøöpqrsštŧuvwxyzž'
+        # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet
+        symbols = list(_punctuation + _letters) + _arpabet
+    elif symbol_set == 'sma_expanded':
+        _punctuation = '!\'",.:;?- '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        _accented = 'áäæçéêëïńñöøćčžđšŧ' #also north sámi letters...
+        # _accented = 'áçéêëñöø' #also north sámi letters...
+        # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        _letters = 'AÆÅBCDEFGHIÏJKLMNOØÖPQRSTUVWXYZaæåbcdefghiïjklmnoøöpqrstuvwxyz'
+        # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet
+        symbols = list(_punctuation + _letters) + _arpabet
+    elif symbol_set == 'all_sami':
+        _punctuation = '!\'",.:;?- '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        _accented = 'áäæçéêëïńñöøćčžđšŧ'
+        _letters = 'AÁÆÅÄBCČDĐEFGHIÏJKLMNŊŃÑOØÖPQRSŠTŦUVWXYZŽaáæåäbcčdđefghiïjklmnŋńñoøöpqrsštŧuvwxyzž'
+        symbols = list(_punctuation + _letters)# +  _arpabet
+    else:
+        raise Exception("{} symbol set does not exist".format(symbol_set))
+
+    return symbols
+
+
+def get_pad_idx(symbol_set='english_basic'):
+    if symbol_set in {'english_basic', 'english_basic_lowercase', 'smj_expanded', 'sme_expanded', 'sma_expanded', 'all_sami'}:
+        return 0
+    else:
+        raise Exception("{} symbol set not used yet".format(symbol_set))
diff --git a/common/text/text_processing.py b/common/text/text_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..873aeb8aac9d9a24a351efbf3ea19373f53a5bae
--- /dev/null
+++ b/common/text/text_processing.py
@@ -0,0 +1,164 @@
+""" adapted from https://github.com/keithito/tacotron """
+import re
+import numpy as np
+from . import cleaners
+from .symbols import get_symbols
+from . import cmudict
+from .numerical import _currency_re, _expand_currency
+
+
+#########
+# REGEX #
+#########
+
+# Regular expression matching text enclosed in curly braces for encoding
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+
+# Regular expression matching words and not words
+_words_re = re.compile(r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)")
+
+# Regular expression separating words enclosed in curly braces for cleaning
+_arpa_re = re.compile(r'{[^}]+}|\S+')
+
+
+class TextProcessing(object):
+    def __init__(self, symbol_set, cleaner_names, p_arpabet=0.0,
+                 handle_arpabet='word', handle_arpabet_ambiguous='ignore',
+                 expand_currency=True):
+        self.symbols = get_symbols(symbol_set)
+        self.cleaner_names = cleaner_names
+
+        # Mappings from symbol to numeric ID and vice versa:
+        self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
+        self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
+        self.expand_currency = expand_currency
+
+        # cmudict
+        self.p_arpabet = p_arpabet
+        self.handle_arpabet = handle_arpabet
+        self.handle_arpabet_ambiguous = handle_arpabet_ambiguous
+
+
+    def text_to_sequence(self, text):
+        sequence = []
+
+        # Check for curly braces and treat their contents as ARPAbet:
+        while len(text):
+            m = _curly_re.match(text)
+            if not m:
+                sequence += self.symbols_to_sequence(text)
+                break
+            sequence += self.symbols_to_sequence(m.group(1))
+            sequence += self.arpabet_to_sequence(m.group(2))
+            text = m.group(3)
+
+        return sequence
+
+    def sequence_to_text(self, sequence):
+        result = ''
+        for symbol_id in sequence:
+            if symbol_id in self.id_to_symbol:
+                s = self.id_to_symbol[symbol_id]
+                # Enclose ARPAbet back in curly braces:
+                if len(s) > 1 and s[0] == '@':
+                    s = '{%s}' % s[1:]
+                result += s
+        return result.replace('}{', ' ')
+
+    def clean_text(self, text):
+        for name in self.cleaner_names:
+            cleaner = getattr(cleaners, name)
+            if not cleaner:
+                raise Exception('Unknown cleaner: %s' % name)
+            text = cleaner(text)
+
+        return text
+
+    def symbols_to_sequence(self, symbols):
+        return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]
+
+    def arpabet_to_sequence(self, text):
+        return self.symbols_to_sequence(['@' + s for s in text.split()])
+
+    def get_arpabet(self, word):
+        arpabet_suffix = ''
+
+        if word.lower() in cmudict.heteronyms:
+            return word
+
+        if len(word) > 2 and word.endswith("'s"):
+            arpabet = cmudict.lookup(word)
+            if arpabet is None:
+                arpabet = self.get_arpabet(word[:-2])
+                arpabet_suffix = ' Z'
+        elif len(word) > 1 and word.endswith("s"):
+            arpabet = cmudict.lookup(word)
+            if arpabet is None:
+                arpabet = self.get_arpabet(word[:-1])
+                arpabet_suffix = ' Z'
+        else:
+            arpabet = cmudict.lookup(word)
+
+        if arpabet is None:
+            return word
+        elif arpabet[0] == '{':
+            arpabet = [arpabet[1:-1]]
+
+        # XXX arpabet might not be a list here
+        if type(arpabet) is not list:
+            return word
+
+        if len(arpabet) > 1:
+            if self.handle_arpabet_ambiguous == 'first':
+                arpabet = arpabet[0]
+            elif self.handle_arpabet_ambiguous == 'random':
+                arpabet = np.random.choice(arpabet)
+            elif self.handle_arpabet_ambiguous == 'ignore':
+                return word
+        else:
+            arpabet = arpabet[0]
+
+        arpabet = "{" + arpabet + arpabet_suffix + "}"
+
+        return arpabet
+
+    def encode_text(self, text, return_all=False):
+        if self.expand_currency:
+            text = re.sub(_currency_re, _expand_currency, text)
+        text_clean = [self.clean_text(split) if split[0] != '{' else split
+                      for split in _arpa_re.findall(text)]
+        text_clean = ' '.join(text_clean)
+        text_clean = cleaners.collapse_whitespace(text_clean)
+        text = text_clean
+        """
+        text_arpabet = ''
+        if self.p_arpabet > 0:
+            if self.handle_arpabet == 'sentence':
+                if np.random.uniform() < self.p_arpabet:
+                    words = _words_re.findall(text)
+                    text_arpabet = [
+                        self.get_arpabet(word[0])
+                        if (word[0] != '') else word[1]
+                        for word in words]
+                    text_arpabet = ''.join(text_arpabet)
+                    text = text_arpabet
+            elif self.handle_arpabet == 'word':
+                words = _words_re.findall(text)
+                text_arpabet = [
+                    word[1] if word[0] == '' else (
+                        self.get_arpabet(word[0])
+                        if np.random.uniform() < self.p_arpabet
+                        else word[0])
+                    for word in words]
+                text_arpabet = ''.join(text_arpabet)
+                text = text_arpabet
+            elif self.handle_arpabet != '':
+                raise Exception("{} handle_arpabet is not supported".format(
+                    self.handle_arpabet))
+        """
+        text_encoded = self.text_to_sequence(text)
+
+        if return_all:
+            return text_encoded, text_clean, text_arpabet
+        # print(text_clean, text_encoded)
+        return text_encoded
diff --git a/common/text/unidecoder/__init__.py b/common/text/unidecoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e76516561849f9e140a8610d9ada6e832cad64
--- /dev/null
+++ b/common/text/unidecoder/__init__.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import warnings
+
+from .homoglyphs import homoglyphs
+from .replacements import replacements
+
+
+_replacements = {uni: asc for uni, asc in replacements}
+_homoglyphs = {g: asc for asc, glyphs in homoglyphs.items() for g in glyphs}
+
+
+def unidecoder(s, homoglyphs=False):
+    """Transliterate unicode
+
+    Args:
+        s (str): unicode string
+        homoglyphs (bool): prioritize translating to homoglyphs
+    """
+    warned = False  # Once per utterance
+    ret = ''
+    for u in s:
+        if ord(u) < 127:
+            a = u
+        elif homoglyphs:
+            a = _homoglyphs.get(u, _replacements.get(u, None))
+        else:
+            a = _replacements.get(u, _homoglyphs.get(u, None))
+
+        if a is None:
+            if not warned:
+                warnings.warn(f'Unexpected character {u}: '
+                              'please revise your text cleaning rules.',
+                              stacklevel=10**6)
+                warned = True
+        else:
+            ret += a
+
+    return ret
diff --git a/common/text/unidecoder/__pycache__/__init__.cpython-37.pyc b/common/text/unidecoder/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49650c64dbfcd45a3a65773fd63853fd7606f844
Binary files /dev/null and b/common/text/unidecoder/__pycache__/__init__.cpython-37.pyc differ
diff --git a/common/text/unidecoder/__pycache__/__init__.cpython-38.pyc b/common/text/unidecoder/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b74bec8cd75d1b1ac6e97315eaa10197e6fa0c7
Binary files /dev/null and b/common/text/unidecoder/__pycache__/__init__.cpython-38.pyc differ
diff --git a/common/text/unidecoder/__pycache__/__init__.cpython-39.pyc b/common/text/unidecoder/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baca350bb74d722e481ac138a05d5ca3687ac7cb
Binary files /dev/null and b/common/text/unidecoder/__pycache__/__init__.cpython-39.pyc differ
diff --git a/common/text/unidecoder/__pycache__/homoglyphs.cpython-37.pyc b/common/text/unidecoder/__pycache__/homoglyphs.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7b7e3e5764c2fb4164e75c733d7caa8c0e4edd0
Binary files /dev/null and b/common/text/unidecoder/__pycache__/homoglyphs.cpython-37.pyc differ
diff --git a/common/text/unidecoder/__pycache__/homoglyphs.cpython-38.pyc b/common/text/unidecoder/__pycache__/homoglyphs.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2aa32cdf7e93f1d34f7e03d68ad73e61a41aac9a
Binary files /dev/null and b/common/text/unidecoder/__pycache__/homoglyphs.cpython-38.pyc differ
diff --git a/common/text/unidecoder/__pycache__/homoglyphs.cpython-39.pyc b/common/text/unidecoder/__pycache__/homoglyphs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8729af4b21c4884a235a3d5aede14608bb99980
Binary files /dev/null and b/common/text/unidecoder/__pycache__/homoglyphs.cpython-39.pyc differ
diff --git a/common/text/unidecoder/__pycache__/replacements.cpython-37.pyc b/common/text/unidecoder/__pycache__/replacements.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdd4f1cdbf368bc27621831b1d35a8aeba403662
Binary files /dev/null and b/common/text/unidecoder/__pycache__/replacements.cpython-37.pyc differ
diff --git a/common/text/unidecoder/__pycache__/replacements.cpython-38.pyc b/common/text/unidecoder/__pycache__/replacements.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..532606f3f6866eb0d54952d5ce8dab46c591db46
Binary files /dev/null and b/common/text/unidecoder/__pycache__/replacements.cpython-38.pyc differ
diff --git a/common/text/unidecoder/__pycache__/replacements.cpython-39.pyc b/common/text/unidecoder/__pycache__/replacements.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8d17e87fe038e1922d3c0bb5752d5ed879acfad
Binary files /dev/null and b/common/text/unidecoder/__pycache__/replacements.cpython-39.pyc differ
diff --git a/common/text/unidecoder/homoglyphs.py b/common/text/unidecoder/homoglyphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e701be3ee7de83b52110890d554ecbe9ad1fdd70
--- /dev/null
+++ b/common/text/unidecoder/homoglyphs.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The MIT License (MIT)
+#
+# Copyright (c) 2015 Rob Dawson
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#
+# Based on:
+#     https://github.com/codebox/homoglyph/blob/master/raw_data/chars.txt
+#
+
+homoglyphs = {
+    ' ': ['\xa0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200a', '\u2028', '\u2029', '\u202f', '\u205f'],
+    '!': ['ǃ', 'ⵑ', '！'],
+    '$': ['＄'],
+    '%': ['％'],
+    '&': ['ꝸ', '＆'],
+    "'": ['´', 'ʹ', 'ʻ', 'ʼ', 'ʽ', 'ʾ', 'ˈ', 'ˊ', 'ˋ', '˴', 'ʹ', '΄', '՚', '՝', 'י', '׳', 'ߴ', 'ߵ', 'ᑊ', 'ᛌ', '᾽', '᾿', '`', '´', '῾', '‘', '’', '‛', '′', '‵', 'ꞌ', '＇', '｀', '𖽑', '𖽒'],
+    '"': ['¨', 'ʺ', '˝', 'ˮ', '״', '“', '”', '‟', '❝', '❞', '⠐', '⹂'],
+    '(': ['❨', '❲', '〔', '﴾', '（', '［'],
+    ')': ['❩', '❳', '〕', '﴿', '）', '］'],
+    '*': ['٭', '⁎', '∗', '＊', '𐌟'],
+    '+': ['᛭', '➕', '＋', '𐊛'],
+    ',': ['¸', '؍', '٫', '‚', 'ꓹ', '，'],
+    '-': ['˗', '۔', '‐', '‑', '‒', '–', '⁃', '−', '➖', 'Ⲻ', '﹘'],
+    '.': ['٠', '۰', '܁', '܂', '․', 'ꓸ', '꘎', '．', '𐩐', '𝅭'],
+    '/': ['᜵', '⁁', '⁄', '∕', '╱', '⟋', '⧸', 'Ⳇ', '⼃', '〳', 'ノ', '㇓', '丿', '／', '𝈺'],
+    '2': ['Ƨ', 'Ϩ', 'ᒿ', 'Ꙅ', 'ꛯ', 'Ꝛ', '２', '𝟐', '𝟚', '𝟤', '𝟮', '𝟸', '\U0001fbf2'],
+    '3': ['Ʒ', 'Ȝ', 'З', 'Ӡ', 'Ⳍ', 'Ꝫ', 'Ɜ', '３', '𑣊', '𖼻', '𝈆', '𝟑', '𝟛', '𝟥', '𝟯', '𝟹', '\U0001fbf3'],
+    '4': ['Ꮞ', '４', '𑢯', '𝟒', '𝟜', '𝟦', '𝟰', '𝟺', '\U0001fbf4'],
+    '5': ['Ƽ', '５', '𑢻', '𝟓', '𝟝', '𝟧', '𝟱', '𝟻', '\U0001fbf5'],
+    '6': ['б', 'Ꮾ', 'Ⳓ', '６', '𑣕', '𝟔', '𝟞', '𝟨', '𝟲', '𝟼', '\U0001fbf6'],
+    '7': ['７', '𐓒', '𑣆', '𝈒', '𝟕', '𝟟', '𝟩', '𝟳', '𝟽', '\U0001fbf7'],
+    '8': ['Ȣ', 'ȣ', '৪', '੪', 'ଃ', '８', '𐌚', '𝟖', '𝟠', '𝟪', '𝟴', '𝟾', '𞣋', '\U0001fbf8'],
+    '9': ['৭', '੧', '୨', '൭', 'Ⳋ', 'Ꝯ', '９', '𑢬', '𑣌', '𑣖', '𝟗', '𝟡', '𝟫', '𝟵', '𝟿', '\U0001fbf9'],
+    ':': ['ː', '˸', '։', '׃', '܃', '܄', 'ः', 'ઃ', '᛬', '᠃', '᠉', '⁚', '∶', 'ꓽ', '꞉', '︰', '：'],
+    ';': [';', '；'],
+    '<': ['˂', 'ᐸ', 'ᚲ', '‹', '❮', '＜', '𝈶'],
+    '=': ['᐀', '⹀', '゠', '꓿', '＝'],
+    '>': ['˃', 'ᐳ', '›', '❯', '＞', '𖼿', '𝈷'],
+    '?': ['Ɂ', 'ʔ', 'ॽ', 'Ꭾ', 'ꛫ', '？'],
+    '@': ['＠'],
+    'A': ['Α', 'А', 'Ꭺ', 'ᗅ', 'ᴀ', 'ꓮ', 'ꭺ', 'Ａ', '𐊠', '𖽀', '𝐀', '𝐴', '𝑨', '𝒜', '𝓐', '𝔄', '𝔸', '𝕬', '𝖠', '𝗔', '𝘈', '𝘼', '𝙰', '𝚨', '𝛢', '𝜜', '𝝖', '𝞐'],
+    'B': ['ʙ', 'Β', 'В', 'в', 'Ᏼ', 'ᏼ', 'ᗷ', 'ᛒ', 'ℬ', 'ꓐ', 'Ꞵ', 'Ｂ', '𐊂', '𐊡', '𐌁', '𝐁', '𝐵', '𝑩', '𝓑', '𝔅', '𝔹', '𝕭', '𝖡', '𝗕', '𝘉', '𝘽', '𝙱', '𝚩', '𝛣', '𝜝', '𝝗', '𝞑'],
+    'C': ['Ϲ', 'С', 'Ꮯ', 'ᑕ', 'ℂ', 'ℭ', 'Ⅽ', '⊂', 'Ⲥ', '⸦', 'ꓚ', 'Ｃ', '𐊢', '𐌂', '𐐕', '𐔜', '𑣩', '𑣲', '𝐂', '𝐶', '𝑪', '𝒞', '𝓒', '𝕮', '𝖢', '𝗖', '𝘊', '𝘾', '𝙲', '🝌'],
+    'D': ['Ꭰ', 'ᗞ', 'ᗪ', 'ᴅ', 'ⅅ', 'Ⅾ', 'ꓓ', 'ꭰ', 'Ｄ', '𝐃', '𝐷', '𝑫', '𝒟', '𝓓', '𝔇', '𝔻', '𝕯', '𝖣', '𝗗', '𝘋', '𝘿', '𝙳'],
+    'E': ['Ε', 'Е', 'Ꭼ', 'ᴇ', 'ℰ', '⋿', 'ⴹ', 'ꓰ', 'ꭼ', 'Ｅ', '𐊆', '𑢦', '𑢮', '𝐄', '𝐸', '𝑬', '𝓔', '𝔈', '𝔼', '𝕰', '𝖤', '𝗘', '𝘌', '𝙀', '𝙴', '𝚬', '𝛦', '𝜠', '𝝚', '𝞔'],
+    'F': ['Ϝ', 'ᖴ', 'ℱ', 'ꓝ', 'Ꞙ', 'Ｆ', '𐊇', '𐊥', '𐔥', '𑢢', '𑣂', '𝈓', '𝐅', '𝐹', '𝑭', '𝓕', '𝔉', '𝔽', '𝕱', '𝖥', '𝗙', '𝘍', '𝙁', '𝙵', '𝟊'],
+    'G': ['ɢ', 'Ԍ', 'ԍ', 'Ꮐ', 'Ᏻ', 'ᏻ', 'ꓖ', 'ꮐ', 'Ｇ', '𝐆', '𝐺', '𝑮', '𝒢', '𝓖', '𝔊', '𝔾', '𝕲', '𝖦', '𝗚', '𝘎', '𝙂', '𝙶'],
+    'H': ['ʜ', 'Η', 'Н', 'н', 'Ꮋ', 'ᕼ', 'ℋ', 'ℌ', 'ℍ', 'Ⲏ', 'ꓧ', 'ꮋ', 'Ｈ', '𐋏', '𝐇', '𝐻', '𝑯', '𝓗', '𝕳', '𝖧', '𝗛', '𝘏', '𝙃', '𝙷', '𝚮', '𝛨', '𝜢', '𝝜', '𝞖'],
+    'J': ['Ϳ', 'Ј', 'Ꭻ', 'ᒍ', 'ᴊ', 'ꓙ', 'Ʝ', 'ꭻ', 'Ｊ', '𝐉', '𝐽', '𝑱', '𝒥', '𝓙', '𝔍', '𝕁', '𝕵', '𝖩', '𝗝', '𝘑', '𝙅', '𝙹'],
+    'K': ['Κ', 'К', 'Ꮶ', 'ᛕ', 'K', 'Ⲕ', 'ꓗ', 'Ｋ', '𐔘', '𝐊', '𝐾', '𝑲', '𝒦', '𝓚', '𝔎', '𝕂', '𝕶', '𝖪', '𝗞', '𝘒', '𝙆', '𝙺', '𝚱', '𝛫', '𝜥', '𝝟', '𝞙'],
+    'L': ['ʟ', 'Ꮮ', 'ᒪ', 'ℒ', 'Ⅼ', 'Ⳑ', 'ⳑ', 'ꓡ', 'ꮮ', 'Ｌ', '𐐛', '𐑃', '𐔦', '𑢣', '𑢲', '𖼖', '𝈪', '𝐋', '𝐿', '𝑳', '𝓛', '𝔏', '𝕃', '𝕷', '𝖫', '𝗟', '𝘓', '𝙇', '𝙻'],
+    'M': ['Μ', 'Ϻ', 'М', 'Ꮇ', 'ᗰ', 'ᛖ', 'ℳ', 'Ⅿ', 'Ⲙ', 'ꓟ', 'Ｍ', '𐊰', '𐌑', '𝐌', '𝑀', '𝑴', '𝓜', '𝔐', '𝕄', '𝕸', '𝖬', '𝗠', '𝘔', '𝙈', '𝙼', '𝚳', '𝛭', '𝜧', '𝝡', '𝞛'],
+    'N': ['ɴ', 'Ν', 'ℕ', 'Ⲛ', 'ꓠ', 'Ｎ', '𐔓', '𝐍', '𝑁', '𝑵', '𝒩', '𝓝', '𝔑', '𝕹', '𝖭', '𝗡', '𝘕', '𝙉', '𝙽', '𝚴', '𝛮', '𝜨', '𝝢', '𝞜'],
+    'P': ['Ρ', 'Р', 'Ꮲ', 'ᑭ', 'ᴘ', 'ᴩ', 'ℙ', 'Ⲣ', 'ꓑ', 'ꮲ', 'Ｐ', '𐊕', '𝐏', '𝑃', '𝑷', '𝒫', '𝓟', '𝔓', '𝕻', '𝖯', '𝗣', '𝘗', '𝙋', '𝙿', '𝚸', '𝛲', '𝜬', '𝝦', '𝞠'],
+    'Q': ['ℚ', 'ⵕ', 'Ｑ', '𝐐', '𝑄', '𝑸', '𝒬', '𝓠', '𝔔', '𝕼', '𝖰', '𝗤', '𝘘', '𝙌', '𝚀'],
+    'R': ['Ʀ', 'ʀ', 'Ꭱ', 'Ꮢ', 'ᖇ', 'ᚱ', 'ℛ', 'ℜ', 'ℝ', 'ꓣ', 'ꭱ', 'ꮢ', 'Ｒ', '𐒴', '𖼵', '𝈖', '𝐑', '𝑅', '𝑹', '𝓡', '𝕽', '𝖱', '𝗥', '𝘙', '𝙍', '𝚁'],
+    'S': ['Ѕ', 'Տ', 'Ꮥ', 'Ꮪ', 'ꓢ', 'Ｓ', '𐊖', '𐐠', '𖼺', '𝐒', '𝑆', '𝑺', '𝒮', '𝓢', '𝔖', '𝕊', '𝕾', '𝖲', '𝗦', '𝘚', '𝙎', '𝚂'],
+    'T': ['Τ', 'τ', 'Т', 'т', 'Ꭲ', 'ᴛ', '⊤', '⟙', 'Ⲧ', 'ꓔ', 'ꭲ', 'Ｔ', '𐊗', '𐊱', '𐌕', '𑢼', '𖼊', '𝐓', '𝑇', '𝑻', '𝒯', '𝓣', '𝔗', '𝕋', '𝕿', '𝖳', '𝗧', '𝘛', '𝙏', '𝚃', '𝚻', '𝛕', '𝛵', '𝜏', '𝜯', '𝝉', '𝝩', '𝞃', '𝞣', '𝞽', '🝨'],
+    'U': ['Ս', 'ሀ', 'ᑌ', '∪', '⋃', 'ꓴ', 'Ｕ', '𐓎', '𑢸', '𖽂', '𝐔', '𝑈', '𝑼', '𝒰', '𝓤', '𝔘', '𝕌', '𝖀', '𝖴', '𝗨', '𝘜', '𝙐', '𝚄'],
+    'V': ['Ѵ', '٧', '۷', 'Ꮩ', 'ᐯ', 'Ⅴ', 'ⴸ', 'ꓦ', 'ꛟ', 'Ｖ', '𐔝', '𑢠', '𖼈', '𝈍', '𝐕', '𝑉', '𝑽', '𝒱', '𝓥', '𝔙', '𝕍', '𝖁', '𝖵', '𝗩', '𝘝', '𝙑', '𝚅'],
+    'W': ['Ԝ', 'Ꮃ', 'Ꮤ', 'ꓪ', 'Ｗ', '𑣦', '𑣯', '𝐖', '𝑊', '𝑾', '𝒲', '𝓦', '𝔚', '𝕎', '𝖂', '𝖶', '𝗪', '𝘞', '𝙒', '𝚆'],
+    'X': ['Χ', 'Х', '᙭', 'ᚷ', 'Ⅹ', '╳', 'Ⲭ', 'ⵝ', 'ꓫ', 'Ꭓ', 'Ｘ', '𐊐', '𐊴', '𐌗', '𐌢', '𐔧', '𑣬', '𝐗', '𝑋', '𝑿', '𝒳', '𝓧', '𝔛', '𝕏', '𝖃', '𝖷', '𝗫', '𝘟', '𝙓', '𝚇', '𝚾', '𝛸', '𝜲', '𝝬', '𝞦'],
+    'Y': ['Υ', 'ϒ', 'У', 'Ү', 'Ꭹ', 'Ꮍ', 'Ⲩ', 'ꓬ', 'Ｙ', '𐊲', '𑢤', '𖽃', '𝐘', '𝑌', '𝒀', '𝒴', '𝓨', '𝔜', '𝕐', '𝖄', '𝖸', '𝗬', '𝘠', '𝙔', '𝚈', '𝚼', '𝛶', '𝜰', '𝝪', '𝞤'],
+    'Z': ['Ζ', 'Ꮓ', 'ℤ', 'ℨ', 'ꓜ', 'Ｚ', '𐋵', '𑢩', '𑣥', '𝐙', '𝑍', '𝒁', '𝒵', '𝓩', '𝖅', '𝖹', '𝗭', '𝘡', '𝙕', '𝚉', '𝚭', '𝛧', '𝜡', '𝝛', '𝞕'],
+    '\\': ['∖', '⟍', '⧵', '⧹', '⼂', '㇔', '丶', '﹨', '＼', '𝈏', '𝈻'],
+    '^': ['˄', 'ˆ'],
+    '_': ['ߺ', '﹍', '﹎', '﹏', '＿'],
+    'a': ['ɑ', 'α', 'а', '⍺', 'ａ', '𝐚', '𝑎', '𝒂', '𝒶', '𝓪', '𝔞', '𝕒', '𝖆', '𝖺', '𝗮', '𝘢', '𝙖', '𝚊', '𝛂', '𝛼', '𝜶', '𝝰', '𝞪'],
+    'b': ['Ƅ', 'Ь', 'Ꮟ', 'ᑲ', 'ᖯ', 'ｂ', '𝐛', '𝑏', '𝒃', '𝒷', '𝓫', '𝔟', '𝕓', '𝖇', '𝖻', '𝗯', '𝘣', '𝙗', '𝚋'],
+    'c': ['ϲ', 'с', 'ᴄ', 'ⅽ', 'ⲥ', 'ꮯ', 'ｃ', '𐐽', '𝐜', '𝑐', '𝒄', '𝒸', '𝓬', '𝔠', '𝕔', '𝖈', '𝖼', '𝗰', '𝘤', '𝙘', '𝚌'],
+    'd': ['ԁ', 'Ꮷ', 'ᑯ', 'ⅆ', 'ⅾ', 'ꓒ', 'ｄ', '𝐝', '𝑑', '𝒅', '𝒹', '𝓭', '𝔡', '𝕕', '𝖉', '𝖽', '𝗱', '𝘥', '𝙙', '𝚍'],
+    'e': ['е', 'ҽ', '℮', 'ℯ', 'ⅇ', 'ꬲ', 'ｅ', '𝐞', '𝑒', '𝒆', '𝓮', '𝔢', '𝕖', '𝖊', '𝖾', '𝗲', '𝘦', '𝙚', '𝚎'],
+    'f': ['ſ', 'ϝ', 'ք', 'ẝ', 'ꞙ', 'ꬵ', 'ｆ', '𝐟', '𝑓', '𝒇', '𝒻', '𝓯', '𝔣', '𝕗', '𝖋', '𝖿', '𝗳', '𝘧', '𝙛', '𝚏', '𝟋'],
+    'g': ['ƍ', 'ɡ', 'ց', 'ᶃ', 'ℊ', 'ｇ', '𝐠', '𝑔', '𝒈', '𝓰', '𝔤', '𝕘', '𝖌', '𝗀', '𝗴', '𝘨', '𝙜', '𝚐'],
+    'h': ['һ', 'հ', 'Ꮒ', 'ℎ', 'ｈ', '𝐡', '𝒉', '𝒽', '𝓱', '𝔥', '𝕙', '𝖍', '𝗁', '𝗵', '𝘩', '𝙝', '𝚑'],
+    'i': ['ı', 'ɩ', 'ɪ', '˛', 'ͺ', 'ι', 'і', 'ӏ', 'Ꭵ', 'ι', 'ℹ', 'ⅈ', 'ⅰ', '⍳', 'ꙇ', 'ꭵ', 'ｉ', '𑣃', '𝐢', '𝑖', '𝒊', '𝒾', '𝓲', '𝔦', '𝕚', '𝖎', '𝗂', '𝗶', '𝘪', '𝙞', '𝚒', '𝚤', '𝛊', '𝜄', '𝜾', '𝝸', '𝞲'],
+    'j': ['ϳ', 'ј', 'ⅉ', 'ｊ', '𝐣', '𝑗', '𝒋', '𝒿', '𝓳', '𝔧', '𝕛', '𝖏', '𝗃', '𝗷', '𝘫', '𝙟', '𝚓'],
+    'k': ['ｋ', '𝐤', '𝑘', '𝒌', '𝓀', '𝓴', '𝔨', '𝕜', '𝖐', '𝗄', '𝗸', '𝘬', '𝙠', '𝚔'],
+    'l': ['Ɩ', 'ǀ', 'Ι', 'І', 'Ӏ', '׀', 'ו', 'ן', 'ا', '١', '۱', 'ߊ', 'ᛁ', 'ℐ', 'ℑ', 'ℓ', 'Ⅰ', 'ⅼ', '∣', '⏽', 'Ⲓ', 'ⵏ', 'ꓲ', 'ﺍ', 'ﺎ', '１', 'Ｉ', 'ｌ', '￨', '𐊊', '𐌉', '𐌠', '𖼨', '𝐈', '𝐥', '𝐼', '𝑙', '𝑰', '𝒍', '𝓁', '𝓘', '𝓵', '𝔩', '𝕀', '𝕝', '𝕴', '𝖑', '𝖨', '𝗅', '𝗜', '𝗹', '𝘐', '𝘭', '𝙄', '𝙡', '𝙸', '𝚕', '𝚰', '𝛪', '𝜤', '𝝞', '𝞘', '𝟏', '𝟙', '𝟣', '𝟭', '𝟷', '𞣇', '𞸀', '𞺀', '\U0001fbf1'],
+    'm': ['ｍ'],
+    'n': ['ո', 'ռ', 'ｎ', '𝐧', '𝑛', '𝒏', '𝓃', '𝓷', '𝔫', '𝕟', '𝖓', '𝗇', '𝗻', '𝘯', '𝙣', '𝚗'],
+    'o': ['Ο', 'ο', 'σ', 'О', 'о', 'Օ', 'օ', 'ס', 'ه', '٥', 'ھ', 'ہ', 'ە', '۵', '߀', '०', '০', '੦', '૦', 'ଠ', '୦', '௦', 'ం', '౦', 'ಂ', '೦', 'ം', 'ഠ', '൦', 'ං', '๐', '໐', 'ဝ', '၀', 'ჿ', 'ዐ', 'ᴏ', 'ᴑ', 'ℴ', 'Ⲟ', 'ⲟ', 'ⵔ', '〇', 'ꓳ', 'ꬽ', 'ﮦ', 'ﮧ', 'ﮨ', 'ﮩ', 'ﮪ', 'ﮫ', 'ﮬ', 'ﮭ', 'ﻩ', 'ﻪ', 'ﻫ', 'ﻬ', '０', 'Ｏ', 'ｏ', '𐊒', '𐊫', '𐐄', '𐐬', '𐓂', '𐓪', '𐔖', '𑓐', '𑢵', '𑣈', '𑣗', '𑣠', '𝐎', '𝐨', '𝑂', '𝑜', '𝑶', '𝒐', '𝒪', '𝓞', '𝓸', '𝔒', '𝔬', '𝕆', '𝕠', '𝕺', '𝖔', '𝖮', '𝗈', '𝗢', '𝗼', '𝘖', '𝘰', '𝙊', '𝙤', '𝙾', '𝚘', '𝚶', '𝛐', '𝛔', '𝛰', '𝜊', '𝜎', '𝜪', '𝝄', '𝝈', '𝝤', '𝝾', '𝞂', '𝞞', '𝞸', '𝞼', '𝟎', '𝟘', '𝟢', '𝟬', '𝟶', '𞸤', '𞹤', '𞺄', '\U0001fbf0'],
+    'p': ['ρ', 'ϱ', 'р', '⍴', 'ⲣ', 'ｐ', '𝐩', '𝑝', '𝒑', '𝓅', '𝓹', '𝔭', '𝕡', '𝖕', '𝗉', '𝗽', '𝘱', '𝙥', '𝚙', '𝛒', '𝛠', '𝜌', '𝜚', '𝝆', '𝝔', '𝞀', '𝞎', '𝞺', '𝟈'],
+    'q': ['ԛ', 'գ', 'զ', 'ｑ', '𝐪', '𝑞', '𝒒', '𝓆', '𝓺', '𝔮', '𝕢', '𝖖', '𝗊', '𝗾', '𝘲', '𝙦', '𝚚'],
+    'r': ['г', 'ᴦ', 'ⲅ', 'ꭇ', 'ꭈ', 'ꮁ', 'ｒ', '𝐫', '𝑟', '𝒓', '𝓇', '𝓻', '𝔯', '𝕣', '𝖗', '𝗋', '𝗿', '𝘳', '𝙧', '𝚛'],
+    's': ['ƽ', 'ѕ', 'ꜱ', 'ꮪ', 'ｓ', '𐑈', '𑣁', '𝐬', '𝑠', '𝒔', '𝓈', '𝓼', '𝔰', '𝕤', '𝖘', '𝗌', '𝘀', '𝘴', '𝙨', '𝚜'],
+    't': ['ｔ', '𝐭', '𝑡', '𝒕', '𝓉', '𝓽', '𝔱', '𝕥', '𝖙', '𝗍', '𝘁', '𝘵', '𝙩', '𝚝'],
+    'u': ['ʋ', 'υ', 'ս', 'ᴜ', 'ꞟ', 'ꭎ', 'ꭒ', 'ｕ', '𐓶', '𑣘', '𝐮', '𝑢', '𝒖', '𝓊', '𝓾', '𝔲', '𝕦', '𝖚', '𝗎', '𝘂', '𝘶', '𝙪', '𝚞', '𝛖', '𝜐', '𝝊', '𝞄', '𝞾'],
+    'v': ['ν', 'ѵ', 'ט', 'ᴠ', 'ⅴ', '∨', '⋁', 'ꮩ', 'ｖ', '𑜆', '𑣀', '𝐯', '𝑣', '𝒗', '𝓋', '𝓿', '𝔳', '𝕧', '𝖛', '𝗏', '𝘃', '𝘷', '𝙫', '𝚟', '𝛎', '𝜈', '𝝂', '𝝼', '𝞶'],
+    'w': ['ɯ', 'ѡ', 'ԝ', 'ա', 'ᴡ', 'ꮃ', 'ｗ', '𑜊', '𑜎', '𑜏', '𝐰', '𝑤', '𝒘', '𝓌', '𝔀', '𝔴', '𝕨', '𝖜', '𝗐', '𝘄', '𝘸', '𝙬', '𝚠'],
+    'x': ['×', 'х', 'ᕁ', 'ᕽ', '᙮', 'ⅹ', '⤫', '⤬', '⨯', 'ｘ', '𝐱', '𝑥', '𝒙', '𝓍', '𝔁', '𝔵', '𝕩', '𝖝', '𝗑', '𝘅', '𝘹', '𝙭', '𝚡'],
+    'y': ['ɣ', 'ʏ', 'γ', 'у', 'ү', 'ყ', 'ᶌ', 'ỿ', 'ℽ', 'ꭚ', 'ｙ', '𑣜', '𝐲', '𝑦', '𝒚', '𝓎', '𝔂', '𝔶', '𝕪', '𝖞', '𝗒', '𝘆', '𝘺', '𝙮', '𝚢', '𝛄', '𝛾', '𝜸', '𝝲', '𝞬'],
+    'z': ['ᴢ', 'ꮓ', 'ｚ', '𑣄', '𝐳', '𝑧', '𝒛', '𝓏', '𝔃', '𝔷', '𝕫', '𝖟', '𝗓', '𝘇', '𝘻', '𝙯', '𝚣'],
+    '{': ['❴', '｛', '𝄔'],
+    '}': ['❵', '｝'],
+    '~': ['˜', '῀', '⁓', '∼'],
+}
diff --git a/common/text/unidecoder/replacements.py b/common/text/unidecoder/replacements.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01825cc6e4f02771343b1ccec09ee918dac6c01
--- /dev/null
+++ b/common/text/unidecoder/replacements.py
@@ -0,0 +1,2085 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+#
+# Copyright (c) Sindre Sorhus <sindresorhus@gmail.com> (https://sindresorhus.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#
+# Based on:
+#   https://github.com/sindresorhus/transliterate/blob/main/replacements.js
+#
+
+replacements = [
+	# German umlauts
+	['ß', 'ss'],
+	['ẞ', 'Ss'],
+	['ä', 'ae'],
+	['Ä', 'Ae'],
+	['ö', 'oe'],
+	['Ö', 'Oe'],
+	['ü', 'ue'],
+	['Ü', 'Ue'],
+
+	# Latin
+	['À', 'A'],
+	['Á', 'A'],
+	['Â', 'A'],
+	['Ã', 'A'],
+	['Ä', 'Ae'],
+	['Å', 'A'],
+	['Æ', 'AE'],
+	['Ç', 'C'],
+	['È', 'E'],
+	['É', 'E'],
+	['Ê', 'E'],
+	['Ë', 'E'],
+	['Ì', 'I'],
+	['Í', 'I'],
+	['Î', 'I'],
+	['Ï', 'I'],
+	['Ð', 'D'],
+	['Ñ', 'N'],
+	['Ò', 'O'],
+	['Ó', 'O'],
+	['Ô', 'O'],
+	['Õ', 'O'],
+	['Ö', 'Oe'],
+	['Ő', 'O'],
+	['Ø', 'O'],
+	['Ù', 'U'],
+	['Ú', 'U'],
+	['Û', 'U'],
+	['Ü', 'Ue'],
+	['Ű', 'U'],
+	['Ý', 'Y'],
+	['Þ', 'TH'],
+	['ß', 'ss'],
+	['à', 'a'],
+	['á', 'a'],
+	['â', 'a'],
+	['ã', 'a'],
+	['ä', 'ae'],
+	['å', 'a'],
+	['æ', 'ae'],
+	['ç', 'c'],
+	['è', 'e'],
+	['é', 'e'],
+	['ê', 'e'],
+	['ë', 'e'],
+	['ì', 'i'],
+	['í', 'i'],
+	['î', 'i'],
+	['ï', 'i'],
+	['ð', 'd'],
+	['ñ', 'n'],
+	['ò', 'o'],
+	['ó', 'o'],
+	['ô', 'o'],
+	['õ', 'o'],
+	['ö', 'oe'],
+	['ő', 'o'],
+	['ø', 'o'],
+	['ù', 'u'],
+	['ú', 'u'],
+	['û', 'u'],
+	['ü', 'ue'],
+	['ű', 'u'],
+	['ý', 'y'],
+	['þ', 'th'],
+	['ÿ', 'y'],
+	['ẞ', 'SS'],
+
+	# Vietnamese
+	['à', 'a'],
+	['À', 'A'],
+	['á', 'a'],
+	['Á', 'A'],
+	['â', 'a'],
+	['Â', 'A'],
+	['ã', 'a'],
+	['Ã', 'A'],
+	['è', 'e'],
+	['È', 'E'],
+	['é', 'e'],
+	['É', 'E'],
+	['ê', 'e'],
+	['Ê', 'E'],
+	['ì', 'i'],
+	['Ì', 'I'],
+	['í', 'i'],
+	['Í', 'I'],
+	['ò', 'o'],
+	['Ò', 'O'],
+	['ó', 'o'],
+	['Ó', 'O'],
+	['ô', 'o'],
+	['Ô', 'O'],
+	['õ', 'o'],
+	['Õ', 'O'],
+	['ù', 'u'],
+	['Ù', 'U'],
+	['ú', 'u'],
+	['Ú', 'U'],
+	['ý', 'y'],
+	['Ý', 'Y'],
+	['ă', 'a'],
+	['Ă', 'A'],
+	['Đ', 'D'],
+	['đ', 'd'],
+	['ĩ', 'i'],
+	['Ĩ', 'I'],
+	['ũ', 'u'],
+	['Ũ', 'U'],
+	['ơ', 'o'],
+	['Ơ', 'O'],
+	['ư', 'u'],
+	['Ư', 'U'],
+	['ạ', 'a'],
+	['Ạ', 'A'],
+	['ả', 'a'],
+	['Ả', 'A'],
+	['ấ', 'a'],
+	['Ấ', 'A'],
+	['ầ', 'a'],
+	['Ầ', 'A'],
+	['ẩ', 'a'],
+	['Ẩ', 'A'],
+	['ẫ', 'a'],
+	['Ẫ', 'A'],
+	['ậ', 'a'],
+	['Ậ', 'A'],
+	['ắ', 'a'],
+	['Ắ', 'A'],
+	['ằ', 'a'],
+	['Ằ', 'A'],
+	['ẳ', 'a'],
+	['Ẳ', 'A'],
+	['ẵ', 'a'],
+	['Ẵ', 'A'],
+	['ặ', 'a'],
+	['Ặ', 'A'],
+	['ẹ', 'e'],
+	['Ẹ', 'E'],
+	['ẻ', 'e'],
+	['Ẻ', 'E'],
+	['ẽ', 'e'],
+	['Ẽ', 'E'],
+	['ế', 'e'],
+	['Ế', 'E'],
+	['ề', 'e'],
+	['Ề', 'E'],
+	['ể', 'e'],
+	['Ể', 'E'],
+	['ễ', 'e'],
+	['Ễ', 'E'],
+	['ệ', 'e'],
+	['Ệ', 'E'],
+	['ỉ', 'i'],
+	['Ỉ', 'I'],
+	['ị', 'i'],
+	['Ị', 'I'],
+	['ọ', 'o'],
+	['Ọ', 'O'],
+	['ỏ', 'o'],
+	['Ỏ', 'O'],
+	['ố', 'o'],
+	['Ố', 'O'],
+	['ồ', 'o'],
+	['Ồ', 'O'],
+	['ổ', 'o'],
+	['Ổ', 'O'],
+	['ỗ', 'o'],
+	['Ỗ', 'O'],
+	['ộ', 'o'],
+	['Ộ', 'O'],
+	['ớ', 'o'],
+	['Ớ', 'O'],
+	['ờ', 'o'],
+	['Ờ', 'O'],
+	['ở', 'o'],
+	['Ở', 'O'],
+	['ỡ', 'o'],
+	['Ỡ', 'O'],
+	['ợ', 'o'],
+	['Ợ', 'O'],
+	['ụ', 'u'],
+	['Ụ', 'U'],
+	['ủ', 'u'],
+	['Ủ', 'U'],
+	['ứ', 'u'],
+	['Ứ', 'U'],
+	['ừ', 'u'],
+	['Ừ', 'U'],
+	['ử', 'u'],
+	['Ử', 'U'],
+	['ữ', 'u'],
+	['Ữ', 'U'],
+	['ự', 'u'],
+	['Ự', 'U'],
+	['ỳ', 'y'],
+	['Ỳ', 'Y'],
+	['ỵ', 'y'],
+	['Ỵ', 'Y'],
+	['ỷ', 'y'],
+	['Ỷ', 'Y'],
+	['ỹ', 'y'],
+	['Ỹ', 'Y'],
+
+	# Arabic
+	['ء', 'e'],
+	['آ', 'a'],
+	['أ', 'a'],
+	['ؤ', 'w'],
+	['إ', 'i'],
+	['ئ', 'y'],
+	['ا', 'a'],
+	['ب', 'b'],
+	['ة', 't'],
+	['ت', 't'],
+	['ث', 'th'],
+	['ج', 'j'],
+	['ح', 'h'],
+	['خ', 'kh'],
+	['د', 'd'],
+	['ذ', 'dh'],
+	['ر', 'r'],
+	['ز', 'z'],
+	['س', 's'],
+	['ش', 'sh'],
+	['ص', 's'],
+	['ض', 'd'],
+	['ط', 't'],
+	['ظ', 'z'],
+	['ع', 'e'],
+	['غ', 'gh'],
+	['ـ', '_'],
+	['ف', 'f'],
+	['ق', 'q'],
+	['ك', 'k'],
+	['ل', 'l'],
+	['م', 'm'],
+	['ن', 'n'],
+	['ه', 'h'],
+	['و', 'w'],
+	['ى', 'a'],
+	['ي', 'y'],
+	['َ‎', 'a'],
+	['ُ', 'u'],
+	['ِ‎', 'i'],
+	['٠', '0'],
+	['١', '1'],
+	['٢', '2'],
+	['٣', '3'],
+	['٤', '4'],
+	['٥', '5'],
+	['٦', '6'],
+	['٧', '7'],
+	['٨', '8'],
+	['٩', '9'],
+
+	# Persian / Farsi
+	['چ', 'ch'],
+	['ک', 'k'],
+	['گ', 'g'],
+	['پ', 'p'],
+	['ژ', 'zh'],
+	['ی', 'y'],
+	['۰', '0'],
+	['۱', '1'],
+	['۲', '2'],
+	['۳', '3'],
+	['۴', '4'],
+	['۵', '5'],
+	['۶', '6'],
+	['۷', '7'],
+	['۸', '8'],
+	['۹', '9'],
+
+	# Pashto
+	['ټ', 'p'],
+	['ځ', 'z'],
+	['څ', 'c'],
+	['ډ', 'd'],
+	['ﺫ', 'd'],
+	['ﺭ', 'r'],
+	['ړ', 'r'],
+	['ﺯ', 'z'],
+	['ږ', 'g'],
+	['ښ', 'x'],
+	['ګ', 'g'],
+	['ڼ', 'n'],
+	['ۀ', 'e'],
+	['ې', 'e'],
+	['ۍ', 'ai'],
+
+	# Urdu
+	['ٹ', 't'],
+	['ڈ', 'd'],
+	['ڑ', 'r'],
+	['ں', 'n'],
+	['ہ', 'h'],
+	['ھ', 'h'],
+	['ے', 'e'],
+
+	# Russian
+	['А', 'A'],
+	['а', 'a'],
+	['Б', 'B'],
+	['б', 'b'],
+	['В', 'V'],
+	['в', 'v'],
+	['Г', 'G'],
+	['г', 'g'],
+	['Д', 'D'],
+	['д', 'd'],
+	['ъе', 'ye'],
+	['Ъе', 'Ye'],
+	['ъЕ', 'yE'],
+	['ЪЕ', 'YE'],
+	['Е', 'E'],
+	['е', 'e'],
+	['Ё', 'Yo'],
+	['ё', 'yo'],
+	['Ж', 'Zh'],
+	['ж', 'zh'],
+	['З', 'Z'],
+	['з', 'z'],
+	['И', 'I'],
+	['и', 'i'],
+	['ый', 'iy'],
+	['Ый', 'Iy'],
+	['ЫЙ', 'IY'],
+	['ыЙ', 'iY'],
+	['Й', 'Y'],
+	['й', 'y'],
+	['К', 'K'],
+	['к', 'k'],
+	['Л', 'L'],
+	['л', 'l'],
+	['М', 'M'],
+	['м', 'm'],
+	['Н', 'N'],
+	['н', 'n'],
+	['О', 'O'],
+	['о', 'o'],
+	['П', 'P'],
+	['п', 'p'],
+	['Р', 'R'],
+	['р', 'r'],
+	['С', 'S'],
+	['с', 's'],
+	['Т', 'T'],
+	['т', 't'],
+	['У', 'U'],
+	['у', 'u'],
+	['Ф', 'F'],
+	['ф', 'f'],
+	['Х', 'Kh'],
+	['х', 'kh'],
+	['Ц', 'Ts'],
+	['ц', 'ts'],
+	['Ч', 'Ch'],
+	['ч', 'ch'],
+	['Ш', 'Sh'],
+	['ш', 'sh'],
+	['Щ', 'Sch'],
+	['щ', 'sch'],
+	['Ъ', ''],
+	['ъ', ''],
+	['Ы', 'Y'],
+	['ы', 'y'],
+	['Ь', ''],
+	['ь', ''],
+	['Э', 'E'],
+	['э', 'e'],
+	['Ю', 'Yu'],
+	['ю', 'yu'],
+	['Я', 'Ya'],
+	['я', 'ya'],
+
+	# Romanian
+	['ă', 'a'],
+	['Ă', 'A'],
+	['ș', 's'],
+	['Ș', 'S'],
+	['ț', 't'],
+	['Ț', 'T'],
+	['ţ', 't'],
+	['Ţ', 'T'],
+
+	# Turkish
+	['ş', 's'],
+	['Ş', 'S'],
+	['ç', 'c'],
+	['Ç', 'C'],
+	['ğ', 'g'],
+	['Ğ', 'G'],
+	['ı', 'i'],
+	['İ', 'I'],
+
+	# Armenian
+	['ա', 'a'],
+	['Ա', 'A'],
+	['բ', 'b'],
+	['Բ', 'B'],
+	['գ', 'g'],
+	['Գ', 'G'],
+	['դ', 'd'],
+	['Դ', 'D'],
+	['ե', 'ye'],
+	['Ե', 'Ye'],
+	['զ', 'z'],
+	['Զ', 'Z'],
+	['է', 'e'],
+	['Է', 'E'],
+	['ը', 'y'],
+	['Ը', 'Y'],
+	['թ', 't'],
+	['Թ', 'T'],
+	['ժ', 'zh'],
+	['Ժ', 'Zh'],
+	['ի', 'i'],
+	['Ի', 'I'],
+	['լ', 'l'],
+	['Լ', 'L'],
+	['խ', 'kh'],
+	['Խ', 'Kh'],
+	['ծ', 'ts'],
+	['Ծ', 'Ts'],
+	['կ', 'k'],
+	['Կ', 'K'],
+	['հ', 'h'],
+	['Հ', 'H'],
+	['ձ', 'dz'],
+	['Ձ', 'Dz'],
+	['ղ', 'gh'],
+	['Ղ', 'Gh'],
+	['ճ', 'tch'],
+	['Ճ', 'Tch'],
+	['մ', 'm'],
+	['Մ', 'M'],
+	['յ', 'y'],
+	['Յ', 'Y'],
+	['ն', 'n'],
+	['Ն', 'N'],
+	['շ', 'sh'],
+	['Շ', 'Sh'],
+	['ո', 'vo'],
+	['Ո', 'Vo'],
+	['չ', 'ch'],
+	['Չ', 'Ch'],
+	['պ', 'p'],
+	['Պ', 'P'],
+	['ջ', 'j'],
+	['Ջ', 'J'],
+	['ռ', 'r'],
+	['Ռ', 'R'],
+	['ս', 's'],
+	['Ս', 'S'],
+	['վ', 'v'],
+	['Վ', 'V'],
+	['տ', 't'],
+	['Տ', 'T'],
+	['ր', 'r'],
+	['Ր', 'R'],
+	['ց', 'c'],
+	['Ց', 'C'],
+	['ու', 'u'],
+	['ՈՒ', 'U'],
+	['Ու', 'U'],
+	['փ', 'p'],
+	['Փ', 'P'],
+	['ք', 'q'],
+	['Ք', 'Q'],
+	['օ', 'o'],
+	['Օ', 'O'],
+	['ֆ', 'f'],
+	['Ֆ', 'F'],
+	['և', 'yev'],
+
+	# Georgian
+	['ა', 'a'],
+	['ბ', 'b'],
+	['გ', 'g'],
+	['დ', 'd'],
+	['ე', 'e'],
+	['ვ', 'v'],
+	['ზ', 'z'],
+	['თ', 't'],
+	['ი', 'i'],
+	['კ', 'k'],
+	['ლ', 'l'],
+	['მ', 'm'],
+	['ნ', 'n'],
+	['ო', 'o'],
+	['პ', 'p'],
+	['ჟ', 'zh'],
+	['რ', 'r'],
+	['ს', 's'],
+	['ტ', 't'],
+	['უ', 'u'],
+	['ფ', 'ph'],
+	['ქ', 'q'],
+	['ღ', 'gh'],
+	['ყ', 'k'],
+	['შ', 'sh'],
+	['ჩ', 'ch'],
+	['ც', 'ts'],
+	['ძ', 'dz'],
+	['წ', 'ts'],
+	['ჭ', 'tch'],
+	['ხ', 'kh'],
+	['ჯ', 'j'],
+	['ჰ', 'h'],
+
+	# Czech
+	['č', 'c'],
+	['ď', 'd'],
+	['ě', 'e'],
+	['ň', 'n'],
+	['ř', 'r'],
+	['š', 's'],
+	['ť', 't'],
+	['ů', 'u'],
+	['ž', 'z'],
+	['Č', 'C'],
+	['Ď', 'D'],
+	['Ě', 'E'],
+	['Ň', 'N'],
+	['Ř', 'R'],
+	['Š', 'S'],
+	['Ť', 'T'],
+	['Ů', 'U'],
+	['Ž', 'Z'],
+
+	# Dhivehi
+	['ހ', 'h'],
+	['ށ', 'sh'],
+	['ނ', 'n'],
+	['ރ', 'r'],
+	['ބ', 'b'],
+	['ޅ', 'lh'],
+	['ކ', 'k'],
+	['އ', 'a'],
+	['ވ', 'v'],
+	['މ', 'm'],
+	['ފ', 'f'],
+	['ދ', 'dh'],
+	['ތ', 'th'],
+	['ލ', 'l'],
+	['ގ', 'g'],
+	['ޏ', 'gn'],
+	['ސ', 's'],
+	['ޑ', 'd'],
+	['ޒ', 'z'],
+	['ޓ', 't'],
+	['ޔ', 'y'],
+	['ޕ', 'p'],
+	['ޖ', 'j'],
+	['ޗ', 'ch'],
+	['ޘ', 'tt'],
+	['ޙ', 'hh'],
+	['ޚ', 'kh'],
+	['ޛ', 'th'],
+	['ޜ', 'z'],
+	['ޝ', 'sh'],
+	['ޞ', 's'],
+	['ޟ', 'd'],
+	['ޠ', 't'],
+	['ޡ', 'z'],
+	['ޢ', 'a'],
+	['ޣ', 'gh'],
+	['ޤ', 'q'],
+	['ޥ', 'w'],
+	['ަ', 'a'],
+	['ާ', 'aa'],
+	['ި', 'i'],
+	['ީ', 'ee'],
+	['ު', 'u'],
+	['ޫ', 'oo'],
+	['ެ', 'e'],
+	['ޭ', 'ey'],
+	['ޮ', 'o'],
+	['ޯ', 'oa'],
+	['ް', ''],
+
+	# Greek
+	['α', 'a'],
+	['β', 'v'],
+	['γ', 'g'],
+	['δ', 'd'],
+	['ε', 'e'],
+	['ζ', 'z'],
+	['η', 'i'],
+	['θ', 'th'],
+	['ι', 'i'],
+	['κ', 'k'],
+	['λ', 'l'],
+	['μ', 'm'],
+	['ν', 'n'],
+	['ξ', 'ks'],
+	['ο', 'o'],
+	['π', 'p'],
+	['ρ', 'r'],
+	['σ', 's'],
+	['τ', 't'],
+	['υ', 'y'],
+	['φ', 'f'],
+	['χ', 'x'],
+	['ψ', 'ps'],
+	['ω', 'o'],
+	['ά', 'a'],
+	['έ', 'e'],
+	['ί', 'i'],
+	['ό', 'o'],
+	['ύ', 'y'],
+	['ή', 'i'],
+	['ώ', 'o'],
+	['ς', 's'],
+	['ϊ', 'i'],
+	['ΰ', 'y'],
+	['ϋ', 'y'],
+	['ΐ', 'i'],
+	['Α', 'A'],
+	['Β', 'B'],
+	['Γ', 'G'],
+	['Δ', 'D'],
+	['Ε', 'E'],
+	['Ζ', 'Z'],
+	['Η', 'I'],
+	['Θ', 'TH'],
+	['Ι', 'I'],
+	['Κ', 'K'],
+	['Λ', 'L'],
+	['Μ', 'M'],
+	['Ν', 'N'],
+	['Ξ', 'KS'],
+	['Ο', 'O'],
+	['Π', 'P'],
+	['Ρ', 'R'],
+	['Σ', 'S'],
+	['Τ', 'T'],
+	['Υ', 'Y'],
+	['Φ', 'F'],
+	['Χ', 'X'],
+	['Ψ', 'PS'],
+	['Ω', 'O'],
+	['Ά', 'A'],
+	['Έ', 'E'],
+	['Ί', 'I'],
+	['Ό', 'O'],
+	['Ύ', 'Y'],
+	['Ή', 'I'],
+	['Ώ', 'O'],
+	['Ϊ', 'I'],
+	['Ϋ', 'Y'],
+
+	# Disabled as it conflicts with German and Latin.
+	# Hungarian
+	# ['ä', 'a'],
+	# ['Ä', 'A'],
+	# ['ö', 'o'],
+	# ['Ö', 'O'],
+	# ['ü', 'u'],
+	# ['Ü', 'U'],
+	# ['ű', 'u'],
+	# ['Ű', 'U'],
+
+	# Latvian
+	['ā', 'a'],
+	['ē', 'e'],
+	['ģ', 'g'],
+	['ī', 'i'],
+	['ķ', 'k'],
+	['ļ', 'l'],
+	['ņ', 'n'],
+	['ū', 'u'],
+	['Ā', 'A'],
+	['Ē', 'E'],
+	['Ģ', 'G'],
+	['Ī', 'I'],
+	['Ķ', 'K'],
+	['Ļ', 'L'],
+	['Ņ', 'N'],
+	['Ū', 'U'],
+	['č', 'c'],
+	['š', 's'],
+	['ž', 'z'],
+	['Č', 'C'],
+	['Š', 'S'],
+	['Ž', 'Z'],
+
+	# Lithuanian
+	['ą', 'a'],
+	['č', 'c'],
+	['ę', 'e'],
+	['ė', 'e'],
+	['į', 'i'],
+	['š', 's'],
+	['ų', 'u'],
+	['ū', 'u'],
+	['ž', 'z'],
+	['Ą', 'A'],
+	['Č', 'C'],
+	['Ę', 'E'],
+	['Ė', 'E'],
+	['Į', 'I'],
+	['Š', 'S'],
+	['Ų', 'U'],
+	['Ū', 'U'],
+
+	# Macedonian
+	['Ќ', 'Kj'],
+	['ќ', 'kj'],
+	['Љ', 'Lj'],
+	['љ', 'lj'],
+	['Њ', 'Nj'],
+	['њ', 'nj'],
+	['Тс', 'Ts'],
+	['тс', 'ts'],
+
+	# Polish
+	['ą', 'a'],
+	['ć', 'c'],
+	['ę', 'e'],
+	['ł', 'l'],
+	['ń', 'n'],
+	['ś', 's'],
+	['ź', 'z'],
+	['ż', 'z'],
+	['Ą', 'A'],
+	['Ć', 'C'],
+	['Ę', 'E'],
+	['Ł', 'L'],
+	['Ń', 'N'],
+	['Ś', 'S'],
+	['Ź', 'Z'],
+	['Ż', 'Z'],
+
+	# Disabled as it conflicts with Vietnamese.
+	# Serbian
+	# ['љ', 'lj'],
+	# ['њ', 'nj'],
+	# ['Љ', 'Lj'],
+	# ['Њ', 'Nj'],
+	# ['đ', 'dj'],
+	# ['Đ', 'Dj'],
+	# ['ђ', 'dj'],
+	# ['ј', 'j'],
+	# ['ћ', 'c'],
+	# ['џ', 'dz'],
+	# ['Ђ', 'Dj'],
+	# ['Ј', 'j'],
+	# ['Ћ', 'C'],
+	# ['Џ', 'Dz'],
+
+	# Disabled as it conflicts with German and Latin.
+	# Slovak
+	# ['ä', 'a'],
+	# ['Ä', 'A'],
+	# ['ľ', 'l'],
+	# ['ĺ', 'l'],
+	# ['ŕ', 'r'],
+	# ['Ľ', 'L'],
+	# ['Ĺ', 'L'],
+	# ['Ŕ', 'R'],
+
+	# Disabled as it conflicts with German and Latin.
+	# Swedish
+	# ['å', 'o'],
+	# ['Å', 'o'],
+	# ['ä', 'a'],
+	# ['Ä', 'A'],
+	# ['ë', 'e'],
+	# ['Ë', 'E'],
+	# ['ö', 'o'],
+	# ['Ö', 'O'],
+
+	# Ukrainian
+	['Є', 'Ye'],
+	['І', 'I'],
+	['Ї', 'Yi'],
+	['Ґ', 'G'],
+	['є', 'ye'],
+	['і', 'i'],
+	['ї', 'yi'],
+	['ґ', 'g'],
+
+	# Dutch
+	['Ĳ', 'IJ'],
+	['ĳ', 'ij'],
+
+	# Danish
+	# ['Æ', 'Ae'],
+	# ['Ø', 'Oe'],
+	# ['Å', 'Aa'],
+	# ['æ', 'ae'],
+	# ['ø', 'oe'],
+	# ['å', 'aa']
+
+	# Currencies
+	['¢', 'c'],
+	['¥', 'Y'],
+	['߿', 'b'],
+	['৳', 't'],
+	['૱', 'Bo'],
+	['฿', 'B'],
+	['₠', 'CE'],
+	['₡', 'C'],
+	['₢', 'Cr'],
+	['₣', 'F'],
+	['₥', 'm'],
+	['₦', 'N'],
+	['₧', 'Pt'],
+	['₨', 'Rs'],
+	['₩', 'W'],
+	['₫', 's'],
+	['€', 'E'],
+	['₭', 'K'],
+	['₮', 'T'],
+	['₯', 'Dp'],
+	['₰', 'S'],
+	['₱', 'P'],
+	['₲', 'G'],
+	['₳', 'A'],
+	['₴', 'S'],
+	['₵', 'C'],
+	['₶', 'tt'],
+	['₷', 'S'],
+	['₸', 'T'],
+	['₹', 'R'],
+	['₺', 'L'],
+	['₽', 'P'],
+	['₿', 'B'],
+	['﹩', '$'],
+	['￠', 'c'],
+	['￥', 'Y'],
+	['￦', 'W'],
+
+	# Latin
+	['𝐀', 'A'],
+	['𝐁', 'B'],
+	['𝐂', 'C'],
+	['𝐃', 'D'],
+	['𝐄', 'E'],
+	['𝐅', 'F'],
+	['𝐆', 'G'],
+	['𝐇', 'H'],
+	['𝐈', 'I'],
+	['𝐉', 'J'],
+	['𝐊', 'K'],
+	['𝐋', 'L'],
+	['𝐌', 'M'],
+	['𝐍', 'N'],
+	['𝐎', 'O'],
+	['𝐏', 'P'],
+	['𝐐', 'Q'],
+	['𝐑', 'R'],
+	['𝐒', 'S'],
+	['𝐓', 'T'],
+	['𝐔', 'U'],
+	['𝐕', 'V'],
+	['𝐖', 'W'],
+	['𝐗', 'X'],
+	['𝐘', 'Y'],
+	['𝐙', 'Z'],
+	['𝐚', 'a'],
+	['𝐛', 'b'],
+	['𝐜', 'c'],
+	['𝐝', 'd'],
+	['𝐞', 'e'],
+	['𝐟', 'f'],
+	['𝐠', 'g'],
+	['𝐡', 'h'],
+	['𝐢', 'i'],
+	['𝐣', 'j'],
+	['𝐤', 'k'],
+	['𝐥', 'l'],
+	['𝐦', 'm'],
+	['𝐧', 'n'],
+	['𝐨', 'o'],
+	['𝐩', 'p'],
+	['𝐪', 'q'],
+	['𝐫', 'r'],
+	['𝐬', 's'],
+	['𝐭', 't'],
+	['𝐮', 'u'],
+	['𝐯', 'v'],
+	['𝐰', 'w'],
+	['𝐱', 'x'],
+	['𝐲', 'y'],
+	['𝐳', 'z'],
+	['𝐴', 'A'],
+	['𝐵', 'B'],
+	['𝐶', 'C'],
+	['𝐷', 'D'],
+	['𝐸', 'E'],
+	['𝐹', 'F'],
+	['𝐺', 'G'],
+	['𝐻', 'H'],
+	['𝐼', 'I'],
+	['𝐽', 'J'],
+	['𝐾', 'K'],
+	['𝐿', 'L'],
+	['𝑀', 'M'],
+	['𝑁', 'N'],
+	['𝑂', 'O'],
+	['𝑃', 'P'],
+	['𝑄', 'Q'],
+	['𝑅', 'R'],
+	['𝑆', 'S'],
+	['𝑇', 'T'],
+	['𝑈', 'U'],
+	['𝑉', 'V'],
+	['𝑊', 'W'],
+	['𝑋', 'X'],
+	['𝑌', 'Y'],
+	['𝑍', 'Z'],
+	['𝑎', 'a'],
+	['𝑏', 'b'],
+	['𝑐', 'c'],
+	['𝑑', 'd'],
+	['𝑒', 'e'],
+	['𝑓', 'f'],
+	['𝑔', 'g'],
+	['𝑖', 'i'],
+	['𝑗', 'j'],
+	['𝑘', 'k'],
+	['𝑙', 'l'],
+	['𝑚', 'm'],
+	['𝑛', 'n'],
+	['𝑜', 'o'],
+	['𝑝', 'p'],
+	['𝑞', 'q'],
+	['𝑟', 'r'],
+	['𝑠', 's'],
+	['𝑡', 't'],
+	['𝑢', 'u'],
+	['𝑣', 'v'],
+	['𝑤', 'w'],
+	['𝑥', 'x'],
+	['𝑦', 'y'],
+	['𝑧', 'z'],
+	['𝑨', 'A'],
+	['𝑩', 'B'],
+	['𝑪', 'C'],
+	['𝑫', 'D'],
+	['𝑬', 'E'],
+	['𝑭', 'F'],
+	['𝑮', 'G'],
+	['𝑯', 'H'],
+	['𝑰', 'I'],
+	['𝑱', 'J'],
+	['𝑲', 'K'],
+	['𝑳', 'L'],
+	['𝑴', 'M'],
+	['𝑵', 'N'],
+	['𝑶', 'O'],
+	['𝑷', 'P'],
+	['𝑸', 'Q'],
+	['𝑹', 'R'],
+	['𝑺', 'S'],
+	['𝑻', 'T'],
+	['𝑼', 'U'],
+	['𝑽', 'V'],
+	['𝑾', 'W'],
+	['𝑿', 'X'],
+	['𝒀', 'Y'],
+	['𝒁', 'Z'],
+	['𝒂', 'a'],
+	['𝒃', 'b'],
+	['𝒄', 'c'],
+	['𝒅', 'd'],
+	['𝒆', 'e'],
+	['𝒇', 'f'],
+	['𝒈', 'g'],
+	['𝒉', 'h'],
+	['𝒊', 'i'],
+	['𝒋', 'j'],
+	['𝒌', 'k'],
+	['𝒍', 'l'],
+	['𝒎', 'm'],
+	['𝒏', 'n'],
+	['𝒐', 'o'],
+	['𝒑', 'p'],
+	['𝒒', 'q'],
+	['𝒓', 'r'],
+	['𝒔', 's'],
+	['𝒕', 't'],
+	['𝒖', 'u'],
+	['𝒗', 'v'],
+	['𝒘', 'w'],
+	['𝒙', 'x'],
+	['𝒚', 'y'],
+	['𝒛', 'z'],
+	['𝒜', 'A'],
+	['𝒞', 'C'],
+	['𝒟', 'D'],
+	['𝒢', 'g'],
+	['𝒥', 'J'],
+	['𝒦', 'K'],
+	['𝒩', 'N'],
+	['𝒪', 'O'],
+	['𝒫', 'P'],
+	['𝒬', 'Q'],
+	['𝒮', 'S'],
+	['𝒯', 'T'],
+	['𝒰', 'U'],
+	['𝒱', 'V'],
+	['𝒲', 'W'],
+	['𝒳', 'X'],
+	['𝒴', 'Y'],
+	['𝒵', 'Z'],
+	['𝒶', 'a'],
+	['𝒷', 'b'],
+	['𝒸', 'c'],
+	['𝒹', 'd'],
+	['𝒻', 'f'],
+	['𝒽', 'h'],
+	['𝒾', 'i'],
+	['𝒿', 'j'],
+	['𝓀', 'h'],
+	['𝓁', 'l'],
+	['𝓂', 'm'],
+	['𝓃', 'n'],
+	['𝓅', 'p'],
+	['𝓆', 'q'],
+	['𝓇', 'r'],
+	['𝓈', 's'],
+	['𝓉', 't'],
+	['𝓊', 'u'],
+	['𝓋', 'v'],
+	['𝓌', 'w'],
+	['𝓍', 'x'],
+	['𝓎', 'y'],
+	['𝓏', 'z'],
+	['𝓐', 'A'],
+	['𝓑', 'B'],
+	['𝓒', 'C'],
+	['𝓓', 'D'],
+	['𝓔', 'E'],
+	['𝓕', 'F'],
+	['𝓖', 'G'],
+	['𝓗', 'H'],
+	['𝓘', 'I'],
+	['𝓙', 'J'],
+	['𝓚', 'K'],
+	['𝓛', 'L'],
+	['𝓜', 'M'],
+	['𝓝', 'N'],
+	['𝓞', 'O'],
+	['𝓟', 'P'],
+	['𝓠', 'Q'],
+	['𝓡', 'R'],
+	['𝓢', 'S'],
+	['𝓣', 'T'],
+	['𝓤', 'U'],
+	['𝓥', 'V'],
+	['𝓦', 'W'],
+	['𝓧', 'X'],
+	['𝓨', 'Y'],
+	['𝓩', 'Z'],
+	['𝓪', 'a'],
+	['𝓫', 'b'],
+	['𝓬', 'c'],
+	['𝓭', 'd'],
+	['𝓮', 'e'],
+	['𝓯', 'f'],
+	['𝓰', 'g'],
+	['𝓱', 'h'],
+	['𝓲', 'i'],
+	['𝓳', 'j'],
+	['𝓴', 'k'],
+	['𝓵', 'l'],
+	['𝓶', 'm'],
+	['𝓷', 'n'],
+	['𝓸', 'o'],
+	['𝓹', 'p'],
+	['𝓺', 'q'],
+	['𝓻', 'r'],
+	['𝓼', 's'],
+	['𝓽', 't'],
+	['𝓾', 'u'],
+	['𝓿', 'v'],
+	['𝔀', 'w'],
+	['𝔁', 'x'],
+	['𝔂', 'y'],
+	['𝔃', 'z'],
+	['𝔄', 'A'],
+	['𝔅', 'B'],
+	['𝔇', 'D'],
+	['𝔈', 'E'],
+	['𝔉', 'F'],
+	['𝔊', 'G'],
+	['𝔍', 'J'],
+	['𝔎', 'K'],
+	['𝔏', 'L'],
+	['𝔐', 'M'],
+	['𝔑', 'N'],
+	['𝔒', 'O'],
+	['𝔓', 'P'],
+	['𝔔', 'Q'],
+	['𝔖', 'S'],
+	['𝔗', 'T'],
+	['𝔘', 'U'],
+	['𝔙', 'V'],
+	['𝔚', 'W'],
+	['𝔛', 'X'],
+	['𝔜', 'Y'],
+	['𝔞', 'a'],
+	['𝔟', 'b'],
+	['𝔠', 'c'],
+	['𝔡', 'd'],
+	['𝔢', 'e'],
+	['𝔣', 'f'],
+	['𝔤', 'g'],
+	['𝔥', 'h'],
+	['𝔦', 'i'],
+	['𝔧', 'j'],
+	['𝔨', 'k'],
+	['𝔩', 'l'],
+	['𝔪', 'm'],
+	['𝔫', 'n'],
+	['𝔬', 'o'],
+	['𝔭', 'p'],
+	['𝔮', 'q'],
+	['𝔯', 'r'],
+	['𝔰', 's'],
+	['𝔱', 't'],
+	['𝔲', 'u'],
+	['𝔳', 'v'],
+	['𝔴', 'w'],
+	['𝔵', 'x'],
+	['𝔶', 'y'],
+	['𝔷', 'z'],
+	['𝔸', 'A'],
+	['𝔹', 'B'],
+	['𝔻', 'D'],
+	['𝔼', 'E'],
+	['𝔽', 'F'],
+	['𝔾', 'G'],
+	['𝕀', 'I'],
+	['𝕁', 'J'],
+	['𝕂', 'K'],
+	['𝕃', 'L'],
+	['𝕄', 'M'],
+	['𝕆', 'N'],
+	['𝕊', 'S'],
+	['𝕋', 'T'],
+	['𝕌', 'U'],
+	['𝕍', 'V'],
+	['𝕎', 'W'],
+	['𝕏', 'X'],
+	['𝕐', 'Y'],
+	['𝕒', 'a'],
+	['𝕓', 'b'],
+	['𝕔', 'c'],
+	['𝕕', 'd'],
+	['𝕖', 'e'],
+	['𝕗', 'f'],
+	['𝕘', 'g'],
+	['𝕙', 'h'],
+	['𝕚', 'i'],
+	['𝕛', 'j'],
+	['𝕜', 'k'],
+	['𝕝', 'l'],
+	['𝕞', 'm'],
+	['𝕟', 'n'],
+	['𝕠', 'o'],
+	['𝕡', 'p'],
+	['𝕢', 'q'],
+	['𝕣', 'r'],
+	['𝕤', 's'],
+	['𝕥', 't'],
+	['𝕦', 'u'],
+	['𝕧', 'v'],
+	['𝕨', 'w'],
+	['𝕩', 'x'],
+	['𝕪', 'y'],
+	['𝕫', 'z'],
+	['𝕬', 'A'],
+	['𝕭', 'B'],
+	['𝕮', 'C'],
+	['𝕯', 'D'],
+	['𝕰', 'E'],
+	['𝕱', 'F'],
+	['𝕲', 'G'],
+	['𝕳', 'H'],
+	['𝕴', 'I'],
+	['𝕵', 'J'],
+	['𝕶', 'K'],
+	['𝕷', 'L'],
+	['𝕸', 'M'],
+	['𝕹', 'N'],
+	['𝕺', 'O'],
+	['𝕻', 'P'],
+	['𝕼', 'Q'],
+	['𝕽', 'R'],
+	['𝕾', 'S'],
+	['𝕿', 'T'],
+	['𝖀', 'U'],
+	['𝖁', 'V'],
+	['𝖂', 'W'],
+	['𝖃', 'X'],
+	['𝖄', 'Y'],
+	['𝖅', 'Z'],
+	['𝖆', 'a'],
+	['𝖇', 'b'],
+	['𝖈', 'c'],
+	['𝖉', 'd'],
+	['𝖊', 'e'],
+	['𝖋', 'f'],
+	['𝖌', 'g'],
+	['𝖍', 'h'],
+	['𝖎', 'i'],
+	['𝖏', 'j'],
+	['𝖐', 'k'],
+	['𝖑', 'l'],
+	['𝖒', 'm'],
+	['𝖓', 'n'],
+	['𝖔', 'o'],
+	['𝖕', 'p'],
+	['𝖖', 'q'],
+	['𝖗', 'r'],
+	['𝖘', 's'],
+	['𝖙', 't'],
+	['𝖚', 'u'],
+	['𝖛', 'v'],
+	['𝖜', 'w'],
+	['𝖝', 'x'],
+	['𝖞', 'y'],
+	['𝖟', 'z'],
+	['𝖠', 'A'],
+	['𝖡', 'B'],
+	['𝖢', 'C'],
+	['𝖣', 'D'],
+	['𝖤', 'E'],
+	['𝖥', 'F'],
+	['𝖦', 'G'],
+	['𝖧', 'H'],
+	['𝖨', 'I'],
+	['𝖩', 'J'],
+	['𝖪', 'K'],
+	['𝖫', 'L'],
+	['𝖬', 'M'],
+	['𝖭', 'N'],
+	['𝖮', 'O'],
+	['𝖯', 'P'],
+	['𝖰', 'Q'],
+	['𝖱', 'R'],
+	['𝖲', 'S'],
+	['𝖳', 'T'],
+	['𝖴', 'U'],
+	['𝖵', 'V'],
+	['𝖶', 'W'],
+	['𝖷', 'X'],
+	['𝖸', 'Y'],
+	['𝖹', 'Z'],
+	['𝖺', 'a'],
+	['𝖻', 'b'],
+	['𝖼', 'c'],
+	['𝖽', 'd'],
+	['𝖾', 'e'],
+	['𝖿', 'f'],
+	['𝗀', 'g'],
+	['𝗁', 'h'],
+	['𝗂', 'i'],
+	['𝗃', 'j'],
+	['𝗄', 'k'],
+	['𝗅', 'l'],
+	['𝗆', 'm'],
+	['𝗇', 'n'],
+	['𝗈', 'o'],
+	['𝗉', 'p'],
+	['𝗊', 'q'],
+	['𝗋', 'r'],
+	['𝗌', 's'],
+	['𝗍', 't'],
+	['𝗎', 'u'],
+	['𝗏', 'v'],
+	['𝗐', 'w'],
+	['𝗑', 'x'],
+	['𝗒', 'y'],
+	['𝗓', 'z'],
+	['𝗔', 'A'],
+	['𝗕', 'B'],
+	['𝗖', 'C'],
+	['𝗗', 'D'],
+	['𝗘', 'E'],
+	['𝗙', 'F'],
+	['𝗚', 'G'],
+	['𝗛', 'H'],
+	['𝗜', 'I'],
+	['𝗝', 'J'],
+	['𝗞', 'K'],
+	['𝗟', 'L'],
+	['𝗠', 'M'],
+	['𝗡', 'N'],
+	['𝗢', 'O'],
+	['𝗣', 'P'],
+	['𝗤', 'Q'],
+	['𝗥', 'R'],
+	['𝗦', 'S'],
+	['𝗧', 'T'],
+	['𝗨', 'U'],
+	['𝗩', 'V'],
+	['𝗪', 'W'],
+	['𝗫', 'X'],
+	['𝗬', 'Y'],
+	['𝗭', 'Z'],
+	['𝗮', 'a'],
+	['𝗯', 'b'],
+	['𝗰', 'c'],
+	['𝗱', 'd'],
+	['𝗲', 'e'],
+	['𝗳', 'f'],
+	['𝗴', 'g'],
+	['𝗵', 'h'],
+	['𝗶', 'i'],
+	['𝗷', 'j'],
+	['𝗸', 'k'],
+	['𝗹', 'l'],
+	['𝗺', 'm'],
+	['𝗻', 'n'],
+	['𝗼', 'o'],
+	['𝗽', 'p'],
+	['𝗾', 'q'],
+	['𝗿', 'r'],
+	['𝘀', 's'],
+	['𝘁', 't'],
+	['𝘂', 'u'],
+	['𝘃', 'v'],
+	['𝘄', 'w'],
+	['𝘅', 'x'],
+	['𝘆', 'y'],
+	['𝘇', 'z'],
+	['𝘈', 'A'],
+	['𝘉', 'B'],
+	['𝘊', 'C'],
+	['𝘋', 'D'],
+	['𝘌', 'E'],
+	['𝘍', 'F'],
+	['𝘎', 'G'],
+	['𝘏', 'H'],
+	['𝘐', 'I'],
+	['𝘑', 'J'],
+	['𝘒', 'K'],
+	['𝘓', 'L'],
+	['𝘔', 'M'],
+	['𝘕', 'N'],
+	['𝘖', 'O'],
+	['𝘗', 'P'],
+	['𝘘', 'Q'],
+	['𝘙', 'R'],
+	['𝘚', 'S'],
+	['𝘛', 'T'],
+	['𝘜', 'U'],
+	['𝘝', 'V'],
+	['𝘞', 'W'],
+	['𝘟', 'X'],
+	['𝘠', 'Y'],
+	['𝘡', 'Z'],
+	['𝘢', 'a'],
+	['𝘣', 'b'],
+	['𝘤', 'c'],
+	['𝘥', 'd'],
+	['𝘦', 'e'],
+	['𝘧', 'f'],
+	['𝘨', 'g'],
+	['𝘩', 'h'],
+	['𝘪', 'i'],
+	['𝘫', 'j'],
+	['𝘬', 'k'],
+	['𝘭', 'l'],
+	['𝘮', 'm'],
+	['𝘯', 'n'],
+	['𝘰', 'o'],
+	['𝘱', 'p'],
+	['𝘲', 'q'],
+	['𝘳', 'r'],
+	['𝘴', 's'],
+	['𝘵', 't'],
+	['𝘶', 'u'],
+	['𝘷', 'v'],
+	['𝘸', 'w'],
+	['𝘹', 'x'],
+	['𝘺', 'y'],
+	['𝘻', 'z'],
+	['𝘼', 'A'],
+	['𝘽', 'B'],
+	['𝘾', 'C'],
+	['𝘿', 'D'],
+	['𝙀', 'E'],
+	['𝙁', 'F'],
+	['𝙂', 'G'],
+	['𝙃', 'H'],
+	['𝙄', 'I'],
+	['𝙅', 'J'],
+	['𝙆', 'K'],
+	['𝙇', 'L'],
+	['𝙈', 'M'],
+	['𝙉', 'N'],
+	['𝙊', 'O'],
+	['𝙋', 'P'],
+	['𝙌', 'Q'],
+	['𝙍', 'R'],
+	['𝙎', 'S'],
+	['𝙏', 'T'],
+	['𝙐', 'U'],
+	['𝙑', 'V'],
+	['𝙒', 'W'],
+	['𝙓', 'X'],
+	['𝙔', 'Y'],
+	['𝙕', 'Z'],
+	['𝙖', 'a'],
+	['𝙗', 'b'],
+	['𝙘', 'c'],
+	['𝙙', 'd'],
+	['𝙚', 'e'],
+	['𝙛', 'f'],
+	['𝙜', 'g'],
+	['𝙝', 'h'],
+	['𝙞', 'i'],
+	['𝙟', 'j'],
+	['𝙠', 'k'],
+	['𝙡', 'l'],
+	['𝙢', 'm'],
+	['𝙣', 'n'],
+	['𝙤', 'o'],
+	['𝙥', 'p'],
+	['𝙦', 'q'],
+	['𝙧', 'r'],
+	['𝙨', 's'],
+	['𝙩', 't'],
+	['𝙪', 'u'],
+	['𝙫', 'v'],
+	['𝙬', 'w'],
+	['𝙭', 'x'],
+	['𝙮', 'y'],
+	['𝙯', 'z'],
+	['𝙰', 'A'],
+	['𝙱', 'B'],
+	['𝙲', 'C'],
+	['𝙳', 'D'],
+	['𝙴', 'E'],
+	['𝙵', 'F'],
+	['𝙶', 'G'],
+	['𝙷', 'H'],
+	['𝙸', 'I'],
+	['𝙹', 'J'],
+	['𝙺', 'K'],
+	['𝙻', 'L'],
+	['𝙼', 'M'],
+	['𝙽', 'N'],
+	['𝙾', 'O'],
+	['𝙿', 'P'],
+	['𝚀', 'Q'],
+	['𝚁', 'R'],
+	['𝚂', 'S'],
+	['𝚃', 'T'],
+	['𝚄', 'U'],
+	['𝚅', 'V'],
+	['𝚆', 'W'],
+	['𝚇', 'X'],
+	['𝚈', 'Y'],
+	['𝚉', 'Z'],
+	['𝚊', 'a'],
+	['𝚋', 'b'],
+	['𝚌', 'c'],
+	['𝚍', 'd'],
+	['𝚎', 'e'],
+	['𝚏', 'f'],
+	['𝚐', 'g'],
+	['𝚑', 'h'],
+	['𝚒', 'i'],
+	['𝚓', 'j'],
+	['𝚔', 'k'],
+	['𝚕', 'l'],
+	['𝚖', 'm'],
+	['𝚗', 'n'],
+	['𝚘', 'o'],
+	['𝚙', 'p'],
+	['𝚚', 'q'],
+	['𝚛', 'r'],
+	['𝚜', 's'],
+	['𝚝', 't'],
+	['𝚞', 'u'],
+	['𝚟', 'v'],
+	['𝚠', 'w'],
+	['𝚡', 'x'],
+	['𝚢', 'y'],
+	['𝚣', 'z'],
+
+	# Dotless letters
+	['𝚤', 'l'],
+	['𝚥', 'j'],
+
+	# Greek
+	['𝛢', 'A'],
+	['𝛣', 'B'],
+	['𝛤', 'G'],
+	['𝛥', 'D'],
+	['𝛦', 'E'],
+	['𝛧', 'Z'],
+	['𝛨', 'I'],
+	['𝛩', 'TH'],
+	['𝛪', 'I'],
+	['𝛫', 'K'],
+	['𝛬', 'L'],
+	['𝛭', 'M'],
+	['𝛮', 'N'],
+	['𝛯', 'KS'],
+	['𝛰', 'O'],
+	['𝛱', 'P'],
+	['𝛲', 'R'],
+	['𝛳', 'TH'],
+	['𝛴', 'S'],
+	['𝛵', 'T'],
+	['𝛶', 'Y'],
+	['𝛷', 'F'],
+	['𝛸', 'x'],
+	['𝛹', 'PS'],
+	['𝛺', 'O'],
+	['𝛻', 'D'],
+	['𝛼', 'a'],
+	['𝛽', 'b'],
+	['𝛾', 'g'],
+	['𝛿', 'd'],
+	['𝜀', 'e'],
+	['𝜁', 'z'],
+	['𝜂', 'i'],
+	['𝜃', 'th'],
+	['𝜄', 'i'],
+	['𝜅', 'k'],
+	['𝜆', 'l'],
+	['𝜇', 'm'],
+	['𝜈', 'n'],
+	['𝜉', 'ks'],
+	['𝜊', 'o'],
+	['𝜋', 'p'],
+	['𝜌', 'r'],
+	['𝜍', 's'],
+	['𝜎', 's'],
+	['𝜏', 't'],
+	['𝜐', 'y'],
+	['𝜑', 'f'],
+	['𝜒', 'x'],
+	['𝜓', 'ps'],
+	['𝜔', 'o'],
+	['𝜕', 'd'],
+	['𝜖', 'E'],
+	['𝜗', 'TH'],
+	['𝜘', 'K'],
+	['𝜙', 'f'],
+	['𝜚', 'r'],
+	['𝜛', 'p'],
+	['𝜜', 'A'],
+	['𝜝', 'V'],
+	['𝜞', 'G'],
+	['𝜟', 'D'],
+	['𝜠', 'E'],
+	['𝜡', 'Z'],
+	['𝜢', 'I'],
+	['𝜣', 'TH'],
+	['𝜤', 'I'],
+	['𝜥', 'K'],
+	['𝜦', 'L'],
+	['𝜧', 'M'],
+	['𝜨', 'N'],
+	['𝜩', 'KS'],
+	['𝜪', 'O'],
+	['𝜫', 'P'],
+	['𝜬', 'S'],
+	['𝜭', 'TH'],
+	['𝜮', 'S'],
+	['𝜯', 'T'],
+	['𝜰', 'Y'],
+	['𝜱', 'F'],
+	['𝜲', 'X'],
+	['𝜳', 'PS'],
+	['𝜴', 'O'],
+	['𝜵', 'D'],
+	['𝜶', 'a'],
+	['𝜷', 'v'],
+	['𝜸', 'g'],
+	['𝜹', 'd'],
+	['𝜺', 'e'],
+	['𝜻', 'z'],
+	['𝜼', 'i'],
+	['𝜽', 'th'],
+	['𝜾', 'i'],
+	['𝜿', 'k'],
+	['𝝀', 'l'],
+	['𝝁', 'm'],
+	['𝝂', 'n'],
+	['𝝃', 'ks'],
+	['𝝄', 'o'],
+	['𝝅', 'p'],
+	['𝝆', 'r'],
+	['𝝇', 's'],
+	['𝝈', 's'],
+	['𝝉', 't'],
+	['𝝊', 'y'],
+	['𝝋', 'f'],
+	['𝝌', 'x'],
+	['𝝍', 'ps'],
+	['𝝎', 'o'],
+	['𝝏', 'a'],
+	['𝝐', 'e'],
+	['𝝑', 'i'],
+	['𝝒', 'k'],
+	['𝝓', 'f'],
+	['𝝔', 'r'],
+	['𝝕', 'p'],
+	['𝝖', 'A'],
+	['𝝗', 'B'],
+	['𝝘', 'G'],
+	['𝝙', 'D'],
+	['𝝚', 'E'],
+	['𝝛', 'Z'],
+	['𝝜', 'I'],
+	['𝝝', 'TH'],
+	['𝝞', 'I'],
+	['𝝟', 'K'],
+	['𝝠', 'L'],
+	['𝝡', 'M'],
+	['𝝢', 'N'],
+	['𝝣', 'KS'],
+	['𝝤', 'O'],
+	['𝝥', 'P'],
+	['𝝦', 'R'],
+	['𝝧', 'TH'],
+	['𝝨', 'S'],
+	['𝝩', 'T'],
+	['𝝪', 'Y'],
+	['𝝫', 'F'],
+	['𝝬', 'X'],
+	['𝝭', 'PS'],
+	['𝝮', 'O'],
+	['𝝯', 'D'],
+	['𝝰', 'a'],
+	['𝝱', 'v'],
+	['𝝲', 'g'],
+	['𝝳', 'd'],
+	['𝝴', 'e'],
+	['𝝵', 'z'],
+	['𝝶', 'i'],
+	['𝝷', 'th'],
+	['𝝸', 'i'],
+	['𝝹', 'k'],
+	['𝝺', 'l'],
+	['𝝻', 'm'],
+	['𝝼', 'n'],
+	['𝝽', 'ks'],
+	['𝝾', 'o'],
+	['𝝿', 'p'],
+	['𝞀', 'r'],
+	['𝞁', 's'],
+	['𝞂', 's'],
+	['𝞃', 't'],
+	['𝞄', 'y'],
+	['𝞅', 'f'],
+	['𝞆', 'x'],
+	['𝞇', 'ps'],
+	['𝞈', 'o'],
+	['𝞉', 'a'],
+	['𝞊', 'e'],
+	['𝞋', 'i'],
+	['𝞌', 'k'],
+	['𝞍', 'f'],
+	['𝞎', 'r'],
+	['𝞏', 'p'],
+	['𝞐', 'A'],
+	['𝞑', 'V'],
+	['𝞒', 'G'],
+	['𝞓', 'D'],
+	['𝞔', 'E'],
+	['𝞕', 'Z'],
+	['𝞖', 'I'],
+	['𝞗', 'TH'],
+	['𝞘', 'I'],
+	['𝞙', 'K'],
+	['𝞚', 'L'],
+	['𝞛', 'M'],
+	['𝞜', 'N'],
+	['𝞝', 'KS'],
+	['𝞞', 'O'],
+	['𝞟', 'P'],
+	['𝞠', 'S'],
+	['𝞡', 'TH'],
+	['𝞢', 'S'],
+	['𝞣', 'T'],
+	['𝞤', 'Y'],
+	['𝞥', 'F'],
+	['𝞦', 'X'],
+	['𝞧', 'PS'],
+	['𝞨', 'O'],
+	['𝞩', 'D'],
+	['𝞪', 'av'],
+	['𝞫', 'g'],
+	['𝞬', 'd'],
+	['𝞭', 'e'],
+	['𝞮', 'z'],
+	['𝞯', 'i'],
+	['𝞰', 'i'],
+	['𝞱', 'th'],
+	['𝞲', 'i'],
+	['𝞳', 'k'],
+	['𝞴', 'l'],
+	['𝞵', 'm'],
+	['𝞶', 'n'],
+	['𝞷', 'ks'],
+	['𝞸', 'o'],
+	['𝞹', 'p'],
+	['𝞺', 'r'],
+	['𝞻', 's'],
+	['𝞼', 's'],
+	['𝞽', 't'],
+	['𝞾', 'y'],
+	['𝞿', 'f'],
+	['𝟀', 'x'],
+	['𝟁', 'ps'],
+	['𝟂', 'o'],
+	['𝟃', 'a'],
+	['𝟄', 'e'],
+	['𝟅', 'i'],
+	['𝟆', 'k'],
+	['𝟇', 'f'],
+	['𝟈', 'r'],
+	['𝟉', 'p'],
+	['𝟊', 'F'],
+	['𝟋', 'f'],
+	['⒜', '(a)'],
+	['⒝', '(b)'],
+	['⒞', '(c)'],
+	['⒟', '(d)'],
+	['⒠', '(e)'],
+	['⒡', '(f)'],
+	['⒢', '(g)'],
+	['⒣', '(h)'],
+	['⒤', '(i)'],
+	['⒥', '(j)'],
+	['⒦', '(k)'],
+	['⒧', '(l)'],
+	['⒨', '(m)'],
+	['⒩', '(n)'],
+	['⒪', '(o)'],
+	['⒫', '(p)'],
+	['⒬', '(q)'],
+	['⒭', '(r)'],
+	['⒮', '(s)'],
+	['⒯', '(t)'],
+	['⒰', '(u)'],
+	['⒱', '(v)'],
+	['⒲', '(w)'],
+	['⒳', '(x)'],
+	['⒴', '(y)'],
+	['⒵', '(z)'],
+	['Ⓐ', '(A)'],
+	['Ⓑ', '(B)'],
+	['Ⓒ', '(C)'],
+	['Ⓓ', '(D)'],
+	['Ⓔ', '(E)'],
+	['Ⓕ', '(F)'],
+	['Ⓖ', '(G)'],
+	['Ⓗ', '(H)'],
+	['Ⓘ', '(I)'],
+	['Ⓙ', '(J)'],
+	['Ⓚ', '(K)'],
+	['Ⓛ', '(L)'],
+	['Ⓝ', '(N)'],
+	['Ⓞ', '(O)'],
+	['Ⓟ', '(P)'],
+	['Ⓠ', '(Q)'],
+	['Ⓡ', '(R)'],
+	['Ⓢ', '(S)'],
+	['Ⓣ', '(T)'],
+	['Ⓤ', '(U)'],
+	['Ⓥ', '(V)'],
+	['Ⓦ', '(W)'],
+	['Ⓧ', '(X)'],
+	['Ⓨ', '(Y)'],
+	['Ⓩ', '(Z)'],
+	['ⓐ', '(a)'],
+	['ⓑ', '(b)'],
+	['ⓒ', '(b)'],
+	['ⓓ', '(c)'],
+	['ⓔ', '(e)'],
+	['ⓕ', '(f)'],
+	['ⓖ', '(g)'],
+	['ⓗ', '(h)'],
+	['ⓘ', '(i)'],
+	['ⓙ', '(j)'],
+	['ⓚ', '(k)'],
+	['ⓛ', '(l)'],
+	['ⓜ', '(m)'],
+	['ⓝ', '(n)'],
+	['ⓞ', '(o)'],
+	['ⓟ', '(p)'],
+	['ⓠ', '(q)'],
+	['ⓡ', '(r)'],
+	['ⓢ', '(s)'],
+	['ⓣ', '(t)'],
+	['ⓤ', '(u)'],
+	['ⓥ', '(v)'],
+	['ⓦ', '(w)'],
+	['ⓧ', '(x)'],
+	['ⓨ', '(y)'],
+	['ⓩ', '(z)'],
+
+	# Numbers
+	['𝟎', '0'],
+	['𝟏', '1'],
+	['𝟐', '2'],
+	['𝟑', '3'],
+	['𝟒', '4'],
+	['𝟓', '5'],
+	['𝟔', '6'],
+	['𝟕', '7'],
+	['𝟖', '8'],
+	['𝟗', '9'],
+	['𝟘', '0'],
+	['𝟙', '1'],
+	['𝟚', '2'],
+	['𝟛', '3'],
+	['𝟜', '4'],
+	['𝟝', '5'],
+	['𝟞', '6'],
+	['𝟟', '7'],
+	['𝟠', '8'],
+	['𝟡', '9'],
+	['𝟢', '0'],
+	['𝟣', '1'],
+	['𝟤', '2'],
+	['𝟥', '3'],
+	['𝟦', '4'],
+	['𝟧', '5'],
+	['𝟨', '6'],
+	['𝟩', '7'],
+	['𝟪', '8'],
+	['𝟫', '9'],
+	['𝟬', '0'],
+	['𝟭', '1'],
+	['𝟮', '2'],
+	['𝟯', '3'],
+	['𝟰', '4'],
+	['𝟱', '5'],
+	['𝟲', '6'],
+	['𝟳', '7'],
+	['𝟴', '8'],
+	['𝟵', '9'],
+	['𝟶', '0'],
+	['𝟷', '1'],
+	['𝟸', '2'],
+	['𝟹', '3'],
+	['𝟺', '4'],
+	['𝟻', '5'],
+	['𝟼', '6'],
+	['𝟽', '7'],
+	['𝟾', '8'],
+	['𝟿', '9'],
+	['①', '1'],
+	['②', '2'],
+	['③', '3'],
+	['④', '4'],
+	['⑤', '5'],
+	['⑥', '6'],
+	['⑦', '7'],
+	['⑧', '8'],
+	['⑨', '9'],
+	['⑩', '10'],
+	['⑪', '11'],
+	['⑫', '12'],
+	['⑬', '13'],
+	['⑭', '14'],
+	['⑮', '15'],
+	['⑯', '16'],
+	['⑰', '17'],
+	['⑱', '18'],
+	['⑲', '19'],
+	['⑳', '20'],
+	['⑴', '1'],
+	['⑵', '2'],
+	['⑶', '3'],
+	['⑷', '4'],
+	['⑸', '5'],
+	['⑹', '6'],
+	['⑺', '7'],
+	['⑻', '8'],
+	['⑼', '9'],
+	['⑽', '10'],
+	['⑾', '11'],
+	['⑿', '12'],
+	['⒀', '13'],
+	['⒁', '14'],
+	['⒂', '15'],
+	['⒃', '16'],
+	['⒄', '17'],
+	['⒅', '18'],
+	['⒆', '19'],
+	['⒇', '20'],
+	['⒈', '1.'],
+	['⒉', '2.'],
+	['⒊', '3.'],
+	['⒋', '4.'],
+	['⒌', '5.'],
+	['⒍', '6.'],
+	['⒎', '7.'],
+	['⒏', '8.'],
+	['⒐', '9.'],
+	['⒑', '10.'],
+	['⒒', '11.'],
+	['⒓', '12.'],
+	['⒔', '13.'],
+	['⒕', '14.'],
+	['⒖', '15.'],
+	['⒗', '16.'],
+	['⒘', '17.'],
+	['⒙', '18.'],
+	['⒚', '19.'],
+	['⒛', '20.'],
+	['⓪', '0'],
+	['⓫', '11'],
+	['⓬', '12'],
+	['⓭', '13'],
+	['⓮', '14'],
+	['⓯', '15'],
+	['⓰', '16'],
+	['⓱', '17'],
+	['⓲', '18'],
+	['⓳', '19'],
+	['⓴', '20'],
+	['⓵', '1'],
+	['⓶', '2'],
+	['⓷', '3'],
+	['⓸', '4'],
+	['⓹', '5'],
+	['⓺', '6'],
+	['⓻', '7'],
+	['⓼', '8'],
+	['⓽', '9'],
+	['⓾', '10'],
+	['⓿', '0'],
+
+	# Punctuation
+	['🙰', '&'],
+	['🙱', '&'],
+	['🙲', '&'],
+	['🙳', '&'],
+	['🙴', '&'],
+	['🙵', '&'],
+	['🙶', '"'],
+	['🙷', '"'],
+	['🙸', '"'],
+	['‽', '?!'],
+	['🙹', '?!'],
+	['🙺', '?!'],
+	['🙻', '?!'],
+	['🙼', '/'],
+	['🙽', '\\'],
+
+	# Alchemy
+	['🜇', 'AR'],
+	['🜈', 'V'],
+	['🜉', 'V'],
+	['🜆', 'VR'],
+	['🜅', 'VF'],
+	['🜩', '2'],
+	['🜪', '5'],
+	['🝡', 'f'],
+	['🝢', 'W'],
+	['🝣', 'U'],
+	['🝧', 'V'],
+	['🝨', 'T'],
+	['🝪', 'V'],
+	['🝫', 'MB'],
+	['🝬', 'VB'],
+	['🝲', '3B'],
+	['🝳', '3B'],
+
+	# Emojis
+	['💯', '100'],
+	['🔙', 'BACK'],
+	['🔚', 'END'],
+	['🔛', 'ON!'],
+	['🔜', 'SOON'],
+	['🔝', 'TOP'],
+	['🔞', '18'],
+	['🔤', 'abc'],
+	['🔠', 'ABCD'],
+	['🔡', 'abcd'],
+	['🔢', '1234'],
+	['🔣', 'T&@%'],
+	['#️⃣', '#'],
+	['*️⃣', '*'],
+	['0️⃣', '0'],
+	['1️⃣', '1'],
+	['2️⃣', '2'],
+	['3️⃣', '3'],
+	['4️⃣', '4'],
+	['5️⃣', '5'],
+	['6️⃣', '6'],
+	['7️⃣', '7'],
+	['8️⃣', '8'],
+	['9️⃣', '9'],
+	['🔟', '10'],
+	['🅰️', 'A'],
+	['🅱️', 'B'],
+	['🆎', 'AB'],
+	['🆑', 'CL'],
+	['🅾️', 'O'],
+	['🅿', 'P'],
+	['🆘', 'SOS'],
+	['🅲', 'C'],
+	['🅳', 'D'],
+	['🅴', 'E'],
+	['🅵', 'F'],
+	['🅶', 'G'],
+	['🅷', 'H'],
+	['🅸', 'I'],
+	['🅹', 'J'],
+	['🅺', 'K'],
+	['🅻', 'L'],
+	['🅼', 'M'],
+	['🅽', 'N'],
+	['🆀', 'Q'],
+	['🆁', 'R'],
+	['🆂', 'S'],
+	['🆃', 'T'],
+	['🆄', 'U'],
+	['🆅', 'V'],
+	['🆆', 'W'],
+	['🆇', 'X'],
+	['🆈', 'Y'],
+	['🆉', 'Z'],
+]
diff --git a/common/utils.py b/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc2e7e558031a0c604f18541960ae051910d54e
--- /dev/null
+++ b/common/utils.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#  MIT License
+#
+#  Copyright (c) 2020 Jungil Kong
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in all
+#  copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#  SOFTWARE.
+
+# The following functions/classes were based on code from https://github.com/jik876/hifi-gan:
+# init_weights, get_padding, AttrDict
+
+import ctypes
+import glob
+import os
+import re
+import shutil
+import warnings
+from collections import defaultdict, OrderedDict
+from pathlib import Path
+from typing import Optional
+
+import librosa
+import numpy as np
+
+import torch
+import torch.distributed as dist
+from scipy.io.wavfile import read
+
+
+def mask_from_lens(lens, max_len: Optional[int] = None):
+    if max_len is None:
+        max_len = lens.max()
+    ids = torch.arange(0, max_len, device=lens.device, dtype=lens.dtype)
+    mask = torch.lt(ids, lens.unsqueeze(1))
+    return mask
+
+
+def load_wav(full_path, torch_tensor=False):
+    import soundfile  # flac
+    data, sampling_rate = soundfile.read(full_path, dtype='int16')
+    if torch_tensor:
+        return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+    else:
+        return data, sampling_rate
+
+
+def load_wav_to_torch(full_path, force_sampling_rate=None):
+    if force_sampling_rate is not None:
+        data, sampling_rate = librosa.load(full_path, sr=force_sampling_rate)
+    else:
+        sampling_rate, data = read(full_path)
+
+    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(dataset_path, fnames, has_speakers=False, split="|"):
+    def split_line(root, line):
+        parts = line.strip().split(split)
+        if has_speakers:
+            #ANT: is this ok?
+            paths, non_paths = parts[:2], parts[2:]
+            #paths, non_paths = parts[:-2], parts[-2:]
+        else:
+            paths, non_paths = parts[:-1], parts[-1:]
+        return tuple(str(Path(root, p)) for p in paths) + tuple(non_paths)
+
+    fpaths_and_text = []
+    for fname in fnames:
+        with open(fname, encoding='utf-8') as f:
+            fpaths_and_text += [split_line(dataset_path, line) for line in f]
+    return fpaths_and_text
+
+
+def to_gpu(x):
+    x = x.contiguous()
+    return x.cuda(non_blocking=True) if torch.cuda.is_available() else x
+
+
+def l2_promote():
+    _libcudart = ctypes.CDLL('libcudart.so')
+    # Set device limit on the current device
+    # cudaLimitMaxL2FetchGranularity = 0x05
+    pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int))
+    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+    _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
+    assert pValue.contents.value == 128
+
+
+def prepare_tmp(path):
+    if path is None:
+        return
+    p = Path(path)
+    if p.is_dir():
+        warnings.warn(f'{p} exists. Removing...')
+        shutil.rmtree(p, ignore_errors=True)
+    p.mkdir(parents=False, exist_ok=False)
+
+
+def print_once(*msg):
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*msg)
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+class DefaultAttrDict(defaultdict):
+    def __init__(self, *args, **kwargs):
+        super(DefaultAttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+    def __getattr__(self, item):
+        return self[item]
+
+
+class BenchmarkStats:
+    """ Tracks statistics used for benchmarking. """
+    def __init__(self):
+        self.num_frames = []
+        self.losses = []
+        self.mel_losses = []
+        self.took = []
+
+    def update(self, num_frames, losses, mel_losses, took):
+        self.num_frames.append(num_frames)
+        self.losses.append(losses)
+        self.mel_losses.append(mel_losses)
+        self.took.append(took)
+
+    def get(self, n_epochs):
+        frames_s = sum(self.num_frames[-n_epochs:]) / sum(self.took[-n_epochs:])
+        return {'frames/s': frames_s,
+                'loss': np.mean(self.losses[-n_epochs:]),
+                'mel_loss': np.mean(self.mel_losses[-n_epochs:]),
+                'took': np.mean(self.took[-n_epochs:]),
+                'benchmark_epochs_num': n_epochs}
+
+    def __len__(self):
+        return len(self.losses)
+
+
+class Checkpointer:
+
+    def __init__(self, save_dir, keep_milestones=[]):
+        self.save_dir = save_dir
+        self.keep_milestones = keep_milestones
+
+        find = lambda name: [
+            (int(re.search("_(\d+).pt", fn).group(1)), fn)
+            for fn in glob.glob(f"{save_dir}/{name}_checkpoint_*.pt")]
+
+        tracked = sorted(find("FastPitch"), key=lambda t: t[0])
+        self.tracked = OrderedDict(tracked)
+
+    def last_checkpoint(self, output):
+
+        def corrupted(fpath):
+            try:
+                torch.load(fpath, map_location="cpu")
+                return False
+            except:
+                warnings.warn(f"Cannot load {fpath}")
+                return True
+
+        saved = sorted(
+            glob.glob(f"{output}/FastPitch_checkpoint_*.pt"),
+            key=lambda f: int(re.search("_(\d+).pt", f).group(1)))
+
+        if len(saved) >= 1 and not corrupted(saved[-1]):
+            return saved[-1]
+        elif len(saved) >= 2:
+            return saved[-2]
+        else:
+            return None
+
+    def maybe_load(self, model, optimizer, scaler, train_state, args,
+                   ema_model=None):
+
+        assert args.checkpoint_path is None or args.resume is False, (
+            "Specify a single checkpoint source")
+
+        fpath = None
+        if args.checkpoint_path is not None:
+            fpath = args.checkpoint_path
+            self.tracked = OrderedDict()  # Do not track/delete prev ckpts
+        elif args.resume:
+            fpath = self.last_checkpoint(args.output)
+
+        if fpath is None:
+            return
+
+        print_once(f"Loading model and optimizer state from {fpath}")
+        ckpt = torch.load(fpath, map_location="cpu")
+        train_state["epoch"] = ckpt["epoch"] + 1
+        train_state["total_iter"] = ckpt["iteration"]
+
+        no_pref = lambda sd: {re.sub("^module.", "", k): v for k, v in sd.items()}
+        unwrap = lambda m: getattr(m, "module", m)
+
+        unwrap(model).load_state_dict(no_pref(ckpt["state_dict"]))
+
+        if ema_model is not None:
+            unwrap(ema_model).load_state_dict(no_pref(ckpt["ema_state_dict"]))
+
+        optimizer.load_state_dict(ckpt["optimizer"])
+
+        if "scaler" in ckpt:
+            scaler.load_state_dict(ckpt["scaler"])
+        else:
+            warnings.warn("AMP scaler state missing from the checkpoint.")
+
+    def maybe_save(self, args, model, ema_model, optimizer, scaler, epoch,
+                   total_iter, config):
+
+        intermediate = (args.epochs_per_checkpoint > 0
+                        and epoch % args.epochs_per_checkpoint == 0)
+        final = epoch == args.epochs
+
+        if not intermediate and not final and epoch not in self.keep_milestones:
+            return
+
+        rank = 0
+        if dist.is_initialized():
+            dist.barrier()
+            rank = dist.get_rank()
+
+        if rank != 0:
+            return
+
+        unwrap = lambda m: getattr(m, "module", m)
+        ckpt = {"epoch": epoch,
+                "iteration": total_iter,
+                "config": config,
+                "train_setup": args.__dict__,
+                "state_dict": unwrap(model).state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "scaler": scaler.state_dict()}
+        if ema_model is not None:
+            ckpt["ema_state_dict"] = unwrap(ema_model).state_dict()
+
+        fpath = Path(args.output, f"FastPitch_checkpoint_{epoch}.pt")
+        print(f"Saving model and optimizer state at epoch {epoch} to {fpath}")
+        torch.save(ckpt, fpath)
+
+        # Remove old checkpoints; keep milestones and the last two
+        self.tracked[epoch] = fpath
+        for epoch in set(list(self.tracked)[:-2]) - set(self.keep_milestones):
+            try:
+                os.remove(self.tracked[epoch])
+            except:
+                pass
+            del self.tracked[epoch]
diff --git a/fastpitch/alignment.py b/fastpitch/alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..6671f6c0c4702583ce5657348ab7b41cddc394f7
--- /dev/null
+++ b/fastpitch/alignment.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numba import jit, prange
+
+
+@jit(nopython=True)
+def mas(log_attn_map, width=1):
+    # assumes mel x text
+    opt = np.zeros_like(log_attn_map)
+    log_attn_map = log_attn_map.copy()
+    log_attn_map[0, 1:] = -np.inf
+    log_p = np.zeros_like(log_attn_map)
+    log_p[0, :] = log_attn_map[0, :]
+    prev_ind = np.zeros_like(log_attn_map, dtype=np.int64)
+    for i in range(1, log_attn_map.shape[0]):
+        for j in range(log_attn_map.shape[1]):  # for each text dim
+            prev_j = np.arange(max(0, j-width), j+1)
+            prev_log = np.array([log_p[i-1, prev_idx] for prev_idx in prev_j])
+
+            ind = np.argmax(prev_log)
+            log_p[i, j] = log_attn_map[i, j] + prev_log[ind]
+            prev_ind[i, j] = prev_j[ind]
+
+    # now backtrack
+    curr_text_idx = log_attn_map.shape[1]-1
+    for i in range(log_attn_map.shape[0]-1, -1, -1):
+        opt[i, curr_text_idx] = 1
+        curr_text_idx = prev_ind[i, curr_text_idx]
+    opt[0, curr_text_idx] = 1
+    return opt
+
+
+@jit(nopython=True)
+def mas_width1(log_attn_map):
+    """mas with hardcoded width=1"""
+    # assumes mel x text
+    neg_inf = log_attn_map.dtype.type(-np.inf)
+    log_p = log_attn_map.copy()
+    log_p[0, 1:] = neg_inf
+    for i in range(1, log_p.shape[0]):
+        prev_log1 = neg_inf
+        for j in range(log_p.shape[1]):
+            prev_log2 = log_p[i-1, j]
+            log_p[i, j] += max(prev_log1, prev_log2)
+            prev_log1 = prev_log2
+
+    # now backtrack
+    opt = np.zeros_like(log_p)
+    one = opt.dtype.type(1)
+    j = log_p.shape[1]-1
+    for i in range(log_p.shape[0]-1, 0, -1):
+        opt[i, j] = one
+        if log_p[i-1, j-1] >= log_p[i-1, j]:
+            j -= 1
+            if j == 0:
+                opt[1:i, j] = one
+                break
+    opt[0, j] = one
+    return opt
+
+
+@jit(nopython=True, parallel=True)
+def b_mas(b_log_attn_map, in_lens, out_lens, width=1):
+    assert width == 1
+    attn_out = np.zeros_like(b_log_attn_map)
+
+    for b in prange(b_log_attn_map.shape[0]):
+        out = mas_width1(b_log_attn_map[b, 0, :out_lens[b], :in_lens[b]])
+        attn_out[b, 0, :out_lens[b], :in_lens[b]] = out
+    return attn_out
diff --git a/fastpitch/arg_parser.py b/fastpitch/arg_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e5b1376413efb23cb1a67d0b76c31e93df304a0
--- /dev/null
+++ b/fastpitch/arg_parser.py
@@ -0,0 +1,130 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import argparse
+
+
+def parse_fastpitch_args(parent, add_help=False):
+    """
+    Parse commandline arguments.
+    """
+    parser = argparse.ArgumentParser(parents=[parent], add_help=add_help,
+                                     allow_abbrev=False)
+    io = parser.add_argument_group('io parameters')
+    io.add_argument('--n-mel-channels', default=80, type=int,
+                    help='Number of bins in mel-spectrograms')
+    io.add_argument('--max-seq-len', default=2048, type=int,
+                    help='')
+
+    symbols = parser.add_argument_group('symbols parameters')
+    symbols.add_argument('--n-symbols', default=148, type=int,
+                         help='Number of symbols in dictionary')
+    symbols.add_argument('--padding-idx', default=0, type=int,
+                         help='Index of padding symbol in dictionary')
+    symbols.add_argument('--symbols-embedding-dim', default=384, type=int,
+                         help='Input embedding dimension')
+
+    in_fft = parser.add_argument_group('input FFT parameters')
+    in_fft.add_argument('--in-fft-n-layers', default=6, type=int,
+                        help='Number of FFT blocks')
+    in_fft.add_argument('--in-fft-n-heads', default=1, type=int,
+                        help='Number of attention heads')
+    in_fft.add_argument('--in-fft-d-head', default=64, type=int,
+                        help='Dim of attention heads')
+    in_fft.add_argument('--in-fft-conv1d-kernel-size', default=3, type=int,
+                        help='Conv-1D kernel size')
+    in_fft.add_argument('--in-fft-conv1d-filter-size', default=1536, type=int,
+                        help='Conv-1D filter size')
+    in_fft.add_argument('--in-fft-output-size', default=384, type=int,
+                        help='Output dim')
+    in_fft.add_argument('--p-in-fft-dropout', default=0.1, type=float,
+                        help='Dropout probability')
+    in_fft.add_argument('--p-in-fft-dropatt', default=0.1, type=float,
+                        help='Multi-head attention dropout')
+    in_fft.add_argument('--p-in-fft-dropemb', default=0.0, type=float,
+                        help='Dropout added to word+positional embeddings')
+
+    out_fft = parser.add_argument_group('output FFT parameters')
+    out_fft.add_argument('--out-fft-n-layers', default=6, type=int,
+                         help='Number of FFT blocks')
+    out_fft.add_argument('--out-fft-n-heads', default=1, type=int,
+                         help='Number of attention heads')
+    out_fft.add_argument('--out-fft-d-head', default=64, type=int,
+                         help='Dim of attention head')
+    out_fft.add_argument('--out-fft-conv1d-kernel-size', default=3, type=int,
+                         help='Conv-1D kernel size')
+    out_fft.add_argument('--out-fft-conv1d-filter-size', default=1536, type=int,
+                         help='Conv-1D filter size')
+    out_fft.add_argument('--out-fft-output-size', default=384, type=int,
+                         help='Output dim')
+    out_fft.add_argument('--p-out-fft-dropout', default=0.1, type=float,
+                         help='Dropout probability for out_fft')
+    out_fft.add_argument('--p-out-fft-dropatt', default=0.1, type=float,
+                         help='Multi-head attention dropout')
+    out_fft.add_argument('--p-out-fft-dropemb', default=0.0, type=float,
+                         help='Dropout added to word+positional embeddings')
+
+    dur_pred = parser.add_argument_group('duration predictor parameters')
+    dur_pred.add_argument('--dur-predictor-kernel-size', default=3, type=int,
+                          help='Duration predictor conv-1D kernel size')
+    dur_pred.add_argument('--dur-predictor-filter-size', default=256, type=int,
+                          help='Duration predictor conv-1D filter size')
+    dur_pred.add_argument('--p-dur-predictor-dropout', default=0.1, type=float,
+                          help='Dropout probability for duration predictor')
+    dur_pred.add_argument('--dur-predictor-n-layers', default=2, type=int,
+                          help='Number of conv-1D layers')
+
+    pitch_pred = parser.add_argument_group('pitch predictor parameters')
+    pitch_pred.add_argument('--pitch-predictor-kernel-size', default=3, type=int,
+                            help='Pitch predictor conv-1D kernel size')
+    pitch_pred.add_argument('--pitch-predictor-filter-size', default=256, type=int,
+                            help='Pitch predictor conv-1D filter size')
+    pitch_pred.add_argument('--p-pitch-predictor-dropout', default=0.1, type=float,
+                            help='Pitch probability for pitch predictor')
+    pitch_pred.add_argument('--pitch-predictor-n-layers', default=2, type=int,
+                            help='Number of conv-1D layers')
+
+    energy_pred = parser.add_argument_group('energy predictor parameters')
+    energy_pred.add_argument('--energy-conditioning', action='store_true')
+    energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int,
+                            help='Pitch predictor conv-1D kernel size')
+    energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int,
+                            help='Pitch predictor conv-1D filter size')
+    energy_pred.add_argument('--p-energy-predictor-dropout', default=0.1, type=float,
+                            help='Pitch probability for energy predictor')
+    energy_pred.add_argument('--energy-predictor-n-layers', default=2, type=int,
+                            help='Number of conv-1D layers')
+
+    cond = parser.add_argument_group('conditioning parameters')
+    cond.add_argument('--pitch-embedding-kernel-size', default=3, type=int,
+                      help='Pitch embedding conv-1D kernel size')
+    cond.add_argument('--energy-embedding-kernel-size', default=3, type=int,
+                      help='Pitch embedding conv-1D kernel size')
+    cond.add_argument('--speaker-emb-weight', type=float, default=1.0,
+                      help='Scale speaker embedding')
+
+    return parser
diff --git a/fastpitch/attention.py b/fastpitch/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..59a7397d637216aed4f0eaa03942a83a1e9a1190
--- /dev/null
+++ b/fastpitch/attention.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+
+class Invertible1x1ConvLUS(torch.nn.Module):
+    def __init__(self, c):
+        super(Invertible1x1ConvLUS, self).__init__()
+        # Sample a random orthonormal matrix to initialize weights
+        W, _ = torch.linalg.qr(torch.randn(c, c))
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:, 0] = -1*W[:, 0]
+        p, lower, upper = torch.lu_unpack(*torch.lu(W))
+
+        self.register_buffer('p', p)
+        # diagonals of lower will always be 1s anyway
+        lower = torch.tril(lower, -1)
+        lower_diag = torch.diag(torch.eye(c, c))
+        self.register_buffer('lower_diag', lower_diag)
+        self.lower = nn.Parameter(lower)
+        self.upper_diag = nn.Parameter(torch.diag(upper))
+        self.upper = nn.Parameter(torch.triu(upper, 1))
+
+    def forward(self, z, reverse=False):
+        U = torch.triu(self.upper, 1) + torch.diag(self.upper_diag)
+        L = torch.tril(self.lower, -1) + torch.diag(self.lower_diag)
+        W = torch.mm(self.p, torch.mm(L, U))
+        if reverse:
+            if not hasattr(self, 'W_inverse'):
+                # Reverse computation
+                W_inverse = W.float().inverse()
+                if z.type() == 'torch.cuda.HalfTensor':
+                    W_inverse = W_inverse.half()
+
+                self.W_inverse = W_inverse[..., None]
+            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+            return z
+        else:
+            W = W[..., None]
+            z = F.conv1d(z, W, bias=None, stride=1, padding=0)
+            log_det_W = torch.sum(torch.log(torch.abs(self.upper_diag)))
+            return z, log_det_W
+
+
+class ConvAttention(torch.nn.Module):
+    def __init__(self, n_mel_channels=80, n_speaker_dim=128,
+                 n_text_channels=512, n_att_channels=80, temperature=1.0,
+                 n_mel_convs=2, align_query_enc_type='3xconv',
+                 use_query_proj=True):
+        super(ConvAttention, self).__init__()
+        self.temperature = temperature
+        self.att_scaling_factor = np.sqrt(n_att_channels)
+        self.softmax = torch.nn.Softmax(dim=3)
+        self.log_softmax = torch.nn.LogSoftmax(dim=3)
+        self.query_proj = Invertible1x1ConvLUS(n_mel_channels)
+        self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1)
+        self.align_query_enc_type = align_query_enc_type
+        self.use_query_proj = bool(use_query_proj)
+
+        self.key_proj = nn.Sequential(
+            ConvNorm(n_text_channels,
+                     n_text_channels * 2,
+                     kernel_size=3,
+                     bias=True,
+                     w_init_gain='relu'),
+            torch.nn.ReLU(),
+            ConvNorm(n_text_channels * 2,
+                     n_att_channels,
+                     kernel_size=1,
+                     bias=True))
+
+        self.align_query_enc_type = align_query_enc_type
+
+        if align_query_enc_type == "inv_conv":
+            self.query_proj = Invertible1x1ConvLUS(n_mel_channels)
+        elif align_query_enc_type == "3xconv":
+            self.query_proj = nn.Sequential(
+                ConvNorm(n_mel_channels,
+                         n_mel_channels * 2,
+                         kernel_size=3,
+                         bias=True,
+                         w_init_gain='relu'),
+                torch.nn.ReLU(),
+                ConvNorm(n_mel_channels * 2,
+                         n_mel_channels,
+                         kernel_size=1,
+                         bias=True),
+                torch.nn.ReLU(),
+                ConvNorm(n_mel_channels,
+                         n_att_channels,
+                         kernel_size=1,
+                         bias=True))
+        else:
+            raise ValueError("Unknown query encoder type specified")
+
+    def run_padded_sequence(self, sorted_idx, unsort_idx, lens, padded_data,
+                            recurrent_model):
+        """Sorts input data by previded ordering (and un-ordering) and runs the
+        packed data through the recurrent model
+
+        Args:
+            sorted_idx (torch.tensor): 1D sorting index
+            unsort_idx (torch.tensor): 1D unsorting index (inverse of sorted_idx)
+            lens: lengths of input data (sorted in descending order)
+            padded_data (torch.tensor): input sequences (padded)
+            recurrent_model (nn.Module): recurrent model to run data through
+        Returns:
+            hidden_vectors (torch.tensor): outputs of the RNN, in the original,
+            unsorted, ordering
+        """
+
+        # sort the data by decreasing length using provided index
+        # we assume batch index is in dim=1
+        padded_data = padded_data[:, sorted_idx]
+        padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens)
+        hidden_vectors = recurrent_model(padded_data)[0]
+        hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors)
+        # unsort the results at dim=1 and return
+        hidden_vectors = hidden_vectors[:, unsort_idx]
+        return hidden_vectors
+
+    def encode_query(self, query, query_lens):
+        query = query.permute(2, 0, 1)  # seq_len, batch, feature dim
+        lens, ids = torch.sort(query_lens, descending=True)
+        original_ids = [0] * lens.size(0)
+        for i in range(len(ids)):
+            original_ids[ids[i]] = i
+
+        query_encoded = self.run_padded_sequence(ids, original_ids, lens,
+                                                 query, self.query_lstm)
+        query_encoded = query_encoded.permute(1, 2, 0)
+        return query_encoded
+
+    def forward(self, queries, keys, query_lens, mask=None, key_lens=None,
+                keys_encoded=None, attn_prior=None):
+        """Attention mechanism for flowtron parallel
+        Unlike in Flowtron, we have no restrictions such as causality etc,
+        since we only need this during training.
+
+        Args:
+            queries (torch.tensor): B x C x T1 tensor
+                (probably going to be mel data)
+            keys (torch.tensor): B x C2 x T2 tensor (text data)
+            query_lens: lengths for sorting the queries in descending order
+            mask (torch.tensor): uint8 binary mask for variable length entries
+                (should be in the T2 domain)
+        Output:
+            attn (torch.tensor): B x 1 x T1 x T2 attention mask.
+                Final dim T2 should sum to 1
+        """
+        keys_enc = self.key_proj(keys)  # B x n_attn_dims x T2
+
+        # Beware can only do this since query_dim = attn_dim = n_mel_channels
+        if self.use_query_proj:
+            if self.align_query_enc_type == "inv_conv":
+                queries_enc, log_det_W = self.query_proj(queries)
+            elif self.align_query_enc_type == "3xconv":
+                queries_enc = self.query_proj(queries)
+                log_det_W = 0.0
+            else:
+                queries_enc, log_det_W = self.query_proj(queries)
+        else:
+            queries_enc, log_det_W = queries, 0.0
+
+        # different ways of computing attn,
+        # one is isotopic gaussians (per phoneme)
+        # Simplistic Gaussian Isotopic Attention
+
+        # B x n_attn_dims x T1 x T2
+        attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2
+        # compute log likelihood from a gaussian
+        attn = -0.0005 * attn.sum(1, keepdim=True)
+        if attn_prior is not None:
+            attn = self.log_softmax(attn) + torch.log(attn_prior[:, None]+1e-8)
+
+        attn_logprob = attn.clone()
+
+        if mask is not None:
+            attn.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2),
+                                   -float("inf"))
+
+        attn = self.softmax(attn)  # Softmax along T2
+        return attn, attn_logprob
diff --git a/fastpitch/attn_loss_function.py b/fastpitch/attn_loss_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..52748f21c3aedecb6e085a913e4a21fc91168b6b
--- /dev/null
+++ b/fastpitch/attn_loss_function.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AttentionCTCLoss(torch.nn.Module):
+    def __init__(self, blank_logprob=-1):
+        super(AttentionCTCLoss, self).__init__()
+        self.log_softmax = torch.nn.LogSoftmax(dim=-1)
+        self.blank_logprob = blank_logprob
+        self.CTCLoss = nn.CTCLoss(zero_infinity=True)
+
+    def forward(self, attn_logprob, in_lens, out_lens):
+        key_lens = in_lens
+        query_lens = out_lens
+        max_key_len = attn_logprob.size(-1)
+
+        # Reorder input to [query_len, batch_size, key_len]
+        attn_logprob = attn_logprob.squeeze(1)
+        attn_logprob = attn_logprob.permute(1, 0, 2)
+
+        # Add blank label
+        attn_logprob = F.pad(
+            input=attn_logprob,
+            pad=(1, 0, 0, 0, 0, 0),
+            value=self.blank_logprob)
+
+        # Convert to log probabilities
+        # Note: Mask out probs beyond key_len
+        key_inds = torch.arange(
+            max_key_len+1,
+            device=attn_logprob.device,
+            dtype=torch.long)
+        attn_logprob.masked_fill_(
+            key_inds.view(1,1,-1) > key_lens.view(1,-1,1), # key_inds >= key_lens+1
+            -float("inf"))
+        attn_logprob = self.log_softmax(attn_logprob)
+
+        # Target sequences
+        target_seqs = key_inds[1:].unsqueeze(0)
+        target_seqs = target_seqs.repeat(key_lens.numel(), 1)
+
+        # Evaluate CTC loss
+        cost = self.CTCLoss(
+            attn_logprob, target_seqs,
+            input_lengths=query_lens, target_lengths=key_lens)
+        return cost
+
+
+class AttentionBinarizationLoss(torch.nn.Module):
+    def __init__(self):
+        super(AttentionBinarizationLoss, self).__init__()
+
+    def forward(self, hard_attention, soft_attention, eps=1e-12):
+        log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1],
+                            min=eps)).sum()
+        return -log_sum / hard_attention.sum()
diff --git a/fastpitch/data_function.py b/fastpitch/data_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c947433f2529ede99cb53355c535ecb9d594068
--- /dev/null
+++ b/fastpitch/data_function.py
@@ -0,0 +1,464 @@
+# *****************************************************************************
+#  Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import functools
+import json
+import re
+from pathlib import Path
+
+import librosa
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy import ndimage
+from scipy.stats import betabinom
+
+import common.layers as layers
+from common.text.text_processing import TextProcessing
+from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu
+
+
+class BetaBinomialInterpolator:
+    """Interpolates alignment prior matrices to save computation.
+
+    Calculating beta-binomial priors is costly. Instead cache popular sizes
+    and use img interpolation to get priors faster.
+    """
+    def __init__(self, round_mel_len_to=100, round_text_len_to=20):
+        self.round_mel_len_to = round_mel_len_to
+        self.round_text_len_to = round_text_len_to
+        self.bank = functools.lru_cache(beta_binomial_prior_distribution)
+
+    def round(self, val, to):
+        return max(1, int(np.round((val + 1) / to))) * to
+
+    def __call__(self, w, h):
+        bw = self.round(w, to=self.round_mel_len_to)
+        bh = self.round(h, to=self.round_text_len_to)
+        ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1)
+        assert ret.shape[0] == w, ret.shape
+        assert ret.shape[1] == h, ret.shape
+        return ret
+
+
+def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling=1.0):
+    P = phoneme_count
+    M = mel_count
+    x = np.arange(0, P)
+    mel_text_probs = []
+    for i in range(1, M+1):
+        a, b = scaling * i, scaling * (M + 1 - i)
+        rv = betabinom(P, a, b)
+        mel_i_prob = rv.pmf(x)
+        mel_text_probs.append(mel_i_prob)
+    return torch.tensor(np.array(mel_text_probs))
+
+
+def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None,
+                normalize_std=None, n_formants=1):
+
+    if type(normalize_mean) is float or type(normalize_mean) is list:
+        normalize_mean = torch.tensor(normalize_mean)
+
+    if type(normalize_std) is float or type(normalize_std) is list:
+        normalize_std = torch.tensor(normalize_std)
+
+    if method == 'pyin':
+
+        snd, sr = librosa.load(wav)
+        pitch_mel, voiced_flag, voiced_probs = librosa.pyin(
+            # snd, fmin=librosa.note_to_hz('C2'), #########################
+            snd, fmin=60, ######################
+            # fmax=librosa.note_to_hz('C7'), frame_length=1024)
+            fmax=400, frame_length=1024)
+        assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0
+
+        pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel)
+        pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0)
+        pitch_mel = F.pad(pitch_mel, (0, mel_len - pitch_mel.size(1)))
+
+        if n_formants > 1:
+            raise NotImplementedError
+
+    else:
+        raise ValueError
+
+    pitch_mel = pitch_mel.float()
+
+    if normalize_mean is not None:
+        assert normalize_std is not None
+        pitch_mel = normalize_pitch(pitch_mel, normalize_mean, normalize_std)
+
+    return pitch_mel
+
+
+def normalize_pitch(pitch, mean, std):
+    zeros = (pitch == 0.0)
+    pitch -= mean[:, None]
+    pitch /= std[:, None]
+    pitch[zeros] = 0.0
+    return pitch
+
+
+class TTSDataset(torch.utils.data.Dataset):
+    """
+        1) loads audio,text pairs
+        2) normalizes text and converts them to sequences of one-hot vectors
+        3) computes mel-spectrograms from audio files.
+    """
+    def __init__(self,
+                dataset_path,
+                audiopaths_and_text,
+                text_cleaners,
+                n_mel_channels,
+                symbol_set='smj_expanded',
+                p_arpabet=1.0,
+                n_speakers=1,
+                n_languages=1, #ANT: added
+                load_mel_from_disk=True,
+                load_pitch_from_disk=True,
+                pitch_mean=214.72203,  # LJSpeech defaults
+                pitch_std=65.72038,
+                max_wav_value=None,
+                sampling_rate=None,
+                filter_length=None,
+                hop_length=None,
+                win_length=None,
+                mel_fmin=None,
+                mel_fmax=None,
+                prepend_space_to_text=False,
+                append_space_to_text=False,
+                pitch_online_dir=None,
+                betabinomial_online_dir=None,
+                use_betabinomial_interpolator=True,
+                pitch_online_method='pyin',
+                **ignored):
+        # print(prepend_space_to_text, append_space_to_text)
+        # Expect a list of filenames
+        if type(audiopaths_and_text) is str:
+            audiopaths_and_text = [audiopaths_and_text]
+
+        self.dataset_path = dataset_path
+        #ANT: do we need to add language to common_utils.load_filepaths_and_text, probably
+        self.audiopaths_and_text = load_filepaths_and_text(
+            dataset_path, audiopaths_and_text,
+            has_speakers=(n_speakers > 1))
+        self.load_mel_from_disk = load_mel_from_disk
+        if not load_mel_from_disk:
+            self.max_wav_value = max_wav_value
+            self.sampling_rate = sampling_rate
+            self.stft = layers.TacotronSTFT(
+                filter_length, hop_length, win_length,
+                n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
+        self.load_pitch_from_disk = load_pitch_from_disk
+
+        self.prepend_space_to_text = prepend_space_to_text
+        self.append_space_to_text = append_space_to_text
+
+        assert p_arpabet == 0.0 or p_arpabet == 1.0, (
+            'Only 0.0 and 1.0 p_arpabet is currently supported. '
+            'Variable probability breaks caching of betabinomial matrices.')
+
+        self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet)
+        self.n_speakers = n_speakers
+        # ANT: added languages, must add to config and probably train.py too
+        self.n_languages = n_languages
+        self.pitch_tmp_dir = pitch_online_dir
+        self.f0_method = pitch_online_method
+        self.betabinomial_tmp_dir = betabinomial_online_dir
+        self.use_betabinomial_interpolator = use_betabinomial_interpolator
+
+        if use_betabinomial_interpolator:
+            self.betabinomial_interpolator = BetaBinomialInterpolator()
+        # ANT: added language here
+        expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1) + (n_languages > 1))
+        assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)
+        """
+        if len(self.audiopaths_and_text[0]) < expected_columns:
+            raise ValueError(f'Expected {expected_columns} columns in audiopaths file. '
+                            'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]')
+        """
+        if len(self.audiopaths_and_text[0]) > expected_columns:
+            print('WARNING: Audiopaths file has more columns than expected')
+
+        to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x
+        self.pitch_mean = to_tensor(pitch_mean)
+        self.pitch_std = to_tensor(pitch_std)
+
+    def __getitem__(self, index):
+        # Separate filename and text
+        # ANT: added language, assume that if language is present, speaker labels are too
+        # print(self.n_speakers, self.n_languages) ############################
+        if self.n_speakers > 1 and self.n_languages > 1:
+            audiopath, *extra, text, speaker, language = self.audiopaths_and_text[index]
+            speaker = int(speaker)
+            language = int(language)
+            # print("spkr", speaker, "lang",language) ############################
+            
+        elif self.n_speakers >1:
+            audiopath, *extra, text, speaker = self.audiopaths_and_text[index]
+            speaker = int(speaker)
+            # print(speaker) ############################
+        else:
+            audiopath, *extra, text = self.audiopaths_and_text[index]
+            speaker = None
+            language = None
+
+        mel = self.get_mel(audiopath)
+        text = self.get_text(text)
+        # print(text)
+        pitch = self.get_pitch(index, mel.size(-1))
+        ## ANT: if external pitch extraction is used, n_frames may be one off due to rounding differences
+        if pitch.size(-1) != mel.size(-1): ############################
+            print(pitch.shape, mel.shape, audiopath) ############################
+        if pitch.size(-1) < mel.size(-1):
+            mel = mel[:, :pitch.size(-1)]
+        else:
+            pitch = pitch[:,:mel.size(-1)] #### 
+
+        energy = torch.norm(mel.float(), dim=0, p=2)
+        attn_prior = self.get_prior(index, mel.shape[1], text.shape[0])
+
+
+        assert pitch.size(-1) == mel.size(-1)
+    
+        # No higher formants?
+        if len(pitch.size()) == 1:
+            pitch = pitch[None, :]
+
+    
+        return (text, mel, len(text), pitch, energy, speaker, language, attn_prior,
+                audiopath)
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+
+    def get_mel(self, filename):
+        if not self.load_mel_from_disk:
+            audio, sampling_rate = load_wav_to_torch(filename)
+            if sampling_rate != self.stft.sampling_rate:
+                print(filename)
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.stft.sampling_rate))
+            audio_norm = audio / self.max_wav_value
+            audio_norm = audio_norm.unsqueeze(0)
+            audio_norm = torch.autograd.Variable(audio_norm,
+                                                requires_grad=False)
+            melspec = self.stft.mel_spectrogram(audio_norm)
+            melspec = torch.squeeze(melspec, 0)
+        else:
+            raise Exception(filename)
+            melspec = torch.load(filename)
+        assert melspec.size(0) == self.stft.n_mel_channels, (
+        'Mel dimension mismatch: given {}, expected {}'.format(
+                melspec.size(0), self.stft.n_mel_channels))
+                
+################ Plotting mels ########################################
+        """
+        import matplotlib.pyplot as plt
+        # plt.imshow(melspec.detach().cpu().T,aspect="auto")
+        fig, ax1 = plt.subplots(ncols=1)
+        pos = ax1.imshow(melspec.cpu().numpy().T,aspect="auto")
+        fig.colorbar(pos, ax=ax1)
+        plt.show()
+        """
+#######################################################################
+
+        return melspec
+
+    def get_text(self, text):
+        text = self.tp.encode_text(text)
+        space = [self.tp.encode_text("A A")[1]]
+
+        if self.prepend_space_to_text:
+            text = space + text
+            print("prepending")
+        if self.append_space_to_text:
+            text = text + space
+            print("appending")
+        return torch.LongTensor(text)
+
+    def get_prior(self, index, mel_len, text_len):
+
+        if self.use_betabinomial_interpolator:
+            return torch.from_numpy(self.betabinomial_interpolator(mel_len,
+                                                                text_len))
+
+        if self.betabinomial_tmp_dir is not None:
+            audiopath, *_ = self.audiopaths_and_text[index]
+            fname = Path(audiopath).relative_to(self.dataset_path)
+            fname = fname.with_suffix('.pt')
+            cached_fpath = Path(self.betabinomial_tmp_dir, fname)
+
+            if cached_fpath.is_file():
+                return torch.load(cached_fpath)
+
+        attn_prior = beta_binomial_prior_distribution(text_len, mel_len)
+
+        if self.betabinomial_tmp_dir is not None:
+            cached_fpath.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(attn_prior, cached_fpath)
+
+        return attn_prior
+
+    def get_pitch(self, index, mel_len=None):
+        audiopath, *fields = self.audiopaths_and_text[index]
+
+        # ANT: spk is not used but I'll let it be
+        if self.n_speakers > 1 and self.n_languages > 1:
+            spk = spk = int(fields[-2])
+        elif self.n_speakers > 1:
+            spk = int(fields[-1])
+        else:
+            spk = 0
+
+        if self.load_pitch_from_disk:
+            pitchpath = fields[0]
+            pitch = torch.load(pitchpath)
+            if self.pitch_mean is not None:
+                assert self.pitch_std is not None
+                pitch = normalize_pitch(pitch, self.pitch_mean, self.pitch_std)
+            return pitch
+
+        if self.pitch_tmp_dir is not None:
+            fname = Path(audiopath).relative_to(self.dataset_path)
+            fname_method = fname.with_suffix('.pt')
+            cached_fpath = Path(self.pitch_tmp_dir, fname_method)
+            if cached_fpath.is_file():
+                return torch.load(cached_fpath)
+
+        # No luck so far - calculate
+        wav = audiopath
+        if not wav.endswith('.wav'):
+            wav = re.sub('/mels/', '/wavs/', wav)
+            wav = re.sub('.pt$', '.wav', wav)
+
+        pitch_mel = estimate_pitch(wav, mel_len, self.f0_method,
+                                self.pitch_mean, self.pitch_std)
+
+        if self.pitch_tmp_dir is not None and not cached_fpath.is_file():
+            cached_fpath.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(pitch_mel, cached_fpath)
+
+        return pitch_mel
+
+
+class TTSCollate:
+    """Zero-pads model inputs and targets based on number of frames per step"""
+
+    def __call__(self, batch):
+        """Collate training batch from normalized text and mel-spec"""
+        # Right zero-pad all one-hot text sequences to max input length
+        input_lengths, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([len(x[0]) for x in batch]),
+            dim=0, descending=True)
+        max_input_len = input_lengths[0]
+
+        text_padded = torch.LongTensor(len(batch), max_input_len)
+        text_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            text = batch[ids_sorted_decreasing[i]][0]
+            text_padded[i, :text.size(0)] = text
+
+        # Right zero-pad mel-spec
+        num_mels = batch[0][1].size(0)
+        max_target_len = max([x[1].size(1) for x in batch])
+
+        # Include mel padded and gate padded
+        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
+        mel_padded.zero_()
+        output_lengths = torch.LongTensor(len(batch))
+        for i in range(len(ids_sorted_decreasing)):
+            mel = batch[ids_sorted_decreasing[i]][1]
+            mel_padded[i, :, :mel.size(1)] = mel
+            output_lengths[i] = mel.size(1)
+
+        n_formants = batch[0][3].shape[0]
+        pitch_padded = torch.zeros(mel_padded.size(0), n_formants,
+                                mel_padded.size(2), dtype=batch[0][3].dtype)
+        energy_padded = torch.zeros_like(pitch_padded[:, 0, :])
+
+        for i in range(len(ids_sorted_decreasing)):
+            pitch = batch[ids_sorted_decreasing[i]][3]
+            energy = batch[ids_sorted_decreasing[i]][4]
+            pitch_padded[i, :, :pitch.shape[1]] = pitch
+            energy_padded[i, :energy.shape[0]] = energy
+
+        if batch[0][5] is not None:
+            speaker = torch.zeros_like(input_lengths)
+            for i in range(len(ids_sorted_decreasing)):
+                speaker[i] = batch[ids_sorted_decreasing[i]][5]
+        else:
+            speaker = None
+        #ANT: added language here and increased the attn_prior and audiopaths index by 1
+        if batch[0][6] is not None:
+            language = torch.zeros_like(input_lengths)
+            for i in range(len(ids_sorted_decreasing)):
+                language[i] = batch[ids_sorted_decreasing[i]][6]
+        else:
+            language = None
+        attn_prior_padded = torch.zeros(len(batch), max_target_len,
+                                        max_input_len)
+        attn_prior_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            prior = batch[ids_sorted_decreasing[i]][7]
+            attn_prior_padded[i, :prior.size(0), :prior.size(1)] = prior
+
+        # Count number of items - characters in text
+        len_x = [x[2] for x in batch]
+        len_x = torch.Tensor(len_x)
+
+        audiopaths = [batch[i][8] for i in ids_sorted_decreasing]
+
+        return (text_padded, input_lengths, mel_padded, output_lengths, len_x,
+                pitch_padded, energy_padded, speaker, language, attn_prior_padded,
+                audiopaths)
+
+
+def batch_to_gpu(batch):
+    # ANT: added language here too
+    (text_padded, input_lengths, mel_padded, output_lengths, len_x,
+    pitch_padded, energy_padded, speaker, language, attn_prior, audiopaths) = batch
+
+    text_padded = to_gpu(text_padded).long()
+    input_lengths = to_gpu(input_lengths).long()
+    mel_padded = to_gpu(mel_padded).float()
+    output_lengths = to_gpu(output_lengths).long()
+    pitch_padded = to_gpu(pitch_padded).float()
+    energy_padded = to_gpu(energy_padded).float()
+    attn_prior = to_gpu(attn_prior).float()
+    if speaker is not None:
+        speaker = to_gpu(speaker).long()
+    if language is not None:
+        language = to_gpu(language).long()
+    # Alignments act as both inputs and targets - pass shallow copies
+    x = [text_padded, input_lengths, mel_padded, output_lengths,
+        pitch_padded, energy_padded, speaker, language, attn_prior, audiopaths]
+    y = [mel_padded, input_lengths, output_lengths]
+    len_x = torch.sum(output_lengths)
+    # print(output_lengths)
+    return (x, y, len_x)
diff --git a/fastpitch/loss_function.py b/fastpitch/loss_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cd3775e56723226c2a13f08fb0146d0ee49a033
--- /dev/null
+++ b/fastpitch/loss_function.py
@@ -0,0 +1,112 @@
+# *****************************************************************************
+#  Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from common.utils import mask_from_lens
+from fastpitch.attn_loss_function import AttentionCTCLoss
+
+
+class FastPitchLoss(nn.Module):
+    def __init__(self, dur_predictor_loss_scale=1.0,
+                 pitch_predictor_loss_scale=1.0, attn_loss_scale=1.0,
+                 energy_predictor_loss_scale=0.1):
+        super(FastPitchLoss, self).__init__()
+        self.dur_predictor_loss_scale = dur_predictor_loss_scale
+        self.pitch_predictor_loss_scale = pitch_predictor_loss_scale
+        self.energy_predictor_loss_scale = energy_predictor_loss_scale
+        self.attn_loss_scale = attn_loss_scale
+        self.attn_ctc_loss = AttentionCTCLoss()
+
+    def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
+        (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt,
+         energy_pred, energy_tgt, attn_soft, attn_hard, attn_dur,
+         attn_logprob) = model_out
+
+        (mel_tgt, in_lens, out_lens) = targets
+
+        dur_tgt = attn_dur
+        dur_lens = in_lens
+
+        mel_tgt.requires_grad = False
+        # (B,H,T) => (B,T,H)
+        mel_tgt = mel_tgt.transpose(1, 2)
+
+        dur_mask = mask_from_lens(dur_lens, max_len=dur_tgt.size(1))
+        log_dur_tgt = torch.log(dur_tgt.float() + 1)
+        loss_fn = F.mse_loss
+        dur_pred_loss = loss_fn(log_dur_pred, log_dur_tgt, reduction='none')
+        dur_pred_loss = (dur_pred_loss * dur_mask).sum() / dur_mask.sum()
+
+        ldiff = mel_tgt.size(1) - mel_out.size(1)
+        mel_out = F.pad(mel_out, (0, 0, 0, ldiff, 0, 0), value=0.0)
+        mel_mask = mel_tgt.ne(0).float()
+        loss_fn = F.mse_loss
+        mel_loss = loss_fn(mel_out, mel_tgt, reduction='none')
+        mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum()
+
+        ldiff = pitch_tgt.size(2) - pitch_pred.size(2)
+        pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0)
+        pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none')
+        pitch_loss = (pitch_loss * dur_mask.unsqueeze(1)).sum() / dur_mask.sum()
+
+        if energy_pred is not None:
+            energy_pred = F.pad(energy_pred, (0, ldiff, 0, 0), value=0.0)
+            energy_loss = F.mse_loss(energy_tgt, energy_pred, reduction='none')
+            energy_loss = (energy_loss * dur_mask).sum() / dur_mask.sum()
+        else:
+            energy_loss = 0
+
+        # Attention loss
+        attn_loss = self.attn_ctc_loss(attn_logprob, in_lens, out_lens)
+
+        loss = (mel_loss
+                + dur_pred_loss * self.dur_predictor_loss_scale
+                + pitch_loss * self.pitch_predictor_loss_scale
+                + energy_loss * self.energy_predictor_loss_scale
+                + attn_loss * self.attn_loss_scale)
+
+        meta = {
+            'loss': loss.clone().detach(),
+            'mel_loss': mel_loss.clone().detach(),
+            'duration_predictor_loss': dur_pred_loss.clone().detach(),
+            'pitch_loss': pitch_loss.clone().detach(),
+            'attn_loss': attn_loss.clone().detach(),
+            'dur_error': (torch.abs(dur_pred - dur_tgt).sum()
+                          / dur_mask.sum()).detach(),
+        }
+
+        if energy_pred is not None:
+            meta['energy_loss'] = energy_loss.clone().detach()
+
+        assert meta_agg in ('sum', 'mean')
+        if meta_agg == 'sum':
+            bsz = mel_out.size(0)
+            meta = {k: v * bsz for k, v in meta.items()}
+        return loss, meta
diff --git a/fastpitch/model.py b/fastpitch/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c54afee6cc45df7f6bb3250ce6f40e7336ffafef
--- /dev/null
+++ b/fastpitch/model.py
@@ -0,0 +1,419 @@
+# *****************************************************************************
+#  Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+from typing import Optional
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from common import filter_warnings
+from common.layers import ConvReLUNorm
+from common.utils import mask_from_lens
+from fastpitch.alignment import b_mas, mas_width1
+from fastpitch.attention import ConvAttention
+from fastpitch.transformer import FFTransformer
+
+
+def regulate_len(durations, enc_out, pace: float = 1.0,
+                 mel_max_len: Optional[int] = None):
+    """If target=None, then predicted durations are applied"""
+    dtype = enc_out.dtype
+    reps = durations.float() / pace
+    reps = (reps + 0.5).long()
+    dec_lens = reps.sum(dim=1)
+
+    max_len = dec_lens.max()
+    reps_cumsum = torch.cumsum(F.pad(reps, (1, 0, 0, 0), value=0.0),
+                               dim=1)[:, None, :]
+    reps_cumsum = reps_cumsum.to(dtype)
+
+    range_ = torch.arange(max_len, device=enc_out.device)[None, :, None]
+    mult = ((reps_cumsum[:, :, :-1] <= range_) &
+            (reps_cumsum[:, :, 1:] > range_))
+    mult = mult.to(dtype)
+    enc_rep = torch.matmul(mult, enc_out)
+
+    if mel_max_len is not None:
+        enc_rep = enc_rep[:, :mel_max_len]
+        dec_lens = torch.clamp_max(dec_lens, mel_max_len)
+    return enc_rep, dec_lens
+
+
+def average_pitch(pitch, durs):
+    durs_cums_ends = torch.cumsum(durs, dim=1).long()
+    durs_cums_starts = F.pad(durs_cums_ends[:, :-1], (1, 0))
+    pitch_nonzero_cums = F.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0))
+    pitch_cums = F.pad(torch.cumsum(pitch, dim=2), (1, 0))
+
+    bs, l = durs_cums_ends.size()
+    n_formants = pitch.size(1)
+    dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
+    dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
+
+    pitch_sums = (torch.gather(pitch_cums, 2, dce)
+                  - torch.gather(pitch_cums, 2, dcs)).float()
+    pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce)
+                    - torch.gather(pitch_nonzero_cums, 2, dcs)).float()
+
+    pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems,
+                            pitch_sums / pitch_nelems)
+    return pitch_avg
+
+
+class TemporalPredictor(nn.Module):
+    """Predicts a single float per each temporal location"""
+
+    def __init__(self, input_size, filter_size, kernel_size, dropout,
+                 n_layers=2, n_predictions=1):
+        super(TemporalPredictor, self).__init__()
+
+        self.layers = nn.Sequential(*[
+            ConvReLUNorm(input_size if i == 0 else filter_size, filter_size,
+                         kernel_size=kernel_size, dropout=dropout)
+            for i in range(n_layers)]
+        )
+        self.n_predictions = n_predictions
+        self.fc = nn.Linear(filter_size, self.n_predictions, bias=True)
+
+    def forward(self, enc_out, enc_out_mask):
+        out = enc_out * enc_out_mask
+        out = self.layers(out.transpose(1, 2)).transpose(1, 2)
+        out = self.fc(out) * enc_out_mask
+        return out
+
+
+class FastPitch(nn.Module):
+    def __init__(self, n_mel_channels, n_symbols, padding_idx,
+                 symbols_embedding_dim, in_fft_n_layers, in_fft_n_heads,
+                 in_fft_d_head,
+                 in_fft_conv1d_kernel_size, in_fft_conv1d_filter_size,
+                 in_fft_output_size,
+                 p_in_fft_dropout, p_in_fft_dropatt, p_in_fft_dropemb,
+                 out_fft_n_layers, out_fft_n_heads, out_fft_d_head,
+                 out_fft_conv1d_kernel_size, out_fft_conv1d_filter_size,
+                 out_fft_output_size,
+                 p_out_fft_dropout, p_out_fft_dropatt, p_out_fft_dropemb,
+                 dur_predictor_kernel_size, dur_predictor_filter_size,
+                 p_dur_predictor_dropout, dur_predictor_n_layers,
+                 pitch_predictor_kernel_size, pitch_predictor_filter_size,
+                 p_pitch_predictor_dropout, pitch_predictor_n_layers,
+                 pitch_embedding_kernel_size,
+                 energy_conditioning,
+                 energy_predictor_kernel_size, energy_predictor_filter_size,
+                 p_energy_predictor_dropout, energy_predictor_n_layers,
+                 energy_embedding_kernel_size,
+                 n_speakers, speaker_emb_weight, n_languages, pitch_conditioning_formants=1):
+        super(FastPitch, self).__init__()
+
+        self.encoder = FFTransformer(
+            n_layer=in_fft_n_layers, n_head=in_fft_n_heads,
+            d_model=symbols_embedding_dim,
+            d_head=in_fft_d_head,
+            d_inner=in_fft_conv1d_filter_size,
+            kernel_size=in_fft_conv1d_kernel_size,
+            dropout=p_in_fft_dropout,
+            dropatt=p_in_fft_dropatt,
+            dropemb=p_in_fft_dropemb,
+            embed_input=True,
+            d_embed=symbols_embedding_dim,
+            n_embed=n_symbols,
+            padding_idx=padding_idx)
+
+        if n_speakers > 1:
+            print(n_speakers, "### Is the number of speakers in this model ###") ################################################        	
+            self.speaker_emb = nn.Embedding(n_speakers, symbols_embedding_dim)
+        else:
+            self.speaker_emb = None
+
+        self.speaker_emb_weight = speaker_emb_weight
+
+        #ANT: added language embedding
+        if n_languages > 1:
+            print(n_languages, "### Is the number of languages in this model ###") ################################################        	
+            self.language_emb = nn.Embedding(n_languages, symbols_embedding_dim)
+        else:
+            self.language_emb = None
+        
+
+        self.duration_predictor = TemporalPredictor(
+            in_fft_output_size,
+            filter_size=dur_predictor_filter_size,
+            kernel_size=dur_predictor_kernel_size,
+            dropout=p_dur_predictor_dropout, n_layers=dur_predictor_n_layers
+        )
+
+        self.decoder = FFTransformer(
+            n_layer=out_fft_n_layers, n_head=out_fft_n_heads,
+            d_model=symbols_embedding_dim,
+            d_head=out_fft_d_head,
+            d_inner=out_fft_conv1d_filter_size,
+            kernel_size=out_fft_conv1d_kernel_size,
+            dropout=p_out_fft_dropout,
+            dropatt=p_out_fft_dropatt,
+            dropemb=p_out_fft_dropemb,
+            embed_input=False,
+            d_embed=symbols_embedding_dim
+        )
+
+        self.pitch_predictor = TemporalPredictor(
+            in_fft_output_size,
+            filter_size=pitch_predictor_filter_size,
+            kernel_size=pitch_predictor_kernel_size,
+            dropout=p_pitch_predictor_dropout, n_layers=pitch_predictor_n_layers,
+            n_predictions=pitch_conditioning_formants
+        )
+
+        self.pitch_emb = nn.Conv1d(
+            pitch_conditioning_formants, symbols_embedding_dim,
+            kernel_size=pitch_embedding_kernel_size,
+            padding=int((pitch_embedding_kernel_size - 1) / 2))
+
+        # Store values precomputed for training data within the model
+        self.register_buffer('pitch_mean', torch.zeros(1))
+        self.register_buffer('pitch_std', torch.zeros(1))
+
+        self.energy_conditioning = energy_conditioning
+        if energy_conditioning:
+            self.energy_predictor = TemporalPredictor(
+                in_fft_output_size,
+                filter_size=energy_predictor_filter_size,
+                kernel_size=energy_predictor_kernel_size,
+                dropout=p_energy_predictor_dropout,
+                n_layers=energy_predictor_n_layers,
+                n_predictions=1
+            )
+
+            self.energy_emb = nn.Conv1d(
+                1, symbols_embedding_dim,
+                kernel_size=energy_embedding_kernel_size,
+                padding=int((energy_embedding_kernel_size - 1) / 2))
+
+        self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True)
+
+        self.attention = ConvAttention(
+            n_mel_channels, 0, symbols_embedding_dim,
+            use_query_proj=True, align_query_enc_type='3xconv')
+
+    def binarize_attention(self, attn, in_lens, out_lens):
+        """For training purposes only. Binarizes attention with MAS.
+           These will no longer recieve a gradient.
+
+        Args:
+            attn: B x 1 x max_mel_len x max_text_len
+        """
+        b_size = attn.shape[0]
+        with torch.no_grad():
+            attn_out_cpu = np.zeros(attn.data.shape, dtype=np.float32)
+            log_attn_cpu = torch.log(attn.data).to(device='cpu', dtype=torch.float32)
+            log_attn_cpu = log_attn_cpu.numpy()
+            out_lens_cpu = out_lens.cpu()
+            in_lens_cpu = in_lens.cpu()
+            for ind in range(b_size):
+                hard_attn = mas_width1(
+                    log_attn_cpu[ind, 0, :out_lens_cpu[ind], :in_lens_cpu[ind]])
+                attn_out_cpu[ind, 0, :out_lens_cpu[ind], :in_lens_cpu[ind]] = hard_attn
+            attn_out = torch.tensor(
+                attn_out_cpu, device=attn.get_device(), dtype=attn.dtype)
+        return attn_out
+
+    def binarize_attention_parallel(self, attn, in_lens, out_lens):
+        """For training purposes only. Binarizes attention with MAS.
+           These will no longer recieve a gradient.
+
+        Args:
+            attn: B x 1 x max_mel_len x max_text_len
+        """
+        with torch.no_grad():
+            log_attn_cpu = torch.log(attn.data).cpu().numpy()
+            attn_out = b_mas(log_attn_cpu, in_lens.cpu().numpy(),
+                             out_lens.cpu().numpy(), width=1)
+        return torch.from_numpy(attn_out).to(attn.get_device())
+
+    def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
+        #ANT: added language
+        (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense,
+         speaker, language, attn_prior, audiopaths) = inputs
+
+        text_max_len = inputs.size(1)
+        mel_max_len = mel_tgt.size(2)
+
+        # Calculate speaker embedding
+        conditionings = []
+        if self.speaker_emb is None:
+            spk_emb = 0
+        else:
+            spk_emb = self.speaker_emb(speaker).unsqueeze(1)
+            spk_emb.mul_(self.speaker_emb_weight)
+            conditionings.append(spk_emb)
+        # ANT: added language 
+        if self.language_emb is None:
+            language_emb = 0
+        else:
+            language_emb = self.language_emb(language).unsqueeze(1)
+            conditionings.append(language_emb)
+            
+
+        # Input FFT
+        #enc_out, enc_mask = self.encoder(inputs, conditioning=[])
+        enc_out, enc_mask = self.encoder(inputs, conditioning=conditionings)
+
+        # Predict durations
+        log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
+        dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration)
+
+        # Predict pitch
+        pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1)
+
+        # Alignment
+        text_emb = self.encoder.word_emb(inputs)
+
+        # make sure to do the alignments before folding
+        attn_mask = mask_from_lens(input_lens, max_len=text_max_len)
+        attn_mask = attn_mask[..., None] == 0
+        # attn_mask should be 1 for unused timesteps in the text_enc_w_spkvec tensor
+
+        attn_soft, attn_logprob = self.attention(
+            mel_tgt, text_emb.permute(0, 2, 1), mel_lens, attn_mask,
+            key_lens=input_lens, keys_encoded=enc_out, attn_prior=attn_prior)
+
+        attn_hard = self.binarize_attention(attn_soft, input_lens, mel_lens)
+
+        # Viterbi --> durations
+        attn_hard_dur = attn_hard.sum(2)[:, 0, :]
+        dur_tgt = attn_hard_dur
+
+        if not torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens)):
+            print(audiopaths,input_lens,dur_tgt.sum(dim=1), mel_lens)
+
+        assert torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens))
+
+        # Average pitch over characters
+        pitch_tgt = average_pitch(pitch_dense, dur_tgt)
+
+        if use_gt_pitch and pitch_tgt is not None:
+            pitch_emb = self.pitch_emb(pitch_tgt)
+        else:
+            pitch_emb = self.pitch_emb(pitch_pred)
+        enc_out = enc_out + pitch_emb.transpose(1, 2)
+
+        # Predict energy
+        if self.energy_conditioning:
+            energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1)
+
+            # Average energy over characters
+            energy_tgt = average_pitch(energy_dense.unsqueeze(1), dur_tgt)
+            energy_tgt = torch.log(1.0 + energy_tgt)
+
+            energy_emb = self.energy_emb(energy_tgt)
+            energy_tgt = energy_tgt.squeeze(1)
+            enc_out = enc_out + energy_emb.transpose(1, 2)
+        else:
+            energy_pred = None
+            energy_tgt = None
+
+        len_regulated, dec_lens = regulate_len(
+            dur_tgt, enc_out, pace, mel_max_len)
+
+        # Output FFT
+        dec_out, dec_mask = self.decoder(len_regulated, dec_lens)
+        mel_out = self.proj(dec_out)
+        return (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred,
+                pitch_tgt, energy_pred, energy_tgt, attn_soft, attn_hard,
+                attn_hard_dur, attn_logprob)
+
+    def infer(self, inputs, pace=1.0, dur_tgt=None, pitch_tgt=None,
+              energy_tgt=None, pitch_transform=None, max_duration=75,
+              speaker=0, language=0, speaker_weight=1.0, language_weight=1.0):
+
+        if self.speaker_emb is None:
+            spk_emb = 0
+        else:
+            print("using speaker embeddings")
+            speaker = (torch.ones(inputs.size(0)).long().to(inputs.device)
+                       * speaker)
+            spk_emb = self.speaker_emb(speaker).unsqueeze(1)
+            print("spkr weight", speaker_weight)
+            spk_emb = spk_emb *speaker_weight
+          # ANT: added language 
+        if self.language_emb is None:
+            language_emb = 0
+        else:
+            print("using language embeddings")
+            language = (torch.ones(inputs.size(0)).long().to(inputs.device)
+                       * language)
+            language_emb = self.language_emb(language).unsqueeze(1)
+            language_emb = language_emb * language_weight
+        # Input FFT
+        enc_out, enc_mask = self.encoder(inputs, conditioning=[spk_emb, language_emb])
+
+        # Predict durations
+        log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
+        dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration)
+
+        # Pitch over chars
+        pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1)
+
+        if pitch_transform is not None:
+            if self.pitch_std[0] == 0.0:
+                # XXX LJSpeech-1.1 defaults
+                mean, std = 218.14, 67.24
+            else:
+                mean, std = self.pitch_mean[0], self.pitch_std[0]
+            pitch_pred = pitch_transform(pitch_pred, enc_mask.sum(dim=(1,2)),
+                                         mean, std)
+        if pitch_tgt is None:
+            pitch_emb = self.pitch_emb(pitch_pred).transpose(1, 2)
+        else:
+            pitch_emb = self.pitch_emb(pitch_tgt).transpose(1, 2)
+
+        enc_out = enc_out + pitch_emb
+
+        # Predict energy
+        if self.energy_conditioning:
+
+            if energy_tgt is None:
+                energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1)
+                energy_emb = self.energy_emb(energy_pred.unsqueeze(1)).transpose(1, 2)
+            else:
+                energy_emb = self.energy_emb(energy_tgt).transpose(1, 2)
+
+            enc_out = enc_out + energy_emb
+        else:
+            energy_pred = None
+
+        len_regulated, dec_lens = regulate_len(
+            dur_pred if dur_tgt is None else dur_tgt,
+            enc_out, pace, mel_max_len=None)
+
+        dec_out, dec_mask = self.decoder(len_regulated, dec_lens)
+        mel_out = self.proj(dec_out)
+        # mel_lens = dec_mask.squeeze(2).sum(axis=1).long()
+        mel_out = mel_out.permute(0, 2, 1)  # For inference.py
+        return mel_out, dec_lens, dur_pred, pitch_pred, energy_pred
diff --git a/fastpitch/model_jit.py b/fastpitch/model_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d437007664bce75be36aaf6cea257459f934f
--- /dev/null
+++ b/fastpitch/model_jit.py
@@ -0,0 +1,216 @@
+# *****************************************************************************
+#  Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+from typing import Optional
+
+import torch
+from torch import nn as nn
+
+from common import filter_warnings
+from fastpitch.model import TemporalPredictor
+from fastpitch.transformer_jit import FFTransformer
+
+
+def regulate_len(durations, enc_out, pace: float = 1.0,
+                 mel_max_len: Optional[int] = None):
+    """If target=None, then predicted durations are applied"""
+    reps = torch.round(durations.float() / pace).long()
+    dec_lens = reps.sum(dim=1)
+
+    max_len = dec_lens.max()
+    bsz, _, hid = enc_out.size()
+
+    reps_padded = torch.cat([reps, (max_len - dec_lens)[:, None]], dim=1)
+    pad_vec = torch.zeros(bsz, 1, hid, dtype=enc_out.dtype,
+                          device=enc_out.device)
+
+    enc_rep = torch.cat([enc_out, pad_vec], dim=1)
+    enc_rep = torch.repeat_interleave(
+        enc_rep.view(-1, hid), reps_padded.view(-1), dim=0
+    ).view(bsz, -1, hid)
+
+    if mel_max_len is not None:
+        enc_rep = enc_rep[:, :mel_max_len]
+        dec_lens = torch.clamp_max(dec_lens, mel_max_len)
+    return enc_rep, dec_lens
+
+
+class FastPitchJIT(nn.Module):
+    __constants__ = ['energy_conditioning']
+    def __init__(self, n_mel_channels, n_symbols, padding_idx,
+                 symbols_embedding_dim, in_fft_n_layers, in_fft_n_heads,
+                 in_fft_d_head,
+                 in_fft_conv1d_kernel_size, in_fft_conv1d_filter_size,
+                 in_fft_output_size,
+                 p_in_fft_dropout, p_in_fft_dropatt, p_in_fft_dropemb,
+                 out_fft_n_layers, out_fft_n_heads, out_fft_d_head,
+                 out_fft_conv1d_kernel_size, out_fft_conv1d_filter_size,
+                 out_fft_output_size,
+                 p_out_fft_dropout, p_out_fft_dropatt, p_out_fft_dropemb,
+                 dur_predictor_kernel_size, dur_predictor_filter_size,
+                 p_dur_predictor_dropout, dur_predictor_n_layers,
+                 pitch_predictor_kernel_size, pitch_predictor_filter_size,
+                 p_pitch_predictor_dropout, pitch_predictor_n_layers,
+                 pitch_embedding_kernel_size,
+                 energy_conditioning,
+                 energy_predictor_kernel_size, energy_predictor_filter_size,
+                 p_energy_predictor_dropout, energy_predictor_n_layers,
+                 energy_embedding_kernel_size,
+                 n_speakers, speaker_emb_weight, pitch_conditioning_formants=1):
+        super(FastPitchJIT, self).__init__()
+
+        self.encoder = FFTransformer(
+            n_layer=in_fft_n_layers, n_head=in_fft_n_heads,
+            d_model=symbols_embedding_dim,
+            d_head=in_fft_d_head,
+            d_inner=in_fft_conv1d_filter_size,
+            kernel_size=in_fft_conv1d_kernel_size,
+            dropout=p_in_fft_dropout,
+            dropatt=p_in_fft_dropatt,
+            dropemb=p_in_fft_dropemb,
+            embed_input=True,
+            d_embed=symbols_embedding_dim,
+            n_embed=n_symbols,
+            padding_idx=padding_idx)
+
+        if n_speakers > 1:
+            self.speaker_emb = nn.Embedding(n_speakers, symbols_embedding_dim)
+        else:
+            self.speaker_emb = None
+        self.speaker_emb_weight = speaker_emb_weight
+
+        self.duration_predictor = TemporalPredictor(
+            in_fft_output_size,
+            filter_size=dur_predictor_filter_size,
+            kernel_size=dur_predictor_kernel_size,
+            dropout=p_dur_predictor_dropout, n_layers=dur_predictor_n_layers
+        )
+
+        self.decoder = FFTransformer(
+            n_layer=out_fft_n_layers, n_head=out_fft_n_heads,
+            d_model=symbols_embedding_dim,
+            d_head=out_fft_d_head,
+            d_inner=out_fft_conv1d_filter_size,
+            kernel_size=out_fft_conv1d_kernel_size,
+            dropout=p_out_fft_dropout,
+            dropatt=p_out_fft_dropatt,
+            dropemb=p_out_fft_dropemb,
+            embed_input=False,
+            d_embed=symbols_embedding_dim
+        )
+
+        self.pitch_predictor = TemporalPredictor(
+            in_fft_output_size,
+            filter_size=pitch_predictor_filter_size,
+            kernel_size=pitch_predictor_kernel_size,
+            dropout=p_pitch_predictor_dropout, n_layers=pitch_predictor_n_layers,
+            n_predictions=pitch_conditioning_formants
+        )
+
+        self.pitch_emb = nn.Conv1d(
+            pitch_conditioning_formants, symbols_embedding_dim,
+            kernel_size=pitch_embedding_kernel_size,
+            padding=int((pitch_embedding_kernel_size - 1) / 2))
+
+        # Store values precomputed for training data within the model
+        self.register_buffer('pitch_mean', torch.zeros(1))
+        self.register_buffer('pitch_std', torch.zeros(1))
+
+        self.energy_conditioning = energy_conditioning
+        if energy_conditioning:
+            self.energy_predictor = TemporalPredictor(
+                in_fft_output_size,
+                filter_size=energy_predictor_filter_size,
+                kernel_size=energy_predictor_kernel_size,
+                dropout=p_energy_predictor_dropout,
+                n_layers=energy_predictor_n_layers,
+                n_predictions=1
+            )
+
+            self.energy_emb = nn.Conv1d(
+                1, symbols_embedding_dim,
+                kernel_size=energy_embedding_kernel_size,
+                padding=int((energy_embedding_kernel_size - 1) / 2))
+
+        self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True)
+
+        # skip self.attention (used only in training)
+
+    def infer(self, inputs, pace: float = 1.0,
+              dur_tgt: Optional[torch.Tensor] = None,
+              pitch_tgt: Optional[torch.Tensor] = None,
+              energy_tgt: Optional[torch.Tensor] = None,
+              speaker: int = 0):
+
+        if self.speaker_emb is None:
+            spk_emb = None
+        else:
+            speaker = (torch.ones(inputs.size(0)).long().to(inputs.device)
+                       * speaker)
+            spk_emb = self.speaker_emb(speaker).unsqueeze(1)
+            spk_emb.mul_(self.speaker_emb_weight)
+
+        # Input FFT
+        enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb)
+
+        # Predict durations
+        log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
+        dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, 100.0)
+
+        # Pitch over chars
+        pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1)
+
+        if pitch_tgt is None:
+            pitch_emb = self.pitch_emb(pitch_pred).transpose(1, 2)
+        else:
+            pitch_emb = self.pitch_emb(pitch_tgt).transpose(1, 2)
+
+        enc_out = enc_out + pitch_emb
+
+        # Predict energy
+        if self.energy_conditioning:
+
+            if energy_tgt is None:
+                energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1)
+                energy_emb = self.energy_emb(energy_pred.unsqueeze(1)).transpose(1, 2)
+            else:
+                energy_pred = None
+                energy_emb = self.energy_emb(energy_tgt).transpose(1, 2)
+
+            enc_out = enc_out + energy_emb
+        else:
+            energy_pred = None
+
+        len_regulated, dec_lens = regulate_len(
+            dur_pred if dur_tgt is None else dur_tgt,
+            enc_out, pace, mel_max_len=None)
+
+        dec_out, dec_mask = self.decoder(len_regulated, dec_lens)
+        mel_out = self.proj(dec_out)
+        # mel_lens = dec_mask.squeeze(2).sum(axis=1).long()
+        mel_out = mel_out.permute(0, 2, 1)  # For inference.py
+        return mel_out, dec_lens, dur_pred, pitch_pred, energy_pred
diff --git a/fastpitch/pitch_transform.py b/fastpitch/pitch_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fbe0c0dc20836fd105d6e30d964be1a88388950
--- /dev/null
+++ b/fastpitch/pitch_transform.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def pitch_transform_custom(pitch, pitch_lens):
+    """Apply a custom pitch transformation to predicted pitch values.
+
+    This sample modification linearly increases the pitch throughout
+    the utterance from 0.5 of predicted pitch to 1.5 of predicted pitch.
+    In other words, it starts low and ends high.
+
+    PARAMS
+    ------
+    pitch: torch.Tensor (bs, max_len)
+        Predicted pitch values for each lexical unit, padded to max_len (in Hz).
+    pitch_lens: torch.Tensor (bs, max_len)
+        Number of lexical units in each utterance.
+
+    RETURNS
+    -------
+    pitch: torch.Tensor
+        Modified pitch (in Hz).
+    """
+
+    weights = torch.arange(pitch.size(1), dtype=torch.float32, device=pitch.device)
+
+    # The weights increase linearly from 0.0 to 1.0 in every i-th row
+    # in the range (0, pitch_lens[i])
+    weights = weights.unsqueeze(0) / pitch_lens.unsqueeze(1)
+
+    # Shift the range from (0.0, 1.0) to (0.5, 1.5)
+    weights += 0.5
+
+    return pitch * weights
diff --git a/fastpitch/transformer.py b/fastpitch/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b11670897ff2052c5095f8b6e3cd08b3fb1002
--- /dev/null
+++ b/fastpitch/transformer.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from common.utils import mask_from_lens
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+        self.demb = demb
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.matmul(torch.unsqueeze(pos_seq, -1),
+                                    torch.unsqueeze(self.inv_freq, 0))
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1)
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+class PositionwiseConvFF(nn.Module):
+    def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False):
+        super(PositionwiseConvFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)),
+            nn.ReLU(),
+            # nn.Dropout(dropout),  # worse convergence
+            nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)),
+            nn.Dropout(dropout),
+        )
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        return self._forward(inp)
+
+    def _forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(self.layer_norm(core_out).to(inp.dtype))
+            core_out = core_out.transpose(1, 2)
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(core_out)
+            core_out = core_out.transpose(1, 2)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out).to(inp.dtype)
+
+        return output
+
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1,
+                 pre_lnorm=False):
+        super(MultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.scale = 1 / (d_head ** 0.5)
+        self.pre_lnorm = pre_lnorm
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head)
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(self, inp, attn_mask=None):
+        return self._forward(inp, attn_mask)
+
+    def _forward(self, inp, attn_mask=None):
+        residual = inp
+
+        if self.pre_lnorm:
+            # layer normalization
+            inp = self.layer_norm(inp)
+
+        n_head, d_head = self.n_head, self.d_head
+
+        head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=2)
+        head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head)
+        head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head)
+        head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head)
+
+        q = head_q.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+        k = head_k.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+        v = head_v.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+
+        attn_score = torch.bmm(q, k.transpose(1, 2))
+        attn_score.mul_(self.scale)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(1).to(attn_score.dtype)
+            attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1)
+            attn_score.masked_fill_(attn_mask.to(torch.bool), -float('inf'))
+
+        attn_prob = F.softmax(attn_score, dim=2)
+        attn_prob = self.dropatt(attn_prob)
+        attn_vec = torch.bmm(attn_prob, v)
+
+        attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head)
+        attn_vec = attn_vec.permute(1, 2, 0, 3).contiguous().view(
+            inp.size(0), inp.size(1), n_head * d_head)
+
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            # residual connection
+            output = residual + attn_out
+        else:
+            # residual connection + layer normalization
+            output = self.layer_norm(residual + attn_out)
+
+        output = output.to(attn_out.dtype)
+
+        return output
+
+
+class TransformerLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout,
+                 **kwargs):
+        super(TransformerLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout,
+                                         pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, mask=None):
+        output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2))
+        output *= mask
+        output = self.pos_ff(output)
+        output *= mask
+        return output
+
+
+class FFTransformer(nn.Module):
+    def __init__(self, n_layer, n_head, d_model, d_head, d_inner, kernel_size,
+                 dropout, dropatt, dropemb=0.0, embed_input=True,
+                 n_embed=None, d_embed=None, padding_idx=0, pre_lnorm=False):
+        super(FFTransformer, self).__init__()
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.padding_idx = padding_idx
+
+        if embed_input:
+            self.word_emb = nn.Embedding(n_embed, d_embed or d_model,
+                                         padding_idx=self.padding_idx)
+        else:
+            self.word_emb = None
+
+        self.pos_emb = PositionalEmbedding(self.d_model)
+        self.drop = nn.Dropout(dropemb)
+        self.layers = nn.ModuleList()
+
+        for _ in range(n_layer):
+            self.layers.append(
+                TransformerLayer(
+                    n_head, d_model, d_head, d_inner, kernel_size, dropout,
+                    dropatt=dropatt, pre_lnorm=pre_lnorm)
+            )
+    # ANT: change conditioning to a list of conditionings
+
+    def forward(self, dec_inp, seq_lens=None, conditioning=[]):
+        if self.word_emb is None:
+            inp = dec_inp
+            mask = mask_from_lens(seq_lens).unsqueeze(2)
+        else:
+            inp = self.word_emb(dec_inp)
+            # [bsz x L x 1]
+            mask = (dec_inp != self.padding_idx).unsqueeze(2)
+
+        pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype)
+        pos_emb = self.pos_emb(pos_seq) * mask
+        out = inp + pos_emb  
+        # out = self.drop(inp+pos_emb)
+        # ANT: is this ok?, used to be out = self.drop(inp+pos_emb+c)
+        # should dropout be applied multiple times?
+        for c in conditioning:
+            out = out + c
+        out = self.drop(out)
+        for layer in self.layers:
+            out = layer(out, mask=mask)
+
+        # out = self.drop(out)
+        return out, mask
diff --git a/fastpitch/transformer_jit.py b/fastpitch/transformer_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b0bb559425fee79b61a68cfe7392bdb8f089f06
--- /dev/null
+++ b/fastpitch/transformer_jit.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from common.utils import mask_from_lens
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+        self.demb = demb
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz: Optional[int] = None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1)
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+        super(PositionwiseFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner), nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class PositionwiseConvFF(nn.Module):
+    def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False):
+        super(PositionwiseConvFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)),
+            nn.ReLU(),
+            # nn.Dropout(dropout),  # worse convergence
+            nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)),
+            nn.Dropout(dropout),
+        )
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(self.layer_norm(core_out))
+            core_out = core_out.transpose(1, 2)
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(core_out)
+            core_out = core_out.transpose(1, 2)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1,
+                 pre_lnorm=False):
+        super(MultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.scale = 1 / (d_head ** 0.5)
+        self.dropout = dropout
+        self.pre_lnorm = pre_lnorm
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head)
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+
+    def forward(self, inp, attn_mask: Optional[torch.Tensor] = None):
+        residual = inp
+
+        if self.pre_lnorm:
+            # layer normalization
+            inp = self.layer_norm(inp)
+
+        n_head, d_head = self.n_head, self.d_head
+
+        head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=-1)
+        head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head)
+        head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head)
+        head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head)
+
+        q = head_q.permute(0, 2, 1, 3).reshape(-1, inp.size(1), d_head)
+        k = head_k.permute(0, 2, 1, 3).reshape(-1, inp.size(1), d_head)
+        v = head_v.permute(0, 2, 1, 3).reshape(-1, inp.size(1), d_head)
+
+        attn_score = torch.bmm(q, k.transpose(1, 2))
+        attn_score.mul_(self.scale)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(1)
+            attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1)
+            attn_score.masked_fill_(attn_mask, -float('inf'))
+
+        attn_prob = F.softmax(attn_score, dim=2)
+        attn_prob = self.dropatt(attn_prob)
+        attn_vec = torch.bmm(attn_prob, v)
+
+        attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head)
+        attn_vec = attn_vec.permute(1, 2, 0, 3).contiguous().view(
+            inp.size(0), inp.size(1), n_head * d_head)
+
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            # residual connection
+            output = residual + attn_out
+        else:
+            # residual connection + layer normalization
+
+            # XXX Running TorchScript on 20.02 and 20.03 containers crashes here
+            # XXX Works well with 20.01-py3 container.
+            # XXX dirty fix is:
+            # XXX     output = self.layer_norm(residual + attn_out).half()
+            output = self.layer_norm(residual + attn_out)
+
+        return output
+
+
+class TransformerLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout,
+                 **kwargs):
+        super(TransformerLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout,
+                                         pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, mask):
+        output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2))
+        output *= mask
+        output = self.pos_ff(output)
+        output *= mask
+        return output
+
+
+class FFTransformer(nn.Module):
+    def __init__(self, n_layer, n_head, d_model, d_head, d_inner, kernel_size,
+                 dropout, dropatt, dropemb=0.0, embed_input=True,
+                 n_embed=None, d_embed=None, padding_idx=0, pre_lnorm=False):
+        super(FFTransformer, self).__init__()
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.padding_idx = padding_idx
+        self.n_embed = n_embed
+
+        self.embed_input = embed_input
+        if embed_input:
+            print(padding_idx) #########################################        	
+            self.word_emb = nn.Embedding(n_embed, d_embed or d_model,
+                                         padding_idx=self.padding_idx)
+        else:
+            self.word_emb = nn.Identity()
+
+        self.pos_emb = PositionalEmbedding(self.d_model)
+        self.drop = nn.Dropout(dropemb)
+        self.layers = nn.ModuleList()
+
+        for _ in range(n_layer):
+            self.layers.append(
+                TransformerLayer(
+                    n_head, d_model, d_head, d_inner, kernel_size, dropout,
+                    dropatt=dropatt, pre_lnorm=pre_lnorm)
+            )
+
+    def forward(self, dec_inp, seq_lens: Optional[torch.Tensor] = None,
+                conditioning: Optional[torch.Tensor] = None):
+        if not self.embed_input:
+            inp = dec_inp
+            assert seq_lens is not None
+            mask = mask_from_lens(seq_lens).unsqueeze(2)
+        else:
+            inp = self.word_emb(dec_inp)
+            # [bsz x L x 1]
+            mask = (dec_inp != self.padding_idx).unsqueeze(2)
+
+        pos_seq = torch.arange(inp.size(1), device=inp.device, dtype=inp.dtype)
+        pos_emb = self.pos_emb(pos_seq) * mask
+        if conditioning is not None:
+            out = self.drop(inp + pos_emb + conditioning)
+        else:
+            out = self.drop(inp + pos_emb)
+
+        for layer in self.layers:
+            out = layer(out, mask=mask)
+
+        # out = self.drop(out)
+        return out, mask
diff --git a/hifigan/__pycache__/arg_parser.cpython-39.pyc b/hifigan/__pycache__/arg_parser.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30d60733b739a3d51b381cfcfb1fbde4f4d22c98
Binary files /dev/null and b/hifigan/__pycache__/arg_parser.cpython-39.pyc differ
diff --git a/hifigan/__pycache__/data_function.cpython-39.pyc b/hifigan/__pycache__/data_function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81b34ee26aca207e17396bfe3908a50f9217d028
Binary files /dev/null and b/hifigan/__pycache__/data_function.cpython-39.pyc differ
diff --git a/hifigan/__pycache__/models.cpython-37.pyc b/hifigan/__pycache__/models.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90138726308c705bfb2845270ec9bbc604c32620
Binary files /dev/null and b/hifigan/__pycache__/models.cpython-37.pyc differ
diff --git a/hifigan/__pycache__/models.cpython-38.pyc b/hifigan/__pycache__/models.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff5527348d384a5657e2940f99cce747cb9a3388
Binary files /dev/null and b/hifigan/__pycache__/models.cpython-38.pyc differ
diff --git a/hifigan/__pycache__/models.cpython-39.pyc b/hifigan/__pycache__/models.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34d94cb6aded7a8adb8c09249c3ab9d317fb7b40
Binary files /dev/null and b/hifigan/__pycache__/models.cpython-39.pyc differ
diff --git a/hifigan/arg_parser.py b/hifigan/arg_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..784389ad89dfa02e19830b27a300072b52ed6b30
--- /dev/null
+++ b/hifigan/arg_parser.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from ast import literal_eval
+
+
+def parse_hifigan_args(parent, add_help=False):
+    """
+    Parse model specific commandline arguments.
+    """
+    parser = argparse.ArgumentParser(parents=[parent], add_help=add_help,
+                                     allow_abbrev=False)
+    hfg = parser.add_argument_group('HiFi-GAN generator parameters')
+    hfg.add_argument('--upsample_rates', default=[8, 8, 2, 2],
+                     type=literal_eval_arg,
+                     help='Upsample rates')
+    hfg.add_argument('--upsample_kernel_sizes', default=[16, 16, 4, 4],
+                     type=literal_eval_arg,
+                     help='Upsample kernel sizes')
+    hfg.add_argument('--upsample_initial_channel', default=512, type=int,
+                     help='Upsample initial channel')
+    hfg.add_argument('--resblock', default='1', type=str,
+                     help='Resblock module version')
+    hfg.add_argument('--resblock_kernel_sizes', default=[3, 7, 11],
+                     type=literal_eval_arg,
+                     help='Resblock kernel sizes')
+    hfg.add_argument('--resblock_dilation_sizes', type=literal_eval_arg,
+                     default=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                     help='Resblock dilation sizes'),
+
+    hfg = parser.add_argument_group('HiFi-GAN discriminator parameters')
+    hfg.add_argument('--mpd_periods', default=[2, 3, 5, 7, 11],
+                      type=literal_eval_arg,
+                      help='Periods of MultiPeriodDiscriminator')
+    hfg.add_argument('--concat_fwd', action='store_true',
+                      help='Faster Discriminators (requires more GPU memory)')
+    hfg.add_argument('--hifigan-config', type=str, default=None, required=False,
+                     help='Path to a HiFi-GAN config .json'
+                          ' (if provided, overrides model architecture flags)')
+    return parser
+
+
+def literal_eval_arg(val):
+    try:
+        return literal_eval(val)
+    except SyntaxError as e:  # Argparse does not handle SyntaxError
+        raise ValueError(str(e)) from e
diff --git a/hifigan/data_function.py b/hifigan/data_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac77c7bd67f6356adda1a29e8a2e2cdbd01e72ed
--- /dev/null
+++ b/hifigan/data_function.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#  MIT License
+#
+#  Copyright (c) 2020 Jungil Kong
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in all
+#  copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#  SOFTWARE.
+
+# The following functions/classes were based on code from https://github.com/jik876/hifi-gan:
+# mel_spectrogram, MelDataset
+
+import math
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from librosa.util import normalize
+from numpy import random
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from common.audio_processing import dynamic_range_compression
+from common.utils import load_filepaths_and_text, load_wav
+
+MAX_WAV_VALUE = 32768.0
+
+mel_basis = {}
+hann_window = {}
+
+
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
+                    fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global mel_basis, hann_window
+    fmax_key = f'{fmax}_{y.device}'
+    if fmax_key not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[fmax_key] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+
+    pad = int((n_fft-hop_size)/2)
+    y = F.pad(y.unsqueeze(1), (pad, pad), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size,
+                      window=hann_window[str(y.device)], center=center,
+                      pad_mode='reflect', normalized=False, onesided=True,
+                      return_complex=True)
+
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = dynamic_range_compression(spec)  # spectral normalize
+    return spec
+
+
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self, training_files, segment_size, n_fft, num_mels,
+                 hop_size, win_size, sampling_rate,  fmin, fmax, split=True,
+                 device=None, fmax_loss=None, fine_tuning=False,
+                 base_mels_path=None, repeat=1, deterministic=False,
+                 max_wav_value=MAX_WAV_VALUE):
+
+        self.audio_files = training_files
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.max_wav_value = max_wav_value
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+        self.repeat = repeat
+        self.deterministic = deterministic
+        self.rng = random.default_rng()
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError('Dataset index out of range')
+        rng = random.default_rng(index) if self.deterministic else self.rng
+        index = index % len(self.audio_files)  # collapse **after** setting seed
+        filename = self.audio_files[index]
+        audio, sampling_rate = load_wav(filename)
+        audio = audio / self.max_wav_value
+        if not self.fine_tuning:
+            audio = normalize(audio) * 0.95
+        if sampling_rate != self.sampling_rate:
+            raise ValueError("{} SR doesn't match target {} SR".format(
+                sampling_rate, self.sampling_rate))
+
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = rng.integers(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start+self.segment_size]
+                else:
+                    audio = F.pad(audio, (0, self.segment_size - audio.size(1)))
+
+            mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
+                                  self.sampling_rate, self.hop_size,
+                                  self.win_size, self.fmin, self.fmax,
+                                  center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path,
+                os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
+            mel = torch.from_numpy(mel).float()
+
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+
+                if audio.size(1) >= self.segment_size:
+                    mel_start = rng.integers(0, mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    a = mel_start * self.hop_size
+                    b = (mel_start + frames_per_seg) * self.hop_size
+                    audio = audio[:, a:b]
+                else:
+                    mel = F.pad(mel, (0, frames_per_seg - mel.size(2)))
+                    audio = F.pad(audio, (0, self.segment_size - audio.size(1)))
+
+        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
+                                   self.sampling_rate, self.hop_size,
+                                   self.win_size, self.fmin, self.fmax_loss,
+                                   center=False)
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+
+    def __len__(self):
+        return len(self.audio_files) * self.repeat
+
+
+def get_data_loader(args, distributed_run, train=True, batch_size=None,
+                    val_kwargs=None):
+
+    filelists = args.training_files if train else args.validation_files
+    files = load_filepaths_and_text(args.dataset_path, filelists)
+    files = list(zip(*files))[0]
+
+    dataset_kw = {
+        'segment_size': args.segment_size,
+        'n_fft': args.filter_length,
+        'num_mels': args.num_mels,
+        'hop_size': args.hop_length,
+        'win_size': args.win_length,
+        'sampling_rate': args.sampling_rate,
+        'fmin': args.mel_fmin,
+        'fmax': args.mel_fmax,
+        'fmax_loss': args.mel_fmax_loss,
+        'max_wav_value': args.max_wav_value,
+        'fine_tuning': args.fine_tuning,
+        'base_mels_path': args.input_mels_dir,
+        'deterministic': not train
+    }
+
+    if train:
+        dataset = MelDataset(files, **dataset_kw)
+        sampler = DistributedSampler(dataset) if distributed_run else None
+    else:
+        dataset_kw.update(val_kwargs or {})
+        dataset = MelDataset(files, **dataset_kw)
+        sampler = (DistributedSampler(dataset, shuffle=False)
+                   if distributed_run else None)
+
+    loader = DataLoader(dataset,
+                        # NOTE On DGX-1 and DGX A100 =1 is optimal
+                        num_workers=args.num_workers if train else 1,
+                        shuffle=(train and not distributed_run),
+                        sampler=sampler,
+                        batch_size=batch_size or args.batch_size,
+                        pin_memory=True,
+                        persistent_workers=True,
+                        drop_last=train)
+    return loader
diff --git a/hifigan/denoiser.py b/hifigan/denoiser.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd407c14725cd652d11d1d0771a5684af764af07
--- /dev/null
+++ b/hifigan/denoiser.py
@@ -0,0 +1,38 @@
+import torch
+from .stft import STFT
+
+
+class Denoiser(torch.nn.Module):
+    """ Removes model bias from audio produced with hifigan """
+
+    def __init__(self, hifigan, filter_length=1024, n_overlap=4,
+                 win_length=1024, mode='zeros'):
+        super(Denoiser, self).__init__()
+        self.stft = STFT(filter_length=filter_length,
+                         hop_length=int(filter_length/n_overlap),
+                         win_length=win_length).cuda()
+        if mode == 'zeros':
+            mel_input = torch.zeros(
+                (1, 80, 88),
+                dtype=hifigan.ups[0].weight.dtype,
+                device=hifigan.ups[0].weight.device)
+        elif mode == 'normal':
+            mel_input = torch.randn(
+                (1, 80, 88),
+                dtype=hifigan.upsample.weight.dtype,
+                device=hifigan.upsample.weight.device)
+        else:
+            raise Exception("Mode {} if not supported".format(mode))
+
+        with torch.no_grad():
+            bias_audio = hifigan(mel_input).float()[0]
+            bias_spec, _ = self.stft.transform(bias_audio)
+
+        self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
+
+    def forward(self, audio, strength=0.1):
+        audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
+        audio_spec_denoised = audio_spec - self.bias_spec * strength
+        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+        audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
+        return audio_denoised
diff --git a/hifigan/logging.py b/hifigan/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..763b45aa6f062d9f295403475ae7f6ce81fbd143
--- /dev/null
+++ b/hifigan/logging.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from collections import OrderedDict
+from copy import copy
+from pathlib import Path
+
+import dllogger
+import numpy as np
+import torch.distributed as dist
+import torch
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+
+from common import tb_dllogger
+from common.tb_dllogger import (stdout_metric_format, stdout_step_format,
+                                unique_log_fpath, TBLogger)
+
+
+def init_logger(output_dir, log_file, ema_decay=0.0):
+
+    local_rank = 0 if not dist.is_initialized() else dist.get_rank()
+
+    print('logger init', local_rank)
+
+    if local_rank == 0:
+        Path(output_dir).mkdir(parents=False, exist_ok=True)
+        log_fpath = log_file or Path(output_dir, 'nvlog.json')
+
+        dllogger.init(backends=[
+            JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(log_fpath)),
+            StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format,
+                          metric_format=stdout_metric_format)])
+
+        init_train_metadata()
+    else:
+        dllogger.init(backends=[])
+
+    tb_train = ['train']
+    tb_val = ['val']
+    tb_ema = [k + '_ema' for k in tb_val] if ema_decay > 0.0 else []
+
+    tb_dllogger.tb_loggers = {
+        s: TBLogger(enabled=(local_rank == 0), log_dir=output_dir, name=s)
+        for s in tb_train + tb_val + tb_ema}
+
+
+def init_train_metadata():
+
+    dllogger.metadata("train_lrate_gen",
+                      {"name": "g lr", "unit": None, "format": ":>3.2e"})
+    dllogger.metadata("train_lrate_discrim",
+                      {"name": "d lr", "unit": None, "format": ":>3.2e"})
+    dllogger.metadata("train_avg_lrate_gen",
+                      {"name": "avg g lr", "unit": None, "format": ":>3.2e"})
+    dllogger.metadata("train_avg_lrate_discrim",
+                      {"name": "avg d lr", "unit": None, "format": ":>3.2e"})
+
+    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
+                      ('val', '  avg val '), ('val_ema', '  EMA val ')]:
+
+        dllogger.metadata(f"{id_}_loss_gen",
+                          {"name": f"{pref}g loss", "unit": None, "format": ":>6.3f"})
+        dllogger.metadata(f"{id_}_loss_discrim",
+                          {"name": f"{pref}d loss", "unit": None, "format": ":>6.3f"})
+        dllogger.metadata(f"{id_}_loss_mel",
+                          {"name": f"{pref}mel loss", "unit": None, "format": ":>6.3f"})
+
+        dllogger.metadata(f"{id_}_frames/s",
+                          {"name": None, "unit": "frames/s", "format": ":>8.2f"})
+        dllogger.metadata(f"{id_}_took",
+                          {"name": "took", "unit": "s", "format": ":>3.2f"})
+
+
+def init_infer_metadata():
+    raise NotImplementedError
+
+    # modalities = [('latency', 's', ':>10.5f'), ('RTF', 'x', ':>10.2f'),
+    #               ('frames/s', None, ':>10.2f'), ('samples/s', None, ':>10.2f'),
+    #               ('letters/s', None, ':>10.2f')]
+
+    # for perc in ['', 'avg', '90%', '95%', '99%']:
+    #     for model in ['fastpitch', 'waveglow', '']:
+    #         for mod, unit, format in modalities:
+
+    #             name = f'{perc} {model} {mod}'.strip().replace('  ', ' ')
+
+    #             dllogger.metadata(
+    #                 name.replace(' ', '_'),
+    #                 {'name': f'{name: <26}', 'unit': unit, 'format': format})
+
+
+class defaultdict(OrderedDict):
+    """A simple, ordered defaultdict."""
+
+    def __init__(self, type_, *args, **kwargs):
+        self.type_ = type_
+        super().__init__(*args, **kwargs)
+
+    def __getitem__(self, key):
+        if key not in self:
+            self.__setitem__(key, self.type_())
+        return super().__getitem__(key)
+
+    def __copy__(self):
+        return defaultdict(self.type_, self)
+
+
+class Metrics(dict):
+
+    def __init__(self, scopes=['train', 'train_avg'],
+                 dll_keys=['loss_gen', 'loss_discrim', 'loss_mel',
+                           'frames/s', 'took', 'lrate_gen', 'lrate_discrim'],
+                 benchmark_epochs=0):
+        super().__init__()
+
+        self.dll_keys = dll_keys
+        self.metrics = {scope: defaultdict(float) for scope in scopes}
+        self.metric_counts = {scope: defaultdict(int) for scope in scopes}
+        self.start_time = {scope: None for scope in scopes}
+        self.benchmark_epochs = benchmark_epochs
+        if benchmark_epochs > 0:
+            self.metrics['train_benchmark'] = defaultdict(list)
+
+    def __setitem__(self, key, val):
+        extract = lambda t: t.item() if type(t) is torch.Tensor else t
+
+        if type(val) is dict:
+            for k, v in val.items():
+                super().__setitem__(k, extract(v))
+        else:
+            super().__setitem__(key, extract(val))
+
+    def __getitem__(self, key):
+        if key not in self:
+            self.__setitem__(key, 0.0)
+        return super().__getitem__(key)
+
+    def start_accumulating(self, step, start_timer=True, scope='train'):
+        del step  # unused
+        self.clear()
+        self.metrics[scope].clear()
+        self.metric_counts[scope].clear()
+        if start_timer:
+            self.start_time[scope] = time.time()
+
+    def accumulate(self, scopes=['train', 'train_avg']):
+        for scope in scopes:
+            for k, v in self.items():
+                self.metrics[scope][k] += v
+                self.metric_counts[scope][k] += 1
+
+        self.clear()
+
+    def finish_accumulating(self, stop_timer=True, scope='train'):
+
+        metr = self.metrics[scope]
+        counts = self.metric_counts[scope]
+
+        for k, v in metr.items():
+            metr[k] = v / counts[k]
+
+        if stop_timer:
+            took = time.time() - self.start_time[scope]
+            if 'frames' in metr:
+                metr['frames/s'] = metr.pop('frames') * counts['frames'] / took
+            metr['took'] = took
+
+    def start_iter(self, iter, start_timer=True):
+        self.start_accumulating(iter, start_timer, 'train')
+
+    def start_epoch(self, epoch, start_timer=True):
+        self.start_accumulating(epoch, start_timer, 'train_avg')
+
+    def start_val(self, start_timer=True):
+        self.start_accumulating(None, start_timer, 'val')
+
+    def finish_iter(self, stop_timer=True):
+        self.finish_accumulating(stop_timer, 'train')
+
+    def finish_epoch(self, stop_timer=True):
+        self.finish_accumulating(stop_timer, 'train_avg')
+
+        metr = self.metrics['train_benchmark']
+        for k in ('took', 'frames/s', 'loss_gen', 'loss_discrim', 'loss_mel'):
+            metr[k].append(self.metrics['train_avg'][k])
+
+            if len(metr[k]) > self.benchmark_epochs:
+                metr[k].pop(0)
+
+    def finish_val(self, stop_timer=True):
+        self.finish_accumulating(stop_timer, 'val')
+
+    def get_metrics(self, scope='train', target='dll'):
+
+        if scope == 'train_benchmark':
+            metr = self.metrics[scope]
+            ret = {'train_' + k: np.mean(v) for k, v in metr.items()}
+            ret['benchmark_epochs_num'] = len(list(metr.values())[0])
+            return ret
+
+        ret = copy(self.metrics[scope])
+
+        if scope == 'train':
+            ret.update(self)
+
+        if target == 'dll':
+            ret = {f'{scope}_{k}': v
+                   for k, v in ret.items() if k in self.dll_keys}
+
+        elif target == 'tb':
+            # Rename keys so they would group nicely inside TensorBoard
+
+            def split_key(k):
+                pos = k.rfind('_')
+                return k[:pos] + '/' + k[pos+1:] if pos >= 0 else k
+
+            ret = {split_key(k): v for k, v in ret.items()}
+
+        return ret
diff --git a/hifigan/metrics.py b/hifigan/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..da053b478413b6006717ec0766aed389b2871e59
--- /dev/null
+++ b/hifigan/metrics.py
@@ -0,0 +1,151 @@
+import timer
+from collections import defaultdict
+
+
+class Metrics(defaultdict):
+
+    # TODO Where to measure - gpu:0 or all gpus?
+
+    def __init__(self, tb_keys=[], benchmark_epochs=10):
+        super().__init__(float)
+
+        # dll_tb_keys=['loss_gen', 'loss_discrim', 'loss_mel', 'took']:
+
+        self.tb_keys = tb_keys  #_ = {'dll': dll_keys, 'tb': tb_keys, 'dll+tb': dll_tb_keys}
+        self.iter_start_time = None
+        self.iter_metrics = defaultdict(float)
+        self.epoch_start_time = None
+        self.epoch_metrics = defaultdict(float)
+        self.benchmark_epochs = benchmark_epochs
+
+    def start_epoch(self, epoch, start_timer=True):
+        self.epoch = epoch
+        if start_timer:
+            self.epoch_start_time = time.time()
+
+    def start_iter(self, iter, start_timer=True):
+        self.iter = iter
+        self.accum_steps = 0
+        self.step_metrics.clear()
+        if start_timer:
+            self.iter_start_time = time.time()
+
+    def update_iter(self, ...):
+        # do stuff
+        pass
+
+    def accumulate(self, scope='step'):
+        tgt = {'step': self.step_metrics, 'epoch': self.epoch_metrics}[scope]
+
+        for k, v in self.items():
+            tgt[k] += v
+
+        self.clear()
+
+    def update_iter(self, metrics={}, stop_timer=True):
+
+        is not self.started_iter:
+            return
+
+        self.accumulate(metrics)
+        self.accumulate(self.iter_metrics, scope='epoch')
+
+        if stop_timer:
+            self.iter_metrics['took'] = time.time() - self.iter_time_start
+
+    def update_epoch(self, stop_timer=True):
+
+        #            tb_total_steps=None,
+        #            subset='train_avg',
+        #            data=OrderedDict([
+        #                ('loss', epoch_loss[-1]),
+        #                ('mel_loss', epoch_mel_loss[-1]),
+        #                ('frames/s', epoch_num_frames[-1] / epoch_time[-1]),
+        #                ('took', epoch_time[-1])]),
+        #            )
+
+        if stop_timer:
+            self.['epoch_time'] = time.time() - self.epoch_time_start
+
+
+        if steps % args.stdout_interval == 0:
+            # with torch.no_grad():
+            #     mel_error = F.l1_loss(y_mel, y_g_hat_mel).item()
+
+            took = time.time() - self.start_b
+
+
+        self.sws['train'].add_scalar("gen_loss_total", loss_gen_all.item(), steps)
+        self.sws['train'].add_scalar("mel_spec_error", mel_error.item(), steps)
+
+        for key, val in meta.items():
+
+            sw_name = 'train'
+            for name_ in keys_mpd + keys_msd:
+                if name_ in key:
+                    sw_name = 'train_' + name_
+
+            key = key.replace('loss_', 'loss/')
+            key = re.sub('mpd\d+', 'mpd-msd', key)
+            key = re.sub('msd\d+', 'mpd-msd', key)
+
+            self.sws[sw_name].add_scalar(key, val / h.batch_size, steps)
+
+    def iter_metrics(self, target='dll+tb'):
+        return {self.iter_metrics[k] for k in self.keys_[target]}
+
+    def foo
+
+Steps : 40, Gen Loss Total : 57.993, Mel-Spec. Error : 47.374, s/b : 1.013
+
+                logger.log((epoch, epoch_iter, num_iters),
+                           tb_total_steps=total_iter,
+                           subset='train',
+                           data=OrderedDict([
+                               ('loss', iter_loss),
+                               ('mel_loss', iter_mel_loss),
+                               ('frames/s', iter_num_frames / iter_time),      
+                               ('took', iter_time),
+                               ('lrate', optimizer.param_groups[0]['lr'])]),   
+                           )
+
+
+
+class Meter:
+    def __init__(self, sink_type, scope, downstream=None, end_points=None, verbosity=dllogger.Verbosity.DEFAULT):
+        self.verbosity = verbosity
+        self.sink_type = sink_type
+        self.scope = scope
+        self.downstream = downstream
+
+        self.end_points = end_points or []
+
+    def start(self):
+        ds = None if self.downstream is None else self.downstream.sink
+        end_pt_fn = lambda x: list(map(lambda f: f(x), self.end_points))  # call all endpoint functions
+        self.sink = self.sink_type(end_pt_fn, ds)
+
+    def end(self):
+        self.sink.close()
+
+    def send(self, data):
+        self.sink.send(data)
+
+    def meters(self):
+        if self.downstream is not None:
+            downstream_meters = self.downstream.meters()
+        else:
+            downstream_meters = []
+        return [self] + downstream_meters
+
+    def add_end_point(self, new_endpoint):
+        self.end_points.append(new_endpoint)
+
+    def __or__(self, other):
+        """for easy chaining of meters"""
+        if self.downstream is None:
+            self.downstream = other
+        else:
+            self.downstream | other
+
+        return self
diff --git a/hifigan/models.py b/hifigan/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..238e978915b47a57d9a3f8757dc6726c19ee1679
--- /dev/null
+++ b/hifigan/models.py
@@ -0,0 +1,457 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#  MIT License
+#
+#  Copyright (c) 2020 Jungil Kong
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in all
+#  copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#  SOFTWARE.
+
+# The following functions/classes were based on code from https://github.com/jik876/hifi-gan:
+# ResBlock1, ResBlock2, Generator, DiscriminatorP, DiscriminatorS, MultiScaleDiscriminator,
+# MultiPeriodDiscriminator, feature_loss, discriminator_loss, generator_loss,
+# init_weights, get_padding
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from common.stft import STFT
+from common.utils import AttrDict, init_weights, get_padding
+
+LRELU_SLOPE = 0.1
+
+
+class NoAMPConv1d(Conv1d):
+    def __init__(self, *args, no_amp=False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.no_amp = no_amp
+
+    def _cast(self, x, dtype):
+        if isinstance(x, (list, tuple)):
+            return [self._cast(t, dtype) for t in x]
+        else:
+            return x.to(dtype)
+
+    def forward(self, *args):
+        if not self.no_amp:
+            return super().forward(*args)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._cast(
+                super().forward(*self._cast(args, torch.float)), args[0].dtype)
+
+
+class ResBlock1(nn.Module):
+    __constants__ = ['lrelu_slope']
+
+    def __init__(self, conf, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.conf = conf
+        self.lrelu_slope = LRELU_SLOPE
+
+        ch, ks = channels, kernel_size
+        self.convs1 = nn.Sequential(*[
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[0]), dilation[0])),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[1]), dilation[1])),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[2]), dilation[2])),
+        ])
+
+        self.convs2 = nn.Sequential(*[
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))),
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, self.lrelu_slope)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, self.lrelu_slope)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(nn.Module):
+    __constants__ = ['lrelu_slope']
+
+    def __init__(self, conf, channels, kernel_size=3, dilation=(1, 3)):
+        super().__init__()
+        self.conf = conf
+
+        ch, ks = channels, kernel_size
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(kernel_size, dilation[0]), dilation[0])),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(kernel_size, dilation[1]), dilation[1])),
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, self.lrelu_slope)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Generator(nn.Module):
+    __constants__ = ['lrelu_slope', 'num_kernels', 'num_upsamples']
+
+    def __init__(self, conf):
+        super().__init__()
+        conf = AttrDict(conf)
+        self.conf = conf
+        self.num_kernels = len(conf.resblock_kernel_sizes)
+        self.num_upsamples = len(conf.upsample_rates)
+
+        self.conv_pre = weight_norm(
+            Conv1d(80, conf.upsample_initial_channel, 7, 1, padding=3))
+
+        self.lrelu_slope = LRELU_SLOPE
+
+        resblock = ResBlock1 if conf.resblock == '1' else ResBlock2
+
+        self.ups = []
+        for i, (u, k) in enumerate(zip(conf.upsample_rates,
+                                       conf.upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(conf.upsample_initial_channel // (2 ** i),
+                                conf.upsample_initial_channel // (2 ** (i + 1)),
+                                k, u, padding=(k-u)//2)))
+
+        self.ups = nn.Sequential(*self.ups)
+
+        self.resblocks = []
+        for i in range(len(self.ups)):
+            resblock_list = []
+
+            ch = conf.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(conf.resblock_kernel_sizes,
+                                           conf.resblock_dilation_sizes)):
+                resblock_list.append(resblock(conf, ch, k, d))
+            resblock_list = nn.Sequential(*resblock_list)
+            self.resblocks.append(resblock_list)
+        self.resblocks = nn.Sequential(*self.resblocks)
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def load_state_dict(self, state_dict, strict=True):
+        # Fallback for old checkpoints (pre-ONNX fix)
+        new_sd = {}
+        for k, v in state_dict.items():
+            new_k = k
+            if 'resblocks' in k:
+                parts = k.split(".")
+                # only do this is the checkpoint type is older
+                if len(parts) == 5:
+                    layer = int(parts[1])
+                    new_layer = f"{layer//3}.{layer%3}"
+                    new_k = f"resblocks.{new_layer}.{'.'.join(parts[2:])}"
+            new_sd[new_k] = v
+
+        # Fix for conv1d/conv2d/NHWC
+        curr_sd = self.state_dict()
+        for key in new_sd:
+            len_diff = len(new_sd[key].size()) - len(curr_sd[key].size())
+            if len_diff == -1:
+                new_sd[key] = new_sd[key].unsqueeze(-1)
+            elif len_diff == 1:
+                new_sd[key] = new_sd[key].squeeze(-1)
+
+        super().load_state_dict(new_sd, strict=strict)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+
+        for upsample_layer, resblock_group in zip(self.ups, self.resblocks):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = upsample_layer(x)
+            xs = 0
+            for resblock in resblock_group:
+                xs += resblock(x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        print('HiFi-GAN: Removing weight norm.')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for group in self.resblocks:
+            for block in group:
+                block.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class Denoiser(nn.Module):
+    """ Removes model bias from audio produced with hifigan """
+
+    def __init__(self, hifigan, filter_length=1024, n_overlap=4,
+                 win_length=1024, mode='zeros', device="cpu", **infer_kw):
+        super().__init__()
+        self.stft = STFT(filter_length=filter_length,
+                         hop_length=int(filter_length/n_overlap),
+                         #win_length=win_length).cuda() # was like this
+                         win_length=win_length, device=device)
+
+        for name, p in hifigan.named_parameters():
+            if name.endswith('.weight'):
+                dtype = p.dtype
+                device = p.device
+                break
+
+        mel_init = {'zeros': torch.zeros, 'normal': torch.randn}[mode]
+        mel_input = mel_init((1, 80, 88), dtype=dtype, device=device)
+
+        with torch.no_grad():
+            bias_audio = hifigan(mel_input, **infer_kw).float()
+            if len(bias_audio.size()) > 2:
+                bias_audio = bias_audio.squeeze(0)
+            elif len(bias_audio.size()) < 2:
+                bias_audio = bias_audio.unsqueeze(0)
+            assert len(bias_audio.size()) == 2
+
+            bias_spec, _ = self.stft.transform(bias_audio)
+
+        self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
+
+    def forward(self, audio, strength=0.1):
+        audio_spec, audio_angles = self.stft.transform(audio.float())
+        audio_spec_denoised = audio_spec - self.bias_spec * strength
+        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+        audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
+        return audio_denoised
+
+
+class DiscriminatorP(nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super().__init__()
+        self.period = period
+        norm_f = spectral_norm if use_spectral_norm else weight_norm
+
+        ks = kernel_size
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (ks, 1), (stride, 1), (get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (ks, 1), (stride, 1), (get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (ks, 1), (stride, 1), (get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (ks, 1), (stride, 1), (get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (ks, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def share_params_of(self, dp):
+        assert len(self.convs) == len(dp.convs)
+        for c1, c2 in zip(self.convs, dp.convs):
+            c1.weight = c2.weight
+            c1.bias = c2.bias
+
+
+class MultiPeriodDiscriminator(nn.Module):
+    def __init__(self, periods, concat_fwd=False):
+        super().__init__()
+        layers = [DiscriminatorP(p) for p in periods]
+        self.discriminators = nn.ModuleList(layers)
+        self.concat_fwd = concat_fwd
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if self.concat_fwd:
+                y_ds, fmaps = d(concat_discr_input(y, y_hat))
+                y_d_r, y_d_g, fmap_r, fmap_g = split_discr_output(y_ds, fmaps)
+            else:
+                y_d_r, fmap_r = d(y)
+                y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(nn.Module):
+    def __init__(self, use_spectral_norm=False, no_amp_grouped_conv=False):
+        super().__init__()
+        norm_f = spectral_norm if use_spectral_norm else weight_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(NoAMPConv1d(128, 256, 41, 2, groups=16, padding=20, no_amp=no_amp_grouped_conv)),
+            norm_f(NoAMPConv1d(256, 512, 41, 4, groups=16, padding=20, no_amp=no_amp_grouped_conv)),
+            norm_f(NoAMPConv1d(512, 1024, 41, 4, groups=16, padding=20, no_amp=no_amp_grouped_conv)),
+            norm_f(NoAMPConv1d(1024, 1024, 41, 1, groups=16, padding=20, no_amp=no_amp_grouped_conv)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            # x = l(x.unsqueeze(-1)).squeeze(-1)
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+
+
+class MultiScaleDiscriminator(nn.Module):
+    def __init__(self, no_amp_grouped_conv=False, concat_fwd=False):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True, no_amp_grouped_conv=no_amp_grouped_conv),
+            DiscriminatorS(no_amp_grouped_conv=no_amp_grouped_conv),
+            DiscriminatorS(no_amp_grouped_conv=no_amp_grouped_conv),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=1),
+            AvgPool1d(4, 2, padding=1)
+        ])
+        self.concat_fwd = concat_fwd
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if self.concat_fwd:
+                ys = concat_discr_input(y, y_hat)
+                if i != 0:
+                    ys = self.meanpools[i-1](ys)
+                y_ds, fmaps = d(ys)
+                y_d_r, y_d_g, fmap_r, fmap_g = split_discr_output(y_ds, fmaps)
+            else:
+                if i != 0:
+                    y = self.meanpools[i-1](y)
+                    y_hat = self.meanpools[i-1](y_hat)
+                y_d_r, fmap_r = d(y)
+                y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def concat_discr_input(y, y_hat):
+    return torch.cat((y, y_hat), dim=0)
+
+
+def split_discr_output(y_ds, fmaps):
+    y_d_r, y_d_g = torch.chunk(y_ds, 2, dim=0)
+    fmap_r, fmap_g = zip(*(torch.chunk(f, 2, dim=0) for f in fmaps))
+    return y_d_r, y_d_g, fmap_r, fmap_g
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss*2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
diff --git a/hifigan/models_ch_last_.py b/hifigan/models_ch_last_.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8f7c6ca22dd2c6efc24afd3458311bfbc10a2e
--- /dev/null
+++ b/hifigan/models_ch_last_.py
@@ -0,0 +1,378 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d, ConvTranspose2d, AvgPool2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from common.utils import init_weights, get_padding, print_once
+
+LRELU_SLOPE = 0.1
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=(dilation[0], 1),
+                               padding=(get_padding(kernel_size, dilation[0]), 0))),
+            weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=(dilation[1], 1),
+                               padding=(get_padding(kernel_size, dilation[1]), 0))),
+            weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=(dilation[2], 1),
+                               padding=(get_padding(kernel_size, dilation[2]), 0)))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=1,
+                               padding=(get_padding(kernel_size, 1), 0))),
+            weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=1,
+                               padding=(get_padding(kernel_size, 1), 0))),
+            weight_norm(Conv2d(channels, channels, (kernel_size, 1), 1, dilation=1,
+                               padding=(get_padding(kernel_size, 1), 0)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv2d(80, h.upsample_initial_channel, (7,1), (1,1), padding=(3,0)))
+        assert h.resblock == '1', 'Only ResBlock1 currently supported for NHWC'
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                # ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                #                 k, u, padding=(k-u)//2)))
+                ConvTranspose2d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                (k, 1), (u, 1), padding=((k-u)//2, 0))))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = weight_norm(Conv2d(ch, 1, (7,1), (1,1), padding=(3,0)))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        x = x.unsqueeze(-1).to(memory_format=torch.channels_last)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            # x = self.ups[i](x.unsqueeze(-1)).squeeze(-1)
+            x = self.ups[i](x)
+            xs = 0
+            for j in range(self.num_kernels):
+                xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        x = x.squeeze(-1)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t, unit = x.shape
+        assert unit == 1
+
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, 0, 0, n_pad), "reflect")
+            t = t + n_pad
+        # print_once('x pre channels last:', x.is_contiguous(memory_format=torch.channels_last))
+        x = x.view(b, c, t // self.period, self.period)
+        # print_once('x post channels last:', x.is_contiguous(memory_format=torch.channels_last))
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def share_params_of(self, dp):
+        assert len(self.convs) == len(dp.convs)
+        for c1, c2 in zip(self.convs, dp.convs):
+            c1.weight = c2.weight
+            c1.bias = c2.bias
+
+
+class DiscriminatorPConv1d(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorPConv1d, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0), dilation=(period, 1))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0), dilation=(period, 1))),
+        ])
+        # self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1, dilation=period))
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0), dilation=(period, 1)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t, unit = x.shape
+        assert unit == 1
+        # if t % self.period != 0: # pad first
+        #     n_pad = self.period - (t % self.period)
+        #     x = F.pad(x, (0, n_pad), "reflect")
+        #     t = t + n_pad
+        # x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def share_params_of(self, dp):
+        assert len(self.convs) == len(dp.convs)
+        for c1, c2 in zip(self.convs, dp.convs):
+            c1.weight = c2.weight
+            c1.bias = c2.bias
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, periods, use_conv1d=False, shared=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        print('MPD PERIODS:', periods)
+        if use_conv1d:
+            print('Constructing dilated MPD')
+            layers = [DiscriminatorPConv1d(p) for p in periods]
+        else:
+            layers = [DiscriminatorP(p) for p in periods]
+
+        if shared:
+            print('MPD HAS SHARED PARAMS')
+            for l in layers[1:]:
+                l.share_params_of(layers[0])
+
+        self.discriminators = nn.ModuleList(layers)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False, amp_groups=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        # self.convs = nn.ModuleList([
+        #     norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+        #     norm_f(Conv1d(128, 128, 41, 2, groups=1 if amp_groups else 4, padding=20)),   # was: groups=4
+        #     norm_f(Conv1d(128, 256, 41, 2, groups=1 if amp_groups else 16, padding=20)),  # was: groups=16
+        #     norm_f(Conv1d(256, 512, 41, 4, groups=1 if amp_groups else 16, padding=20)),  # was: groups=16
+        #     norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+        #     norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+        #     norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        # ])
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1,     128, (15,1), (1,1),                                 padding=(7 , 0))),
+            norm_f(Conv2d(128,   128, (41,1), (2,1), groups=1 if amp_groups else  4, padding=(20, 0))),   # was: groups=4
+            norm_f(Conv2d(128,   256, (41,1), (2,1), groups=1 if amp_groups else 16, padding=(20, 0))),  # was: groups=16
+            norm_f(Conv2d(256,   512, (41,1), (4,1), groups=1 if amp_groups else 16, padding=(20, 0))),  # was: groups=16
+            norm_f(Conv2d(512,  1024, (41,1), (4,1), groups=16                     , padding=(20, 0))),
+            norm_f(Conv2d(1024, 1024, (41,1), (1,1), groups=16                     , padding=(20, 0))),
+            norm_f(Conv2d(1024, 1024, ( 5,1), (1,1),                                 padding=(2 , 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3,1), (1,1), padding=(1,0)))
+
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = x.squeeze(-1)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap
+
+
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self, amp_groups=False):
+        super(MultiScaleDiscriminator, self).__init__()
+        if amp_groups:
+            print('MSD: AMP groups')
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True, amp_groups=amp_groups),
+            DiscriminatorS(amp_groups=amp_groups),
+            DiscriminatorS(amp_groups=amp_groups),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool2d((4, 1), (2, 1), padding=(1, 0)),
+            AvgPool2d((4, 1), (2, 1), padding=(1, 0))
+        ])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i-1](y)
+                y_hat = self.meanpools[i-1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g, keys=[]):
+    loss = 0
+    meta = {}
+    assert len(keys) == len(fmap_r)
+
+    for key, dr, dg in zip(keys, fmap_r, fmap_g):
+
+        k = 'loss_gen_feat_' + key
+        meta[k] = 0
+
+        for rl, gl in zip(dr, dg):
+            # loss += torch.mean(torch.abs(rl - gl))
+            diff = torch.mean(torch.abs(rl - gl))
+            loss += diff
+            meta[k] += diff.item()
+
+    return loss*2, meta
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs, keys=[]):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    meta = {}
+    assert len(keys) == len(disc_real_outputs)
+
+    for key, dr, dg in zip(keys, disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+        meta['loss_disc_real_' + key] = r_loss.item()
+        meta['loss_disc_gen_' + key] = g_loss.item()
+
+    return loss, r_losses, g_losses, meta
+
+
+def generator_loss(disc_outputs, keys=[]):
+    loss = 0
+    gen_losses = []
+    meta = {}
+    assert len(keys) == len(disc_outputs)
+
+    for key, dg in zip(keys, disc_outputs):
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+        meta['loss_gen_' + key] = l.item()
+
+    return loss, gen_losses, meta
+
diff --git a/models.py b/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ec744528259ec588ef27d2902a611668103848
--- /dev/null
+++ b/models.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import re
+import sys
+
+import torch
+
+from common.text.symbols import get_symbols, get_pad_idx
+from common.utils import DefaultAttrDict, AttrDict
+from fastpitch.model import FastPitch
+from fastpitch.model_jit import FastPitchJIT
+from hifigan.models import Generator
+
+try:
+    from waveglow.model import WaveGlow
+    from waveglow import model as glow
+    from waveglow.denoiser import Denoiser
+    sys.modules['glow'] = glow
+except ImportError:
+    print("WARNING: Couldn't import WaveGlow")
+
+
+def parse_model_args(model_name, parser, add_help=False):
+    if model_name == 'FastPitch':
+        from fastpitch import arg_parser
+        return arg_parser.parse_fastpitch_args(parser, add_help)
+
+    elif model_name == 'HiFi-GAN':
+        from hifigan import arg_parser
+        return arg_parser.parse_hifigan_args(parser, add_help)
+
+    elif model_name == 'WaveGlow':
+        from waveglow.arg_parser import parse_waveglow_args
+        return parse_waveglow_args(parser, add_help)
+
+    else:
+        raise NotImplementedError(model_name)
+
+
+def get_model(model_name, model_config, device, bn_uniform_init=False,
+              forward_is_infer=False, jitable=False):
+    """Chooses a model based on name"""
+    del bn_uniform_init  # unused (old name: uniform_initialize_bn_weight)
+
+    if model_name == 'FastPitch':
+        if jitable:
+            model = FastPitchJIT(**model_config)
+        else:
+            model = FastPitch(**model_config)
+
+    elif model_name == 'HiFi-GAN':
+        model = Generator(model_config)
+
+    elif model_name == 'WaveGlow':
+        model = WaveGlow(**model_config)
+
+    else:
+        raise NotImplementedError(model_name)
+
+    if forward_is_infer and hasattr(model, 'infer'):
+        model.forward = model.infer
+
+    return model.to(device)
+
+
+def get_model_config(model_name, args, ckpt_config=None):
+    """ Get config needed to instantiate the model """
+
+    # Mark keys missing in `args` with an object (None is ambiguous)
+    _missing = object()
+    args = DefaultAttrDict(lambda: _missing, vars(args))
+
+    # `ckpt_config` is loaded from the checkpoint and has the priority
+    # `model_config` is based on args and fills empty slots in `ckpt_config`
+    if model_name == 'FastPitch':
+        print(get_symbols(args.symbol_set)) ############################
+        model_config = dict(
+            # io
+            n_mel_channels=args.n_mel_channels,
+            # symbols
+            n_symbols=(len(get_symbols(args.symbol_set))
+                       if args.symbol_set is not _missing else _missing),
+            padding_idx=(get_pad_idx(args.symbol_set)
+                         if args.symbol_set is not _missing else _missing),
+            symbols_embedding_dim=args.symbols_embedding_dim,
+            # input FFT
+            in_fft_n_layers=args.in_fft_n_layers,
+            in_fft_n_heads=args.in_fft_n_heads,
+            in_fft_d_head=args.in_fft_d_head,
+            in_fft_conv1d_kernel_size=args.in_fft_conv1d_kernel_size,
+            in_fft_conv1d_filter_size=args.in_fft_conv1d_filter_size,
+            in_fft_output_size=args.in_fft_output_size,
+            p_in_fft_dropout=args.p_in_fft_dropout,
+            p_in_fft_dropatt=args.p_in_fft_dropatt,
+            p_in_fft_dropemb=args.p_in_fft_dropemb,
+            # output FFT
+            out_fft_n_layers=args.out_fft_n_layers,
+            out_fft_n_heads=args.out_fft_n_heads,
+            out_fft_d_head=args.out_fft_d_head,
+            out_fft_conv1d_kernel_size=args.out_fft_conv1d_kernel_size,
+            out_fft_conv1d_filter_size=args.out_fft_conv1d_filter_size,
+            out_fft_output_size=args.out_fft_output_size,
+            p_out_fft_dropout=args.p_out_fft_dropout,
+            p_out_fft_dropatt=args.p_out_fft_dropatt,
+            p_out_fft_dropemb=args.p_out_fft_dropemb,
+            # duration predictor
+            dur_predictor_kernel_size=args.dur_predictor_kernel_size,
+            dur_predictor_filter_size=args.dur_predictor_filter_size,
+            p_dur_predictor_dropout=args.p_dur_predictor_dropout,
+            dur_predictor_n_layers=args.dur_predictor_n_layers,
+            # pitch predictor
+            pitch_predictor_kernel_size=args.pitch_predictor_kernel_size,
+            pitch_predictor_filter_size=args.pitch_predictor_filter_size,
+            p_pitch_predictor_dropout=args.p_pitch_predictor_dropout,
+            pitch_predictor_n_layers=args.pitch_predictor_n_layers,
+            # pitch conditioning
+            pitch_embedding_kernel_size=args.pitch_embedding_kernel_size,
+            # speakers parameters
+            n_speakers=args.n_speakers,
+            speaker_emb_weight=args.speaker_emb_weight,
+            n_languages=args.n_languages,
+            # energy predictor
+            energy_predictor_kernel_size=args.energy_predictor_kernel_size,
+            energy_predictor_filter_size=args.energy_predictor_filter_size,
+            p_energy_predictor_dropout=args.p_energy_predictor_dropout,
+            energy_predictor_n_layers=args.energy_predictor_n_layers,
+            # energy conditioning
+            energy_conditioning=args.energy_conditioning,
+            energy_embedding_kernel_size=args.energy_embedding_kernel_size,
+        )
+    elif model_name == 'HiFi-GAN':
+        if args.hifigan_config is not None:
+            assert ckpt_config is None, (
+                "Supplied --hifigan-config, but the checkpoint has a config. "
+                "Drop the flag or remove the config from the checkpoint file.")
+            print(f'HiFi-GAN: Reading model config from {args.hifigan_config}')
+            with open(args.hifigan_config) as f:
+                args = AttrDict(json.load(f))
+
+        model_config = dict(
+            # generator architecture
+            upsample_rates=args.upsample_rates,
+            upsample_kernel_sizes=args.upsample_kernel_sizes,
+            upsample_initial_channel=args.upsample_initial_channel,
+            resblock=args.resblock,
+            resblock_kernel_sizes=args.resblock_kernel_sizes,
+            resblock_dilation_sizes=args.resblock_dilation_sizes,
+        )
+    elif model_name == 'WaveGlow':
+        model_config = dict(
+            n_mel_channels=args.n_mel_channels,
+            n_flows=args.flows,
+            n_group=args.groups,
+            n_early_every=args.early_every,
+            n_early_size=args.early_size,
+            WN_config=dict(
+                n_layers=args.wn_layers,
+                kernel_size=args.wn_kernel_size,
+                n_channels=args.wn_channels
+            )
+        )
+    else:
+        raise NotImplementedError(model_name)
+
+    # Start with ckpt_config, and fill missing keys from model_config
+    final_config = {} if ckpt_config is None else ckpt_config.copy()
+    missing_keys = set(model_config.keys()) - set(final_config.keys())
+    final_config.update({k: model_config[k] for k in missing_keys})
+
+    # If there was a ckpt_config, it should have had all args
+    if ckpt_config is not None and len(missing_keys) > 0:
+        print(f'WARNING: Keys {missing_keys} missing from the loaded config; '
+              'using args instead.')
+    # NOTE: useful to debug the assertion error
+    #for k, v in final_config.items():
+    #    if v is _missing:
+    #        print(k)
+    assert all(v is not _missing for v in final_config.values()) ##########################################
+    return final_config
+
+
+def get_model_train_setup(model_name, args):
+    """ Dump train setup for documentation purposes """
+    if model_name == 'FastPitch':
+        return dict()
+    elif model_name == 'HiFi-GAN':
+        return dict(
+            # audio
+            segment_size=args.segment_size,
+            filter_length=args.filter_length,
+            num_mels=args.num_mels,
+            hop_length=args.hop_length,
+            win_length=args.win_length,
+            sampling_rate=args.sampling_rate,
+            mel_fmin=args.mel_fmin,
+            mel_fmax=args.mel_fmax,
+            mel_fmax_loss=args.mel_fmax_loss,
+            max_wav_value=args.max_wav_value,
+            # other
+            seed=args.seed,
+            # optimization
+            base_lr=args.learning_rate,
+            lr_decay=args.lr_decay,
+            epochs_all=args.epochs,
+        )
+    elif model_name == 'WaveGlow':
+        return dict()
+    else:
+        raise NotImplementedError(model_name)
+
+
+def load_model_from_ckpt(checkpoint_data, model, key='state_dict'):
+
+    if key is None:
+        return checkpoint_data['model'], None
+
+    sd = checkpoint_data[key]
+    sd = {re.sub('^module\.', '', k): v for k, v in sd.items()}
+    status = model.load_state_dict(sd, strict=False)
+    return model, status
+
+
+def load_and_setup_model(model_name, parser, checkpoint, amp, device,
+                         unk_args=[], forward_is_infer=False, jitable=False):
+    if checkpoint is not None:
+        #ckpt_data = torch.load(checkpoint)
+        ckpt_data = torch.load(checkpoint, map_location=device)
+        print(f'{model_name}: Loading {checkpoint}...')
+        ckpt_config = ckpt_data.get('config')
+        if ckpt_config is None:
+            print(f'{model_name}: No model config in the checkpoint; using args.')
+        else:
+            print(f'{model_name}: Found model config saved in the checkpoint.')
+    else:
+        ckpt_config = None
+        ckpt_data = {}
+
+    model_parser = parse_model_args(model_name, parser, add_help=False)
+    model_args, model_unk_args = model_parser.parse_known_args()
+    unk_args[:] = list(set(unk_args) & set(model_unk_args))
+
+    model_config = get_model_config(model_name, model_args, ckpt_config)
+
+    model = get_model(model_name, model_config, device,
+                      forward_is_infer=forward_is_infer,
+                      jitable=jitable)
+
+    if checkpoint is not None:
+        key = 'generator' if model_name == 'HiFi-GAN' else 'state_dict'
+        model, status = load_model_from_ckpt(ckpt_data, model, key)
+
+        missing = [] if status is None else status.missing_keys
+        unexpected = [] if status is None else status.unexpected_keys
+
+        # Attention is only used during training, we won't miss it
+        if model_name == 'FastPitch':
+            missing = [k for k in missing if not k.startswith('attention.')]
+            unexpected = [k for k in unexpected if not k.startswith('attention.')]
+
+        assert len(missing) == 0 and len(unexpected) == 0, (
+            f'Mismatched keys when loading parameters. Missing: {missing}, '
+            f'unexpected: {unexpected}.')
+
+    if model_name == "WaveGlow":
+        for k, m in model.named_modules():
+            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatability
+        model = model.remove_weightnorm(model)
+
+    elif model_name == 'HiFi-GAN':
+        assert model_args.hifigan_config is not None or ckpt_config is not None, (
+            'Use a HiFi-GAN checkpoint from NVIDIA DeepLearningExamples with '
+            'saved config or supply --hifigan-config <json_file>.')
+        model.remove_weight_norm()
+
+    if amp:
+        model.half()
+
+    model.eval()
+    return model.to(device), model_config, ckpt_data.get('train_setup', {})
+
+
+def load_and_setup_ts_model(model_name, checkpoint, amp, device=None):
+    print(f'{model_name}: Loading TorchScript checkpoint {checkpoint}...')
+    model = torch.jit.load(checkpoint).eval()
+    if device is not None:
+        model = model.to(device)
+    
+    if amp:
+        model.half()
+    elif next(model.parameters()).dtype == torch.float16:
+        raise ValueError('Trying to load FP32 model,'
+                         'TS checkpoint is in FP16 precision.')
+    return model
+
+
+def convert_ts_to_trt(model_name, ts_model, parser, amp, unk_args=[]):
+    trt_parser = _parse_trt_compilation_args(model_name, parser, add_help=False)
+    trt_args, trt_unk_args = trt_parser.parse_known_args()
+    unk_args[:] = list(set(unk_args) & set(trt_unk_args))
+
+    if model_name == 'HiFi-GAN':
+        return _convert_ts_to_trt_hifigan(
+            ts_model, amp, trt_args.trt_min_opt_max_batch,
+            trt_args.trt_min_opt_max_hifigan_length)
+    else:
+        raise NotImplementedError
+
+
+def _parse_trt_compilation_args(model_name, parent, add_help=False):
+    """
+    Parse model and inference specific commandline arguments.
+    """
+    parser = argparse.ArgumentParser(parents=[parent], add_help=add_help,
+                                     allow_abbrev=False)
+    trt = parser.add_argument_group(f'{model_name} Torch-TensorRT compilation parameters')
+    trt.add_argument('--trt-min-opt-max-batch', nargs=3, type=int,
+                     default=(1, 8, 16),
+                     help='Torch-TensorRT min, optimal and max batch size')
+    if model_name == 'HiFi-GAN':
+        trt.add_argument('--trt-min-opt-max-hifigan-length', nargs=3, type=int,
+                         default=(100, 800, 1200),
+                         help='Torch-TensorRT min, optimal and max audio length (in frames)')
+    return parser
+
+
+def _convert_ts_to_trt_hifigan(ts_model, amp, trt_min_opt_max_batch,
+                               trt_min_opt_max_hifigan_length, num_mels=80):
+    import torch_tensorrt
+    trt_dtype = torch.half if amp else torch.float
+    print(f'Torch TensorRT: compiling HiFi-GAN for dtype {trt_dtype}.')
+    min_shp, opt_shp, max_shp = zip(trt_min_opt_max_batch,
+                                    (num_mels,) * 3,
+                                    trt_min_opt_max_hifigan_length)
+    compile_settings = {
+        "inputs": [torch_tensorrt.Input(
+            min_shape=min_shp,
+            opt_shape=opt_shp,
+            max_shape=max_shp,
+            dtype=trt_dtype,
+        )],
+        "enabled_precisions": {trt_dtype},
+        "require_full_compilation": True,
+    }
+    trt_model = torch_tensorrt.compile(ts_model, **compile_settings)
+    print('Torch TensorRT: compilation successful.')
+    return trt_model
diff --git a/pretrained_models/hifigan/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz_21.08.0_amp.zip b/pretrained_models/hifigan/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz_21.08.0_amp.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a60321254ad44c9a59ca743adcdb620053dac18f
--- /dev/null
+++ b/pretrained_models/hifigan/hifigan__pyt_ckpt_mode-finetune_ds-ljs22khz_21.08.0_amp.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00af1cfbc043da27906c1b535115c6d46c385f033353aefa08b3bd31c5a82acb
+size 51879246
diff --git a/pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt b/pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9aeed2cf22c5af1e96b1ff88d6c61886498b2cba
--- /dev/null
+++ b/pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23d8023b48c13e5cdaf8758077c6bc3567d14813f5327f63aa1d26f579c6e9a3
+size 55819501
diff --git a/pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt b/pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5cc70a4bf11a91f3368d6998e9d2a34f9bcda273
--- /dev/null
+++ b/pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a6092979c190aa784fe84ccdba7582e99ff50a2e9c8db8a5c227afe5e21cd26
+size 55824685
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a5867ca5f58a6574f4a90481e26c323a9835b89c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+torch
+torchvision
+torchaudio
+inflect
+librosa==0.9.0
+matplotlib
+numpy
+pynvml==11.0.0
+scipy
+tensorboardX==2.0
+git+https://github.com/NVIDIA/dllogger@v1.0.0#egg=dllogger
+gradio==5.15
+pydantic==2.10.6
diff --git a/symbols.py b/symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb7e52dde5461ac6720aae8b2a65cd7107c720e5
--- /dev/null
+++ b/symbols.py
@@ -0,0 +1,64 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
+from .cmudict import valid_symbols
+
+
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ['@' + s for s in valid_symbols]
+
+
+def get_symbols(symbol_set='english_basic'):
+    if symbol_set == 'english_basic':
+        _pad = '_'
+        _punctuation = '!\'(),.:;? '
+        _special = '-'
+        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+    elif symbol_set == 'english_basic_lowercase':
+        _pad = '_'
+        _punctuation = '!\'"(),.:;? '
+        _special = '-'
+        _letters = 'abcdefghijklmnopqrstuvwxyz'
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+    elif symbol_set == 'english_expanded':
+        _punctuation = '!\'",.:;? '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        _accented = 'áçéêëñöøćž'
+        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet
+    elif symbol_set == 'smj_expanded':
+        _punctuation = '!\'",.:;?- '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        # _accented = 'áçéêëñöøćžđšŧ' #also north sámi letters...
+        _accented = 'áçéêëñöø' #also north sámi letters...
+        # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        _letters = 'AÁÆÅÄBCDEFGHIJKLMNŊŃÑOØÖPQRSTŦUVWXYZaáæåäbcdefghijklmnŋńñoøöpqrstuvwxyz'
+        # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet
+        symbols = list(_punctuation + _letters) + _arpabet
+    elif symbol_set == 'sme_expanded':
+        _punctuation = '!\'",.:;?- '
+        _math = '#%&*+-/[]()'
+        _special = '_@©°½—₩€$'
+        _accented = 'áçéêëńñöøćčžđšŧ' #also north sámi letters...
+        # _accented = 'áçéêëñöø' #also north sámi letters...
+        # _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+        _letters = 'AÁÆÅÄBCČDĐEFGHIJKLMNŊOØÖPQRSŠTŦUVWXYZŽaáæåäbcčdđefghijklmnŋoøöpqrsštŧuvwxyzž'
+        # symbols = list(_punctuation + _math + _special + _accented + _letters) #+ _arpabet
+        symbols = list(_punctuation + _letters) + _arpabet
+    else:
+        raise Exception("{} symbol set does not exist".format(symbol_set))
+
+    return symbols
+
+
+def get_pad_idx(symbol_set='english_basic'):
+    if symbol_set in {'english_basic', 'english_basic_lowercase', 'smj_expanded', 'sme_expanded'}:
+        return 0
+    else:
+        raise Exception("{} symbol set not used yet".format(symbol_set))
diff --git a/syn_hifigan.py b/syn_hifigan.py
new file mode 100644
index 0000000000000000000000000000000000000000..384945dc0e0e003f289ef625d5e2289613b1f8f7
--- /dev/null
+++ b/syn_hifigan.py
@@ -0,0 +1,293 @@
+import argparse
+
+import models
+import time
+import sys
+import warnings
+#from pathlib import Path
+
+
+
+import torch
+import numpy as np
+from scipy.stats import norm
+from scipy.io.wavfile import write
+from torch.nn.utils.rnn import pad_sequence
+#import style_controller
+from common.utils import load_wav_to_torch
+
+
+from common import utils, layers
+
+from common.text.text_processing import TextProcessing
+
+
+import os
+#os.environ["CUDA_VISIBLE_DEVICES"]=""
+#device = "cuda:0"
+device = "cpu"
+
+vocoder = "hifigan"
+SHARPEN = True
+from hifigan.data_function import MAX_WAV_VALUE, mel_spectrogram
+from hifigan.models import Denoiser
+import json
+from scipy import ndimage
+
+import os
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('-i', '--input', type=str, required=False,
+                        help='Full path to the input text (phareses separated by newlines)')
+    parser.add_argument('-o', '--output', default=None,
+                        help='Output folder to save audio (file per phrase)')
+    parser.add_argument('--log-file', type=str, default=None,
+                        help='Path to a DLLogger log file')
+    parser.add_argument('--save-mels', action='store_true', help='')
+    parser.add_argument('--cuda', action='store_true',
+                        help='Run inference on a GPU using CUDA')
+
+    parser.add_argument('--cudnn-benchmark', action='store_true',
+                        help='Enable cudnn benchmark mode')
+
+    #parser.add_argument('--fastpitch', type=str, default='output_smj_sander/FastPitch_checkpoint_660.pt',
+                        #help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
+
+    parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
+                        help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########    
+
+    parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
+                        help='WaveGlow denoising')
+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
+                        help='Sampling rate')
+    parser.add_argument('--stft-hop-length', type=int, default=256,
+                        help='STFT hop length for estimating audio length from mel size')
+    parser.add_argument('--amp', action='store_true',default=False,
+                        help='Inference with AMP')
+    parser.add_argument('-bs', '--batch-size', type=int, default=1)
+    
+    parser.add_argument('--ema', action='store_true',
+                        help='Use EMA averaged model (if saved in checkpoints)')
+    
+    parser.add_argument('--speaker', type=int, default=0,
+                        help='Speaker ID for a multi-speaker model')
+    parser.add_argument('--language', type=int, default=0,
+                        help='Language ID for a multilingual model')
+    parser.add_argument('--p-arpabet', type=float, default=0.0, help='') ################
+
+  
+    text_processing = parser.add_argument_group('Text processing parameters')
+    text_processing.add_argument('--text-cleaners', nargs='*',
+                                 default=['basic_cleaners'], type=str,
+                                 help='Type of text cleaners for input text')
+    text_processing.add_argument('--symbol-set', type=str, default='all_sami', #################
+                                 help='Define symbol set for input text')
+
+    cond = parser.add_argument_group('conditioning on additional attributes')
+
+    cond.add_argument('--n-speakers', type=int, default=10,
+                      help='Number of speakers in the model.')
+    cond.add_argument('--n-languages', type=int, default=3,
+                      help='Number of languages in the model.')
+
+    return parser
+
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+def load_model_from_ckpt(checkpoint_path, ema, model):
+
+    
+    checkpoint_data = torch.load(checkpoint_path,map_location = device)
+    status = ''
+
+    if 'state_dict' in checkpoint_data:
+        sd = checkpoint_data['state_dict']
+        if ema and 'ema_state_dict' in checkpoint_data:
+            sd = checkpoint_data['ema_state_dict']
+            status += ' (EMA)'
+        elif ema and not 'ema_state_dict' in checkpoint_data:
+            print(f'WARNING: EMA weights missing for {checkpoint_data}')
+
+        if any(key.startswith('module.') for key in sd):
+            sd = {k.replace('module.', ''): v for k,v in sd.items()}
+        status += ' ' + str(model.load_state_dict(sd, strict=False))
+    else:
+        model = checkpoint_data['model']
+    print(f'Loaded {checkpoint_path}{status}')
+
+    return model
+    
+def load_and_setup_model(model_name, parser, checkpoint, amp, device,
+                         unk_args=[], forward_is_infer=False, ema=True,
+                         jitable=False):
+
+
+    model_parser = models.parse_model_args(model_name, parser, add_help=False)
+    model_args, model_unk_args = model_parser.parse_known_args()
+    unk_args[:] = list(set(unk_args) & set(model_unk_args))
+
+    setattr(model_args, "energy_conditioning",True)
+    model_config = models.get_model_config(model_name, model_args)
+    # print(model_config)
+    model = models.get_model(model_name, model_config, device,
+                             forward_is_infer=forward_is_infer,
+                             jitable=jitable)
+    
+    if checkpoint is not None:
+        model = load_model_from_ckpt(checkpoint, ema, model)
+
+    amp = False
+    if amp:
+        model.half()
+    model.eval()
+ 
+    return model.to(device)
+
+class Synthesizer:
+
+    def _load_pyt_or_ts_model(self, model_name, ckpt_path, format = 'pyt'):
+
+        if format == 'ts':
+
+            model = models.load_and_setup_ts_model(model_name, ckpt_path,
+                                                   False, device)
+            model_train_setup = {}
+            return model, model_train_setup
+          
+        is_ts_based_infer = False
+        model, _, model_train_setup = models.load_and_setup_model(
+            model_name, self.parser, ckpt_path, False, device,
+            unk_args=self.unk_args, forward_is_infer=True, jitable=is_ts_based_infer)
+        
+        if is_ts_based_infer:
+            model = torch.jit.script(model)
+        return model, model_train_setup
+
+
+
+    def __init__(self): 
+        parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference',
+                                     allow_abbrev=False)
+        self.parser = parse_args(parser)
+        
+        self.args, self.unk_args = self.parser.parse_known_args()
+        self.generator = load_and_setup_model(
+            'FastPitch', parser, self.args.fastpitch, self.args.amp, device,
+            unk_args=self.unk_args, forward_is_infer=True, ema=self.args.ema,
+            jitable=False)
+        
+
+        self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt" # Better with Sander!
+
+        #self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt"
+        #self.vocoder = UnivNetModel.from_pretrained(model_name="tts_en_libritts_univnet")
+        self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
+        self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
+        self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
+        
+        
+   
+    def unsharp_mask(self, img, radius=1, amount=1):
+        blurred = ndimage.gaussian_filter(img, radius)
+        sharpened = img + amount * ( img - blurred)
+        return sharpened
+
+    def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1):
+
+        text = self.tp.encode_text(text)
+        #text = [9]+self.tp.encode_text(text)+[9]
+        text = torch.LongTensor([text]).to(device)
+        #probs = surprisals
+        for p in [0]:
+                  
+            with torch.no_grad():
+                print(s_weight, l_weight)
+                mel, mel_lens, *_ = self.generator(text, pace, max_duration=15, speaker=spkr, language=lang, speaker_weight=s_weight, language_weight=l_weight) #, ref_vector=embedding, speaker=speaker_i) #, **gen_kw, speaker 0 = bad audio, speaker 1 = better audio   
+             
+            if SHARPEN:
+                
+                mel_np = mel.float().data.cpu().numpy()[0]
+                tgt_min = -11
+                tgt_max = 1.25
+                #print(np.min(mel_np), np.max(mel_np))
+                mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
+                mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
+                # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
+  
+                for i in range(0, 80):
+                    mel_np[i,:]+=(i-30)*clarity*0.02
+                mel_np = (mel_np-np.min(mel_np))/ (np.max(mel_np)-np.min(mel_np)) * (tgt_max - tgt_min) + tgt_min
+                mel[0] = torch.from_numpy(mel_np).float().to(device)
+            """
+            mel_np = mel.float().data.cpu().numpy()[0]
+            blurred_f = ndimage.gaussian_filter(mel_np, 1.0) #3
+            alpha = 0.2 #0.3 ta
+            mel_np = mel_np + alpha * (mel_np - blurred_f)
+            blurred_f = ndimage.gaussian_filter(mel_np, 3.0) #3
+            alpha = 0.1 # 0.1 ta
+            sharpened = mel_np + alpha * (mel_np - blurred_f)
+            
+            for i in range(0,80):
+                sharpened[i, :]+=(i-40)*0.01 #0.01 ta
+            mel[0] = torch.from_numpy(sharpened).float().to(device)
+            
+            """
+            with torch.no_grad():
+
+                y_g_hat = self.vocoder(mel).float() ###########
+                #y_g_hat = self.denoiser(y_g_hat.squeeze(1), strength=0.01) #[:, 0]
+                audio = y_g_hat.squeeze()
+                # normalize volume
+                audio = audio/torch.max(torch.abs(audio))*0.95*32768
+                audio = audio.cpu().numpy().astype('int16')
+                   
+                    
+                write(output_file+".wav", 22050, audio)
+            
+            os.system("play -q "+output_file+".wav")
+            return audio
+    
+
+if __name__ == '__main__':
+    syn = Synthesizer()
+    hifigan = syn.hifigan_model
+    hifigan_n = hifigan.replace(".pt", "")
+    fastpitch = syn.args.fastpitch
+    fastpitch_n = fastpitch.replace(".pt", "")
+    print(hifigan_n + " " + fastpitch_n)
+
+    hifigan_n_short = hifigan_n.split("/")
+    hifigan_n_shorter = hifigan_n_short[2].split("_")
+    hifigan_n_shortest = hifigan_n_shorter[3]
+
+    fastpitch_n_short = fastpitch_n.split("/")
+    fastpitch_n_shorter = fastpitch_n_short[1].split("_")
+    fastpitch_n_shortest = fastpitch_n_shorter[2]
+        
+    #syn.speak("Gå lij riek mælggadav vádtsám, de bådij vijmak tjáppa vuobmáj.")
+    i = 0
+    spkr = 1
+    lang = 1
+    while (1==1):
+        
+        text = input(">")
+        text1 = text.split(" ")
+        syn.speak(text, output_file="/tmp/tmp.wav", spkr=6, lang=1)
+        syn.speak(text, output_file="/tmp/tmp.wav", spkr=7, lang=1)
+        continue
+        for s in range(1,10):
+            for l in range(3): ## 
+                print("speaker", s, "language", l) ##
+                syn.speak(text, output_file="/tmp/"+str(i)+"_"+text1[0]+"_"+str(s)+"_"+str(l)+"_FP_"+fastpitch_n_shortest+"univnet", spkr=s, lang=l)
+                #syn.speak(text, output_file="/home/hiovain/DeepLearningExamples/PyTorch/SpeechSynthesis/FastPitchMulti/inf_output_multi/"+str(i)+"_"+text1[0]+"_"+str(s)+"_"+str(l)+"_FP_"+fastpitch_n_shortest+"univnet", spkr=s, lang=l)
+                i += 1
+
+        
diff --git a/syn_k_univnet_multi.py b/syn_k_univnet_multi.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4803d4590ed4792e14043ca11c8ff9b3698d503
--- /dev/null
+++ b/syn_k_univnet_multi.py
@@ -0,0 +1,275 @@
+import argparse
+
+import models
+import time
+import sys
+import warnings
+#from pathlib import Path
+
+from nemo.collections.tts.models import UnivNetModel
+
+import torch
+import numpy as np
+from scipy.stats import norm
+from scipy.io.wavfile import write
+from torch.nn.utils.rnn import pad_sequence
+#import style_controller
+from common.utils import load_wav_to_torch
+
+
+from common import utils, layers
+
+from common.text.text_processing import TextProcessing
+
+
+import os
+#os.environ["CUDA_VISIBLE_DEVICES"]=""
+device = "cuda:0"
+#device = "cpu"
+vocoder = "univnet"
+vocoder1 = "hifigan"
+SHARPEN = True
+
+from hifigan.data_function import MAX_WAV_VALUE, mel_spectrogram
+from hifigan.models import Denoiser
+import json
+from scipy import ndimage
+
+import os
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('-i', '--input', type=str, required=False,
+                        help='Full path to the input text (phareses separated by newlines)')
+    parser.add_argument('-o', '--output', default=None,
+                        help='Output folder to save audio (file per phrase)')
+    parser.add_argument('--log-file', type=str, default=None,
+                        help='Path to a DLLogger log file')
+    parser.add_argument('--save-mels', action='store_true', help='')
+    parser.add_argument('--cuda', action='store_true',
+                        help='Run inference on a GPU using CUDA')
+
+    parser.add_argument('--cudnn-benchmark', action='store_true',
+                        help='Enable cudnn benchmark mode')
+
+    #parser.add_argument('--fastpitch', type=str, default='output_smj_sander/FastPitch_checkpoint_660.pt',
+                        #help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
+
+    parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
+                        help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########    
+
+    parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
+                        help='WaveGlow denoising')
+    parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
+                        help='Sampling rate')
+    parser.add_argument('--stft-hop-length', type=int, default=256,
+                        help='STFT hop length for estimating audio length from mel size')
+    parser.add_argument('--amp', action='store_true',default=False,
+                        help='Inference with AMP')
+    parser.add_argument('-bs', '--batch-size', type=int, default=1)
+    
+    parser.add_argument('--ema', action='store_true',
+                        help='Use EMA averaged model (if saved in checkpoints)')
+    
+    parser.add_argument('--speaker', type=int, default=0,
+                        help='Speaker ID for a multi-speaker model')
+    parser.add_argument('--language', type=int, default=0,
+                        help='Language ID for a multilingual model')
+    parser.add_argument('--p-arpabet', type=float, default=0.0, help='') ################
+
+  
+    text_processing = parser.add_argument_group('Text processing parameters')
+    text_processing.add_argument('--text-cleaners', nargs='*',
+                                 default=['basic_cleaners'], type=str,
+                                 help='Type of text cleaners for input text')
+    text_processing.add_argument('--symbol-set', type=str, default='all_sami', #################
+                                 help='Define symbol set for input text')
+
+    cond = parser.add_argument_group('conditioning on additional attributes')
+
+    cond.add_argument('--n-speakers', type=int, default=10,
+                      help='Number of speakers in the model.')
+    cond.add_argument('--n-languages', type=int, default=3,
+                      help='Number of languages in the model.')
+
+    return parser
+
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+def load_model_from_ckpt(checkpoint_path, ema, model):
+
+    
+    checkpoint_data = torch.load(checkpoint_path,map_location = device)
+    status = ''
+
+    if 'state_dict' in checkpoint_data:
+        sd = checkpoint_data['state_dict']
+        if ema and 'ema_state_dict' in checkpoint_data:
+            sd = checkpoint_data['ema_state_dict']
+            status += ' (EMA)'
+        elif ema and not 'ema_state_dict' in checkpoint_data:
+            print(f'WARNING: EMA weights missing for {checkpoint_data}')
+
+        if any(key.startswith('module.') for key in sd):
+            sd = {k.replace('module.', ''): v for k,v in sd.items()}
+        status += ' ' + str(model.load_state_dict(sd, strict=False))
+    else:
+        model = checkpoint_data['model']
+    print(f'Loaded {checkpoint_path}{status}')
+
+    return model
+    
+def load_and_setup_model(model_name, parser, checkpoint, amp, device,
+                         unk_args=[], forward_is_infer=False, ema=True,
+                         jitable=False):
+
+    model_parser = models.parse_model_args(model_name, parser, add_help=False)
+    model_args, model_unk_args = model_parser.parse_known_args()
+    unk_args[:] = list(set(unk_args) & set(model_unk_args))
+
+    setattr(model_args, "energy_conditioning",True)
+    model_config = models.get_model_config(model_name, model_args)
+    # print(model_config)
+    model = models.get_model(model_name, model_config, device,
+                             forward_is_infer=forward_is_infer,
+                             jitable=jitable)
+    
+    if checkpoint is not None:
+        model = load_model_from_ckpt(checkpoint, ema, model)
+
+    amp = False
+    if amp:
+        model.half()
+    model.eval()
+ 
+    return model.to(device)
+
+class Synthesizer:
+
+    def _load_pyt_or_ts_model(self, model_name, ckpt_path, format = 'pyt'):
+        if format == 'ts':
+          
+            model = models.load_and_setup_ts_model(model_name, ckpt_path,
+                                                   False, device)
+            model_train_setup = {}
+            return model, model_train_setup
+          
+        is_ts_based_infer = False
+        model, _, model_train_setup = models.load_and_setup_model(
+            model_name, self.parser, ckpt_path, False, device,
+            unk_args=self.unk_args, forward_is_infer=True, jitable=is_ts_based_infer)
+
+        if is_ts_based_infer:
+            model = torch.jit.script(model)
+        return model, model_train_setup
+
+
+
+    def __init__(self): 
+        parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference',
+                                     allow_abbrev=False)
+        self.parser = parse_args(parser)
+        
+        self.args, self.unk_args = self.parser.parse_known_args()
+        self.generator = load_and_setup_model(
+            'FastPitch', parser, self.args.fastpitch, self.args.amp, device,
+            unk_args=self.unk_args, forward_is_infer=True, ema=self.args.ema,
+            jitable=False)
+        
+
+        self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_10000_ft.pt" # Better with Sander!
+        #self.hifigan_model = "pretrained_models/hifigan/hifigan_gen_checkpoint_6500.pt"
+        self.vocoder = UnivNetModel.from_pretrained(model_name="tts_en_libritts_univnet")
+        self.vocoder1, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
+        self.denoiser = Denoiser(self.vocoder1,device=device) #, win_length=self.args.win_length).to(device)
+        self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
+        
+    def unsharp_mask(self, img, radius=1, amount=1):
+        blurred = ndimage.gaussian_filter(img, radius)
+        sharpened = img + amount * ( img - blurred)
+        return sharpened
+    
+    #
+    def speak(self, text, output_file="/tmp/tmp", lang=0, spkr=0, l_weight=1, s_weight=1, pace=0.95,clarity=1):
+
+        text = self.tp.encode_text(text)
+        #text = [9]+self.tp.encode_text(text)+[9]
+        text = torch.LongTensor([text]).to(device)
+       
+        for p in [0]:
+            
+        
+            with torch.no_grad():
+               
+                mel, mel_lens, *_ = self.generator(text, pace=pace, max_duration=15, speaker=spkr, language=lang, speaker_weight=s_weight, language_weight=l_weight) #, ref_vector=embedding, speaker=speaker_i) #, **gen_kw, speaker 0 = bad audio, speaker 1 = better audio   
+            if SHARPEN:
+                
+                mel_np = mel.float().data.cpu().numpy()[0]
+                tgt_min = -11
+                tgt_max = 1.5
+                #print(np.min(mel_np), np.max(mel_np))
+                mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=1)
+                mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
+                # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
+  
+                for i in range(0, 80):
+                    mel_np[i,:]+=(i-30)*clarity
+                mel_np = (mel_np-np.min(mel_np))/ (np.max(mel_np)-np.min(mel_np)) * (tgt_max - tgt_min) + tgt_min
+                mel[0] = torch.from_numpy(mel_np).float().to(device)
+            
+            
+            
+            with torch.no_grad():
+                y_g_hat = self.vocoder(spec=mel).float()
+                #y_g_hat = self.vocoder1(mel).float() ###########
+                y_g_hat = self.denoiser(y_g_hat.squeeze(1), strength=0.01) #[:, 0]
+                audio = y_g_hat.squeeze()
+                # normalize volume
+                audio = audio/torch.max(torch.abs(audio))*0.95*32768
+                audio = audio.cpu().numpy().astype('int16')
+                   
+                    
+                write(output_file+".wav", 22050, audio)
+            # ANT: Remove playing form here so GUI doesn't play twice
+            #os.system("play -q "+output_file+".wav")
+            return audio
+    
+
+if __name__ == '__main__':
+    syn = Synthesizer()
+    hifigan = syn.hifigan_model
+    hifigan_n = hifigan.replace(".pt", "")
+    fastpitch = syn.args.fastpitch
+    fastpitch_n = fastpitch.replace(".pt", "")
+    print(hifigan_n + " " + fastpitch_n)
+
+    hifigan_n_short = hifigan_n.split("/")
+    hifigan_n_shorter = hifigan_n_short[2].split("_")
+    hifigan_n_shortest = hifigan_n_shorter[3]
+
+    fastpitch_n_short = fastpitch_n.split("/")
+    fastpitch_n_shorter = fastpitch_n_short[1].split("_")
+    fastpitch_n_shortest = fastpitch_n_shorter[2]
+        
+    #syn.speak("Gå lij riek mælggadav vádtsám, de bådij vijmak tjáppa vuobmáj.")
+    i = 0
+    spkr = 1
+    lang = 1
+    while (1==1):
+        
+        text = input(">")
+        text1 = text.split(" ")
+        for s in range(1,10):
+            for l in range(3): ## 
+                print("speaker", s, "language", l) ##
+                syn.speak(text, output_file="/home/hiovain/DeepLearningExamples/PyTorch/SpeechSynthesis/FastPitchMulti/inf_output_multi/"+str(i)+"_"+text1[0]+"_"+str(s)+"_"+str(l)+"_FP_"+fastpitch_n_shortest+"univnet", spkr=s, lang=l)
+                i += 1
+
+        
diff --git a/waveglow/arg_parser.py b/waveglow/arg_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..e46a537396222fa1c4faff2706a4c651d11aa012
--- /dev/null
+++ b/waveglow/arg_parser.py
@@ -0,0 +1,65 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import argparse
+
+def parse_waveglow_args(parent, add_help=False):
+    """
+    Parse commandline arguments.
+    """
+    parser = argparse.ArgumentParser(parents=[parent], add_help=add_help, allow_abbrev=False)
+
+    # misc parameters
+    parser.add_argument('--n-mel-channels', default=80, type=int,
+                        help='Number of bins in mel-spectrograms')
+
+    # glow parameters
+    parser.add_argument('--flows', default=12, type=int,
+                        help='Number of steps of flow')
+    parser.add_argument('--groups', default=8, type=int,
+                        help='Number of samples in a group processed by the steps of flow')
+    parser.add_argument('--early-every', default=4, type=int,
+                        help='Determines how often (i.e., after how many coupling layers) \
+                        a number of channels (defined by --early-size parameter) are output\
+                        to the loss function')
+    parser.add_argument('--early-size', default=2, type=int,
+                        help='Number of channels output to the loss function')
+    parser.add_argument('--sigma', default=1.0, type=float,
+                        help='Standard deviation used for sampling from Gaussian')
+    parser.add_argument('--segment-length', default=4000, type=int,
+                        help='Segment length (audio samples) processed per iteration')
+
+    # wavenet parameters
+    wavenet = parser.add_argument_group('WaveNet parameters')
+    wavenet.add_argument('--wn-kernel-size', default=3, type=int,
+                        help='Kernel size for dialted convolution in the affine coupling layer (WN)')
+    wavenet.add_argument('--wn-channels', default=512, type=int,
+                        help='Number of channels in WN')
+    wavenet.add_argument('--wn-layers', default=8, type=int,
+                        help='Number of layers in WN')
+
+    return parser
diff --git a/waveglow/data_function.py b/waveglow/data_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..583a50205b7459bfe87c898583aadd70e5d402b5
--- /dev/null
+++ b/waveglow/data_function.py
@@ -0,0 +1,100 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************\
+
+import torch
+import random
+import common.layers as layers
+from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu
+
+
+class MelAudioLoader(torch.utils.data.Dataset):
+    """
+        1) loads audio,text pairs
+        2) computes mel-spectrograms from audio files.
+    """
+
+    def __init__(self,
+                 dataset_path,
+                 audiopaths_and_text,
+                 segment_length,
+                 n_mel_channels,
+                 max_wav_value,
+                 sampling_rate,
+                 filter_length,
+                 hop_length,
+                 win_length,
+                 mel_fmin,
+                 mel_fmax,
+                 args):
+        self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
+        self.max_wav_value = max_wav_value
+        self.sampling_rate = sampling_rate
+        self.stft = layers.TacotronSTFT(
+            filter_length, hop_length, win_length,
+            n_mel_channels, sampling_rate, mel_fmin,
+            mel_fmax)
+        self.segment_length = segment_length
+        random.seed(1234)
+        random.shuffle(self.audiopaths_and_text)
+
+    def get_mel_audio_pair(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+
+        if sampling_rate != self.stft.sampling_rate:
+            raise ValueError("{} {} SR doesn't match target {} SR".format(
+                sampling_rate, self.stft.sampling_rate))
+
+        # Take segment
+        if audio.size(0) >= self.segment_length:
+            max_audio_start = audio.size(0) - self.segment_length
+            audio_start = random.randint(0, max_audio_start)
+            audio = audio[audio_start:audio_start+self.segment_length]
+        else:
+            audio = torch.nn.functional.pad(
+                audio, (0, self.segment_length - audio.size(0)), 'constant').data
+
+        audio = audio / self.max_wav_value
+        audio_norm = audio.unsqueeze(0)
+        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+        melspec = self.stft.mel_spectrogram(audio_norm)
+        melspec = melspec.squeeze(0)
+
+        return (melspec, audio, len(audio))
+
+    def __getitem__(self, index):
+        return self.get_mel_audio_pair(self.audiopaths_and_text[index][0])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+
+
+def batch_to_gpu(batch):
+    x, y, len_y = batch
+    x = to_gpu(x).float()
+    y = to_gpu(y).float()
+    len_y = to_gpu(torch.sum(len_y))
+    return ((x, y), y, len_y)
diff --git a/waveglow/denoiser.py b/waveglow/denoiser.py
new file mode 100644
index 0000000000000000000000000000000000000000..824e1086a7004bdd44b1824738d8a525aa04bd2f
--- /dev/null
+++ b/waveglow/denoiser.py
@@ -0,0 +1,61 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+from common.layers import STFT
+
+
+class Denoiser(torch.nn.Module):
+    """ Removes model bias from audio produced with waveglow """
+
+    def __init__(self, waveglow, filter_length=1024, n_overlap=4,
+                 win_length=1024, mode='zeros'):
+        super(Denoiser, self).__init__()
+        device = waveglow.upsample.weight.device
+        dtype = waveglow.upsample.weight.dtype
+        self.stft = STFT(filter_length=filter_length,
+                         hop_length=int(filter_length/n_overlap),
+                         win_length=win_length).to(device)
+        if mode == 'zeros':
+            mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
+        elif mode == 'normal':
+            mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
+        else:
+            raise Exception("Mode {} if not supported".format(mode))
+
+        with torch.no_grad():
+            bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
+            bias_spec, _ = self.stft.transform(bias_audio)
+
+        self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
+
+    def forward(self, audio, strength=0.1):
+        audio_spec, audio_angles = self.stft.transform(audio)
+        audio_spec_denoised = audio_spec - self.bias_spec * strength
+        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+        audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
+        return audio_denoised
diff --git a/waveglow/loss_function.py b/waveglow/loss_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ff86e8814203d4e2c187a76171107d4e367d55d
--- /dev/null
+++ b/waveglow/loss_function.py
@@ -0,0 +1,49 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+
+class WaveGlowLoss(torch.nn.Module):
+    def __init__(self, sigma=1.0):
+        super(WaveGlowLoss, self).__init__()
+        self.sigma = sigma
+
+    def forward(self, model_output, clean_audio):
+        # clean_audio is unused;
+        z, log_s_list, log_det_W_list = model_output
+        for i, log_s in enumerate(log_s_list):
+            if i == 0:
+                log_s_total = torch.sum(log_s)
+                log_det_W_total = log_det_W_list[i]
+            else:
+                log_s_total = log_s_total + torch.sum(log_s)
+                log_det_W_total += log_det_W_list[i]
+
+        loss = torch.sum(
+            z * z) / (2 * self.sigma * self.sigma) - log_s_total - log_det_W_total  # noqa: E501
+        meta = {}
+        return loss / (z.size(0) * z.size(1) * z.size(2)), meta
diff --git a/waveglow/model.py b/waveglow/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..770564ff69488c324bd883ccb97d2f09b7e103d2
--- /dev/null
+++ b/waveglow/model.py
@@ -0,0 +1,342 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+class Invertible1x1Conv(torch.nn.Module):
+    """
+    The layer outputs both the convolution, and the log determinant
+    of its weight matrix.  If reverse=True it does convolution with
+    inverse
+    """
+
+    def __init__(self, c):
+        super(Invertible1x1Conv, self).__init__()
+        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+                                    bias=False)
+
+        # Sample a random orthonormal matrix to initialize weights
+        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:, 0] = -1 * W[:, 0]
+        W = W.view(c, c, 1)
+        W = W.contiguous()
+        self.conv.weight.data = W
+
+    def forward(self, z):
+        # shape
+        batch_size, group_size, n_of_groups = z.size()
+
+        W = self.conv.weight.squeeze()
+
+        # Forward computation
+        log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze()
+        z = self.conv(z)
+        return z, log_det_W
+
+
+    def infer(self, z):
+        # shape
+        batch_size, group_size, n_of_groups = z.size()
+
+        W = self.conv.weight.squeeze()
+
+        if not hasattr(self, 'W_inverse'):
+            # Reverse computation
+            W_inverse = W.float().inverse()
+            W_inverse = Variable(W_inverse[..., None])
+            if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
+                W_inverse = W_inverse.half()
+            self.W_inverse = W_inverse
+        z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+        return z
+
+
+class WN(torch.nn.Module):
+    """
+    This is the WaveNet like layer for the affine coupling.  The primary
+    difference from WaveNet is the convolutions need not be causal.  There is
+    also no dilation size reset.  The dilation only doubles on each layer
+    """
+
+    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+                 kernel_size):
+        super(WN, self).__init__()
+        assert(kernel_size % 2 == 1)
+        assert(n_channels % 2 == 0)
+        self.n_layers = n_layers
+        self.n_channels = n_channels
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.cond_layers = torch.nn.ModuleList()
+
+        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+        start = torch.nn.utils.weight_norm(start, name='weight')
+        self.start = start
+
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+
+        for i in range(n_layers):
+            dilation = 2 ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size,
+                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+            cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
+            cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+            self.cond_layers.append(cond_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * n_channels
+            else:
+                res_skip_channels = n_channels
+            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(
+                res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, forward_input):
+        audio, spect = forward_input
+        audio = self.start(audio)
+
+        for i in range(self.n_layers):
+            acts = fused_add_tanh_sigmoid_multiply(
+                self.in_layers[i](audio),
+                self.cond_layers[i](spect),
+                torch.IntTensor([self.n_channels]))
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                audio = res_skip_acts[:, :self.n_channels, :] + audio
+                skip_acts = res_skip_acts[:, self.n_channels:, :]
+            else:
+                skip_acts = res_skip_acts
+
+            if i == 0:
+                output = skip_acts
+            else:
+                output = skip_acts + output
+        return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+                 n_early_size, WN_config):
+        super(WaveGlow, self).__init__()
+
+        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+                                                 n_mel_channels,
+                                                 1024, stride=256)
+        assert(n_group % 2 == 0)
+        self.n_flows = n_flows
+        self.n_group = n_group
+        self.n_early_every = n_early_every
+        self.n_early_size = n_early_size
+        self.WN = torch.nn.ModuleList()
+        self.convinv = torch.nn.ModuleList()
+
+        n_half = int(n_group / 2)
+
+        # Set up layers with the right sizes based on how many dimensions
+        # have been output already
+        n_remaining_channels = n_group
+        for k in range(n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                n_half = n_half - int(self.n_early_size / 2)
+                n_remaining_channels = n_remaining_channels - self.n_early_size
+            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+            self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
+        self.n_remaining_channels = n_remaining_channels
+
+    def forward(self, forward_input):
+        """
+        forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
+        forward_input[1] = audio: batch x time
+        """
+        spect, audio = forward_input
+
+        #  Upsample spectrogram to size of audio
+        spect = self.upsample(spect)
+        assert(spect.size(2) >= audio.size(1))
+        if spect.size(2) > audio.size(1):
+            spect = spect[:, :, :audio.size(1)]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
+        spect = spect.permute(0, 2, 1)
+
+        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+        output_audio = []
+        log_s_list = []
+        log_det_W_list = []
+
+        for k in range(self.n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                output_audio.append(audio[:, :self.n_early_size, :])
+                audio = audio[:, self.n_early_size:, :]
+
+            audio, log_det_W = self.convinv[k](audio)
+            log_det_W_list.append(log_det_W)
+
+            n_half = int(audio.size(1) / 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+            log_s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = torch.exp(log_s) * audio_1 + b
+            log_s_list.append(log_s)
+
+            audio = torch.cat([audio_0, audio_1], 1)
+
+        output_audio.append(audio)
+        return torch.cat(output_audio, 1), log_s_list, log_det_W_list
+
+    def infer(self, spect, sigma=1.0):
+
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
+        spect = spect.permute(0, 2, 1)
+
+        audio = torch.randn(spect.size(0),
+                            self.n_remaining_channels,
+                            spect.size(2), device=spect.device).to(spect.dtype)
+
+        audio = torch.autograd.Variable(sigma * audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1) / 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b) / torch.exp(s)
+            audio = torch.cat([audio_0, audio_1], 1)
+
+            audio = self.convinv[k].infer(audio)
+
+            if k % self.n_early_every == 0 and k > 0:
+                z = torch.randn(spect.size(0), self.n_early_size, spect.size(
+                    2), device=spect.device).to(spect.dtype)
+                audio = torch.cat((sigma * z, audio), 1)
+
+        audio = audio.permute(
+            0, 2, 1).contiguous().view(
+            audio.size(0), -1).data
+        return audio
+
+
+    def infer_onnx(self, spect, z, sigma=0.9):
+
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+
+        length_spect_group = spect.size(2)//8
+        mel_dim = 80
+        batch_size = spect.size(0)
+
+        spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
+        spect = spect.permute(0, 2, 1, 3)
+        spect = spect.contiguous()
+        spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
+        spect = spect.permute(0, 2, 1)
+        spect = spect.contiguous()
+
+        audio = z[:, :self.n_remaining_channels, :]
+        z = z[:, self.n_remaining_channels:self.n_group, :]
+        audio = sigma*audio
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1) // 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:(n_half+n_half), :]
+
+            output = self.WN[k]((audio_0, spect))
+            s = output[:, n_half:(n_half+n_half), :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b) / torch.exp(s)
+            audio = torch.cat([audio_0, audio_1], 1)
+            audio = self.convinv[k].infer(audio)
+
+            if k % self.n_early_every == 0 and k > 0:
+                audio = torch.cat((z[:, :self.n_early_size, :], audio), 1)
+                z = z[:, self.n_early_size:self.n_group, :]
+
+        audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
+
+        return audio
+
+
+    @staticmethod
+    def remove_weightnorm(model):
+        waveglow = model
+        for WN in waveglow.WN:
+            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+            WN.in_layers = remove(WN.in_layers)
+            WN.cond_layers = remove(WN.cond_layers)
+            WN.res_skip_layers = remove(WN.res_skip_layers)
+        return waveglow
+
+
+def remove(conv_list):
+    new_conv_list = torch.nn.ModuleList()
+    for old_conv in conv_list:
+        old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+        new_conv_list.append(old_conv)
+    return new_conv_list