diff --git a/chatterbox/src/chatterbox/__init__.py b/chatterbox/src/chatterbox/__init__.py deleted file mode 100644 index c8aa565d6cf00b8eaf2b7896ea751bb8091fc77a..0000000000000000000000000000000000000000 --- a/chatterbox/src/chatterbox/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .tts import ChatterboxTTS -from .vc import ChatterboxVC diff --git a/chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 99ad5263eb11bd1403f596d0d01121b66bbce883..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc b/chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc deleted file mode 100644 index 4bbc4ca60d6faf3959346ac2c9a738bc9c9fded1..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc b/chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc deleted file mode 100644 index 6e7d5c9fd63ba404f2a4fc368af509070b145821..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index a3d7d93498769edbe98d626fa8ad88d44639f807..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc b/chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc deleted file mode 100644 index 3de9b10493a0ef2f07787995507b5ef310cd096d..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index b54c01f2d7f935f4c2e413f3a2d5e1c4439dc55c..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 21576c393a72f8860c71b605a8314a624f632b79..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 96e484e0e352ee35b2d98f07435f5591a3abe760..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc b/chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc deleted file mode 100644 index 06221cbf74e637a11be179968dcc18d835a605ff..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc b/chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc deleted file mode 100644 index 412b8ae8e77b5bcd98547cfdccd3601964f9807a..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 4a545f058927aaf45dc9ae1e32467853890bf3e0..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 791d0a135f25bc8c8e8447d947b691776ba4d16c..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc b/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc deleted file mode 100644 index 5da073fa41d48b50ccc6fd0c351624a50ab53ad9..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc b/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc deleted file mode 100644 index 4f119c67e8e25df6680801cb454f0af6b19c5f5d..0000000000000000000000000000000000000000 Binary files a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc and /dev/null differ diff --git a/chatterbox/src/orator/__init__.py b/chatterbox/src/orator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2b13b095b627341de9c4a1e63ca91b180910ee3 --- /dev/null +++ b/chatterbox/src/orator/__init__.py @@ -0,0 +1 @@ +from .tts import OratorTTS \ No newline at end of file diff --git a/chatterbox/src/orator/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8594ea162f363a12ba2f7288bffb183797bc1698 Binary files /dev/null and b/chatterbox/src/orator/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/orator/__pycache__/tts.cpython-311.pyc b/chatterbox/src/orator/__pycache__/tts.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fee5d6451758b8ab24076847fe65c45fe55254b Binary files /dev/null and b/chatterbox/src/orator/__pycache__/tts.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py b/chatterbox/src/orator/model_checkpoints.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py rename to chatterbox/src/orator/model_checkpoints.py diff --git a/chatterbox/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc b/chatterbox/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abdfafb4abc0f4c125638ebe9d4f456039bf68fc Binary files /dev/null and b/chatterbox/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc b/chatterbox/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bc39b3811393b032fad3f25eb8c822ad831b6c0 Binary files /dev/null and b/chatterbox/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/bigvgan/activations.py b/chatterbox/src/orator/models/bigvgan/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..30a3c85145eb147e61331f9dbd5d2b3650146851 --- /dev/null +++ b/chatterbox/src/orator/models/bigvgan/activations.py @@ -0,0 +1,120 @@ +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter + + +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x + + +class SnakeBeta(nn.Module): + ''' + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + ''' + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + self.beta = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + self.beta = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta ∶= x + 1/b * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/__init__.py b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f756ed83f87f9839e457b240f60469bc187707d --- /dev/null +++ b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__init__.py @@ -0,0 +1,6 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +from .filter import * +from .resample import * +from .act import * diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdf57d13c2b1e94a2c20321d6fcab00ee86ba913 Binary files /dev/null and b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e4a139e20f899bddacf05d467861e2857286268 Binary files /dev/null and b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6416602224f887fa03f8bce27fc952f8f6ff23a Binary files /dev/null and b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af56e62e2e4bffcd9444f653101a91af4241494b Binary files /dev/null and b/chatterbox/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/act.py b/chatterbox/src/orator/models/bigvgan/alias_free_torch/act.py new file mode 100644 index 0000000000000000000000000000000000000000..ef231b01506f01c2b66d2dc4f3f0891219b3b41a --- /dev/null +++ b/chatterbox/src/orator/models/bigvgan/alias_free_torch/act.py @@ -0,0 +1,28 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn + +from .resample import UpSample1d, DownSample1d + + +class Activation1d(nn.Module): + def __init__(self, + activation, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = activation + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B, C, T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + return x diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/filter.py b/chatterbox/src/orator/models/bigvgan/alias_free_torch/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..066dce8eef9f31a868554f08efbef7c3f4422b7b --- /dev/null +++ b/chatterbox/src/orator/models/bigvgan/alias_free_torch/filter.py @@ -0,0 +1,95 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +if 'sinc' in dir(torch): + sinc = torch.sinc +else: + # This code is adopted from adefossez's julius.core.sinc under the MIT License + # https://adefossez.github.io/julius/julius/core.html + # LICENSE is in incl_licenses directory. + def sinc(x: torch.Tensor): + """ + Implementation of sinc, i.e. sin(pi * x) / (pi * x) + __Warning__: Different to julius.sinc, the input is multiplied by `pi`! + """ + return torch.where(x == 0, + torch.tensor(1., device=x.device, dtype=x.dtype), + torch.sin(math.pi * x) / math.pi / x) + + +# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License +# https://adefossez.github.io/julius/julius/lowpass.html +# LICENSE is in incl_licenses directory. +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] + even = (kernel_size % 2 == 0) + half_size = kernel_size // 2 + + #For kaiser window + delta_f = 4 * half_width + A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 + if A > 50.: + beta = 0.1102 * (A - 8.7) + elif A >= 21.: + beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) + else: + beta = 0. + window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) + + # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio + if even: + time = (torch.arange(-half_size, half_size) + 0.5) + else: + time = torch.arange(kernel_size) - half_size + if cutoff == 0: + filter_ = torch.zeros_like(time) + else: + filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) + # Normalize filter to have sum = 1, otherwise we will have a small leakage + # of the constant component in the input signal. + filter_ /= filter_.sum() + filter = filter_.view(1, 1, kernel_size) + + return filter + + +class LowPassFilter1d(nn.Module): + def __init__(self, + cutoff=0.5, + half_width=0.6, + stride: int = 1, + padding: bool = True, + padding_mode: str = 'replicate', + kernel_size: int = 12): + # kernel_size should be even number for stylegan3 setup, + # in this implementation, odd number is also possible. + super().__init__() + if cutoff < -0.: + raise ValueError("Minimum cutoff must be larger than zero.") + if cutoff > 0.5: + raise ValueError("A cutoff above 0.5 does not make sense.") + self.kernel_size = kernel_size + self.even = (kernel_size % 2 == 0) + self.pad_left = kernel_size // 2 - int(self.even) + self.pad_right = kernel_size // 2 + self.stride = stride + self.padding = padding + self.padding_mode = padding_mode + filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) + self.register_buffer("filter", filter) + + #input [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + if self.padding: + x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode) + out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) + + return out diff --git a/chatterbox/src/orator/models/bigvgan/alias_free_torch/resample.py b/chatterbox/src/orator/models/bigvgan/alias_free_torch/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..73670db9735504a51231fbe93cb812f722fb74ae --- /dev/null +++ b/chatterbox/src/orator/models/bigvgan/alias_free_torch/resample.py @@ -0,0 +1,55 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn +from torch.nn import functional as F + +from .filter import LowPassFilter1d +from .filter import kaiser_sinc_filter1d + + +class UpSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.stride = ratio + self.pad = self.kernel_size // ratio - 1 + self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d( + cutoff=0.5 / ratio, + half_width=0.6 / ratio, + kernel_size=self.kernel_size + ) + self.register_buffer("filter", filter) + + # x: [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + x = F.pad(x, (self.pad, self.pad), mode='replicate') + x = self.ratio * F.conv_transpose1d( + x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C + ) + x = x[..., self.pad_left:-self.pad_right] + + return x + + +class DownSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.lowpass = LowPassFilter1d( + cutoff=0.5 / ratio, + half_width=0.6 / ratio, + stride=ratio, + kernel_size=self.kernel_size + ) + + def forward(self, x): + xx = self.lowpass(x) + + return xx diff --git a/chatterbox/src/orator/models/bigvgan/bigvgan.py b/chatterbox/src/orator/models/bigvgan/bigvgan.py new file mode 100644 index 0000000000000000000000000000000000000000..356142106f6c91b0cd4c8db4ec28e04811e8e1ef --- /dev/null +++ b/chatterbox/src/orator/models/bigvgan/bigvgan.py @@ -0,0 +1,212 @@ +# Copyright (c) 2022 NVIDIA CORPORATION. +# Licensed under the MIT license. +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import logging +from torch.nn import Conv1d, ConvTranspose1d +from torch.nn.utils import weight_norm, remove_weight_norm +from torch.nn.utils.weight_norm import WeightNorm + +from .activations import SnakeBeta +from .alias_free_torch import * + + + +LRELU_SLOPE = 0.1 + +logger = logging.getLogger(__name__) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +class AMPBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(AMPBlock1, self).__init__() + + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers + + self.activations = nn.ModuleList([ + Activation1d(activation=SnakeBeta(channels, alpha_logscale=True)) + for _ in range(self.num_layers) + ]) + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + + return x + + def set_weight_norm(self, enabled: bool): + weight_norm_fn = weight_norm if enabled else remove_weight_norm + for l in self.convs1: + weight_norm_fn(l) + for l in self.convs2: + weight_norm_fn(l) + + +class BigVGAN(nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + + # We've got a model in prod that has the wrong hparams for this. It's simpler to add this check than to + # redistribute the model. + ignore_state_dict_unexpected = ("cond_layer.*",) + + def __init__(self): + super().__init__() + + input_dims = 80 + + upsample_rates = [10, 8, 4, 2] + upsample_kernel_sizes = [x * 2 for x in upsample_rates] + upsample_initial_channel = 1024 + + resblock_kernel_sizes = [3, 7, 11] + resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + # pre conv + self.conv_pre = weight_norm(Conv1d(input_dims, upsample_initial_channel, 7, 1, padding=3)) + self.cond_layer = None + + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append(nn.ModuleList([ + weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i), + upsample_initial_channel // (2 ** (i + 1)), + k, u, padding=(k - u) // 2)) + ])) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(AMPBlock1(ch, k, d)) + + # post conv + activation_post = SnakeBeta(ch, alpha_logscale=True) + self.activation_post = Activation1d(activation=activation_post) + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + + # weight initialization + for i in range(len(self.ups)): + self.ups[i].apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x) -> torch.Tensor: + """ + Args + ---- + x: torch.Tensor of shape [B, T, C] + """ + with torch.inference_mode(): + + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + # upsampling + for i_up in range(len(self.ups[i])): + x = self.ups[i][i_up](x) + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + + # Bound the output to [-1, 1] + x = torch.tanh(x) + + return x + + @property + def weight_norm_enabled(self) -> bool: + return any( + isinstance(hook, WeightNorm) and hook.name == "weight" + for k, hook in self.conv_pre._forward_pre_hooks.items() + ) + + def set_weight_norm(self, enabled: bool): + """ + N.B.: weight norm modifies the state dict, causing incompatibilities. Conventions: + - BigVGAN runs with weight norm for training, without for inference (done automatically by instantiate()) + - All checkpoints are saved with weight norm (allows resuming training) + """ + if enabled != self.weight_norm_enabled: + weight_norm_fn = weight_norm if enabled else remove_weight_norm + logger.debug(f"{'Applying' if enabled else 'Removing'} weight norm...") + + for l in self.ups: + for l_i in l: + weight_norm_fn(l_i) + for l in self.resblocks: + l.set_weight_norm(enabled) + weight_norm_fn(self.conv_pre) + weight_norm_fn(self.conv_post) + + def train_mode(self): + self.train() + self.set_weight_norm(enabled=True) + + def inference_mode(self): + self.eval() + self.set_weight_norm(enabled=False) + + +if __name__ == '__main__': + import sys + import soundfile as sf + model = BigVGAN() + + state_dict = torch.load("bigvgan32k.pt") + msg = model.load_state_dict(state_dict) + model.eval() + model.set_weight_norm(enabled=False) + + print(msg) + mels = torch.load("mels.pt") + with torch.inference_mode(): + y = model(mels.cpu()) + + for i, wav in enumerate(y): + wav = wav.view(-1).detach().numpy() + sf.write(f"bigvgan_test{i}.flac", wav, samplerate=32_000, format="FLAC") diff --git a/chatterbox/src/chatterbox/models/s3gen/__init__.py b/chatterbox/src/orator/models/s3gen/__init__.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/__init__.py rename to chatterbox/src/orator/models/s3gen/__init__.py diff --git a/chatterbox/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88f5061c35b85e71863ad378316c847fa62a8098 Binary files /dev/null and b/chatterbox/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..672a9ede71d20bb648c0aab8763d53b9febe8419 Binary files /dev/null and b/chatterbox/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc similarity index 94% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc index 5e701bf377262d87ddba74bd24fa03cd7686d509..29795369200d8523c3fab0df7b6ac3aa039bd11f 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc similarity index 91% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc index fd73014e5b78d03e6969798838842f199251d849..895be0183fd8cdcf048e83a67240111dd69cb560 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc similarity index 97% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc index 879a98004b4eaf5820b2633125801fe5c515fe3d..aaf91646a4f6565053ec61efdf463e9644f80f47 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc similarity index 98% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc index c3645b02e56bfcac71736e4c8ef955a2a8beb9f1..5fc5d3678bcecf22eb08f3a791c4bf3e6c73a206 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc similarity index 98% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc index 328bd2ed37a93ad7fa48d4828f4a760d751d56cb..6251caed029bb870899fe3df134a92c91db64b8b 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc similarity index 85% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc index b080467f33435838d82b9c4c34811ec05cfc2fc3..3b2350c12e4e99d1ed1a08f71a677cd9e3e64a19 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc similarity index 98% rename from chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc index 06021ffe2d09b7335c2175b7154c6f9ca1662e03..6e75c087040de062e0fa3b9b9428f566be50e18d 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/const.py b/chatterbox/src/orator/models/s3gen/const.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/const.py rename to chatterbox/src/orator/models/s3gen/const.py diff --git a/chatterbox/src/chatterbox/models/s3gen/decoder.py b/chatterbox/src/orator/models/s3gen/decoder.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/decoder.py rename to chatterbox/src/orator/models/s3gen/decoder.py diff --git a/chatterbox/src/chatterbox/models/s3gen/f0_predictor.py b/chatterbox/src/orator/models/s3gen/f0_predictor.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/f0_predictor.py rename to chatterbox/src/orator/models/s3gen/f0_predictor.py diff --git a/chatterbox/src/chatterbox/models/s3gen/flow.py b/chatterbox/src/orator/models/s3gen/flow.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/flow.py rename to chatterbox/src/orator/models/s3gen/flow.py diff --git a/chatterbox/src/chatterbox/models/s3gen/flow_matching.py b/chatterbox/src/orator/models/s3gen/flow_matching.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/flow_matching.py rename to chatterbox/src/orator/models/s3gen/flow_matching.py diff --git a/chatterbox/src/chatterbox/models/s3gen/hifigan.py b/chatterbox/src/orator/models/s3gen/hifigan.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/hifigan.py rename to chatterbox/src/orator/models/s3gen/hifigan.py diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc similarity index 97% rename from chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc index 3a5e1329dfb117a8952dc604167ccb837f286557..373a13ae0c375f07787bb316a2d8af87f378ce66 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc similarity index 96% rename from chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc index f1a843deeb409e56e0d835e895dc0878cfc15c63..e536d592d56b8e4dc8c5189038911326be7d1cfd 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc similarity index 96% rename from chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc index 72d877a95889c065706aeacb7ca0cde0bf16e214..3d5180852c8d04652cbabeca757b461e27d01475 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py b/chatterbox/src/orator/models/s3gen/matcha/decoder.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py rename to chatterbox/src/orator/models/s3gen/matcha/decoder.py diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py b/chatterbox/src/orator/models/s3gen/matcha/flow_matching.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py rename to chatterbox/src/orator/models/s3gen/matcha/flow_matching.py diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py b/chatterbox/src/orator/models/s3gen/matcha/text_encoder.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py rename to chatterbox/src/orator/models/s3gen/matcha/text_encoder.py diff --git a/chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py b/chatterbox/src/orator/models/s3gen/matcha/transformer.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py rename to chatterbox/src/orator/models/s3gen/matcha/transformer.py diff --git a/chatterbox/src/chatterbox/models/s3gen/s3gen.py b/chatterbox/src/orator/models/s3gen/s3gen.py similarity index 99% rename from chatterbox/src/chatterbox/models/s3gen/s3gen.py rename to chatterbox/src/orator/models/s3gen/s3gen.py index 97b7c0bd40ad6cd258ca3c4bd4ae752c78f28b19..f04471c8b92d4d4dcedeecd94f738ed1a8f10b9d 100644 --- a/chatterbox/src/chatterbox/models/s3gen/s3gen.py +++ b/chatterbox/src/orator/models/s3gen/s3gen.py @@ -294,9 +294,8 @@ class S3Token2Wav(S3Token2Mel): # pre-computed ref embedding (prod API) ref_dict: Optional[dict] = None, cache_source: torch.Tensor = None, # NOTE: this arg is for streaming, it can probably be removed here - finalize: bool = True, ): - output_mels = self.flow_inference(speech_tokens, ref_wav=ref_wav, ref_sr=ref_sr, ref_dict=ref_dict, finalize=finalize) + output_mels = self.flow_inference(speech_tokens, ref_wav=ref_wav, ref_sr=ref_sr, ref_dict=ref_dict) output_wavs, output_sources = self.hift_inference(output_mels, cache_source) # NOTE: ad-hoc method to reduce "spillover" from the reference clip. diff --git a/chatterbox/src/orator/models/s3gen/transformer/__init__.py b/chatterbox/src/orator/models/s3gen/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/chatterbox/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c7e9096456a5b215a610226e1da38c102f7bbfc Binary files /dev/null and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc similarity index 93% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc index 8f848b9907877010c24348013797d3e97753c8fb..4a5a3bd3092eb9573a5d20952dbedd864c2655b8 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc similarity index 93% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc index 7d9e4dc70a3561f2301a2ba8b377c724fa4be8cf..9e01aa5f75862f99299f319f52b2e82367b4a47b 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc similarity index 94% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc index 8795dbf9016971e5f195d218af9b888c7901acbb..b47b3e98dbfe5bdc0c8a42658d163fea720ada03 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc similarity index 94% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc index de37602db521e7cfd6a17fd8d0ee3c5e414bc153..d5d20c7f0ef144113c41f31ad3f3f0b4a1b27bb6 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc similarity index 97% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc index 10235b5da58816ae1da187e5a063f8b4d9d72cfb..2139d256ab5aaa4ca37209dedec6ba9409b646c3 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc similarity index 90% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc index 07669f23137a230571a3d4dba8ede1a98dac389b..218dab46968816f426581e08b0f831541befb63c 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc similarity index 99% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc index 4d38fbe5b378fdf466b38c943af2597aa8b05081..7eaf8de2e1a5b43dbbefa4e81eee8b1c2b6772e6 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc similarity index 97% rename from chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc index 9caff02c02acd742bfda0fcc2545570118c95806..a0907d3745ab61d6424a600471679e23e4f34558 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/activation.py b/chatterbox/src/orator/models/s3gen/transformer/activation.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/activation.py rename to chatterbox/src/orator/models/s3gen/transformer/activation.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/attention.py b/chatterbox/src/orator/models/s3gen/transformer/attention.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/attention.py rename to chatterbox/src/orator/models/s3gen/transformer/attention.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/convolution.py b/chatterbox/src/orator/models/s3gen/transformer/convolution.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/convolution.py rename to chatterbox/src/orator/models/s3gen/transformer/convolution.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/embedding.py b/chatterbox/src/orator/models/s3gen/transformer/embedding.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/embedding.py rename to chatterbox/src/orator/models/s3gen/transformer/embedding.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/encoder_layer.py b/chatterbox/src/orator/models/s3gen/transformer/encoder_layer.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/encoder_layer.py rename to chatterbox/src/orator/models/s3gen/transformer/encoder_layer.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/positionwise_feed_forward.py b/chatterbox/src/orator/models/s3gen/transformer/positionwise_feed_forward.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/positionwise_feed_forward.py rename to chatterbox/src/orator/models/s3gen/transformer/positionwise_feed_forward.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/subsampling.py b/chatterbox/src/orator/models/s3gen/transformer/subsampling.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/subsampling.py rename to chatterbox/src/orator/models/s3gen/transformer/subsampling.py diff --git a/chatterbox/src/chatterbox/models/s3gen/transformer/upsample_encoder.py b/chatterbox/src/orator/models/s3gen/transformer/upsample_encoder.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/transformer/upsample_encoder.py rename to chatterbox/src/orator/models/s3gen/transformer/upsample_encoder.py diff --git a/chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc similarity index 67% rename from chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc index 939d19a5c95c1f7ef0cef0186ce04f3ef2aed5d4..e7d832f2454549011d5a4cdec48a9c7f924ce34c 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc similarity index 96% rename from chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc index f373f74f3e44a9122fdebf9778d018952b41df5b..d2cfc900413b9c3e584855e9226bb3380bd9c2b4 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc b/chatterbox/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc similarity index 92% rename from chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc rename to chatterbox/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc index ce5a93aa98a343d3d1d95cd18430a01a18344ef8..b6a5958dfa10225bf5c28d6efe00ef531fc7bb44 100644 Binary files a/chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc and b/chatterbox/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3gen/utils/class_utils.py b/chatterbox/src/orator/models/s3gen/utils/class_utils.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/utils/class_utils.py rename to chatterbox/src/orator/models/s3gen/utils/class_utils.py diff --git a/chatterbox/src/chatterbox/models/s3gen/utils/mask.py b/chatterbox/src/orator/models/s3gen/utils/mask.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/utils/mask.py rename to chatterbox/src/orator/models/s3gen/utils/mask.py diff --git a/chatterbox/src/chatterbox/models/s3gen/utils/mel.py b/chatterbox/src/orator/models/s3gen/utils/mel.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/utils/mel.py rename to chatterbox/src/orator/models/s3gen/utils/mel.py diff --git a/chatterbox/src/chatterbox/models/s3gen/xvector.py b/chatterbox/src/orator/models/s3gen/xvector.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3gen/xvector.py rename to chatterbox/src/orator/models/s3gen/xvector.py diff --git a/chatterbox/src/chatterbox/models/s3tokenizer/__init__.py b/chatterbox/src/orator/models/s3tokenizer/__init__.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3tokenizer/__init__.py rename to chatterbox/src/orator/models/s3tokenizer/__init__.py diff --git a/chatterbox/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..255a81bf59d934c1949073746742e34e26aa3c27 Binary files /dev/null and b/chatterbox/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc b/chatterbox/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc similarity index 97% rename from chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc rename to chatterbox/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc index da1ed4d56a5a1d22a67e42a2a1ba04f55c325ee9..68cf1b224c42e7ce00561e1a54312f897d3445cc 100644 Binary files a/chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc and b/chatterbox/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/s3tokenizer/s3tokenizer.py b/chatterbox/src/orator/models/s3tokenizer/s3tokenizer.py similarity index 100% rename from chatterbox/src/chatterbox/models/s3tokenizer/s3tokenizer.py rename to chatterbox/src/orator/models/s3tokenizer/s3tokenizer.py diff --git a/chatterbox/src/chatterbox/models/t3/__init__.py b/chatterbox/src/orator/models/t3/__init__.py similarity index 100% rename from chatterbox/src/chatterbox/models/t3/__init__.py rename to chatterbox/src/orator/models/t3/__init__.py diff --git a/chatterbox/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e247c27c21bc95394fa4ba3f05a163036a4e51bf Binary files /dev/null and b/chatterbox/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc b/chatterbox/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc similarity index 84% rename from chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc rename to chatterbox/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc index 2d6a50dd84e27275a22504110f26bfdf7e8099f9..b35f1fa530b6b5e4081cb274aeb654787567a924 100644 Binary files a/chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc and b/chatterbox/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc b/chatterbox/src/orator/models/t3/__pycache__/t3.cpython-311.pyc similarity index 55% rename from chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc rename to chatterbox/src/orator/models/t3/__pycache__/t3.cpython-311.pyc index aea80eecaa5e6a52e982c95459ad9092bf5e6f3c..9e24ddd71d0cbb1d7bce1e8d136658fd167b3a33 100644 Binary files a/chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc and b/chatterbox/src/orator/models/t3/__pycache__/t3.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc b/chatterbox/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2afa71a963d361037cfe1bce9c2628c2b7e4272 Binary files /dev/null and b/chatterbox/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py b/chatterbox/src/orator/models/t3/inference/alignment_stream_analyzer.py similarity index 100% rename from chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py rename to chatterbox/src/orator/models/t3/inference/alignment_stream_analyzer.py diff --git a/chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py b/chatterbox/src/orator/models/t3/inference/t3_hf_backend.py similarity index 92% rename from chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py rename to chatterbox/src/orator/models/t3/inference/t3_hf_backend.py index 6130722ce967b5d82dc0ca29390fb21748358424..8d2b175093074e9b8a566ce02a807de9804160a0 100644 --- a/chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py +++ b/chatterbox/src/orator/models/t3/inference/t3_hf_backend.py @@ -23,14 +23,14 @@ class T3HuggingfaceBackend(LlamaPreTrainedModel, GenerationMixin): speech_head, latents_queue=None, logits_queue=None, - alignment_stream_analyzer: 'AlignmentStreamAnalyzer'=None, ): super().__init__(config) self.model = llama self.speech_enc = speech_enc self.speech_head = speech_head + self.latents_queue = latents_queue + self.logits_queue = logits_queue self._added_cond = False - self.alignment_stream_analyzer = alignment_stream_analyzer @torch.inference_mode() def prepare_inputs_for_generation( @@ -101,12 +101,12 @@ class T3HuggingfaceBackend(LlamaPreTrainedModel, GenerationMixin): return_dict=True, ) hidden_states = tfmr_out.hidden_states[-1] # (B, seq, dim) + if self.latents_queue is not None: + self.latents_queue.put(hidden_states) logits = self.speech_head(hidden_states) - assert inputs_embeds.size(0) == 1 - - # NOTE: hallucination handler may modify logits to force emit an EOS token - logits = self.alignment_stream_analyzer.step(logits) + if self.logits_queue is not None: + self.logits_queue.put(logits) return CausalLMOutputWithCrossAttentions( logits=logits, diff --git a/chatterbox/src/chatterbox/models/t3/llama_configs.py b/chatterbox/src/orator/models/t3/llama_configs.py similarity index 100% rename from chatterbox/src/chatterbox/models/t3/llama_configs.py rename to chatterbox/src/orator/models/t3/llama_configs.py diff --git a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc b/chatterbox/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc similarity index 90% rename from chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc rename to chatterbox/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc index a6ff1b0a834044aa78a2f2da404a694d6342f1fe..cb7d6252fd9be8b2d77abbda8ce567a03afcde0b 100644 Binary files a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc and b/chatterbox/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc b/chatterbox/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc similarity index 82% rename from chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc rename to chatterbox/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc index 8e0cf937c1e994a79e9234d1ee8abb7d67cc8d45..9b04a7e5021ba0247d540342a5c8ec1a9436149a 100644 Binary files a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc and b/chatterbox/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc b/chatterbox/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc similarity index 79% rename from chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc rename to chatterbox/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc index d22f01242efbbd7e047eb0543671ed972d5e9954..e458b05d8d2bb30591b4a900458b4eaabe3ba92d 100644 Binary files a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc and b/chatterbox/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc b/chatterbox/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc similarity index 84% rename from chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc rename to chatterbox/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc index ce09feda6547f5c107e1a376b3898b84b2ffbfb3..20efe2507bc50ce6de730d0d301a7abc255ca974 100644 Binary files a/chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc and b/chatterbox/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/t3/modules/cond_enc.py b/chatterbox/src/orator/models/t3/modules/cond_enc.py similarity index 100% rename from chatterbox/src/chatterbox/models/t3/modules/cond_enc.py rename to chatterbox/src/orator/models/t3/modules/cond_enc.py diff --git a/chatterbox/src/chatterbox/models/t3/modules/learned_pos_emb.py b/chatterbox/src/orator/models/t3/modules/learned_pos_emb.py similarity index 100% rename from chatterbox/src/chatterbox/models/t3/modules/learned_pos_emb.py rename to chatterbox/src/orator/models/t3/modules/learned_pos_emb.py diff --git a/chatterbox/src/chatterbox/models/t3/modules/perceiver.py b/chatterbox/src/orator/models/t3/modules/perceiver.py similarity index 98% rename from chatterbox/src/chatterbox/models/t3/modules/perceiver.py rename to chatterbox/src/orator/models/t3/modules/perceiver.py index be9c5b863ce43ab43c0124a8ae0fa125b0da9673..eaa4b87c65832d380741c332c0a8288a4e8a9854 100644 --- a/chatterbox/src/chatterbox/models/t3/modules/perceiver.py +++ b/chatterbox/src/orator/models/t3/modules/perceiver.py @@ -1,6 +1,3 @@ -# Copyright (c) 2025 Resemble AI -# Author: Manmay Nakhashi -# MIT License import math import torch @@ -171,7 +168,6 @@ class AttentionBlock2(nn.Module): class Perceiver(nn.Module): - """Inspired by https://arxiv.org/abs/2103.03206""" def __init__(self, pre_attention_query_token=32, pre_attention_query_size=1024, embedding_dim=1024, num_attn_heads=4): """ Initialize the perceiver module. diff --git a/chatterbox/src/chatterbox/models/t3/modules/t3_config.py b/chatterbox/src/orator/models/t3/modules/t3_config.py similarity index 100% rename from chatterbox/src/chatterbox/models/t3/modules/t3_config.py rename to chatterbox/src/orator/models/t3/modules/t3_config.py diff --git a/chatterbox/src/chatterbox/models/t3/t3.py b/chatterbox/src/orator/models/t3/t3.py similarity index 94% rename from chatterbox/src/chatterbox/models/t3/t3.py rename to chatterbox/src/orator/models/t3/t3.py index d1af8a1c07293748f6f7874d76f3aabfabde1633..39978dfa8588f8d7bbcf2ea639c739119503708e 100644 --- a/chatterbox/src/chatterbox/models/t3/t3.py +++ b/chatterbox/src/orator/models/t3/t3.py @@ -1,5 +1,3 @@ -# Copyright (c) 2025 Resemble AI -# MIT License import logging from typing import Union, Optional, List @@ -12,9 +10,8 @@ from .modules.learned_pos_emb import LearnedPositionEmbeddings from .modules.cond_enc import T3CondEnc, T3Cond from .modules.t3_config import T3Config -from .llama_configs import LLAMA_CONFIGS from .inference.t3_hf_backend import T3HuggingfaceBackend -from .inference.alignment_stream_analyzer import AlignmentStreamAnalyzer +from .llama_configs import LLAMA_CONFIGS logger = logging.getLogger(__name__) @@ -224,6 +221,9 @@ class T3(nn.Module): """ Args: text_tokens: a 1D (unbatched) or 2D (batched) tensor. + tokens_queue: if a ReferenceQueue is provided, tokens will be streamed to it during generation. + latents_queue: if a ReferenceQueue is provided, latents will be streamed to it during generation. + logits_queue: if a ReferenceQueue is provided, logits will be streamed to it during generation. """ # Validate / sanitize inputs assert prepend_prompt_speech_tokens is None, "not implemented" @@ -235,7 +235,7 @@ class T3(nn.Module): initial_speech_tokens = self.hp.start_speech_token * torch.ones_like(text_tokens[:, :1]) # Prepare custom input embeds - embeds, len_cond = self.prepare_input_embeds( + embeds, _ = self.prepare_input_embeds( t3_cond=t3_cond, text_tokens=text_tokens, speech_tokens=initial_speech_tokens, @@ -249,19 +249,11 @@ class T3(nn.Module): # TODO? synchronize the expensive compile function # with self.compile_lock: if not self.compiled: - alignment_stream_analyzer = AlignmentStreamAnalyzer( - self.tfmr, - None, - text_tokens_slice=(len_cond, len_cond + text_tokens.size(-1)), - alignment_layer_idx=9, # TODO: hparam or something? - eos_idx=self.hp.stop_speech_token, - ) patched_model = T3HuggingfaceBackend( config=self.cfg, llama=self.tfmr, speech_enc=self.speech_emb, speech_head=self.speech_head, - alignment_stream_analyzer=alignment_stream_analyzer, ) self.patched_model = patched_model self.compiled = True diff --git a/chatterbox/src/chatterbox/models/tokenizers/__init__.py b/chatterbox/src/orator/models/tokenizers/__init__.py similarity index 100% rename from chatterbox/src/chatterbox/models/tokenizers/__init__.py rename to chatterbox/src/orator/models/tokenizers/__init__.py diff --git a/chatterbox/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd5ac112494eb3e7738603889ad9a6bee5fdd797 Binary files /dev/null and b/chatterbox/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc b/chatterbox/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc similarity index 84% rename from chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc rename to chatterbox/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc index bcec1c1dcfdb1d46a845dfa717e20b0e08cc91ab..07fc2f136c46382845e665b149a87899838a8006 100644 Binary files a/chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc and b/chatterbox/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/tokenizers/tokenizer.py b/chatterbox/src/orator/models/tokenizers/tokenizer.py similarity index 100% rename from chatterbox/src/chatterbox/models/tokenizers/tokenizer.py rename to chatterbox/src/orator/models/tokenizers/tokenizer.py diff --git a/chatterbox/src/chatterbox/models/voice_encoder/__init__.py b/chatterbox/src/orator/models/voice_encoder/__init__.py similarity index 100% rename from chatterbox/src/chatterbox/models/voice_encoder/__init__.py rename to chatterbox/src/orator/models/voice_encoder/__init__.py diff --git a/chatterbox/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc b/chatterbox/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27bbeb63c80ebbbbf6322aa2e826f35413a644e8 Binary files /dev/null and b/chatterbox/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc differ diff --git a/chatterbox/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc b/chatterbox/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97cb25025e63e3e20f23f9ca50da46a6c0c7f4f0 Binary files /dev/null and b/chatterbox/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc differ diff --git a/chatterbox/src/chatterbox/models/voice_encoder/config.py b/chatterbox/src/orator/models/voice_encoder/config.py similarity index 100% rename from chatterbox/src/chatterbox/models/voice_encoder/config.py rename to chatterbox/src/orator/models/voice_encoder/config.py diff --git a/chatterbox/src/chatterbox/models/voice_encoder/melspec.py b/chatterbox/src/orator/models/voice_encoder/melspec.py similarity index 100% rename from chatterbox/src/chatterbox/models/voice_encoder/melspec.py rename to chatterbox/src/orator/models/voice_encoder/melspec.py diff --git a/chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py b/chatterbox/src/orator/models/voice_encoder/voice_encoder.py similarity index 84% rename from chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py rename to chatterbox/src/orator/models/voice_encoder/voice_encoder.py index d986f17fd6afab59364863b5e92fd56eec21236b..afa10ba40731fadc621d0415ec43cf07ec931f54 100644 --- a/chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py +++ b/chatterbox/src/orator/models/voice_encoder/voice_encoder.py @@ -1,54 +1,37 @@ # Adapted from https://github.com/CorentinJ/Real-Time-Voice-Cloning # MIT License + from typing import List, Union, Optional import numpy as np from numpy.lib.stride_tricks import as_strided import librosa +from librosa import resample import torch import torch.nn.functional as F from torch import nn, Tensor -from .config import VoiceEncConfig -from .melspec import melspectrogram - - -def pack(arrays, seq_len: int=None, pad_value=0): - """ - Given a list of length B of array-like objects of shapes (Ti, ...), packs them in a single tensor of - shape (B, T, ...) by padding each individual array on the right. - - :param arrays: a list of array-like objects of matching shapes except for the first axis. - :param seq_len: the value of T. It must be the maximum of the lengths Ti of the arrays at - minimum. Will default to that value if None. - :param pad_value: the value to pad the arrays with. - :return: a (B, T, ...) tensor - """ - if seq_len is None: - seq_len = max(len(array) for array in arrays) - else: - assert seq_len >= max(len(array) for array in arrays) - - # Convert lists to np.array - if isinstance(arrays[0], list): - arrays = [np.array(array) for array in arrays] - - # Convert to tensor and handle device - device = None - if isinstance(arrays[0], torch.Tensor): - tensors = arrays - device = tensors[0].device - else: - tensors = [torch.as_tensor(array) for array in arrays] - - # Fill the packed tensor with the array data - packed_shape = (len(tensors), seq_len, *tensors[0].shape[1:]) - packed_tensor = torch.full(packed_shape, pad_value, dtype=tensors[0].dtype, device=device) - - for i, tensor in enumerate(tensors): - packed_tensor[i, :tensor.size(0)] = tensor - - return packed_tensor +from orator.transforms.spectrogram import melspectrogram +from orator.transforms.syn_transforms import pack + + +class VoiceEncConfig: + num_mels = 40 + sample_rate = 16000 + speaker_embed_size = 256 + ve_hidden_size = 256 + flatten_lstm_params = False + n_fft = 400 + hop_size = 160 + win_size = 400 + fmax = 8000 + fmin = 0 + preemphasis = 0. + mel_power = 2.0 + mel_type = "amp" + normalized_mels = False + ve_partial_frames = 160 + ve_final_relu = True def get_num_wins( @@ -259,7 +242,7 @@ class VoiceEncoder(nn.Module): """ if sample_rate != self.hp.sample_rate: wavs = [ - librosa.resample(wav, orig_sr=sample_rate, target_sr=self.hp.sample_rate, res_type="kaiser_fast") + resample(wav, orig_sr=sample_rate, target_sr=self.hp.sample_rate, res_type="kaiser_fast") for wav in wavs ] @@ -270,5 +253,4 @@ class VoiceEncoder(nn.Module): kwargs["rate"] = 1.3 # Resemble's default value. mels = [melspectrogram(w, self.hp).T for w in wavs] - return self.embeds_from_mels(mels, as_spk=as_spk, batch_size=batch_size, **kwargs) diff --git a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc b/chatterbox/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc similarity index 84% rename from chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc rename to chatterbox/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc index 8946c2bdd9c1243e1cb395e390f0b58f0ddf6a96..286dd85c3ce49de47aafd595300ef5e950d80a15 100644 Binary files a/chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc and b/chatterbox/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc differ diff --git a/chatterbox/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc b/chatterbox/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db4b5cbab3609a5c94d78da74982e3ccded20b4e Binary files /dev/null and b/chatterbox/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc differ diff --git a/chatterbox/src/orator/transforms/spectrogram.py b/chatterbox/src/orator/transforms/spectrogram.py new file mode 100644 index 0000000000000000000000000000000000000000..69147fc8c591c9364ff829a157af0ea3fcbd5770 --- /dev/null +++ b/chatterbox/src/orator/transforms/spectrogram.py @@ -0,0 +1,78 @@ +from functools import lru_cache + +from scipy import signal +import numpy as np +import librosa + + +@lru_cache() +def mel_basis(hp): + assert hp.fmax <= hp.sample_rate // 2 + return librosa.filters.mel( + sr=hp.sample_rate, + n_fft=hp.n_fft, + n_mels=hp.num_mels, + fmin=hp.fmin, + fmax=hp.fmax) # -> (nmel, nfreq) + + +def preemphasis(wav, hp): + assert hp.preemphasis != 0 + wav = signal.lfilter([1, -hp.preemphasis], [1], wav) + wav = np.clip(wav, -1, 1) + return wav + + +def melspectrogram(wav, hp, pad=True): + # Run through pre-emphasis + if hp.preemphasis > 0: + wav = preemphasis(wav, hp) + assert np.abs(wav).max() - 1 < 1e-07 + + # Do the stft + spec_complex = _stft(wav, hp, pad=pad) + + # Get the magnitudes + spec_magnitudes = np.abs(spec_complex) + + if hp.mel_power != 1.0: + spec_magnitudes **= hp.mel_power + + # Get the mel and convert magnitudes->db + mel = np.dot(mel_basis(hp), spec_magnitudes) + if hp.mel_type == "db": + mel = _amp_to_db(mel, hp) + + # Normalise the mel from db to 0,1 + if hp.normalized_mels: + mel = _normalize(mel, hp).astype(np.float32) + + assert not pad or mel.shape[1] == 1 + len(wav) // hp.hop_size # Sanity check + return mel # (M, T) + + +def _stft(y, hp, pad=True): + # NOTE: after 0.8, pad mode defaults to constant, setting this to reflect for + # historical consistency and streaming-version consistency + return librosa.stft( + y, + n_fft=hp.n_fft, + hop_length=hp.hop_size, + win_length=hp.win_size, + center=pad, + pad_mode="reflect", + ) + + +def _amp_to_db(x, hp): + return 20 * np.log10(np.maximum(hp.stft_magnitude_min, x)) + + +def _db_to_amp(x): + return np.power(10.0, x * 0.05) + + +def _normalize(s, hp, headroom_db=15): + min_level_db = 20 * np.log10(hp.stft_magnitude_min) + s = (s - min_level_db) / (-min_level_db + headroom_db) + return s diff --git a/chatterbox/src/orator/transforms/syn_transforms.py b/chatterbox/src/orator/transforms/syn_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..13ce597ae05503ef618b2de9b6c7b833f94409cb --- /dev/null +++ b/chatterbox/src/orator/transforms/syn_transforms.py @@ -0,0 +1,46 @@ +# Common transformations used by synthesizers +import logging + +import numpy as np +import torch + + +logger = logging.getLogger(__name__) + + +def pack(arrays, seq_len: int=None, pad_value=0): + """ + Given a list of length B of array-like objects of shapes (Ti, ...), packs them in a single tensor of + shape (B, T, ...) by padding each individual array on the right. + + :param arrays: a list of array-like objects of matching shapes except for the first axis. + :param seq_len: the value of T. It must be the maximum of the lengths Ti of the arrays at + minimum. Will default to that value if None. + :param pad_value: the value to pad the arrays with. + :return: a (B, T, ...) tensor + """ + if seq_len is None: + seq_len = max(len(array) for array in arrays) + else: + assert seq_len >= max(len(array) for array in arrays) + + # Convert lists to np.array + if isinstance(arrays[0], list): + arrays = [np.array(array) for array in arrays] + + # Convert to tensor and handle device + device = None + if isinstance(arrays[0], torch.Tensor): + tensors = arrays + device = tensors[0].device + else: + tensors = [torch.as_tensor(array) for array in arrays] + + # Fill the packed tensor with the array data + packed_shape = (len(tensors), seq_len, *tensors[0].shape[1:]) + packed_tensor = torch.full(packed_shape, pad_value, dtype=tensors[0].dtype, device=device) + + for i, tensor in enumerate(tensors): + packed_tensor[i, :tensor.size(0)] = tensor + + return packed_tensor diff --git a/chatterbox/src/orator/transforms/webrtc.py b/chatterbox/src/orator/transforms/webrtc.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d3abf97f27ac7f5c51ea7228920a0c522ed934 --- /dev/null +++ b/chatterbox/src/orator/transforms/webrtc.py @@ -0,0 +1,181 @@ +from itertools import groupby + +import numpy as np +import webrtcvad as _webrtcvad + +from transforms.vad.vad_stream import VADStream +from transforms.wav_encoding import encode_pcm16 + +# The sample rate the algo can operate at +_WEBRTC_SAMPLE_RATES = np.array([8000, 16000, 32000, 48000]) +# The algo operates with window sizes of 10, 20 and 30ms +_WEBRTC_WINDOW_SIZES_MS = (10, 20, 30) +# Greatest common divisor and lowest common multiple of the above +_WEBRTC_WINDOW_SIZES_MS_GCD = 10 +_WEBRTC_WINDOW_SIZES_MS_LCM = 60 + + +class WebRTCVADStream(VADStream): + def __init__(self, sample_rate: int, aggressiveness=2, dilation_ms=40, min_voiced_region_ms=125): + """ + :param sample_rate: sample rate of the wavs that will be passed + :param aggressiveness: parameter for controlling the aggressiveness of the VAD algo. Possible values are 1, + 2 and 3. Higher means less regions will be detected as voiced. + :param dilation_ms: pass a value greater than 0 to include regions directly preceding or succeeding voiced + regions. Voiced regions will be expanded left and right by this value, in milliseconds. + N.B.: this is a best effort parameter. When the output is requested as fast as the input is produced, + it's impossible to foresee an upcoming voiced region. In that case, the dilation on the left of that region + may not appear. + :param min_voiced_region_ms: to exclude regions detected as speech that are considered too short, pass a value + greater than 0. Voiced regions shorter than this value (prior to dilation) will be set as unvoiced. + N.B.: this is also a best effort parameter. A region may be too short, but because VAD has not finished + being computed at the end of that region, it won't be removed as it could potentially be large enough. + """ + webrtc_sr = int(_WEBRTC_SAMPLE_RATES[np.argmin(np.abs(_WEBRTC_SAMPLE_RATES - sample_rate))]) + lcm_win_size = (_WEBRTC_WINDOW_SIZES_MS_LCM * webrtc_sr) // 1000 + self._gcd_win_size = (_WEBRTC_WINDOW_SIZES_MS_GCD * webrtc_sr) // 1000 + + # webrtcvad.Vad is stateful, predictions will be impacted if a new instance is created halfway through an + # audio. This is why we create them now. + self._detectors = {win_size: _webrtcvad.Vad(mode=aggressiveness) for win_size in _WEBRTC_WINDOW_SIZES_MS} + + super().__init__(sample_rate, webrtc_sr, lcm_win_size, dilation_ms, min_voiced_region_ms) + + def _wav_vad(self, wav: np.ndarray) -> np.ndarray: + pcm = encode_pcm16(wav) + + # Perform the VAD by ensembling the different window sizes + win_vad = np.zeros(len(wav) // self._gcd_win_size, dtype=np.int32) + for sub_win_size_ms in _WEBRTC_WINDOW_SIZES_MS: + detector = self._detectors[sub_win_size_ms] + sub_win_size_pcm = (2 * sub_win_size_ms * self.vad_sr) // 1000 + factor = sub_win_size_ms // _WEBRTC_WINDOW_SIZES_MS_GCD + + for i, win_start in enumerate(range(0, len(pcm), sub_win_size_pcm)): + win_i_vad = detector.is_speech(pcm[win_start:win_start + sub_win_size_pcm], self.vad_sr) + win_vad[i * factor:(i + 1) * factor] += win_i_vad + win_vad = win_vad > (len(_WEBRTC_WINDOW_SIZES_MS) // 2) + + # Convert the output to regions + regions = np.diff(win_vad, prepend=0, append=0).nonzero()[0].reshape(-1, 2) + regions = regions * (len(wav) // len(win_vad)) + + return regions + + +def webrtc_vad(wav: np.ndarray, source_sr: int, aggressiveness=2, dilation_ms=40, min_voiced_region_ms=125): + """ + Peforms Voice Activation Detection on a single audio. See WebrtcVADStream for more details. + + :return vad: a boolean numpy array of length equal to + """ + vad_stream = WebRTCVADStream(source_sr, aggressiveness, dilation_ms, min_voiced_region_ms) + vad_stream.feed(wav, close_input=True) + if vad_stream.can_step(): + return vad_stream.step(len(wav)) + else: + return np.zeros_like(wav, dtype=bool) + + +def split_on_silence( + wav, sr, vad, thresholds_ms=[500, 300, 200, 100, 50], min_dur_s=1.5, max_split_dur_s=20, max_dur_s=30, +): + """ + Split a wav into chunks, splitting on silence when the length of the silence exceeds a threshold. + Args: + wav: 1d-array + sr: sample rate + thresholds_ms: min length of silence to split on, clips are recursively split using values from this list until + the resulting chunks are all within the min / max duration bounds + min_dur_s: minimum duration of a chunk in seconds + max_split_dur_s: segments above this length are continue to be split down with smaller thesholds + max_dur_s: maximum duration of a chunk in seconds + """ + assert isinstance(wav, np.ndarray) and wav.ndim == 1 + + # unpack silence length thresholds + thresh_ms, next_thresh_ms = (thresholds_ms + [0, 0])[:2] + if thresh_ms <= 0: + return [wav] + + # convert thresholds to samples + max_split_dur_s = min(max_split_dur_s, max_dur_s) + thresh = int(thresh_ms * sr / 1000) + min_len = int(min_dur_s * sr) + max_split_len = int(max_split_dur_s * sr) + max_len = int(max_dur_s * sr) + wav_len = len(wav) + + # detect regions of silence using groupby + sil_regions = [] + for is_voiced, idxs in groupby(range(wav_len), key=vad.__getitem__): + idxs = list(idxs) + i = idxs[0] + j = idxs[-1] + j += 1 + n = j - i + mid = (i + j) // 2 + + # record split point if this is a long silence region + if (not is_voiced) and n > thresh: + sil_regions += [( + min(mid, i + (0 if i == 0 else thresh // 2)), + max(mid, j - (0 if j == wav_len else thresh // 2)), + )] + + # invert silence regions to get voiced regions + ptr = 0 + voiced_regions = [] + for i, j in sil_regions: + if i > 0: + voiced_regions += [(ptr, i)] + ptr = j + if ptr < wav_len: + voiced_regions += [(ptr, wav_len)] + + # split the waveform into chunks using the detected content bounds and silence split points + chunks = [] + for i, j in voiced_regions: + chunk = wav[i:j] + chunklen = len(chunk) + + # chunk is within bounds + if chunklen < max_split_len: + chunks += [chunk] + + # chunk is too long, attempt to split it recursively using threshold list + elif next_thresh_ms > 0: + chunks += split_on_silence( + chunk, sr, vad[i:j], thresholds_ms=thresholds_ms[1:], + min_dur_s=min_dur_s, max_dur_s=max_dur_s, + ) + + # NOTE: keeping chunks longer than `max_len` here, filtering is done below + else: + chunks += [chunk] + + # merge short chunks + merged_chunks = [] + for chunk in chunks: + chunklen = len(chunk) + + # chunk is too short, add it to the previous chunk if possible + if chunklen == 0: + continue + + elif chunklen < min_len: + # NOTE: ignore the edge case where this would make the previous chunk too long, by just dropping this chunk + if len(merged_chunks) > 0 and len(merged_chunks[-1]) + chunklen < max_len: + merged_chunks[-1] = np.concatenate([merged_chunks[-1], chunk]) + + elif chunklen < max_len: + merged_chunks += [chunk] + + else: + # TODO: keep long chunks as well? one benefit is to keep the adjascent ordering of chunks, for + # building paragraph-level datasets. However, this should rarely drop any clips, so it's probably okay. + # merged_chunks += [chunk] + pass + chunks = merged_chunks + + return chunks diff --git a/chatterbox/src/chatterbox/tts.py b/chatterbox/src/orator/tts.py similarity index 70% rename from chatterbox/src/chatterbox/tts.py rename to chatterbox/src/orator/tts.py index 33fb6e3bfd6b74495d908f5c775cc2861eda69d6..6b2ba965bebba7660e32c47aa5a83509bd5de1f9 100644 --- a/chatterbox/src/chatterbox/tts.py +++ b/chatterbox/src/orator/tts.py @@ -14,60 +14,7 @@ from .models.voice_encoder import VoiceEncoder from .models.t3.modules.cond_enc import T3Cond -REPO_ID = "ResembleAI/chatterbox" - - -def change_pace(speech_tokens: torch.Tensor, pace: float): - """ - :param speech_tokens: Tensor of shape (L,) - :param pace: float, pace (default: 1) - """ - L = len(speech_tokens) - speech_tokens = F.interpolate(speech_tokens.view(1, 1, -1).float(), size=int(L / pace), mode="nearest") - speech_tokens = speech_tokens.view(-1).long() - return speech_tokens - - -def punc_norm(text: str) -> str: - """ - Quick cleanup func for punctuation from LLMs or - containing chars not seen often in the dataset - """ - if len(text) == 0: - return "You need to add some text for me to talk." - - # Capitalise first letter - if text[0].islower(): - text = text[0].upper() + text[1:] - - # Remove multiple space chars - text = " ".join(text.split()) - - # Replace uncommon/llm punc - punc_to_replace = [ - ("...", ", "), - ("…", ", "), - (":", ","), - (" - ", ", "), - (";", ", "), - ("—", "-"), - ("–", "-"), - (" ,", ","), - ("“", "\""), - ("”", "\""), - ("‘", "'"), - ("’", "'"), - ] - for old_char_sequence, new_char in punc_to_replace: - text = text.replace(old_char_sequence, new_char) - - # Add full stop if no ending punc - text = text.rstrip(" ") - sentence_enders = {".", "!", "?", "-", ","} - if not any(text.endswith(p) for p in sentence_enders): - text += "." - - return text +REPO_ID = "ResembleAI/Orator" @dataclass @@ -110,7 +57,7 @@ class Conditionals: return cls(T3Cond(**kwargs['t3']), kwargs['gen']) -class ChatterboxTTS: +class OratorTTS: ENC_COND_LEN = 6 * S3_SR DEC_COND_LEN = 10 * S3GEN_SR @@ -132,7 +79,7 @@ class ChatterboxTTS: self.conds = conds @classmethod - def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS': + def from_local(cls, ckpt_dir, device) -> 'OratorTTS': ckpt_dir = Path(ckpt_dir) ve = VoiceEncoder() @@ -164,13 +111,13 @@ class ChatterboxTTS: return cls(t3, s3gen, ve, tokenizer, device, conds=conds) @classmethod - def from_pretrained(cls, device) -> 'ChatterboxTTS': + def from_pretrained(cls, device) -> 'OratorTTS': for fpath in ["ve.pt", "t3.pt", "s3gen.pt", "tokenizer.json", "conds.pt"]: local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath) return cls.from_local(Path(local_path).parent, device) - def prepare_conditionals(self, wav_fpath, exaggeration=0.5): + def prepare_conditionals(self, wav_fpath, emotion_adv=0.5): ## Load reference wav s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR) @@ -186,13 +133,13 @@ class ChatterboxTTS: t3_cond_prompt_tokens = torch.atleast_2d(t3_cond_prompt_tokens).to(self.device) # # Voice-encoder speaker embedding - ve_embed = torch.from_numpy(self.ve.embeds_from_wavs([s3_ref_wav], sample_rate=S3_SR)) + ve_embed = torch.from_numpy(self.ve.embeds_from_wavs([s3_ref_wav], sample_rate=S3GEN_SR)) ve_embed = ve_embed.mean(axis=0, keepdim=True).to(self.device) t3_cond = T3Cond( speaker_emb=ve_embed, cond_prompt_speech_tokens=t3_cond_prompt_tokens, - emotion_adv=exaggeration * torch.ones(1, 1, 1), + emotion_adv=emotion_adv * torch.ones(1, 1, 1), ).to(device=self.device) self.conds = Conditionals(t3_cond, s3gen_ref_dict) @@ -200,26 +147,22 @@ class ChatterboxTTS: self, text, audio_prompt_path=None, - exaggeration=0.5, - pace=1, - temperature=0.8, + emotion_adv=0.5 ): if audio_prompt_path: - self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration) + self.prepare_conditionals(audio_prompt_path, emotion_adv=emotion_adv) else: assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`" - # Update exaggeration if needed - if exaggeration != self.conds.t3.emotion_adv[0, 0, 0]: + # Update emotion_adv if needed + if emotion_adv != self.conds.t3.emotion_adv[0, 0, 0]: _cond: T3Cond = self.conds.t3 self.conds.t3 = T3Cond( speaker_emb=_cond.speaker_emb, cond_prompt_speech_tokens=_cond.cond_prompt_speech_tokens, - emotion_adv=exaggeration * torch.ones(1, 1, 1), + emotion_adv=emotion_adv * torch.ones(1, 1, 1), ).to(device=self.device) - # Norm and tokenize text - text = punc_norm(text) text_tokens = self.tokenizer.text_to_tokens(text).to(self.device) sot = self.t3.hp.start_text_token @@ -232,15 +175,12 @@ class ChatterboxTTS: t3_cond=self.conds.t3, text_tokens=text_tokens, max_new_tokens=1000, # TODO: use the value in config - temperature=temperature, ) # TODO: output becomes 1D speech_tokens = drop_invalid_tokens(speech_tokens) speech_tokens = speech_tokens.to(self.device) - speech_tokens = change_pace(speech_tokens, pace=pace) - wav, _ = self.s3gen.inference( speech_tokens=speech_tokens, ref_dict=self.conds.gen, diff --git a/chatterbox/src/chatterbox/vc.py b/chatterbox/src/orator/vc.py similarity index 82% rename from chatterbox/src/chatterbox/vc.py rename to chatterbox/src/orator/vc.py index ea5ec21e25f671188e439f33307e5890421ff4af..39d6ede5fe745231bacf57f4366a55aab6fcad51 100644 --- a/chatterbox/src/chatterbox/vc.py +++ b/chatterbox/src/orator/vc.py @@ -8,10 +8,10 @@ from .models.s3tokenizer import S3_SR from .models.s3gen import S3GEN_SR, S3Gen -REPO_ID = "ResembleAI/chatterbox" +REPO_ID = "ResembleAI/Orator" -class ChatterboxVC: +class OratorVC: ENC_COND_LEN = 6 * S3_SR DEC_COND_LEN = 10 * S3GEN_SR @@ -33,7 +33,7 @@ class ChatterboxVC: } @classmethod - def from_local(cls, ckpt_dir, device) -> 'ChatterboxVC': + def from_local(cls, ckpt_dir, device) -> 'OratorVC': ckpt_dir = Path(ckpt_dir) ref_dict = None if (builtin_voice := ckpt_dir / "conds.pt").exists(): @@ -49,7 +49,7 @@ class ChatterboxVC: return cls(s3gen, device, ref_dict=ref_dict) @classmethod - def from_pretrained(cls, device) -> 'ChatterboxVC': + def from_pretrained(cls, device) -> 'OratorVC': for fpath in ["s3gen.pt", "conds.pt"]: local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath) @@ -73,8 +73,12 @@ class ChatterboxVC: assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`" with torch.inference_mode(): - audio_16, _ = librosa.load(audio, sr=S3_SR) - audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ] + if isinstance(audio, str): + import torchaudio as ta + audio_16, _ = librosa.load(audio, sr=S3_SR) + audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ] + else: + raise NotImplementedError() s3_tokens, _ = self.s3gen.tokenizer(audio_16) wav, _ = self.s3gen.inference(