Spaces:
Running
on
Zero
Running
on
Zero
| import math | |
| import torch | |
| from torch import nn | |
| import audiosr.latent_diffusion.modules.phoneme_encoder.commons as commons | |
| import audiosr.latent_diffusion.modules.phoneme_encoder.attentions as attentions | |
| class TextEncoder(nn.Module): | |
| def __init__( | |
| self, | |
| n_vocab, | |
| out_channels=192, | |
| hidden_channels=192, | |
| filter_channels=768, | |
| n_heads=2, | |
| n_layers=6, | |
| kernel_size=3, | |
| p_dropout=0.1, | |
| ): | |
| super().__init__() | |
| self.n_vocab = n_vocab | |
| self.out_channels = out_channels | |
| self.hidden_channels = hidden_channels | |
| self.filter_channels = filter_channels | |
| self.n_heads = n_heads | |
| self.n_layers = n_layers | |
| self.kernel_size = kernel_size | |
| self.p_dropout = p_dropout | |
| self.emb = nn.Embedding(n_vocab, hidden_channels) | |
| nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) | |
| self.encoder = attentions.Encoder( | |
| hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout | |
| ) | |
| self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) | |
| def forward(self, x, x_lengths): | |
| x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] | |
| x = torch.transpose(x, 1, -1) # [b, h, t] | |
| x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( | |
| x.dtype | |
| ) | |
| x = self.encoder(x * x_mask, x_mask) | |
| stats = self.proj(x) * x_mask | |
| m, logs = torch.split(stats, self.out_channels, dim=1) | |
| return x, m, logs, x_mask | |