Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import torch | |
| from torch import nn | |
| from torchaudio.transforms import MelSpectrogram as TorchMelSpectrogram | |
| from .hparams import HParams | |
| class MelSpectrogram(nn.Module): | |
| def __init__(self, hp: HParams): | |
| """ | |
| Torch implementation of Resemble's mel extraction. | |
| Note that the values are NOT identical to librosa's implementation | |
| due to floating point precisions. | |
| """ | |
| super().__init__() | |
| self.hp = hp | |
| self.melspec = TorchMelSpectrogram( | |
| hp.wav_rate, | |
| n_fft=hp.n_fft, | |
| win_length=hp.win_size, | |
| hop_length=hp.hop_size, | |
| f_min=0, | |
| f_max=hp.wav_rate // 2, | |
| n_mels=hp.num_mels, | |
| power=1, | |
| normalized=False, | |
| # NOTE: Folowing librosa's default. | |
| pad_mode="constant", | |
| norm="slaney", | |
| mel_scale="slaney", | |
| ) | |
| self.register_buffer("stft_magnitude_min", torch.FloatTensor([hp.stft_magnitude_min])) | |
| self.min_level_db = 20 * np.log10(hp.stft_magnitude_min) | |
| self.preemphasis = hp.preemphasis | |
| self.hop_size = hp.hop_size | |
| def forward(self, wav, pad=True): | |
| """ | |
| Args: | |
| wav: [B, T] | |
| """ | |
| device = wav.device | |
| if wav.is_mps: | |
| wav = wav.cpu() | |
| self.to(wav.device) | |
| if self.preemphasis > 0: | |
| wav = torch.nn.functional.pad(wav, [1, 0], value=0) | |
| wav = wav[..., 1:] - self.preemphasis * wav[..., :-1] | |
| mel = self.melspec(wav) | |
| mel = self._amp_to_db(mel) | |
| mel_normed = self._normalize(mel) | |
| assert not pad or mel_normed.shape[-1] == 1 + wav.shape[-1] // self.hop_size # Sanity check | |
| mel_normed = mel_normed.to(device) | |
| return mel_normed # (M, T) | |
| def _normalize(self, s, headroom_db=15): | |
| return (s - self.min_level_db) / (-self.min_level_db + headroom_db) | |
| def _amp_to_db(self, x): | |
| return x.clamp_min(self.hp.stft_magnitude_min).log10() * 20 | |