Spaces:

junseok520
/

VoxSIM

Running

App Files Files Community

junseok commited on Mar 1

Commit

2216a22

1 Parent(s): 09bc42b

first commit

Browse files

Files changed (5) hide show

app.py +47 -0
predict.py +119 -0
requirements.txt +7 -0
score.py +102 -0
ssl_ecapa_model.py +314 -0

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from score import load_model
+from predict import loadWav
+import torch
+import torch.nn.functional as F
+import gradio as gr
+model = load_model("wavlm_ecapa.model")
+model.eval()
+def calc_voxsim(inp_path, ref_path):
+    inp_wavs, inp_wav = loadWav(inp_path)
+    ref_wavs, ref_wav = loadWav(ref_path)
+    inp_wavs = torch.FloatTensor(inp_wavs)
+    inp_wav = torch.FloatTensor(inp_wav)
+    ref_wavs = torch.FloatTensor(ref_wavs)
+    ref_wav = torch.FloatTensor(ref_wav)
+    with torch.no_grad():
+        input_emb_1 = F.normalize(model.foward(inp_wavs), p=2, dim=1)
+        input_emb_2 = F.normalize(model.foward(inp_wav), p=2, dim=1)
+        ref_emb_1 = F.normalize(model.foward(ref_wavs), p=2, dim=1)
+        ref_emb_2 = F.normalize(model.foward(ref_wav), p=2, dim=1)
+        score_1 = torch.mean(torch.matmul(input_emb_1, ref_emb_1.T))
+        score_2 = torch.mean(torch.matmul(input_emb_2, ref_emb_2.T))
+        score = (score_1 + score_2) / 2
+        return score.detach().cpu().numpy()
+description = """
+Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
+This demo only accepts .wav format. Best at 16 kHz sampling rate.
+Paper is available [here](https://arxiv.org/abs/2407.18505)
+"""
+iface = gr.Interface(
+    fn=calc_voxsim,
+    inputs=(
+        gr.inputs.Audio(label="Input Audio"),
+        gr.inputs.Audio(label="Reference Audio")
+    ),
+    outputs="text",
+    title="voice similarity with VoxSim",
+    description=description,
+    allow_flagging=False
+)

predict.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import argparse
+import pathlib
+import tqdm
+from torch.utils.data import Dataset, DataLoader
+import librosa
+import numpy
+from score import Score
+import torch
+import warnings
+warnings.filterwarnings("ignore")
+def get_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bs", required=False, default=None, type=int)
+    parser.add_argument("--mode", required=True, choices=["predict_file", "predict_dir"], type=str)
+    parser.add_argument("--ckpt_path", required=False, default="wavlm_ecapa.model", type=pathlib.Path)
+    parser.add_argument("--inp_dir", required=False, default=None, type=pathlib.Path)
+    parser.add_argument("--ref_dir", required=False, default=None, type=pathlib.Path)
+    parser.add_argument("--inp_path", required=False, default=None, type=pathlib.Path)
+    parser.add_argument("--ref_path", required=False, default=None, type=pathlib.Path)
+    parser.add_argument("--out_path", required=True, type=pathlib.Path)
+    parser.add_argument("--num_workers", required=False, default=0, type=int)
+    return parser.parse_args()
+def loadWav(filename, max_frames: int = 400):
+    # Maximum audio length
+    max_audio = max_frames * 160 + 240
+    # Read wav file and convert to torch tensor
+    audio, sr = librosa.load(filename, sr=16000)
+    audio_org = audio.copy()
+    audiosize = audio.shape[0]
+    if audiosize <= max_audio:
+        shortage = max_audio - audiosize + 1
+        audio       = numpy.pad(audio, (0, shortage), 'wrap')
+        audiosize   = audio.shape[0]
+    startframe = numpy.linspace(0,audiosize-max_audio,num=10)
+    feats = []
+    for asf in startframe:
+        feats.append(audio[int(asf):int(asf)+max_audio])
+    feat = numpy.stack(feats,axis=0).astype(numpy.float32)
+    return torch.FloatTensor(feat), torch.FloatTensor(numpy.stack([audio_org],axis=0).astype(numpy.float32))
+class AudioDataset(Dataset):
+    def __init__(self, inp_dir_path: pathlib.Path, ref_dir_path: pathlib.Path, max_frames: int = 400):
+        self.inp_wavlist = list(inp_dir_path.glob("*.wav"))
+        self.ref_wavlist = list(ref_dir_path.glob("*.wav"))
+        assert len(self.inp_wavlist) == len(self.ref_wavlist)
+        self.inp_wavlist.sort()
+        self.ref_wavlist.sort()
+        _, self.sr = librosa.load(self.inp_wavlist[0], sr=None)
+        self.max_audio = max_frames * 160 + 240
+    def __len__(self):
+        return len(self.inp_wavlist)
+    def __getitem__(self, idx):
+        inp_wavs, inp_wav = loadWav(self.inp_wavlist[idx])
+        ref_wavs, ref_wav = loadWav(self.ref_wavlist[idx])
+        return inp_wavs, inp_wav, ref_wavs, ref_wav
+def main():
+    args = get_arg()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if args.mode == "predict_file":
+        assert args.inp_path is not None
+        assert args.ref_path is not None
+        assert args.inp_dir is None
+        assert args.ref_dir is None
+        assert args.inp_path.exists()
+        assert args.inp_path.is_file()
+        assert args.ref_path.exists()
+        assert args.ref_path.is_file()
+        inp_wavs, inp_wav = loadWav(args.inp_path)
+        ref_wavs, ref_wav = loadWav(args.ref_path)
+        scorer = Score(ckpt_path=args.ckpt_path, device=device)
+        score = scorer.score(inp_wavs, inp_wav, ref_wavs, ref_wav)
+        print("Voxsim score: ", score[0])
+        with open(args.out_path, "w") as fw:
+            fw.write(str(score[0]))
+    else:
+        assert args.inp_dir is not None, "inp_dir is required when mode is predict_dir."
+        assert args.ref_dir is not None, "ref_dir is required when mode is predict_dir."
+        assert args.bs is not None, "bs is required when mode is predict_dir."
+        assert args.inp_path is None, "inp_path should be None"
+        assert args.ref_path is None, "ref_path should be None"
+        assert args.inp_dir.exists()
+        assert args.ref_dir.exists()
+        assert args.inp_dir.is_dir()
+        assert args.ref_dir.is_dir()
+        dataset = AudioDataset(args.inp_dir, args.ref_dir)
+        loader = DataLoader(
+            dataset,
+            batch_size=args.bs,
+            shuffle=False,
+            num_workers=args.num_workers)
+        scorer = Score(ckpt_path=args.ckpt_path, device=device)
+        with open(args.out_path, 'w'):
+            pass
+        for batch in tqdm.tqdm(loader):
+            scores = score.score(batch.to(device))
+            with open(args.out_path, 'a') as fw:
+                for s in scores:
+                    fw.write(str(s) + "\n")
+        print("save to ", args.out_path)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy
+librosa
+torch
+torchaudio
+tqdm
+s3prl
+huggingface_hub

score.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import torch
+import torch.nn.functional as F
+from ssl_ecapa_model import SSL_ECAPA_TDNN
+from huggingface_hub import hf_hub_download
+def load_model(ckpt_path):
+    model = SSL_ECAPA_TDNN(feat_dim=1024, emb_dim=256, feat_type='wavlm_large')
+    load_parameters(model, ckpt_path)
+    return model
+def load_parameters(model, ckpt_path):
+    model_state = model.state_dict()
+    if not os.path.isfile(ckpt_path):
+        print("Downloading model from Hugging Face Hub...")
+        new_ckpt_path = hf_hub_download(repo_id="junseok520/voxsim-models", filename=ckpt_path, local_dir="./")
+        ckpt_path = new_ckpt_path
+    loaded_state = torch.load(ckpt_path, map_location='cpu', weights_only=True)
+    for name, param in loaded_state.items():
+        if name.startswith('__S__.'):
+            if name[6:] in model_state:
+                model_state[name[6:]].copy_(param)
+            else:
+                print("{} is not in the model.".format(name[6:]))
+class Score:
+    """Predicting score for each audio clip."""
+    def __init__(
+        self,
+        ckpt_path: str = "wavlm_ecapa.pt",
+        device: str = "gpu"):
+        """
+        Args:
+            ckpt_path: path to pretrained checkpoint of voxsim evaluator.
+            input_sample_rate: sampling rate of input audio tensor. The input audio tensor
+                is automatically downsampled to 16kHz.
+        """
+        print(f"Using device: {device}")
+        self.device = device
+        self.model = load_model(ckpt_path).to(self.device)
+        self.model.eval()
+    def score(self, inp_wavs: torch.tensor, inp_wav: torch.tensor, ref_wavs: torch.tensor, ref_wav: torch.tensor) -> torch.tensor:
+        """
+        Args:
+            wavs: audio waveform to be evaluated. When len(wavs) == 1 or 2,
+                the model processes the input as a single audio clip. The model
+                performs batch processing when len(wavs) == 3.
+        """
+        # if len(wavs.shape) == 1:
+        #     out_wavs = wavs.unsqueeze(0).unsqueeze(0)
+        # elif len(wavs.shape) == 2:
+        #     out_wavs = wavs.unsqueeze(0)
+        # elif len(wavs.shape) == 3:
+        #     out_wavs = wavs
+        # else:
+        #     raise ValueError('Dimension of input tensor needs to be <= 3.')
+        if len(inp_wavs.shape) == 2:
+            bs = 1
+        elif len(inp_wavs.shape) == 3:
+            bs = inp_wavs.shape[0]
+        else:
+            raise ValueError('Dimension of input tensor needs to be <= 3.')
+        inp_wavs = inp_wavs.reshape(-1, inp_wavs.shape[-1]).to(self.device)
+        inp_wav = inp_wav.reshape(-1, inp_wav.shape[-1]).to(self.device)
+        ref_wavs = ref_wavs.reshape(-1, ref_wavs.shape[-1]).to(self.device)
+        ref_wav = ref_wav.reshape(-1, ref_wav.shape[-1]).to(self.device)
+        # assert inp_wavs.shape[1] == 10
+        # assert ref_wavs.shape[1] == 10
+        # assert inp_wav.shape[1] == 1
+        # assert ref_wav.shape[1] == 1
+        # import pdb; pdb.set_trace()
+        with torch.no_grad():
+            input_emb_1 = F.normalize(self.model.forward(inp_wavs), p=2, dim=1).detach()
+            input_emb_2 = F.normalize(self.model.forward(inp_wav), p=2, dim=1).detach()
+            ref_emb_1 = F.normalize(self.model.forward(ref_wavs), p=2, dim=1).detach()
+            ref_emb_2 = F.normalize(self.model.forward(ref_wav), p=2, dim=1).detach()
+            emb_size = input_emb_1.shape[-1]
+            input_emb_1 = input_emb_1.reshape(bs, -1, emb_size)
+            input_emb_2 = input_emb_2.reshape(bs, -1, emb_size)
+            ref_emb_1 = ref_emb_1.reshape(bs, -1, emb_size)
+            ref_emb_2 = ref_emb_2.reshape(bs, -1, emb_size)
+            score_1 = torch.mean(torch.bmm(input_emb_1, ref_emb_1.transpose(1,2)), dim=(1,2))
+            score_2 = torch.mean(torch.bmm(input_emb_2, ref_emb_2.transpose(1,2)), dim=(1,2))
+            score = (score_1 + score_2) / 2
+            score = score.detach().cpu().numpy()
+            return score

ssl_ecapa_model.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.transforms as trans
+urls = {
+    'hubert_large_ll60k': "https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt",
+    'xls_r_300m': "https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_300m.pt",
+    'unispeech_sat': "https://huggingface.co/s3prl/converted_ckpts/resolve/main/unispeech_sat_large.pt",
+    'wavlm_base_plus': "https://huggingface.co/s3prl/converted_ckpts/resolve/main/wavlm_base_plus.pt",
+    'wavlm_large': "https://huggingface.co/s3prl/converted_ckpts/resolve/main/wavlm_large.pt",
+}
+''' Res2Conv1d + BatchNorm1d + ReLU
+'''
+class Res2Conv1dReluBn(nn.Module):
+    '''
+    in_channels == out_channels == channels
+    '''
+    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            # Order: conv -> relu -> bn
+            sp = self.convs[i](sp)
+            sp = self.bns[i](F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+''' Conv1d + BatchNorm1d + ReLU
+'''
+class Conv1dReluBn(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+''' The SE connection of 1D case.
+'''
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+''' SE-Res2Block of the ECAPA-TDNN architecture.
+'''
+# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
+#     return nn.Sequential(
+#         Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
+#         Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
+#         Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
+#         SE_Connect(channels)
+#     )
+class SE_Res2Block(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
+        super().__init__()
+        self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
+        self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+    def forward(self, x):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+        x = self.Conv1dReluBn1(x)
+        x = self.Res2Conv1dReluBn(x)
+        x = self.Conv1dReluBn2(x)
+        x = self.SE_Connect(x)
+        return x + residual
+''' Attentive weighted mean and standard deviation pooling.
+'''
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, in_dim, attention_channels=128, global_context_att=False):
+        super().__init__()
+        self.global_context_att = global_context_att
+        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1)  # equals V and k in the paper
+    def forward(self, x):
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x_in))
+        # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * (x ** 2), dim=2) - mean ** 2
+        std = torch.sqrt(residuals.clamp(min=1e-9))
+        return torch.cat([mean, std], dim=1)
+class SSL_ECAPA_TDNN(nn.Module):
+    def __init__(self, feat_dim=80, channels=512, emb_dim=192, global_context_att=False,
+                 feat_type='fbank', sr=16000, feature_selection="hidden_states", update_extract=False, initial_model="", **kwargs):
+        super().__init__()
+        self.feat_type = feat_type
+        self.feature_selection = feature_selection
+        self.update_extract = update_extract
+        self.sr = sr
+        if feat_type == "fbank" or feat_type == "mfcc":
+            self.update_extract = False
+        win_len = int(sr * 0.025)
+        hop_len = int(sr * 0.01)
+        if feat_type == 'fbank':
+            self.feature_extract = trans.MelSpectrogram(sample_rate=sr, n_fft=512, win_length=win_len,
+                                                        hop_length=hop_len, f_min=0.0, f_max=sr // 2,
+                                                        pad=0, n_mels=feat_dim)
+        elif feat_type == 'mfcc':
+            melkwargs = {
+                'n_fft': 512,
+                'win_length': win_len,
+                'hop_length': hop_len,
+                'f_min': 0.0,
+                'f_max': sr // 2,
+                'pad': 0
+            }
+            self.feature_extract = trans.MFCC(sample_rate=sr, n_mfcc=feat_dim, log_mels=False,
+                                              melkwargs=melkwargs)
+        else:
+            self.feature_extract = torch.hub.load('s3prl/s3prl', feat_type)
+            if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"):
+                self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
+            if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"):
+                self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
+            self.feat_num = self.get_feat_num()
+            self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
+            # self.feature_weight = nn.Parameter(torch.zeros(7))
+        if feat_type != 'fbank' and feat_type != 'mfcc':
+            freeze_list = ['final_proj', 'label_embs_concat', 'mask_emb', 'project_q', 'quantizer']
+            for name, param in self.feature_extract.named_parameters():
+                for freeze_val in freeze_list:
+                    if freeze_val in name:
+                        param.requires_grad = False
+                        break
+        if not self.update_extract:
+            for param in self.feature_extract.parameters():
+                param.requires_grad = False
+        self.instance_norm = nn.InstanceNorm1d(feat_dim)
+        # self.channels = [channels] * 4 + [channels * 3]
+        self.channels = [channels] * 4 + [1536]
+        self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(self.channels[0], self.channels[1], kernel_size=3, stride=1, padding=2, dilation=2, scale=8, se_bottleneck_dim=128)
+        self.layer3 = SE_Res2Block(self.channels[1], self.channels[2], kernel_size=3, stride=1, padding=3, dilation=3, scale=8, se_bottleneck_dim=128)
+        self.layer4 = SE_Res2Block(self.channels[2], self.channels[3], kernel_size=3, stride=1, padding=4, dilation=4, scale=8, se_bottleneck_dim=128)
+        # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
+        self.pooling = AttentiveStatsPool(self.channels[-1], attention_channels=128, global_context_att=global_context_att)
+        self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
+        self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
+    def get_feat_num(self):
+        self.feature_extract.eval()
+        wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
+        with torch.no_grad():
+            features = self.feature_extract(wav)
+        select_feature = features[self.feature_selection]
+        if isinstance(select_feature, (list, tuple)):
+            return len(select_feature)
+        else:
+            return 1
+    def get_feat(self, x):
+        if self.update_extract:
+            x = self.feature_extract([sample for sample in x])
+        else:
+            with torch.no_grad():
+                if self.feat_type == 'fbank' or self.feat_type == 'mfcc':
+                    x = self.feature_extract(x) + 1e-6  # B x feat_dim x time_len
+                else:
+                    x = self.feature_extract([sample for sample in x])
+        if self.feat_type == 'fbank':
+            x = x.log()
+        if self.feat_type != "fbank" and self.feat_type != "mfcc":
+            x = x[self.feature_selection]
+            # x = x[1:8]
+            # x = x[2]
+            if isinstance(x, (list, tuple)):
+                x = torch.stack(x, dim=0)
+            else:
+                x = x.unsqueeze(0)
+            norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+            # norm_weights = F.softmax(self.feature_weight[1:8], dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+            x = (norm_weights * x).sum(dim=0)
+            x = torch.transpose(x, 1, 2) + 1e-6
+        x = self.instance_norm(x)
+        return x
+    def forward(self, x):
+        x = self.get_feat(x)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn(self.pooling(out))
+        out = self.linear(out)
+        return out
+def ECAPA_TDNN_SMALL(feat_dim, emb_dim=256, feat_type='fbank', sr=16000, feature_selection="hidden_states", update_extract=False):
+    return SSL_ECAPA_TDNN(feat_dim=feat_dim, channels=512, emb_dim=emb_dim,
+                      feat_type=feat_type, sr=sr, feature_selection=feature_selection, update_extract=update_extract)
+def wavlm_ecapa():
+    return SSL_ECAPA_TDNN(feat_dim=1024, emb_dim=256, feat_type='wavlm_large')
+if __name__ == '__main__':
+    x = torch.zeros(2, 32000)
+    model = SSL_ECAPA_TDNN(feat_dim=1024, emb_dim=256, feat_type='wavlm_large', feature_selection="hidden_states",
+                              update_extract=False, ssl_weight=False)
+    import pdb; pdb.set_trace()
+    out = model(x)
+    # print(model)
+    print(out.shape)