Spaces:

Bredvige
/

RVC

No application file

App Files Files Community

_Noxty commited on Jan 12

Commit

8a42f74

verified ·

1 Parent(s): 6dd95c2

Upload 4 files

Browse files

Files changed (4) hide show

libs/audio.py +60 -0
libs/rmvpe.py +670 -0
libs/rtrvc.py +461 -0
libs/slicer2.py +260 -0

libs/audio.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import platform, os
+import ffmpeg
+import numpy as np
+import av
+from io import BytesIO
+import traceback
+import re
+def wav2(i, o, format):
+    inp = av.open(i, "rb")
+    if format == "m4a":
+        format = "mp4"
+    out = av.open(o, "wb", format=format)
+    if format == "ogg":
+        format = "libvorbis"
+    if format == "mp4":
+        format = "aac"
+    ostream = out.add_stream(format)
+    for frame in inp.decode(audio=0):
+        for p in ostream.encode(frame):
+            out.mux(p)
+    for p in ostream.encode(None):
+        out.mux(p)
+    out.close()
+    inp.close()
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
+        if os.path.exists(file) == False:
+            raise RuntimeError(
+                "You input a wrong audio path that does not exists, please fix it!"
+            )
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        traceback.print_exc()
+        raise RuntimeError(f"Failed to load audio: {e}")
+    return np.frombuffer(out, np.float32).flatten()
+def clean_path(path_str):
+    if platform.system() == "Windows":
+        path_str = path_str.replace("/", "\\")
+    path_str = re.sub(r'[\u202a\u202b\u202c\u202d\u202e]', '', path_str)  # 移除 Unicode 控制字符
+    return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")

libs/rmvpe.py ADDED Viewed

	@@ -0,0 +1,670 @@

+from io import BytesIO
+import os
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from infer.lib import jit
+try:
+    # Fix "Torch not compiled with CUDA enabled"
+    import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
+    if torch.xpu.is_available():
+        from infer.modules.ipex import ipex_init
+        ipex_init()
+except Exception:  # pylint: disable=broad-exception-caught
+    pass
+import torch.nn as nn
+import torch.nn.functional as F
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+import logging
+logger = logging.getLogger(__name__)
+class STFT(torch.nn.Module):
+    def __init__(
+        self, filter_length=1024, hop_length=512, win_length=None, window="hann"
+    ):
+        """
+        This module implements an STFT using 1D convolution and 1D transpose convolutions.
+        This is a bit tricky so there are some cases that probably won't work as working
+        out the same sizes before and after in all overlap add setups is tough. Right now,
+        this code should work with hop lengths that are half the filter length (50% overlap
+        between frames).
+        Keyword Arguments:
+            filter_length {int} -- Length of filters used (default: {1024})
+            hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
+            win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
+                equals the filter length). (default: {None})
+            window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
+                (default: {'hann'})
+        """
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length if win_length else filter_length
+        self.window = window
+        self.forward_transform = None
+        self.pad_amount = int(self.filter_length / 2)
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis)
+        inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
+        assert filter_length >= self.win_length
+        # get window and zero center pad it to filter_length
+        fft_window = get_window(window, self.win_length, fftbins=True)
+        fft_window = pad_center(fft_window, size=filter_length)
+        fft_window = torch.from_numpy(fft_window).float()
+        # window the bases
+        forward_basis *= fft_window
+        inverse_basis = (inverse_basis.T * fft_window).T
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+        self.register_buffer("fft_window", fft_window.float())
+    def transform(self, input_data, return_phase=False):
+        """Take input data (audio) to STFT domain.
+        Arguments:
+            input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
+        Returns:
+            magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
+                num_frequencies, num_frames)
+            phase {tensor} -- Phase of STFT with shape (num_batch,
+                num_frequencies, num_frames)
+        """
+        input_data = F.pad(
+            input_data,
+            (self.pad_amount, self.pad_amount),
+            mode="reflect",
+        )
+        forward_transform = input_data.unfold(
+            1, self.filter_length, self.hop_length
+        ).permute(0, 2, 1)
+        forward_transform = torch.matmul(self.forward_basis, forward_transform)
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        if return_phase:
+            phase = torch.atan2(imag_part.data, real_part.data)
+            return magnitude, phase
+        else:
+            return magnitude
+    def inverse(self, magnitude, phase):
+        """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
+        by the ```transform``` function.
+        Arguments:
+            magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
+                num_frequencies, num_frames)
+            phase {tensor} -- Phase of STFT with shape (num_batch,
+                num_frequencies, num_frames)
+        Returns:
+            inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
+                shape (num_batch, num_samples)
+        """
+        cat = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        fold = torch.nn.Fold(
+            output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
+            kernel_size=(1, self.filter_length),
+            stride=(1, self.hop_length),
+        )
+        inverse_transform = torch.matmul(self.inverse_basis, cat)
+        inverse_transform = fold(inverse_transform)[
+            :, 0, 0, self.pad_amount : -self.pad_amount
+        ]
+        window_square_sum = (
+            self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
+        )
+        window_square_sum = fold(window_square_sum)[
+            :, 0, 0, self.pad_amount : -self.pad_amount
+        ]
+        inverse_transform /= window_square_sum
+        return inverse_transform
+    def forward(self, input_data):
+        """Take input data (audio) to STFT domain and then back to audio.
+        Arguments:
+            input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
+        Returns:
+            reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
+                shape (num_batch, num_samples)
+        """
+        self.magnitude, self.phase = self.transform(input_data, return_phase=True)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+from time import time as ttime
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+    def forward(self, x):
+        return self.gru(x)[0]
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        # self.shortcut:Optional[nn.Module] = None
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+    def forward(self, x: torch.Tensor):
+        if not hasattr(self, "shortcut"):
+            return self.conv(x) + x
+        else:
+            return self.conv(x) + self.shortcut(x)
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        in_size,
+        n_encoders,
+        kernel_size,
+        n_blocks,
+        out_channels=16,
+        momentum=0.01,
+    ):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(
+                ResEncoderBlock(
+                    in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
+                )
+            )
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x: torch.Tensor):
+        concat_tensors: List[torch.Tensor] = []
+        x = self.bn(x)
+        for i, layer in enumerate(self.layers):
+            t, x = layer(x)
+            concat_tensors.append(t)
+        return x, concat_tensors
+class ResEncoderBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i, conv in enumerate(self.conv):
+            x = conv(x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class Intermediate(nn.Module):  #
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
+        )
+        for i in range(self.n_inters - 1):
+            self.layers.append(
+                ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
+            )
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+        return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=stride,
+                padding=(1, 1),
+                output_padding=out_padding,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i, conv2 in enumerate(self.conv2):
+            x = conv2(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(
+                ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
+            )
+            in_channels = out_channels
+    def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
+        for i, layer in enumerate(self.layers):
+            x = layer(x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    def __init__(
+        self,
+        kernel_size,
+        n_blocks,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            self.encoder.out_channel // 2,
+            self.encoder.out_channel,
+            inter_layers,
+            n_blocks,
+        )
+        self.decoder = Decoder(
+            self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+class E2E(nn.Module):
+    def __init__(
+        self,
+        n_blocks,
+        n_gru,
+        kernel_size,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(
+            kernel_size,
+            n_blocks,
+            en_de_layers,
+            inter_layers,
+            in_channels,
+            en_out_channels,
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid(),
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
+            )
+    def forward(self, mel):
+        # print(mel.shape)
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        # print(x.shape)
+        return x
+from librosa.filters import mel
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        is_half,
+        n_mel_channels,
+        sampling_rate,
+        win_length,
+        hop_length,
+        n_fft=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        clamp=1e-5,
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+        self.is_half = is_half
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + "_" + str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
+                audio.device
+            )
+        if "privateuseone" in str(audio.device):
+            if not hasattr(self, "stft"):
+                self.stft = STFT(
+                    filter_length=n_fft_new,
+                    hop_length=hop_length_new,
+                    win_length=win_length_new,
+                    window="hann",
+                ).to(audio.device)
+            magnitude = self.stft.transform(audio)
+        else:
+            fft = torch.stft(
+                audio,
+                n_fft=n_fft_new,
+                hop_length=hop_length_new,
+                win_length=win_length_new,
+                window=self.hann_window[keyshift_key],
+                center=center,
+                return_complex=True,
+            )
+            magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        if self.is_half == True:
+            mel_output = mel_output.half()
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec
+class RMVPE:
+    def __init__(self, model_path: str, is_half, device=None, use_jit=False):
+        self.resample_kernel = {}
+        self.resample_kernel = {}
+        self.is_half = is_half
+        if device is None:
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.mel_extractor = MelSpectrogram(
+            is_half, 128, 16000, 1024, 160, None, 30, 8000
+        ).to(device)
+        if "privateuseone" in str(device):
+            import onnxruntime as ort
+            ort_session = ort.InferenceSession(
+                "%s/rmvpe.onnx" % os.environ["rmvpe_root"],
+                providers=["DmlExecutionProvider"],
+            )
+            self.model = ort_session
+        else:
+            if str(self.device) == "cuda":
+                self.device = torch.device("cuda:0")
+            def get_jit_model():
+                jit_model_path = model_path.rstrip(".pth")
+                jit_model_path += ".half.jit" if is_half else ".jit"
+                reload = False
+                if os.path.exists(jit_model_path):
+                    ckpt = jit.load(jit_model_path)
+                    model_device = ckpt["device"]
+                    if model_device != str(self.device):
+                        reload = True
+                else:
+                    reload = True
+                if reload:
+                    ckpt = jit.rmvpe_jit_export(
+                        model_path=model_path,
+                        mode="script",
+                        inputs_path=None,
+                        save_path=jit_model_path,
+                        device=device,
+                        is_half=is_half,
+                    )
+                model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
+                return model
+            def get_default_model():
+                model = E2E(4, 1, (2, 2))
+                ckpt = torch.load(model_path, map_location="cpu")
+                model.load_state_dict(ckpt)
+                model.eval()
+                if is_half:
+                    model = model.half()
+                else:
+                    model = model.float()
+                return model
+            if use_jit:
+                if is_half and "cpu" in str(self.device):
+                    logger.warning(
+                        "Use default rmvpe model. \
+                                 Jit is not supported on the CPU for half floating point"
+                    )
+                    self.model = get_default_model()
+                else:
+                    self.model = get_jit_model()
+            else:
+                self.model = get_default_model()
+            self.model = self.model.to(device)
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))  # 368
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
+            if n_pad > 0:
+                mel = F.pad(mel, (0, n_pad), mode="constant")
+            if "privateuseone" in str(self.device):
+                onnx_input_name = self.model.get_inputs()[0].name
+                onnx_outputs_names = self.model.get_outputs()[0].name
+                hidden = self.model.run(
+                    [onnx_outputs_names],
+                    input_feed={onnx_input_name: mel.cpu().numpy()},
+                )[0]
+            else:
+                mel = mel.half() if self.is_half else mel.float()
+                hidden = self.model(mel)
+            return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03):
+        cents_pred = self.to_local_average_cents(hidden, thred=thred)
+        f0 = 10 * (2 ** (cents_pred / 1200))
+        f0[f0 == 10] = 0
+        # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
+        return f0
+    def infer_from_audio(self, audio, thred=0.03):
+        # torch.cuda.synchronize()
+        # t0 = ttime()
+        if not torch.is_tensor(audio):
+            audio = torch.from_numpy(audio)
+        mel = self.mel_extractor(
+            audio.float().to(self.device).unsqueeze(0), center=True
+        )
+        # print(123123123,mel.device.type)
+        # torch.cuda.synchronize()
+        # t1 = ttime()
+        hidden = self.mel2hidden(mel)
+        # torch.cuda.synchronize()
+        # t2 = ttime()
+        # print(234234,hidden.device.type)
+        if "privateuseone" not in str(self.device):
+            hidden = hidden.squeeze(0).cpu().numpy()
+        else:
+            hidden = hidden[0]
+        if self.is_half == True:
+            hidden = hidden.astype("float32")
+        f0 = self.decode(hidden, thred=thred)
+        # torch.cuda.synchronize()
+        # t3 = ttime()
+        # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
+        return f0
+    def to_local_average_cents(self, salience, thred=0.05):
+        # t0 = ttime()
+        center = np.argmax(salience, axis=1)  # 帧长#index
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # 帧长,368
+        # t1 = ttime()
+        center += 4
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+        # t2 = ttime()
+        todo_salience = np.array(todo_salience)  # 帧长，9
+        todo_cents_mapping = np.array(todo_cents_mapping)  # 帧长，9
+        product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
+        weight_sum = np.sum(todo_salience, 1)  # 帧长
+        devided = product_sum / weight_sum  # 帧长
+        # t3 = ttime()
+        maxx = np.max(salience, axis=1)  # 帧长
+        devided[maxx <= thred] = 0
+        # t4 = ttime()
+        # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+        return devided
+if __name__ == "__main__":
+    import librosa
+    import soundfile as sf
+    audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    audio_bak = audio.copy()
+    if sampling_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+    model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
+    thred = 0.03  # 0.01
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    rmvpe = RMVPE(model_path, is_half=False, device=device)
+    t0 = ttime()
+    f0 = rmvpe.infer_from_audio(audio, thred=thred)
+    # f0 = rmvpe.infer_from_audio(audio, thred=thred)
+    # f0 = rmvpe.infer_from_audio(audio, thred=thred)
+    # f0 = rmvpe.infer_from_audio(audio, thred=thred)
+    # f0 = rmvpe.infer_from_audio(audio, thred=thred)
+    t1 = ttime()
+    logger.info("%s %.2f", f0.shape, t1 - t0)

libs/rtrvc.py ADDED Viewed

	@@ -0,0 +1,461 @@

+from io import BytesIO
+import os
+import sys
+import traceback
+from infer.lib import jit
+from infer.lib.jit.get_synthesizer import get_synthesizer
+from time import time as ttime
+import fairseq
+import faiss
+import numpy as np
+import parselmouth
+import pyworld
+import scipy.signal as signal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchcrepe
+from torchaudio.transforms import Resample
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from multiprocessing import Manager as M
+from configs.config import Config
+# config = Config()
+mm = M()
+def printt(strr, *args):
+    if len(args) == 0:
+        print(strr)
+    else:
+        print(strr % args)
+# config.device=torch.device("cpu")########强制cpu测试
+# config.is_half=False########强制cpu测试
+class RVC:
+    def __init__(
+        self,
+        key,
+        formant,
+        pth_path,
+        index_path,
+        index_rate,
+        n_cpu,
+        inp_q,
+        opt_q,
+        config: Config,
+        last_rvc=None,
+    ) -> None:
+        """
+        初始化
+        """
+        try:
+            if config.dml == True:
+                def forward_dml(ctx, x, scale):
+                    ctx.scale = scale
+                    res = x.clone().detach()
+                    return res
+                fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+            # global config
+            self.config = config
+            self.inp_q = inp_q
+            self.opt_q = opt_q
+            # device="cpu"########强制cpu测试
+            self.device = config.device
+            self.f0_up_key = key
+            self.formant_shift = formant
+            self.f0_min = 50
+            self.f0_max = 1100
+            self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+            self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+            self.n_cpu = n_cpu
+            self.use_jit = self.config.use_jit
+            self.is_half = config.is_half
+            if index_rate != 0:
+                self.index = faiss.read_index(index_path)
+                self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
+                printt("Index search enabled")
+            self.pth_path: str = pth_path
+            self.index_path = index_path
+            self.index_rate = index_rate
+            self.cache_pitch: torch.Tensor = torch.zeros(
+                1024, device=self.device, dtype=torch.long
+            )
+            self.cache_pitchf = torch.zeros(
+                1024, device=self.device, dtype=torch.float32
+            )
+            self.resample_kernel = {}
+            if last_rvc is None:
+                models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+                    ["assets/hubert/hubert_base.pt"],
+                    suffix="",
+                )
+                hubert_model = models[0]
+                hubert_model = hubert_model.to(self.device)
+                if self.is_half:
+                    hubert_model = hubert_model.half()
+                else:
+                    hubert_model = hubert_model.float()
+                hubert_model.eval()
+                self.model = hubert_model
+            else:
+                self.model = last_rvc.model
+            self.net_g: nn.Module = None
+            def set_default_model():
+                self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
+                self.tgt_sr = cpt["config"][-1]
+                cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+                self.if_f0 = cpt.get("f0", 1)
+                self.version = cpt.get("version", "v1")
+                if self.is_half:
+                    self.net_g = self.net_g.half()
+                else:
+                    self.net_g = self.net_g.float()
+            def set_jit_model():
+                jit_pth_path = self.pth_path.rstrip(".pth")
+                jit_pth_path += ".half.jit" if self.is_half else ".jit"
+                reload = False
+                if str(self.device) == "cuda":
+                    self.device = torch.device("cuda:0")
+                if os.path.exists(jit_pth_path):
+                    cpt = jit.load(jit_pth_path)
+                    model_device = cpt["device"]
+                    if model_device != str(self.device):
+                        reload = True
+                else:
+                    reload = True
+                if reload:
+                    cpt = jit.synthesizer_jit_export(
+                        self.pth_path,
+                        "script",
+                        None,
+                        device=self.device,
+                        is_half=self.is_half,
+                    )
+                self.tgt_sr = cpt["config"][-1]
+                self.if_f0 = cpt.get("f0", 1)
+                self.version = cpt.get("version", "v1")
+                self.net_g = torch.jit.load(
+                    BytesIO(cpt["model"]), map_location=self.device
+                )
+                self.net_g.infer = self.net_g.forward
+                self.net_g.eval().to(self.device)
+            def set_synthesizer():
+                if self.use_jit and not config.dml:
+                    if self.is_half and "cpu" in str(self.device):
+                        printt(
+                            "Use default Synthesizer model. \
+                                    Jit is not supported on the CPU for half floating point"
+                        )
+                        set_default_model()
+                    else:
+                        set_jit_model()
+                else:
+                    set_default_model()
+            if last_rvc is None or last_rvc.pth_path != self.pth_path:
+                set_synthesizer()
+            else:
+                self.tgt_sr = last_rvc.tgt_sr
+                self.if_f0 = last_rvc.if_f0
+                self.version = last_rvc.version
+                self.is_half = last_rvc.is_half
+                if last_rvc.use_jit != self.use_jit:
+                    set_synthesizer()
+                else:
+                    self.net_g = last_rvc.net_g
+            if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
+                self.model_rmvpe = last_rvc.model_rmvpe
+            if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
+                self.device_fcpe = last_rvc.device_fcpe
+                self.model_fcpe = last_rvc.model_fcpe
+        except:
+            printt(traceback.format_exc())
+    def change_key(self, new_key):
+        self.f0_up_key = new_key
+    def change_formant(self, new_formant):
+        self.formant_shift = new_formant
+    def change_index_rate(self, new_index_rate):
+        if new_index_rate != 0 and self.index_rate == 0:
+            self.index = faiss.read_index(self.index_path)
+            self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
+            printt("Index search enabled")
+        self.index_rate = new_index_rate
+    def get_f0_post(self, f0):
+        if not torch.is_tensor(f0):
+            f0 = torch.from_numpy(f0)
+        f0 = f0.float().to(self.device).squeeze()
+        f0_mel = 1127 * torch.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
+            self.f0_mel_max - self.f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = torch.round(f0_mel).long()
+        return f0_coarse, f0
+    def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
+        n_cpu = int(n_cpu)
+        if method == "crepe":
+            return self.get_f0_crepe(x, f0_up_key)
+        if method == "rmvpe":
+            return self.get_f0_rmvpe(x, f0_up_key)
+        if method == "fcpe":
+            return self.get_f0_fcpe(x, f0_up_key)
+        x = x.cpu().numpy()
+        if method == "pm":
+            p_len = x.shape[0] // 160 + 1
+            f0_min = 65
+            l_pad = int(np.ceil(1.5 / f0_min * 16000))
+            r_pad = l_pad + 1
+            s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac(
+                time_step=0.01,
+                voicing_threshold=0.6,
+                pitch_floor=f0_min,
+                pitch_ceiling=1100,
+            )
+            assert np.abs(s.t1 - 1.5 / f0_min) < 0.001
+            f0 = s.selected_array["frequency"]
+            if len(f0) < p_len:
+                f0 = np.pad(f0, (0, p_len - len(f0)))
+            f0 = f0[:p_len]
+            f0 *= pow(2, f0_up_key / 12)
+            return self.get_f0_post(f0)
+        if n_cpu == 1:
+            f0, t = pyworld.harvest(
+                x.astype(np.double),
+                fs=16000,
+                f0_ceil=1100,
+                f0_floor=50,
+                frame_period=10,
+            )
+            f0 = signal.medfilt(f0, 3)
+            f0 *= pow(2, f0_up_key / 12)
+            return self.get_f0_post(f0)
+        f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64)
+        length = len(x)
+        part_length = 160 * ((length // 160 - 1) // n_cpu + 1)
+        n_cpu = (length // 160 - 1) // (part_length // 160) + 1
+        ts = ttime()
+        res_f0 = mm.dict()
+        for idx in range(n_cpu):
+            tail = part_length * (idx + 1) + 320
+            if idx == 0:
+                self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
+            else:
+                self.inp_q.put(
+                    (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
+                )
+        while 1:
+            res_ts = self.opt_q.get()
+            if res_ts == ts:
+                break
+        f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])]
+        for idx, f0 in enumerate(f0s):
+            if idx == 0:
+                f0 = f0[:-3]
+            elif idx != n_cpu - 1:
+                f0 = f0[2:-3]
+            else:
+                f0 = f0[2:]
+            f0bak[part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]] = (
+                f0
+            )
+        f0bak = signal.medfilt(f0bak, 3)
+        f0bak *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0bak)
+    def get_f0_crepe(self, x, f0_up_key):
+        if "privateuseone" in str(
+            self.device
+        ):  ###不支持dml，cpu又太慢用不成，拿fcpe顶替
+            return self.get_f0(x, f0_up_key, 1, "fcpe")
+        # printt("using crepe,device:%s"%self.device)
+        f0, pd = torchcrepe.predict(
+            x.unsqueeze(0).float(),
+            16000,
+            160,
+            self.f0_min,
+            self.f0_max,
+            "full",
+            batch_size=512,
+            # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+    def get_f0_rmvpe(self, x, f0_up_key):
+        if hasattr(self, "model_rmvpe") == False:
+            from infer.lib.rmvpe import RMVPE
+            printt("Loading rmvpe model")
+            self.model_rmvpe = RMVPE(
+                "assets/rmvpe/rmvpe.pt",
+                is_half=self.is_half,
+                device=self.device,
+                use_jit=self.config.use_jit,
+            )
+        f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+    def get_f0_fcpe(self, x, f0_up_key):
+        if hasattr(self, "model_fcpe") == False:
+            from torchfcpe import spawn_bundled_infer_model
+            printt("Loading fcpe model")
+            if "privateuseone" in str(self.device):
+                self.device_fcpe = "cpu"
+            else:
+                self.device_fcpe = self.device
+            self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
+        f0 = self.model_fcpe.infer(
+            x.to(self.device_fcpe).unsqueeze(0).float(),
+            sr=16000,
+            decoder_mode="local_argmax",
+            threshold=0.006,
+        )
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+    def infer(
+        self,
+        input_wav: torch.Tensor,
+        block_frame_16k,
+        skip_head,
+        return_length,
+        f0method,
+    ) -> np.ndarray:
+        t1 = ttime()
+        with torch.no_grad():
+            if self.config.is_half:
+                feats = input_wav.half().view(1, -1)
+            else:
+                feats = input_wav.float().view(1, -1)
+            padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+            inputs = {
+                "source": feats,
+                "padding_mask": padding_mask,
+                "output_layer": 9 if self.version == "v1" else 12,
+            }
+            logits = self.model.extract_features(**inputs)
+            feats = (
+                self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
+            )
+            feats = torch.cat((feats, feats[:, -1:, :]), 1)
+        t2 = ttime()
+        try:
+            if hasattr(self, "index") and self.index_rate != 0:
+                npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32")
+                score, ix = self.index.search(npy, k=8)
+                if (ix >= 0).all():
+                    weight = np.square(1 / score)
+                    weight /= weight.sum(axis=1, keepdims=True)
+                    npy = np.sum(
+                        self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1
+                    )
+                    if self.config.is_half:
+                        npy = npy.astype("float16")
+                    feats[0][skip_head // 2 :] = (
+                        torch.from_numpy(npy).unsqueeze(0).to(self.device)
+                        * self.index_rate
+                        + (1 - self.index_rate) * feats[0][skip_head // 2 :]
+                    )
+                else:
+                    printt(
+                        "Invalid index. You MUST use added_xxxx.index but not trained_xxxx.index!"
+                    )
+            else:
+                printt("Index search FAILED or disabled")
+        except:
+            traceback.print_exc()
+            printt("Index search FAILED")
+        t3 = ttime()
+        p_len = input_wav.shape[0] // 160
+        factor = pow(2, self.formant_shift / 12)
+        return_length2 = int(np.ceil(return_length * factor))
+        if self.if_f0 == 1:
+            f0_extractor_frame = block_frame_16k + 800
+            if f0method == "rmvpe":
+                f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
+            pitch, pitchf = self.get_f0(
+                input_wav[-f0_extractor_frame:], self.f0_up_key - self.formant_shift, self.n_cpu, f0method
+            )
+            shift = block_frame_16k // 160
+            self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
+            self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone()
+            self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1]
+            self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1]
+            cache_pitch = self.cache_pitch[None, -p_len:]
+            cache_pitchf = self.cache_pitchf[None, -p_len:] * return_length2 / return_length
+        t4 = ttime()
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        feats = feats[:, :p_len, :]
+        p_len = torch.LongTensor([p_len]).to(self.device)
+        sid = torch.LongTensor([0]).to(self.device)
+        skip_head = torch.LongTensor([skip_head])
+        return_length2 = torch.LongTensor([return_length2])
+        return_length = torch.LongTensor([return_length])
+        with torch.no_grad():
+            if self.if_f0 == 1:
+                infered_audio, _, _ = self.net_g.infer(
+                    feats,
+                    p_len,
+                    cache_pitch,
+                    cache_pitchf,
+                    sid,
+                    skip_head,
+                    return_length,
+                    return_length2,
+                )
+            else:
+                infered_audio, _, _ = self.net_g.infer(
+                    feats, p_len, sid, skip_head, return_length, return_length2
+                )
+        infered_audio = infered_audio.squeeze(1).float()
+        upp_res = int(np.floor(factor * self.tgt_sr // 100))
+        if upp_res != self.tgt_sr // 100:
+            if upp_res not in self.resample_kernel:
+                self.resample_kernel[upp_res] = Resample(
+                    orig_freq=upp_res,
+                    new_freq=self.tgt_sr // 100,
+                    dtype=torch.float32,
+                ).to(self.device)
+            infered_audio = self.resample_kernel[upp_res](
+                infered_audio[:, : return_length * upp_res]
+            )
+        t5 = ttime()
+        printt(
+            "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
+            t2 - t1,
+            t3 - t2,
+            t4 - t3,
+            t5 - t4,
+        )
+        return infered_audio.squeeze()

libs/slicer2.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import numpy as np
+# This function is obtained from librosa.
+def get_rms(
+    y,
+    frame_length=2048,
+    hop_length=512,
+    pad_mode="constant",
+):
+    padding = (int(frame_length // 2), int(frame_length // 2))
+    y = np.pad(y, padding, mode=pad_mode)
+    axis = -1
+    # put our new within-frame axis at the end for now
+    out_strides = y.strides + tuple([y.strides[axis]])
+    # Reduce the shape on the framing axis
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+    if axis < 0:
+        target_axis = axis - 1
+    else:
+        target_axis = axis + 1
+    xw = np.moveaxis(xw, -1, target_axis)
+    # Downsample along the target axis
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    x = xw[tuple(slices)]
+    # Calculate power
+    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+    return np.sqrt(power)
+class Slicer:
+    def __init__(
+        self,
+        sr: int,
+        threshold: float = -40.0,
+        min_length: int = 5000,
+        min_interval: int = 300,
+        hop_size: int = 20,
+        max_sil_kept: int = 5000,
+    ):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+            )
+        if not max_sil_kept >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: max_sil_kept >= hop_size"
+            )
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.0)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[
+                :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
+            ]
+        else:
+            return waveform[
+                begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
+            ]
+    # @timeit
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = waveform.mean(axis=0)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return [waveform]
+        rms_list = get_rms(
+            y=samples, frame_length=self.win_size, hop_length=self.hop_size
+        ).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (
+                i - silence_start >= self.min_interval
+                and i - clip_start >= self.min_length
+            )
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[
+                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+                ].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if (
+            silence_start is not None
+            and total_frames - silence_start >= self.min_interval
+        ):
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return [waveform]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0:
+                chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+            for i in range(len(sil_tags) - 1):
+                chunks.append(
+                    self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
+                )
+            if sil_tags[-1][1] < total_frames:
+                chunks.append(
+                    self._apply_slice(waveform, sil_tags[-1][1], total_frames)
+                )
+            return chunks
+def main():
+    import os.path
+    from argparse import ArgumentParser
+    import librosa
+    import soundfile
+    parser = ArgumentParser()
+    parser.add_argument("audio", type=str, help="The audio to be sliced")
+    parser.add_argument(
+        "--out", type=str, help="Output directory of the sliced audio clips"
+    )
+    parser.add_argument(
+        "--db_thresh",
+        type=float,
+        required=False,
+        default=-40,
+        help="The dB threshold for silence detection",
+    )
+    parser.add_argument(
+        "--min_length",
+        type=int,
+        required=False,
+        default=5000,
+        help="The minimum milliseconds required for each sliced audio clip",
+    )
+    parser.add_argument(
+        "--min_interval",
+        type=int,
+        required=False,
+        default=300,
+        help="The minimum milliseconds for a silence part to be sliced",
+    )
+    parser.add_argument(
+        "--hop_size",
+        type=int,
+        required=False,
+        default=10,
+        help="Frame length in milliseconds",
+    )
+    parser.add_argument(
+        "--max_sil_kept",
+        type=int,
+        required=False,
+        default=500,
+        help="The maximum silence length kept around the sliced clip, presented in milliseconds",
+    )
+    args = parser.parse_args()
+    out = args.out
+    if out is None:
+        out = os.path.dirname(os.path.abspath(args.audio))
+    audio, sr = librosa.load(args.audio, sr=None, mono=False)
+    slicer = Slicer(
+        sr=sr,
+        threshold=args.db_thresh,
+        min_length=args.min_length,
+        min_interval=args.min_interval,
+        hop_size=args.hop_size,
+        max_sil_kept=args.max_sil_kept,
+    )
+    chunks = slicer.slice(audio)
+    if not os.path.exists(out):
+        os.makedirs(out)
+    for i, chunk in enumerate(chunks):
+        if len(chunk.shape) > 1:
+            chunk = chunk.T
+        soundfile.write(
+            os.path.join(
+                out,
+                f"%s_%d.wav"
+                % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
+            ),
+            chunk,
+            sr,
+        )
+if __name__ == "__main__":
+    main()