Spaces:

oreillyp
/

watermark_stress_test

Sleeping

App Files Files Community

oreillyp commited on Apr 21

Commit

f872c8a

1 Parent(s): 40bed59

initial commit

Browse files

Files changed (16) hide show

.gitignore +9 -0
app.py +278 -0
ckpt/encodec_voicecraft.pt +3 -0
encodec/LICENSE +21 -0
encodec/__init__.py +6 -0
encodec/distrib.py +125 -0
encodec/encodec.py +171 -0
encodec/modules/__init__.py +17 -0
encodec/modules/conv.py +342 -0
encodec/modules/lstm.py +27 -0
encodec/modules/norm.py +32 -0
encodec/modules/seanet.py +352 -0
encodec/quantization/__init__.py +7 -0
encodec/quantization/core_vq.py +370 -0
encodec/quantization/vq.py +95 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+**/..git/*
+__pycache__/
+.DS_Store
+._.DS_Store
+.ipynb_checkpoints/
+.vscode/
+*.egg-info/
+.pytest_cache
+*.ipynb_checkpoints/

app.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import os
+import sys
+import uuid
+from pathlib import Path
+from contextlib import contextmanager
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+import gradio as gr
+from scipy.io.wavfile import write as wavwrite
+from audiotools import AudioSignal
+from audioseal import AudioSeal
+# allow local imports of your encodec folder
+@contextmanager
+def chdir(path: str):
+    origin = Path().absolute()
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(origin)
+_path = Path(__file__).parent
+sys.path.insert(0, str(_path))
+with chdir(_path):
+    from encodec import Encodec
+OUT_DIR = _path / "gradio-outputs"
+OUT_DIR.mkdir(exist_ok=True)
+LOUDNESS_DB = -16.
+SAMPLE_RATE = 48_000
+ENCODEC_SAMPLE_RATE = 16_000
+AUDIOSEAL_SAMPLE_RATE = 16_000
+# load codec
+config = {
+    "sample_rate": 16_000,
+    "target_bandwidths": [2.2],
+    "channels": 1,
+    "causal": False,
+    "codebook_size": 2048,
+    "n_filters": 64,
+    "model_norm": "weight_norm",
+    "audio_normalize": False,
+    "true_skip": True,
+    "ratios": [8, 5, 4, 2],
+    "encoder_kwargs": {"pad_mode": "constant"},
+    "decoder_kwargs": {"pad_mode": "constant"},
+}
+codec = Encodec(**config)
+codec.load_state_dict(torch.load("ckpt/encodec_voicecraft.pt", map_location="cpu"))
+codec.eval()
+for p in codec.parameters(): p.requires_grad_(False)
+codec.set_target_bandwidth(2.2)
+# watermark models
+embedder = AudioSeal.load_generator("audioseal_wm_16bits")
+detector  = AudioSeal.load_detector("audioseal_detector_16bits")
+@torch.no_grad()
+def encode(signal: AudioSignal, codec: torch.nn.Module):
+    n_b, n_ch, n_s = signal.shape
+    sr = signal.sample_rate
+    loud_db = signal.loudness()
+    x = signal.clone().resample(ENCODEC_SAMPLE_RATE).audio_data
+    x = x.reshape(n_b * n_ch, 1, -1)
+    codes, *_ = codec.encode(x)
+    return codes, n_b, n_ch, n_s, sr, loud_db
+@torch.no_grad()
+def decode(codes, n_b, n_ch, n_s, sr, loud_db, codec):
+    x = codec.decode(codes).reshape(n_b, n_ch, -1)
+    sig = AudioSignal(x, sample_rate=ENCODEC_SAMPLE_RATE)
+    sig = sig.resample(sr)
+    sig.audio_data = sig.audio_data[..., :n_s]
+    sig.audio_data = torch.nn.functional.pad(
+        sig.audio_data, (0, max(0, n_s - sig.signal_length))
+    )
+    return sig.normalize(loud_db)
+@torch.no_grad()
+def split_bands(signal: AudioSignal, sample_rate: float = ENCODEC_SAMPLE_RATE):
+    nyq = sample_rate // 2
+    high = signal.clone().high_pass(cutoffs=int(nyq * 0.95), zeros=51)
+    low  = signal.clone().low_pass(cutoffs=int(nyq * 1.05), zeros=51)
+    loud_db = low.loudness()
+    low = low.resample(sample_rate)
+    return low, high, loud_db
+@torch.no_grad()
+def merge_bands(low, high, loud_db):
+    low = low.clone().to(high.device).resample(high.sample_rate)
+    low.audio_data = low.audio_data[..., :high.signal_length]
+    low.audio_data = torch.nn.functional.pad(
+        low.audio_data, (0, max(0, high.signal_length - low.signal_length))
+    )
+    return low.normalize(loud_db) + high
+@torch.no_grad()
+def attack(signal: AudioSignal, codec, split_rate_hz=AUDIOSEAL_SAMPLE_RATE):
+    if split_rate_hz:
+        low, high, loud_db = split_bands(signal, split_rate_hz)
+        low = decode(*encode(low, codec), codec)
+        return merge_bands(low, high, loud_db)
+    else:
+        return decode(*encode(signal, codec), codec)
+@torch.no_grad()
+def embed(signal: AudioSignal, embedder: torch.nn.Module):
+    orig_ch, orig_sr = signal.num_channels, signal.sample_rate
+    sig = signal.clone().resample(SAMPLE_RATE)
+    if orig_ch > 1:
+        b, c, n = sig.audio_data.shape
+        sig.audio_data = sig.audio_data.reshape(b * c, 1, n)
+    low, high, loud = split_bands(sig.clone(), AUDIOSEAL_SAMPLE_RATE)
+    wm = embedder.get_watermark(low.audio_data, AUDIOSEAL_SAMPLE_RATE)
+    low.audio_data = low.audio_data + wm
+    merged = merge_bands(low, high, loud)
+    if orig_ch > 1:
+        b2, c2, n2 = merged.audio_data.shape
+        merged.audio_data = merged.audio_data.reshape(-1, orig_ch * c2, n2)
+    return merged.resample(orig_sr)
+@torch.no_grad()
+def detect(signal: AudioSignal, detector: torch.nn.Module):
+    sig = signal.clone().to_mono().resample(AUDIOSEAL_SAMPLE_RATE)
+    result, _ = detector.forward(sig.audio_data, sample_rate=AUDIOSEAL_SAMPLE_RATE)
+    return result[0, 1, :].detach().cpu().numpy()
+def pipeline(audio_tuple):
+    sr, audio_np = audio_tuple
+    print("GOT SR", sr)
+    print("GOT AUDIO", audio_np.shape)
+    if audio_np.ndim == 1:
+        audio_np = audio_np[None, None, :]
+    else:
+        audio_np = np.transpose(audio_np, (1, 0))[None, ...]
+    print("FORMATTED AUDIO", audio_np.shape)
+    sig = AudioSignal(torch.from_numpy(audio_np).float(), sample_rate=sr)
+    orig_loud = sig.loudness()
+    sig = sig.to_mono().resample(SAMPLE_RATE).normalize(LOUDNESS_DB).ensure_max_of_audio()
+    print("REFORMATTED AUDIO")
+    print(sig)
+    # Detect
+    scores = detect(sig, detector)
+    # Embed + detect without attack
+    wm_sig       = embed(sig.clone(), embedder).normalize(LOUDNESS_DB).ensure_max_of_audio()
+    scores_clean = detect(wm_sig, detector)
+    print(np.mean(scores_clean))
+    # Attack + detect
+    att_sig      = attack(wm_sig.clone(), codec).normalize(LOUDNESS_DB).ensure_max_of_audio()
+    scores_att   = detect(att_sig, detector)
+    print(np.mean(scores_att))
+    # Match loudness priot to writing
+    wm_sig.normalize(orig_loud).ensure_max_of_audio()
+    att_sig.normalize(orig_loud).ensure_max_of_audio()
+    # Write audio files to disk
+    uid = uuid.uuid4().hex
+    wm_path  = OUT_DIR / f"watermarked_{uid}.wav"
+    att_path = OUT_DIR / f"attacked_{uid}.wav"
+    wm_arr  = wm_sig.audio_data.squeeze().numpy()
+    att_arr = att_sig.audio_data.squeeze().numpy()
+    wavwrite(str(wm_path), SAMPLE_RATE, wm_arr)
+    wavwrite(str(att_path), SAMPLE_RATE, att_arr)
+    # Plot scores with waveform background
+        # Plot: waveform on top, detection scores on bottom
+    sig_bg = sig.clone().to_mono().resample(AUDIOSEAL_SAMPLE_RATE)
+    wav = sig_bg.audio_data.squeeze().numpy()
+    N = len(scores)
+    if wav.shape[0] < N:
+        wav = np.pad(wav, (0, N - wav.shape[0]), mode="constant")
+    else:
+        wav = wav[:N]
+    fig, (ax_wav, ax_score) = plt.subplots(2, 1, sharex=True, figsize=(8, 6))
+    # Top: waveform (no labels)
+    ax_wav.plot(wav, alpha=0.3)
+    ax_wav.axis("off")
+    # Bottom: detection scores
+    ax_score.plot(scores,       label="No watermark", color="blue")
+    ax_score.plot(scores_clean, label="Watermark (no attack)", color="green")
+    ax_score.plot(scores_att,   label="Watermark (codec attack)", color="red")
+    ax_score.set_xlabel("Frame Index")
+    ax_score.set_ylabel("Detection Score")
+    ax_score.set_ylim(-0.05, 1.05)
+    ax_score.set_yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
+    ax_score.legend()
+    plt.tight_layout()
+    plot_path = OUT_DIR / f"detection_plot_{uid}.png"
+    fig.savefig(str(plot_path), format="png")
+    plt.close(fig)
+    return str(wm_path), str(att_path), str(plot_path)
+demo = gr.Interface(
+    fn=pipeline,
+    inputs= gr.Audio(sources=["upload"], type="numpy", label="Upload Input Audio"),
+    outputs=[
+        gr.Audio(type="filepath", label="Watermarked Audio"),
+        gr.Audio(type="filepath", label="Attacked Audio"),
+        gr.Image(type="filepath", label="Detection Scores Plot"),
+    ],
+    title="Watermark Stress Test",
+    description="""
+    This is an educational demonstration of state-of-the-art audio watermark performance under codec processing. Upload any (speech) audio file to test watermark performance before and after processing with a low-bitrate neural codec [1].
+    For this demo, we use the AudioSeal [2] watermark, which is well documented, open source, and provides state-of-the-art localized detection performance. Both the watermark and codec operate at 16kHz, meaning all frequencies above 8kHz are left unaltered. To ensure consistent watermark performance, we normalize audio to -16db LUFS and downmix to mono prior to embedding.
+    [1] https://github.com/jasonppy/VoiceCraft
+    [2] https://github.com/facebookresearch/audioseal
+    """,
+    article="""
+    The citation info for our corresponding paper is:
+    ```
+    @inproceedings{deepwatermarksareshallow,
+        author ={Patrick O'Reilly and Zeyu Jin and Jiaqi Su and Bryan Pardo},
+        title = {Deep Audio Watermarks are Shallow: Limitations of Post-Hoc Watermarking Techniques for Speech},
+        booktitle = {ICLR Workshop on GenAI Watermarking},
+        year = {2025}
+    }
+    ```
+    For the VoiceCraft codec:
+    ```
+    @article{voicecraft,
+        author={Puyuan Peng and Po-Yao Huang and Daniel Li and Abdelrahman Mohamed and David Harwath},
+        year={2024},
+        title={VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild},
+        journal={arXiv preprint arXiv:2403.16973v1},
+    }
+    ```
+    And for the AudioSeal watermark:
+    ```
+    @article{audioseal,
+      title={Proactive Detection of Voice Cloning with Localized Watermarking},
+      author={San Roman, Robin and Fernandez, Pierre and Elsahar, Hady and D´efossez, Alexandre and Furon, Teddy and Tran, Tuan},
+      journal={International Conference on Machine Learning (ICML)},
+      year={2024}
+    }
+    ```
+    """,
+    allow_flagging=False,
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

ckpt/encodec_voicecraft.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42b224ba5b193a8fb66eb692fe377831bb14b1dcf556638db7afc5d108099bfb
+size 235735922

encodec/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

encodec/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .encodec import Encodec

encodec/distrib.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Torch distributed utilities."""
+import typing as tp
+import torch
+def rank():
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    else:
+        return 0
+def world_size():
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_world_size()
+    else:
+        return 1
+def is_distributed():
+    return world_size() > 1
+def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
+    if is_distributed():
+        return torch.distributed.all_reduce(tensor, op)
+def _is_complex_or_float(tensor):
+    return torch.is_floating_point(tensor) or torch.is_complex(tensor)
+def _check_number_of_params(params: tp.List[torch.Tensor]):
+    # utility function to check that the number of params in all workers is the same,
+    # and thus avoid a deadlock with distributed all reduce.
+    if not is_distributed() or not params:
+        return
+    tensor = torch.tensor([len(params)], device=params[0].device, dtype=torch.long)
+    all_reduce(tensor)
+    if tensor.item() != len(params) * world_size():
+        # If not all the workers have the same number, for at least one of them,
+        # this inequality will be verified.
+        raise RuntimeError(
+            f"Mismatch in number of params: ours is {len(params)}, "
+            "at least one worker has a different one."
+        )
+def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int = 0):
+    """Broadcast the tensors from the given parameters to all workers.
+    This can be used to ensure that all workers have the same model to start with.
+    """
+    if not is_distributed():
+        return
+    tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)]
+    _check_number_of_params(tensors)
+    handles = []
+    for tensor in tensors:
+        handle = torch.distributed.broadcast(tensor.data, src=src, async_op=True)
+        handles.append(handle)
+    for handle in handles:
+        handle.wait()
+def sync_buffer(buffers, average=True):
+    """
+    Sync grad for buffers. If average is False, broadcast instead of averaging.
+    """
+    if not is_distributed():
+        return
+    handles = []
+    for buffer in buffers:
+        if torch.is_floating_point(buffer.data):
+            if average:
+                handle = torch.distributed.all_reduce(
+                    buffer.data, op=torch.distributed.ReduceOp.SUM, async_op=True
+                )
+            else:
+                handle = torch.distributed.broadcast(buffer.data, src=0, async_op=True)
+            handles.append((buffer, handle))
+    for buffer, handle in handles:
+        handle.wait()
+        if average:
+            buffer.data /= world_size
+def sync_grad(params):
+    """
+    Simpler alternative to DistributedDataParallel, that doesn't rely
+    on any black magic. For simple models it can also be as fast.
+    Just call this on your model parameters after the call to backward!
+    """
+    if not is_distributed():
+        return
+    handles = []
+    for p in params:
+        if p.grad is not None:
+            handle = torch.distributed.all_reduce(
+                p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True
+            )
+            handles.append((p, handle))
+    for p, handle in handles:
+        handle.wait()
+        p.grad.data /= world_size()
+def average_metrics(metrics: tp.Dict[str, float], count=1.0):
+    """Average a dictionary of metrics across all workers, using the optional
+    `count` as unnormalized weight.
+    """
+    if not is_distributed():
+        return metrics
+    keys, values = zip(*metrics.items())
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    tensor = torch.tensor(list(values) + [1], device=device, dtype=torch.float32)
+    tensor *= count
+    all_reduce(tensor)
+    averaged = (tensor[:-1] / tensor[-1]).cpu().tolist()
+    return dict(zip(keys, averaged))

encodec/encodec.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import torch
+from .modules import SEANetDecoder
+from .modules import SEANetEncoder
+from .quantization import ResidualVectorQuantizer
+################################################################################
+# Encodec neural audio codec
+################################################################################
+class Encodec(torch.nn.Module):
+    """
+    Encodec neural audio codec proposed in "High Fidelity Neural Audio
+    Compression" (https://arxiv.org/abs/2210.13438) by Défossez et al.
+    """
+    def __init__(
+        self,
+        sample_rate: int,
+        channels: int,
+        causal: bool,
+        model_norm: str,
+        target_bandwidths: tp.Sequence[float],
+        audio_normalize: bool,
+        ratios: tp.List[int] = (8, 5, 4, 2),
+        codebook_size: int = 1024,
+        n_filters: int = 32,
+        true_skip: bool = False,
+        encoder_kwargs: tp.Dict = None,
+        decoder_kwargs: tp.Dict = None,
+    ):
+        """
+        Parameters
+        ----------
+        sample_rate : int
+            Audio sample rate in Hz.
+        channels : int
+            Number of audio channels expected at input.
+        causal : bool
+            Whether to use a causal convolution layers in encoder/decoder.
+        model_norm : str
+            Type of normalization to use in encoder/decoder.
+        target_bandwidths : tp.Sequence[float]
+            List of target bandwidths in kb/s.
+        audio_normalize : bool
+            Whether to normalize encoded and decoded audio segments using
+            simple scaling factors
+        ratios : tp.List[int], optional
+            List of downsampling ratios used in encoder/decoder, by default (8, 5, 4, 2)
+        codebook_size : int, optional
+            Size of residual vector quantizer codebooks, by default 1024
+        n_filters : int, optional
+            Number of filters used in encoder/decoder, by default 32
+        true_skip : bool, optional
+            Whether to use true skip connections in encoder/decoder rather than
+            convolutional skip connections, by default False
+        """
+        super().__init__()
+        encoder_kwargs = encoder_kwargs or {}
+        decoder_kwargs = decoder_kwargs or {}
+        self.encoder = SEANetEncoder(
+            channels=channels,
+            causal=causal,
+            norm=model_norm,
+            ratios=ratios,
+            n_filters=n_filters,
+            true_skip=true_skip,
+            **encoder_kwargs,
+        )
+        self.decoder = SEANetDecoder(
+            channels=channels,
+            causal=causal,
+            norm=model_norm,
+            ratios=ratios,
+            n_filters=n_filters,
+            true_skip=true_skip,
+            **decoder_kwargs,
+        )
+        n_q = int(
+            1000
+            * target_bandwidths[-1]
+            // (math.ceil(sample_rate / self.encoder.hop_length) * 10)
+        )
+        self.n_q = n_q  # Maximum number of quantizers
+        self.quantizer = ResidualVectorQuantizer(
+            dimension=self.encoder.dimension,
+            n_q=n_q,
+            bins=codebook_size,
+        )
+        self.sample_rate = sample_rate
+        self.normalize = audio_normalize
+        self.channels = channels
+        self.frame_rate = math.ceil(self.sample_rate / math.prod(self.encoder.ratios))
+        self.target_bandwidths = target_bandwidths
+        self.bits_per_codebook = int(math.log2(self.quantizer.bins))
+        assert (
+            2**self.bits_per_codebook == self.quantizer.bins
+        ), "quantizer bins must be a power of 2."
+        self.bandwidth = self.target_bandwidths[-1]
+    def set_target_bandwidth(self, bandwidth: float):
+        """
+        Set the target bandwidth for the codec by adjusting the
+        number of residual vector quantizers used
+        """
+        if bandwidth not in self.target_bandwidths:
+            raise ValueError(
+                f"This model doesn't support the bandwidth {bandwidth}. "
+                f"Select one of {self.target_bandwidths}."
+            )
+        self.bandwidth = bandwidth
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Map a given an audio waveform `x` to discrete residual latent codes.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Audio waveform of shape `(n_batch, n_channels, n_samples)`.
+        Returns
+        -------
+        codes : torch.Tensor
+            Tensor of shape `(n_batch, n_codebooks, n_frames)`.
+        """
+        assert x.dim() == 3
+        _, channels, length = x.shape
+        assert 0 < channels <= 2
+        z = self.encoder(x)
+        codes, z_O, z_o = self.quantizer.encode(z, self.frame_rate, self.bandwidth)
+        codes = codes.transpose(0, 1)
+        return codes, z_O, z_o, z
+    def decode(self, codes: torch.Tensor):
+        """
+        Decode quantized latents to obtain waveform audio.
+        Parameters
+        ----------
+        codes : torch.Tensor
+            Tensor of shape `(n_batch, n_codebooks, n_frames)`.
+        Returns
+        -------
+        out : torch.Tensor
+            Tensor of shape `(n_batch, n_channels, n_samples)`.
+        """
+        codes = codes.transpose(0, 1)
+        emb = self.quantizer.decode(codes)
+        out = self.decoder(emb)
+        return out

encodec/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Torch modules."""
+from .conv import NormConv1d
+from .conv import NormConv2d
+from .conv import NormConvTranspose1d
+from .conv import NormConvTranspose2d
+from .conv import pad1d
+from .conv import SConv1d
+from .conv import SConvTranspose1d
+from .conv import unpad1d
+from .lstm import SLSTM
+from .seanet import SEANetDecoder
+from .seanet import SEANetEncoder

encodec/modules/conv.py ADDED Viewed

	@@ -0,0 +1,342 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Convolutional layers wrappers and utilities."""
+import math
+import typing as tp
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import spectral_norm
+from torch.nn.utils import weight_norm
+from .norm import ConvLayerNorm
+CONV_NORMALIZATIONS = frozenset(
+    [
+        "none",
+        "weight_norm",
+        "spectral_norm",
+        "time_layer_norm",
+        "layer_norm",
+        "time_group_norm",
+    ]
+)
+def apply_parametrization_norm(module: nn.Module, norm: str = "none") -> nn.Module:
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "weight_norm":
+        return weight_norm(module)
+    elif norm == "spectral_norm":
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparametrization.
+        return module
+def get_norm_module(
+    module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs
+) -> nn.Module:
+    """Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn't support causal evaluation.
+    """
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "layer_norm":
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return ConvLayerNorm(module.out_channels, **norm_kwargs)
+    elif norm == "time_group_norm":
+        if causal:
+            raise ValueError("GroupNorm doesn't support causal evaluation.")
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+):
+    """Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    """
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "zero",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+class NormConv2d(nn.Module):
+    """Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(
+            nn.ConvTranspose1d(*args, **kwargs), norm
+        )
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+class NormConvTranspose2d(nn.Module):
+    """Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(
+            nn.ConvTranspose2d(*args, **kwargs), norm
+        )
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+class SConv1d(nn.Module):
+    """Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        pad_mode: str = "reflect",
+    ):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            warnings.warn(
+                "SConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+        self.conv = NormConv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.pad_mode = pad_mode
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (
+            kernel_size - 1
+        ) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(
+            x, kernel_size, stride, padding_total
+        )
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(
+                x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+            )
+        return self.conv(x)
+class SConvTranspose1d(nn.Module):
+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        norm: str = "none",
+        trim_right_ratio: float = 1.0,
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+    ):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert (
+            self.causal or self.trim_right_ratio == 1.0
+        ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+        y = self.convtr(x)
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y

encodec/modules/lstm.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""LSTM layers module."""
+from torch import nn
+class SLSTM(nn.Module):
+    """
+    LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    """
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y

encodec/modules/norm.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Normalization modules."""
+import typing as tp
+import torch
+from torch import nn
+class ConvLayerNorm(nn.LayerNorm):
+    """
+    Convolution-friendly LayerNorm that moves channels to last dimensions
+    before running the normalization and moves them back to original position right after.
+    """
+    def __init__(
+        self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs
+    ):
+        super().__init__(normalized_shape, **kwargs)
+    def forward(self, x):
+        assert x.ndim == 3  # (n_batch, n_channels, n_samples)
+        x = x.transpose(1, 2)
+        x = super().forward(x)
+        x = x.transpose(1, 2)
+        return x

encodec/modules/seanet.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Encodec SEANet-based encoder and decoder implementation."""
+import typing as tp
+import numpy as np
+import torch.nn as nn
+from . import SConv1d
+from . import SConvTranspose1d
+from . import SLSTM
+class SEANetResnetBlock(nn.Module):
+    """Residual block from SEANet model.
+    Args:
+        dim (int): Dimension of the input/output
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3)
+        true_skip (bool): Whether to use true skip connection or a simple convolution as the skip connection.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_sizes: tp.List[int] = [3, 1],
+        dilations: tp.List[int] = [1, 1],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        norm: str = "weight_norm",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        compress: int = 2,
+        true_skip: bool = True,
+    ):
+        super().__init__()
+        assert len(kernel_sizes) == len(
+            dilations
+        ), "Number of kernel sizes should match number of dilations"
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                SConv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    norm=norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    pad_mode=pad_mode,
+                ),
+            ]
+        self.block = nn.Sequential(*block)
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = SConv1d(
+                dim,
+                dim,
+                kernel_size=1,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+class SEANetEncoder(nn.Module):
+    """SEANet encoder.
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+    """
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 1,
+        ratios: tp.List[int] = [8, 5, 4, 2],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        norm: str = "weight_norm",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        true_skip: bool = False,
+        compress: int = 2,
+        lstm: int = 2,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            SConv1d(
+                channels,
+                mult * n_filters,
+                kernel_size,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+        ]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        norm=norm,
+                        norm_params=norm_params,
+                        activation=activation,
+                        activation_params=activation_params,
+                        causal=causal,
+                        pad_mode=pad_mode,
+                        compress=compress,
+                        true_skip=true_skip,
+                    )
+                ]
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                SConv1d(
+                    mult * n_filters,
+                    mult * n_filters * 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    norm=norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    pad_mode=pad_mode,
+                ),
+            ]
+            mult *= 2
+        if lstm:
+            model += [SLSTM(mult * n_filters, num_layers=lstm)]
+        model += [
+            act(**activation_params),
+            SConv1d(
+                mult * n_filters,
+                dimension,
+                last_kernel_size,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            ),
+        ]
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        return self.model(x)
+class SEANetDecoder(nn.Module):
+    """SEANet decoder.
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    """
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 1,
+        ratios: tp.List[int] = [8, 5, 4, 2],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        final_activation: tp.Optional[str] = None,
+        final_activation_params: tp.Optional[dict] = None,
+        norm: str = "weight_norm",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        true_skip: bool = False,
+        compress: int = 2,
+        lstm: int = 2,
+        trim_right_ratio: float = 1.0,
+    ):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            SConv1d(
+                dimension,
+                mult * n_filters,
+                kernel_size,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+        ]
+        if lstm:
+            model += [SLSTM(mult * n_filters, num_layers=lstm)]
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                SConvTranspose1d(
+                    mult * n_filters,
+                    mult * n_filters // 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    norm=norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    trim_right_ratio=trim_right_ratio,
+                ),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters // 2,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        activation=activation,
+                        activation_params=activation_params,
+                        norm=norm,
+                        norm_params=norm_params,
+                        causal=causal,
+                        pad_mode=pad_mode,
+                        compress=compress,
+                        true_skip=true_skip,
+                    )
+                ]
+            mult //= 2
+        # Add final layers
+        model += [
+            act(**activation_params),
+            SConv1d(
+                n_filters,
+                channels,
+                last_kernel_size,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            ),
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [final_act(**final_activation_params)]
+        self.model = nn.Sequential(*model)
+    def forward(self, z):
+        y = self.model(z)
+        return y

encodec/quantization/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# flake8: noqa
+from .vq import ResidualVectorQuantizer

encodec/quantization/core_vq.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .. import distrib
+################################################################################
+# Core vector quantization implementation
+################################################################################
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        diffs = samples.unsqueeze(1) - means.unsqueeze(0)
+        dists = -(diffs**2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, buckets.unsqueeze(-1).expand(-1, dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = (
+            uniform_init if not kmeans_init else torch.zeros
+        )
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        distrib.broadcast_tensors(self.buffers())
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = batch_samples.view(-1, batch_samples.shape[-1])
+        self.replace_(batch_samples, mask=expired_codes)
+        distrib.broadcast_tensors(self.buffers())
+    def preprocess(self, x):
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.0,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(
+            dim=_codebook_dim,
+            codebook_size=codebook_size,
+            kmeans_init=kmeans_init,
+            kmeans_iters=kmeans_iters,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+        )
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x):
+        x = x.transpose(1, 2).contiguous()
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = quantize.transpose(1, 2).contiguous()
+        return quantize
+    def forward(self, x):
+        device = x.device
+        x = x.transpose(1, 2).contiguous()
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            warnings.warn(
+                "When using RVQ in training model, first check "
+                "https://github.com/facebookresearch/encodec/issues/25 . "
+                "The bug wasn't fixed here for reproducibility."
+            )
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        quantize = quantize.transpose(1, 2).contiguous()
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        residual = x
+        # Return quantized latents, both summed and at each quantizer level
+        z_O = 0.0  # Summed quantized latents
+        z_o = []  # Quantized latents at each quantizer level
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            z_o += [quantized]
+            z_O = z_O + quantized
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        z_o = torch.stack(z_o, dim=1)
+        return out_indices, z_O, z_o
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out

encodec/quantization/vq.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import torch
+from torch import nn
+from .core_vq import ResidualVectorQuantization
+################################################################################
+# Residual quantization module
+################################################################################
+class ResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer.
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        bins: int = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.n_q = n_q
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+        )
+    def get_num_quantizers_for_bandwidth(
+        self, frame_rate: int, bandwidth: tp.Optional[float] = None
+    ) -> int:
+        """Return n_q based on specified target bandwidth."""
+        bw_per_q = self.get_bandwidth_per_quantizer(frame_rate)
+        n_q = self.n_q
+        if bandwidth and bandwidth > 0.0:
+            # bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
+            # bandwidth == 6.0
+            n_q = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
+        return n_q
+    def get_bandwidth_per_quantizer(self, frame_rate: int):
+        """Return bandwidth per quantizer for a given input frame rate.
+        Each quantizer encodes a frame with lg(bins) bits.
+        """
+        return math.log2(self.bins) * frame_rate
+    def encode(
+        self, x: torch.Tensor, frame_rate: int, bandwidth: tp.Optional[float] = None
+    ) -> torch.Tensor:
+        """Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizers to use
+        and returns indices for each quantizer.
+        """
+        n_q = self.get_num_quantizers_for_bandwidth(frame_rate, bandwidth)
+        codes, z_O, z_o = self.vq.encode(x, n_q=n_q)
+        return codes, z_O, z_o
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """
+        Decode the given codes to the quantized representation.
+        """
+        quantized = self.vq.decode(codes)
+        return quantized

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+torch
+audioseal
+git+https://github.com/descriptinc/audiotools
+pydantic==2.10.6