Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores commited on Mar 17, 2023

Commit

534a89c

1 Parent(s): fc839a6

refactor bugfixes

Browse files

Files changed (8) hide show

conf/vampnet.yml +1 -1
lyrebird-audiotools +1 -1
scripts/{generative → exp}/eval.py +0 -0
scripts/{generative → exp}/train.py +46 -38
vampnet/modules/activations.py +1 -1
vampnet/modules/base.py +10 -12
vampnet/modules/layers.py +2 -2
vampnet/modules/wavenet.py +90 -0

conf/vampnet.yml CHANGED Viewed

@@ -1,5 +1,5 @@
-wav2wav_ckpt: /u/home/src/runs/codec-ckpt/codec.pth
 save_path: ckpt
 max_epochs: 1000000
 epoch_length: 1000

+codec_ckpt: /u/home/src/runs/codec-ckpt/codec.pth
 save_path: ckpt
 max_epochs: 1000000
 epoch_length: 1000

lyrebird-audiotools CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~018a055ff7406c7bcb3b175551356ec18ba895b7~~


1	+ Subproject commit 3b1abbe27a846f3e2330cacc3ddf70a280b08e98

scripts/{generative → exp}/eval.py RENAMED Viewed

File without changes

scripts/{generative → exp}/train.py RENAMED Viewed

@@ -114,8 +114,8 @@ def load(
             "map_location": "cpu",
             "package": not load_weights,
         }
-        if (Path(kwargs["folder"]) / "model").exists():
-            model, v_extra = model.load_from_folder(**kwargs)
     codec = LAC.load(args["codec_ckpt"], map_location="cpu")
     codec.eval()
@@ -215,6 +215,29 @@ def accuracy(
     return accuracy
 @argbind.bind(without_prefix=True)
 def train(
@@ -288,7 +311,7 @@ def train(
     class Trainer(at.ml.BaseTrainer):
         _last_grad_norm = 0.0
-        def metrics(self, vn, z_hat, r, target, flat_mask, output):
             for r_range in [(0, 0.5), (0.5, 1.0)]:
                 unmasked_target = target.masked_fill(flat_mask.bool(), IGNORE_INDEX)
                 masked_target = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
@@ -324,7 +347,6 @@ def train(
                     )
         def train_loop(self, engine, batch):
             model.train()
             batch = at.util.prepare_batch(batch, accel.device)
             signal = apply_transform(train_data.transform, batch)
@@ -333,22 +355,18 @@ def train(
             vn = accel.unwrap(model)
             with accel.autocast():
                 with torch.inference_mode():
                     z = codec.encode(signal.samples, signal.sample_rate)["codes"]
                     z = z[:, : vn.n_codebooks, :]
                 n_batch = z.shape[0]
                 r = rng.draw(n_batch)[:, 0].to(accel.device)
-                if prefix_amt > 0.0:
-                    prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
-                    n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
-                else:
-                    n_prefix = None
-                if suffix_amt > 0.0:
-                    suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
-                    n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
-                else:
-                    n_suffix = None
                 z_mask, mask = vn.add_noise(
                     z, r, n_prefix=n_prefix, n_suffix=n_suffix
@@ -378,7 +396,7 @@ def train(
                 else:
                     output["loss"] = criterion(z_hat, target)
-                self.metrics(
                     vn=vn,
                     r=r,
                     z_hat=z_hat,
@@ -430,16 +448,11 @@ def train(
             n_batch = z.shape[0]
             r = rng.draw(n_batch)[:, 0].to(accel.device)
-            if prefix_amt > 0.0:
-                prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
-                n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
-            else:
-                n_prefix = None
-            if suffix_amt > 0.0:
-                suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
-                n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
-            else:
-                n_suffix = None
             z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
@@ -466,7 +479,7 @@ def train(
             else:
                 output["loss"] = criterion(z_hat, target)
-            self.metrics(
                 vn=vn,
                 r=r,
                 z_hat=z_hat,
@@ -516,7 +529,7 @@ def train(
             for i in range(num_samples):
                 sampled = accel.unwrap(model).sample(
-                    codec,
                     time_steps=z.shape[-1],
                     start_tokens=z[i : i + 1],
                 )
@@ -547,7 +560,7 @@ def train(
             for i in range(len(z)):
                 imputed.append(
                     accel.unwrap(model).sample(
-                        codec,
                         time_steps=z.shape[-1],
                         start_tokens=z[i][None, ...],
                         mask=imp_mask[i][None, ...],
@@ -593,16 +606,11 @@ def train(
             n_batch = z.shape[0]
-            if prefix_amt > 0.0:
-                prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
-                n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
-            else:
-                n_prefix = None
-            if suffix_amt > 0.0:
-                suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
-                n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
-            else:
-                n_suffix = None
             z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)

             "map_location": "cpu",
             "package": not load_weights,
         }
+        if (Path(kwargs["folder"]) / "vampnet").exists():
+            model, v_extra = VampNet.load_from_folder(**kwargs)
     codec = LAC.load(args["codec_ckpt"], map_location="cpu")
     codec.eval()
     return accuracy
+def sample_prefix_suffix_amt(
+        n_batch,
+        prefix_amt,
+        suffix_amt,
+        prefix_dropout,
+        suffix_dropout,
+        rng
+    ):
+    """
+    Sample the number of prefix and suffix tokens to drop.
+    """
+    if prefix_amt > 0.0:
+        prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
+        n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
+    else:
+        n_prefix = None
+    if suffix_amt > 0.0:
+        suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
+        n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
+    else:
+        n_suffix = None
+    return n_prefix, n_suffix
 @argbind.bind(without_prefix=True)
 def train(
     class Trainer(at.ml.BaseTrainer):
         _last_grad_norm = 0.0
+        def _metrics(self, vn, z_hat, r, target, flat_mask, output):
             for r_range in [(0, 0.5), (0.5, 1.0)]:
                 unmasked_target = target.masked_fill(flat_mask.bool(), IGNORE_INDEX)
                 masked_target = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
                     )
         def train_loop(self, engine, batch):
             model.train()
             batch = at.util.prepare_batch(batch, accel.device)
             signal = apply_transform(train_data.transform, batch)
             vn = accel.unwrap(model)
             with accel.autocast():
                 with torch.inference_mode():
+                    codec.to(accel.device)
                     z = codec.encode(signal.samples, signal.sample_rate)["codes"]
                     z = z[:, : vn.n_codebooks, :]
                 n_batch = z.shape[0]
                 r = rng.draw(n_batch)[:, 0].to(accel.device)
+                n_prefix, n_suffix = sample_prefix_suffix_amt(
+                    n_batch=n_batch, prefix_amt=prefix_amt, suffix_amt=suffix_amt,
+                    prefix_dropout=prefix_dropout, suffix_dropout=suffix_dropout,
+                    rng=rng
+                )
                 z_mask, mask = vn.add_noise(
                     z, r, n_prefix=n_prefix, n_suffix=n_suffix
                 else:
                     output["loss"] = criterion(z_hat, target)
+                self._metrics(
                     vn=vn,
                     r=r,
                     z_hat=z_hat,
             n_batch = z.shape[0]
             r = rng.draw(n_batch)[:, 0].to(accel.device)
+            n_prefix, n_suffix = sample_prefix_suffix_amt(
+                n_batch=n_batch, prefix_amt=prefix_amt, suffix_amt=suffix_amt,
+                prefix_dropout=prefix_dropout, suffix_dropout=suffix_dropout,
+                rng=rng
+            )
             z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             else:
                 output["loss"] = criterion(z_hat, target)
+            self._metrics(
                 vn=vn,
                 r=r,
                 z_hat=z_hat,
             for i in range(num_samples):
                 sampled = accel.unwrap(model).sample(
+                    codec=codec,
                     time_steps=z.shape[-1],
                     start_tokens=z[i : i + 1],
                 )
             for i in range(len(z)):
                 imputed.append(
                     accel.unwrap(model).sample(
+                        codec=codec,
                         time_steps=z.shape[-1],
                         start_tokens=z[i][None, ...],
                         mask=imp_mask[i][None, ...],
             n_batch = z.shape[0]
+            n_prefix, n_suffix = sample_prefix_suffix_amt(
+                n_batch=n_batch, prefix_amt=prefix_amt, suffix_amt=suffix_amt,
+                prefix_dropout=prefix_dropout, suffix_dropout=suffix_dropout,
+                rng=rng
+            )
             z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)

vampnet/modules/activations.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 import torch
 import torch.nn as nn
@@ -5,7 +6,6 @@ import torch.nn.functional as F
 from einops import rearrange
 class NewGELU(nn.Module):
     """
     Implementation of the GELU activation function currently in Google BERT repo

+import math
 import numpy as np
 import torch
 import torch.nn as nn
 from einops import rearrange
 class NewGELU(nn.Module):
     """
     Implementation of the GELU activation function currently in Google BERT repo

vampnet/modules/base.py CHANGED Viewed

@@ -85,8 +85,6 @@ class VampBase(at.ml.BaseModel):
             mask = mask[:, self.n_conditioning_codebooks :, :]
             truth = F.one_hot(z_true, self.vocab_size)
-            print(truth.shape)
-            # truth = rearrange(truth, "b c t p -> b p (t c)")
             mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
             z_hat = rearrange(
                 z_hat,
@@ -127,16 +125,16 @@ class VampBase(at.ml.BaseModel):
             return r
     @torch.no_grad()
-    def to_signal(self, z, vqvae):
         if z.ndim == 2:
             z = self.embedding.unflatten(z)
         assert z.ndim == 3
         signal = at.AudioSignal(
-            vqvae.decode(
-                vqvae.quantizer.from_latents(self.embedding.from_codes(z, vqvae))[0]
             )["audio"],
-            vqvae.sample_rate,
         )
         return signal
@@ -150,7 +148,7 @@ class VampBase(at.ml.BaseModel):
     def paella_sample(
         self,
-        vqvae,
         time_steps: int = 400,
         sampling_steps: int = 12,
         start_tokens: Optional[torch.Tensor] = None,
@@ -219,7 +217,7 @@ class VampBase(at.ml.BaseModel):
             if renoise_mode == "prev":
                 z_prev = z.clone()
-            latents = self.embedding.from_codes(z, vqvae)
             logits = self.forward(latents, r[i])
             # for mask mode
@@ -258,13 +256,13 @@ class VampBase(at.ml.BaseModel):
                 z = start_tokens * (1 - mask) + z * mask
         if return_signal:
-            return self.to_signal(z, vqvae)
         else:
             return z
     def maskgit_sample(
         self,
-        vqvae,
         time_steps: int = 300,
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
@@ -338,7 +336,7 @@ class VampBase(at.ml.BaseModel):
                 z_masked = z.masked_fill(~keep_mask_unflat.bool(), self.mask_token)
                 # get latents
-                latents = self.embedding.from_codes(z_masked, vqvae)
                 # infer from latents
                 logits = self.forward(latents, r)
@@ -400,7 +398,7 @@ class VampBase(at.ml.BaseModel):
                 # z = torch.cat([z[:, :self.n_conditioning_codebooks, :], z_inferred], dim=1)
         if return_signal:
-            return self.to_signal(z, vqvae)
         else:
             return z

             mask = mask[:, self.n_conditioning_codebooks :, :]
             truth = F.one_hot(z_true, self.vocab_size)
             mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
             z_hat = rearrange(
                 z_hat,
             return r
     @torch.no_grad()
+    def to_signal(self, z, codec):
         if z.ndim == 2:
             z = self.embedding.unflatten(z)
         assert z.ndim == 3
         signal = at.AudioSignal(
+            codec.decode(
+                codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
             )["audio"],
+            codec.sample_rate,
         )
         return signal
     def paella_sample(
         self,
+        codec,
         time_steps: int = 400,
         sampling_steps: int = 12,
         start_tokens: Optional[torch.Tensor] = None,
             if renoise_mode == "prev":
                 z_prev = z.clone()
+            latents = self.embedding.from_codes(z, codec)
             logits = self.forward(latents, r[i])
             # for mask mode
                 z = start_tokens * (1 - mask) + z * mask
         if return_signal:
+            return self.to_signal(z, codec)
         else:
             return z
     def maskgit_sample(
         self,
+        codec,
         time_steps: int = 300,
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
                 z_masked = z.masked_fill(~keep_mask_unflat.bool(), self.mask_token)
                 # get latents
+                latents = self.embedding.from_codes(z_masked, codec)
                 # infer from latents
                 logits = self.forward(latents, r)
                 # z = torch.cat([z[:, :self.n_conditioning_codebooks, :], z_inferred], dim=1)
         if return_signal:
+            return self.to_signal(z, codec)
         else:
             return z

vampnet/modules/layers.py CHANGED Viewed

@@ -113,13 +113,13 @@ class CodebookEmbedding(nn.Module):
         self.out_proj = nn.Conv1d(n_codebooks * self.latent_dim, self.emb_dim, 1)
-    def from_codes(self, codes: torch.Tensor, vqvae):
         n_codebooks = codes.shape[1]
         latent = []
         for i in range(n_codebooks):
             c = codes[:, i, :]
-            lookup_table = vqvae.quantizer.quantizers[i].codebook.weight
             if hasattr(self, "special"):
                 special_lookup = torch.cat(
                     [self.special[tkn][i : i + 1] for tkn in self.special], dim=0

         self.out_proj = nn.Conv1d(n_codebooks * self.latent_dim, self.emb_dim, 1)
+    def from_codes(self, codes: torch.Tensor, codec):
         n_codebooks = codes.shape[1]
         latent = []
         for i in range(n_codebooks):
             c = codes[:, i, :]
+            lookup_table = codec.quantizer.quantizers[i].codebook.weight
             if hasattr(self, "special"):
                 special_lookup = torch.cat(
                     [self.special[tkn][i : i + 1] for tkn in self.special], dim=0

vampnet/modules/wavenet.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch.nn as nn
+from einops import rearrange
+from voicegpt.nn import WaveNet
+class AutoregMLP(nn.Module):
+    """Implements an autoregressive ConvNet decoder
+    Refer to SampleRNN (https://arxiv.org/abs/1612.07837) for motivation
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int,
+        n_layers: int,
+        n_fine_tokens: int = 6,
+        n_tokens: int = 9,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+        causal: bool = True,
+    ):
+        super().__init__()
+        self.n_fine = n_fine_tokens
+        self.n_layers = n_layers
+        self.upsampler = nn.Linear(d_model, d_model * n_fine_tokens)
+        self.wavenet = WaveNet(
+            d_model,
+            d_model,
+            d_model,
+            n_layers,
+            n_fine_tokens,
+            dropout=dropout,
+            activation=activation,
+            causal=causal,
+        )
+        self.ff_output = nn.Linear(d_model, vocab_size * n_tokens, bias=False)
+    def time_upsample(self, h_t_coarse):
+        """Upsamples the conditioning hidden states to match the time resolution
+        of output tokens
+        Parameters
+        ----------
+        h_t_coarse : Tensor[B x T_coarse x D]
+            Conditioning hidden states in coarse time-scale
+        Returns
+        -------
+        Tensor[B x T_fine x D]
+            Conditioning hidden states in fine time-scale
+        """
+        # Upsample the transformer hidden states to fine scale
+        h_t_fine = rearrange(
+            self.upsampler(h_t_coarse), "b t (n d) -> b (t n) d", n=self.n_fine
+        )
+        return h_t_fine
+    def decode_logits(self, x_tm1, h_t_fine):
+        """Decodes output logits conditioned on previous output
+        tokens (upto timestep t-1) and conditioning hidden states
+        using an autoregressive WaveNet
+        Parameters
+        ----------
+        x_tm1 : Tensor[B x T x D]
+        h_t_fine : Tensor[B x T x D]
+        Returns
+        -------
+        Tensor[B x T x vocab_size]
+            Predicted logits
+        """
+        # Compute wavenet layers and predict logits
+        o_t = self.wavenet(x_tm1, h_t_fine)
+        return self.ff_output(o_t)
+    def forward(self, x_tm1, h_t_coarse):
+        """Computes autoregressive conditional probability distribution
+        using a WaveNet decoder
+        Parameters
+        ----------
+        x_tm1 : Tensor[B x T_fine x D]
+            Embeddings of tokens at fine time-scale
+        h_t_coarse : Tensor[B x T_coarse x D]
+            Hidden states at coarse time scale
+        Returns
+        -------
+        Tensor[B x T_fine x vocab_size]
+            Predicted logits at fine time-scale
+        """
+        h_t_fine = self.time_upsample(h_t_coarse)
+        return self.decode_logits(x_tm1, h_t_fine)