Spaces:

descript
/

vampnet

Runtime error

pseeth commited on Jul 18, 2023

Commit

4d9dee0

1 Parent(s): 1fedcf3

fix audiotools version + sampling trick (#7)

- sampling tricks, fix audiotools pin (1062aecaf5ba2e866553c342f9a7ad78b6ec695a)
- remove share from demo (4fd6833b5f5c90fa51a330c389e2be09d8b057c5)

Files changed (5) hide show

.gitignore +2 -0
app.py +53 -14
requirements.txt +1 -1
scripts/exp/train.py +7 -5
vampnet/modules/transformer.py +109 -37

.gitignore CHANGED Viewed

@@ -175,6 +175,7 @@ lyrebird-audio-codec
 samples-*/**
 gradio-outputs/
 samples*/
 models-all/
 models.zip
@@ -183,3 +184,4 @@ descript-audio-codec/
 # *.pth
 .git-old
 conf/generated/*

 samples-*/**
 gradio-outputs/
+models/
 samples*/
 models-all/
 models.zip
 # *.pth
 .git-old
 conf/generated/*
+runs*/

app.py CHANGED Viewed

@@ -107,24 +107,36 @@ def _vamp(data, return_mask=False):
     mask = pmask.codebook_unmask(mask, ncc)
-    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[temp]},  use coarse2fine {data[use_coarse2fine]}")
     # save the mask as a txt file
     np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
         sampling_steps=data[num_steps],
-        temperature=float(data[temp]*10),
         return_mask=True,
         typical_filtering=data[typical_filtering],
         typical_mass=data[typical_mass],
         typical_min_tokens=data[typical_min_tokens],
         gen_fn=interface.coarse.generate,
     )
     if use_coarse2fine:
-        zv = interface.coarse_to_fine(zv, temperature=data[temp], mask=mask)
     sig = interface.to_signal(zv).cpu()
     print("done")
@@ -157,7 +169,9 @@ def save_vamp(data):
     sig_out.write(out_dir / "output.wav")
     _data = {
-        "temp": data[temp],
         "prefix_s": data[prefix_s],
         "suffix_s": data[suffix_s],
         "rand_mask_intensity": data[rand_mask_intensity],
@@ -168,6 +182,7 @@ def save_vamp(data):
         "n_conditioning_codebooks": data[n_conditioning_codebooks],
         "use_coarse2fine": data[use_coarse2fine],
         "stretch_factor": data[stretch_factor],
     }
     # save with yaml
@@ -183,13 +198,14 @@ def save_vamp(data):
     return f"saved! your save code is {out_dir.stem}", zip_path
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            gr.Markdown("# VampNet")
             gr.Markdown("""## Description:
-            This is a demo of VampNet, a masked generative music model capable of doing music variations.
             You can control the extent and nature of variation with a set of manual controls and presets.
             Use this interface to experiment with different mask settings and explore the audio outputs.
             """)
@@ -197,8 +213,8 @@ with gr.Blocks() as demo:
             gr.Markdown("""
             ## Instructions:
             1. You can start by uploading some audio, or by loading the example audio.
-            2. Choose a preset for the vamp operation, or manually adjust the controls to customize the mask settings. Click the load preset button.
-            3. Click the "generate (vamp)!!!" button to generate audio. Listen to the output audio, and the masked audio to hear the mask hints.
             4. Optionally, you can add some notes and save the result.
             5. You can also use the output as the new input and continue experimenting!
             """)
@@ -377,16 +393,28 @@ with gr.Blocks() as demo:
                     value=0.0
                 )
-            temp = gr.Slider(
-                label="temperature",
                 minimum=0.0,
                 maximum=10.0,
-                value=1.8
             )
             with gr.Accordion("sampling settings", open=False):
                 typical_filtering = gr.Checkbox(
                     label="typical filtering ",
                     value=False
@@ -428,6 +456,14 @@ with gr.Blocks() as demo:
             )
         # mask settings
         with gr.Column():
             vamp_button = gr.Button("generate (vamp)!!!")
@@ -455,7 +491,9 @@ with gr.Blocks() as demo:
     _inputs = {
             input_audio,
             num_steps,
-            temp,
             prefix_s, suffix_s,
             rand_mask_intensity,
             periodic_p, periodic_w,
@@ -468,6 +506,7 @@ with gr.Blocks() as demo:
             typical_mass,
             typical_min_tokens,
             beat_mask_width,
             beat_mask_downbeats
         }
@@ -498,4 +537,4 @@ with gr.Blocks() as demo:
         outputs=[thank_you, download_file]
     )
-demo.queue().launch()

     mask = pmask.codebook_unmask(mask, ncc)
+    print(data)
+    _top_p = data[top_p] if data[top_p] > 0 else None
     # save the mask as a txt file
     np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
+    _seed = data[seed] if data[seed] > 0 else None
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
         sampling_steps=data[num_steps],
+        mask_temperature=data[masktemp]*10,
+        sampling_temperature=data[sampletemp],
         return_mask=True,
         typical_filtering=data[typical_filtering],
         typical_mass=data[typical_mass],
         typical_min_tokens=data[typical_min_tokens],
+        top_p=_top_p,
         gen_fn=interface.coarse.generate,
+        seed=_seed,
     )
     if use_coarse2fine:
+        zv = interface.coarse_to_fine(
+            zv,
+            mask_temperature=data[masktemp]*10,
+            sampling_temperature=data[sampletemp],
+            mask=mask,
+            sampling_steps=data[num_steps],
+            seed=_seed,
+        )
     sig = interface.to_signal(zv).cpu()
     print("done")
     sig_out.write(out_dir / "output.wav")
     _data = {
+        "masktemp": data[masktemp],
+        "sampletemp": data[sampletemp],
+        "top_p": data[top_p],
         "prefix_s": data[prefix_s],
         "suffix_s": data[suffix_s],
         "rand_mask_intensity": data[rand_mask_intensity],
         "n_conditioning_codebooks": data[n_conditioning_codebooks],
         "use_coarse2fine": data[use_coarse2fine],
         "stretch_factor": data[stretch_factor],
+        "seed": data[seed],
     }
     # save with yaml
     return f"saved! your save code is {out_dir.stem}", zip_path
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            gr.Markdown("# VampNet Audio Vamping")
             gr.Markdown("""## Description:
+            This is a demo of the VampNet, a generative audio model that transforms the input audio based on the chosen settings.
             You can control the extent and nature of variation with a set of manual controls and presets.
             Use this interface to experiment with different mask settings and explore the audio outputs.
             """)
             gr.Markdown("""
             ## Instructions:
             1. You can start by uploading some audio, or by loading the example audio.
+            2. Choose a preset for the vamp operation, or manually adjust the controls to customize the mask settings.
+            3. Click the "generate (vamp)!!!" button to apply the vamp operation. Listen to the output audio.
             4. Optionally, you can add some notes and save the result.
             5. You can also use the output as the new input and continue experimenting!
             """)
                     value=0.0
                 )
+            masktemp = gr.Slider(
+                label="mask temperature",
                 minimum=0.0,
                 maximum=10.0,
+                value=1.5
             )
+            sampletemp = gr.Slider(
+                label="sample temperature",
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0
+            )
             with gr.Accordion("sampling settings", open=False):
+                top_p = gr.Slider(
+                    label="top p (0.0 = off)",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.0
+                )
                 typical_filtering = gr.Checkbox(
                     label="typical filtering ",
                     value=False
             )
+            seed = gr.Number(
+                label="seed (0 for random)",
+                value=0,
+                precision=0,
+            )
         # mask settings
         with gr.Column():
             vamp_button = gr.Button("generate (vamp)!!!")
     _inputs = {
             input_audio,
             num_steps,
+            masktemp,
+            sampletemp,
+            top_p,
             prefix_s, suffix_s,
             rand_mask_intensity,
             periodic_p, periodic_w,
             typical_mass,
             typical_min_tokens,
             beat_mask_width,
+            seed,
             beat_mask_downbeats
         }
         outputs=[thank_you, download_file]
     )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -5,4 +5,4 @@ gradio
 loralib
 wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
 lac @ git+https://github.com/hugofloresgarcia/lac.git
-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git

 loralib
 wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
 lac @ git+https://github.com/hugofloresgarcia/lac.git
+descript-audiotools @ git+https://github.com/descriptinc/audiotools.git@0.7.2

scripts/exp/train.py CHANGED Viewed

@@ -485,7 +485,6 @@ def load(
     save_path: str,
     resume: bool = False,
     tag: str = "latest",
-    load_weights: bool = False,
     fine_tune_checkpoint: Optional[str] = None,
     grad_clip_val: float = 5.0,
 ) -> State:
@@ -498,7 +497,7 @@ def load(
         kwargs = {
             "folder": f"{save_path}/{tag}",
             "map_location": "cpu",
-            "package": not load_weights,
         }
         tracker.print(f"Loading checkpoint from {kwargs['folder']}")
         if (Path(kwargs["folder"]) / "vampnet").exists():
@@ -511,11 +510,14 @@ def load(
     if args["fine_tune"]:
         assert fine_tune_checkpoint is not None, "Must provide a fine-tune checkpoint"
-        model = VampNet.load(location=Path(fine_tune_checkpoint), map_location="cpu")
-    model = VampNet() if model is None else model
     model = accel.prepare_model(model)
     # assert accel.unwrap(model).n_codebooks == codec.quantizer.n_codebooks

     save_path: str,
     resume: bool = False,
     tag: str = "latest",
     fine_tune_checkpoint: Optional[str] = None,
     grad_clip_val: float = 5.0,
 ) -> State:
         kwargs = {
             "folder": f"{save_path}/{tag}",
             "map_location": "cpu",
+            "package": False,
         }
         tracker.print(f"Loading checkpoint from {kwargs['folder']}")
         if (Path(kwargs["folder"]) / "vampnet").exists():
     if args["fine_tune"]:
         assert fine_tune_checkpoint is not None, "Must provide a fine-tune checkpoint"
+        model = torch.compile(
+            VampNet.load(location=Path(fine_tune_checkpoint),
+                         map_location="cpu",
+            )
+        )
+    model = torch.compile(VampNet()) if model is None else model
     model = accel.prepare_model(model)
     # assert accel.unwrap(model).n_codebooks == codec.quantizer.n_codebooks

vampnet/modules/transformer.py CHANGED Viewed

@@ -367,6 +367,15 @@ class TransformerLayer(nn.Module):
         return x, position_bias, encoder_decoder_position_bias
 class TransformerStack(nn.Module):
     def __init__(
@@ -580,20 +589,20 @@ class VampNet(at.ml.BaseModel):
         time_steps: int = 300,
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
-        temperature: float = 2.5,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
         return_signal=True,
     ):
         logging.debug(f"beginning generation with {sampling_steps} steps")
-        #####################
-        # resolve temperature #
-        #####################
-        logging.debug(f"temperature: {temperature}")
         #####################
@@ -641,13 +650,11 @@ class VampNet(at.ml.BaseModel):
         #################
         # begin sampling #
         #################
         for i in range(sampling_steps):
             logging.debug(f"step {i} of {sampling_steps}")
-            # our current temperature
-            logging.debug(f"temperature: {temperature}")
             # our current schedule step
             r = scalar_to_batch_tensor(
                 (i + 1) / sampling_steps,
@@ -664,39 +671,19 @@ class VampNet(at.ml.BaseModel):
             # NOTE: this collapses the codebook dimension into the sequence dimension
             logits = self.forward(latents, r) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
-            if typical_filtering:
-                typical_filter(logits,
-                               typical_mass=typical_mass,
-                               typical_min_tokens=typical_min_tokens
-                )
             logging.debug(f"permuted logits with shape: {logits.shape}")
-            # logits2probs
-            probs = torch.softmax(logits, dim=-1)
-            logging.debug(f"computed probs with shape: {probs.shape}")
-            # sample from logits with multinomial sampling
-            b = probs.shape[0]
-            probs = rearrange(probs, "b seq prob -> (b seq) prob")
-            sampled_z =  torch.multinomial(probs, 1).squeeze(-1)
-            sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
-            probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
             logging.debug(f"sampled z with shape: {sampled_z.shape}")
-            # get the confidences: which tokens did we sample?
-            selected_probs = (
-                torch.take_along_dim(
-                    probs, sampled_z.long().unsqueeze(-1),
-                    dim=-1
-                ).squeeze(-1)
-            )
             # flatten z_masked and mask, so we can deal with the sampling logic
             # we'll unflatten them at the end of the loop for the next forward pass
             # remove conditioning codebooks, we'll add them back at the end
@@ -733,7 +720,7 @@ class VampNet(at.ml.BaseModel):
             # get our new mask
             mask = mask_by_random_topk(
-                num_to_mask, selected_probs, temperature * (1-r)
             )
             # update the mask
@@ -766,6 +753,91 @@ class VampNet(at.ml.BaseModel):
         else:
             return sampled_z
 def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):
     """

         return x, position_bias, encoder_decoder_position_bias
+def t_schedule(n_steps, max_temp=1.0, min_temp=0.0, k=1.0):
+    x = np.linspace(0, 1, n_steps)
+    a = (0.5 - min_temp) / (max_temp - min_temp)
+    x = (x * 12) - 6
+    x0 = np.log((1 / a - 1) + 1e-5) / k
+    y = (1 / (1 + np.exp(- k *(x-x0))))[::-1]
+    return y
 class TransformerStack(nn.Module):
     def __init__(
         time_steps: int = 300,
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
+        sampling_temperature: float = 1.0,
         mask: Optional[torch.Tensor] = None,
+        mask_temperature: float = 20.5,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
+        top_p=None,
         return_signal=True,
+        seed: int = None
     ):
+        if seed is not None:
+            at.util.seed(seed)
         logging.debug(f"beginning generation with {sampling_steps} steps")
         #####################
         #################
         # begin sampling #
         #################
+        t_sched = t_schedule(sampling_steps, max_temp=sampling_temperature)
         for i in range(sampling_steps):
             logging.debug(f"step {i} of {sampling_steps}")
             # our current schedule step
             r = scalar_to_batch_tensor(
                 (i + 1) / sampling_steps,
             # NOTE: this collapses the codebook dimension into the sequence dimension
             logits = self.forward(latents, r) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
+            b = logits.shape[0]
             logging.debug(f"permuted logits with shape: {logits.shape}")
+            sampled_z, selected_probs = sample_from_logits(
+                logits, sample=True, temperature=t_sched[i],
+                typical_filtering=typical_filtering, typical_mass=typical_mass,
+                typical_min_tokens=typical_min_tokens,
+                top_k=None, top_p=top_p, return_probs=True
+            )
             logging.debug(f"sampled z with shape: {sampled_z.shape}")
             # flatten z_masked and mask, so we can deal with the sampling logic
             # we'll unflatten them at the end of the loop for the next forward pass
             # remove conditioning codebooks, we'll add them back at the end
             # get our new mask
             mask = mask_by_random_topk(
+                num_to_mask, selected_probs, mask_temperature * (1-r)
             )
             # update the mask
         else:
             return sampled_z
+def sample_from_logits(
+        logits,
+        sample: bool = True,
+        temperature: float = 1.0,
+        top_k: int = None,
+        top_p: float = None,
+        typical_filtering: bool = False,
+        typical_mass: float = 0.2,
+        typical_min_tokens: int = 1,
+        return_probs: bool = False
+    ):
+    """Convenience function to sample from a categorial distribution with input as
+    unnormalized logits.
+    Parameters
+    ----------
+    logits : Tensor[..., vocab_size]
+    config: SamplingConfig
+        The set of hyperparameters to be used for sampling
+        sample : bool, optional
+            Whether to perform multinomial sampling, by default True
+        temperature : float, optional
+            Scaling parameter when multinomial samping, by default 1.0
+        top_k : int, optional
+            Restricts sampling to only `top_k` values acc. to probability,
+            by default None
+        top_p : float, optional
+            Restricts sampling to only those values with cumulative
+            probability = `top_p`, by default None
+    Returns
+    -------
+    Tensor[...]
+        Sampled tokens
+    """
+    shp = logits.shape[:-1]
+    if typical_filtering:
+        typical_filter(logits,
+                        typical_mass=typical_mass,
+                        typical_min_tokens=typical_min_tokens
+        )
+    # Apply top_k sampling
+    if top_k is not None:
+        v, _ = logits.topk(top_k)
+        logits[logits < v[..., [-1]]] = -float("inf")
+    # Apply top_p (nucleus) sampling
+    if top_p is not None and top_p < 1.0:
+        v, sorted_indices = logits.sort(descending=True)
+        cumulative_probs = v.softmax(dim=-1).cumsum(dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Right shift indices_to_remove to keep 1st token over threshold
+        sorted_indices_to_remove = F.pad(sorted_indices_to_remove, (1, 0), value=False)[
+            ..., :-1
+        ]
+        # Compute indices_to_remove in unsorted array
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            -1, sorted_indices, sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = -float("inf")
+    # Perform multinomial sampling after normalizing logits
+    probs = (
+        F.softmax(logits / temperature, dim=-1)
+        if temperature > 0
+        else logits.softmax(dim=-1)
+    )
+    token = (
+        probs.view(-1, probs.size(-1)).multinomial(1).squeeze(1).view(*shp)
+        if sample
+        else logits.argmax(-1)
+    )
+    if return_probs:
+        token_probs = probs.take_along_dim(token.unsqueeze(-1), dim=-1).squeeze(-1)
+        return token, token_probs
+    else:
+        return token
 def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):
     """