Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores commited on Mar 23, 2023

Commit

a63cce0

1 Parent(s): 4a2dc41

interface improvements

Browse files

Files changed (4) hide show

scripts/utils/process_folder-c2f.py +122 -0
vampnet/gradio.py +4 -0
vampnet/interface.py +149 -20
vampnet/modules/base.py +5 -0

scripts/utils/process_folder-c2f.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from audiotools import AudioSignal
+import torch
+from pathlib import Path
+import argbind
+from tqdm import tqdm
+import random
+from collections import defaultdict
+def coarse2fine_infer(
+        signal,
+        model,
+        vqvae,
+        device,
+        signal_window=3,
+        signal_hop=1.5,
+        max_excerpts=25,
+    ):
+    output = defaultdict(list)
+    # split into 3 seconds
+    windows = [s for s in signal.clone().windows(signal_window, signal_hop)]
+    random.shuffle(windows)
+    for w in windows[1:max_excerpts]: # skip the first window since it's mostly zero padded?
+        # batch the signal into chunks of 3
+        with torch.no_grad():
+            # get codes
+            w = w.to(device)
+            z = vqvae.encode(w.audio_data, w.sample_rate)["codes"]
+            model.to(device)
+            output["reconstructed"] = model.to_signal(z, vqvae).cpu()
+            # make a full mask
+            mask = torch.ones_like(z)
+            mask[:, :model.n_conditioning_codebooks, :] = 0
+            output["sampled"].append(model.sample(
+                codec=vqvae,
+                time_steps=z.shape[-1],
+                sampling_steps=12,
+                start_tokens=z,
+                mask=mask,
+                temperature=0.85,
+                top_k=None,
+                sample="gumbel",
+                typical_filtering=True,
+                return_signal=True
+            ).cpu())
+            output["argmax"].append(model.sample(
+                codec=vqvae,
+                time_steps=z.shape[-1],
+                sampling_steps=1,
+                start_tokens=z,
+                mask=mask,
+                temperature=1.0,
+                top_k=None,
+                sample="argmax",
+                typical_filtering=True,
+                return_signal=True
+            ).cpu())
+    return output
+@argbind.bind(without_prefix=True)
+def main(
+        sources=[
+            "/home/hugo/data/spotdl/audio/val", "/home/hugo/data/spotdl/audio/test"
+        ],
+        audio_ext="mp3",
+        exp_name="noise_mode",
+        model_paths=[
+            "ckpt/mask/best/vampnet/weights.pth",
+            "ckpt/random/best/vampnet/weights.pth",
+        ],
+        model_keys=[
+            "noise_mode=mask",
+            "noise_mode=random",
+        ],
+        vqvae_path="ckpt/wav2wav.pth",
+        device="cuda",
+    ):
+    from vampnet.modules.transformer import VampNet
+    from lac.model.lac import LAC
+    from audiotools.post import audio_zip
+    models = {
+        k: VampNet.load(p) for k, p in zip(model_keys, model_paths)
+    }
+    for model in models.values():
+        model.eval()
+    print(f"Loaded {len(models)} models.")
+    vqvae = LAC.load(vqvae_path)
+    vqvae.to(device)
+    vqvae.eval()
+    print("Loaded VQVAE.")
+    audio_dict = defaultdict(list)
+    for source in sources:
+        print(f"Processing {source}...")
+        for path in tqdm(list(Path(source).glob(f"**/*.{audio_ext}"))):
+            sig = AudioSignal(path)
+            sig.resample(vqvae.sample_rate).normalize(-24).ensure_max_of_audio(1.0)
+            for model_key, model in models.items():
+                out = coarse2fine_infer(sig, model, vqvae, device)
+                for k in out:
+                    audio_dict[f"{model_key}-{k}"].extend(out[k])
+    audio_zip(audio_dict, f"{exp_name}-results.zip")
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        main()

vampnet/gradio.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ import gradio as gr
3	+
4	+

vampnet/interface.py CHANGED Viewed

@@ -4,11 +4,21 @@ import math
 import torch
 from audiotools import AudioSignal
 from .modules.transformer import VampNet
 from lac.model.lac import LAC
 class Interface:
     def __init__(
         self,
@@ -26,20 +36,29 @@ class Interface:
         self.coarse = VampNet.load(location=Path(coarse_ckpt), map_location="cpu")
         self.coarse.to(device)
         self.coarse.eval()
-        self.coarse.chunk_size_s = coarse_chunk_size_s
-        self.c2f = VampNet.load(
-            location=Path(coarse2fine_ckpt), map_location="cpu"
-        )
-        self.c2f.to(device)
-        self.c2f.eval()
-        self.c2f.chunk_size_s = coarse2fine_chunk_size_s
         self.device = device
     def s2t(self, seconds: float):
         """seconds to tokens"""
-        return int(seconds * self.codec.sample_rate / self.codec.hop_length)
     def to(self, device):
         self.device = device
@@ -51,15 +70,22 @@ class Interface:
     def to_signal(self, z: torch.Tensor):
         return self.coarse.to_signal(z, self.codec)
-    @torch.inference_mode()
-    def encode(self, signal: AudioSignal):
         signal = (
-            signal.clone().to(self.device)
             .resample(self.codec.sample_rate)
             .to_mono()
             .normalize(-24)
             .ensure_max_of_audio(1.0)
         )
         z = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
         return z
@@ -68,6 +94,7 @@ class Interface:
         coarse_z: torch.Tensor,
         **kwargs
     ):
         length = coarse_z.shape[-1]
         chunk_len = self.s2t(self.c2f.chunk_size_s)
         n_chunks = math.ceil(coarse_z.shape[-1] / chunk_len)
@@ -198,24 +225,30 @@ class Interface:
     def coarse_vamp_v2(
         self,
         signal,
-        prefix_dur_s: float = 1.25,
-        suffix_dur_s: float = 1.25,
-        num_loops: int = 3,
         downsample_factor: int = None,
         debug=False,
         **kwargs
     ):
         z = self.encode(signal)
-        assert signal.duration == self.coarse.chunk_size_s, "signal duration must match coarse chunk size for now"
         # coarse z
         cz = z[:, : self.coarse.n_codebooks, :].clone()
         c_seq_len = cz.shape[-1]
         n_prefix = self.s2t(prefix_dur_s)
         n_suffix = self.s2t(suffix_dur_s)
         assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
         # we'll keep the final codes sequence here
         c_vamp = {
@@ -225,10 +258,10 @@ class Interface:
         _cz = cz.clone()
         cz_mask = None
-        for _ in range(num_loops):
             # add noise
             cz_masked, cz_mask = self.coarse.add_noise(
-                _cz, r=0.0,
                 n_prefix=n_prefix,
                 n_suffix=n_suffix,
                 downsample_factor=downsample_factor,
@@ -244,7 +277,7 @@ class Interface:
                 print(f"z: {_cz[:,0,:]}")
             cz_sampled = self.coarse.sample(
                 codec=self.codec,
-                time_steps=self.s2t(self.coarse.chunk_size_s),
                 start_tokens=_cz,
                 mask=cz_mask,
                 return_signal=False,
@@ -329,17 +362,113 @@ class Interface:
         c_vamp = torch.cat([prefix_codes, suffix_codes], dim=-1)
         return c_vamp

 import torch
 from audiotools import AudioSignal
+import tqdm
 from .modules.transformer import VampNet
 from lac.model.lac import LAC
+def signal_concat(
+    audio_signals: list,
+):
+    audio_data = torch.cat([x.audio_data for x in audio_signals], dim=-1)
+    return AudioSignal(audio_data, sample_rate=audio_signals[0].sample_rate)
 class Interface:
     def __init__(
         self,
         self.coarse = VampNet.load(location=Path(coarse_ckpt), map_location="cpu")
         self.coarse.to(device)
         self.coarse.eval()
+        self.coarse.chunk_size_s = self.s2t2s(coarse_chunk_size_s)
+        if coarse2fine_ckpt is not None:
+            self.c2f = VampNet.load(
+                location=Path(coarse2fine_ckpt), map_location="cpu"
+            )
+            self.c2f.to(device)
+            self.c2f.eval()
+            self.c2f.chunk_size_s = self.s2t2s(coarse2fine_chunk_size_s)
         self.device = device
     def s2t(self, seconds: float):
         """seconds to tokens"""
+        return math.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
+    def s2t2s(self, seconds: float):
+        """seconds to tokens to seconds"""
+        return self.t2s(self.s2t(seconds))
+    def t2s(self, tokens: int):
+        """tokens to seconds"""
+        return tokens * self.codec.hop_length / self.codec.sample_rate
     def to(self, device):
         self.device = device
     def to_signal(self, z: torch.Tensor):
         return self.coarse.to_signal(z, self.codec)
+    def autoencode(self, signal: AudioSignal):
+        z = self.encode(signal)
+        return self.to_signal(z)
+    def preprocess(self, signal: AudioSignal):
         signal = (
+            signal.clone()
             .resample(self.codec.sample_rate)
             .to_mono()
             .normalize(-24)
             .ensure_max_of_audio(1.0)
         )
+        return signal
+    @torch.inference_mode()
+    def encode(self, signal: AudioSignal):
+        signal = self.preprocess(signal).to(self.device)
         z = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
         return z
         coarse_z: torch.Tensor,
         **kwargs
     ):
+        assert self.c2f is not None, "No coarse2fine model loaded"
         length = coarse_z.shape[-1]
         chunk_len = self.s2t(self.c2f.chunk_size_s)
         n_chunks = math.ceil(coarse_z.shape[-1] / chunk_len)
     def coarse_vamp_v2(
         self,
         signal,
+        prefix_dur_s: float = 0.0,
+        suffix_dur_s: float = 0.0,
+        num_vamps: int = 1,
         downsample_factor: int = None,
+        intensity: float = 1.0,
         debug=False,
+        swap_prefix_suffix=False,
         **kwargs
     ):
         z = self.encode(signal)
         # coarse z
         cz = z[:, : self.coarse.n_codebooks, :].clone()
         c_seq_len = cz.shape[-1]
         n_prefix = self.s2t(prefix_dur_s)
         n_suffix = self.s2t(suffix_dur_s)
+        assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
         assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
+        if swap_prefix_suffix:
+            # swap the prefix and suffix regions in c_z
+            assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
+            cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
         # we'll keep the final codes sequence here
         c_vamp = {
         _cz = cz.clone()
         cz_mask = None
+        for _ in range(num_vamps):
             # add noise
             cz_masked, cz_mask = self.coarse.add_noise(
+                _cz, r=1.0-intensity,
                 n_prefix=n_prefix,
                 n_suffix=n_suffix,
                 downsample_factor=downsample_factor,
                 print(f"z: {_cz[:,0,:]}")
             cz_sampled = self.coarse.sample(
                 codec=self.codec,
+                time_steps=_cz.shape[-1],
                 start_tokens=_cz,
                 mask=cz_mask,
                 return_signal=False,
         c_vamp = torch.cat([prefix_codes, suffix_codes], dim=-1)
         return c_vamp
+    # create a variation of an audio signal
+    def variation(
+        self,
+        signal: AudioSignal,
+        overlap_hop_ratio: float = 1.0, # TODO: should this be fixed to 1.0?  or should we overlap and replace instead of overlap add
+        verbose: bool = False,
+        **kwargs
+    ):
+        signal = signal.clone()
+        # autoencode first, so the samples get rounded up to the nearest tokens
+        signal = self.autoencode(signal).cpu()
+        # pad the signal to the nearest chunk size
+        req_len = (
+            math.ceil(signal.duration / self.coarse.chunk_size_s)
+            * self.coarse.chunk_size_s
+        )
+        hop_duration = self.coarse.chunk_size_s * overlap_hop_ratio
+        original_length = signal.length
+        signal.zero_pad_to(req_len)
+        # window the signal
+        signal = signal.collect_windows(
+            window_duration=self.coarse.chunk_size_s,
+            hop_duration=hop_duration,
+        )
+        # output = []
+        range_fn = range if not verbose else tqdm.trange
+        for i in range_fn(signal.batch_size):
+            sig = AudioSignal(
+                signal.samples[i,...], signal.sample_rate
+            )
+            sig.to(self.device)
+            out_z = self.coarse_vamp_v2(
+                sig,
+                num_vamps=1,
+                swap_prefix_suffix=False,
+                **kwargs
+            )
+            if self.c2f is not None:
+                out_z = self.coarse_to_fine(out_z)
+            out_sig = self.to_signal(out_z).cpu()
+            signal.samples[i] = out_sig.samples
+        output = signal.overlap_and_add(hop_duration)
+        output.truncate_samples(original_length)
+        return output
+    # create a loop of a single region with variations
+    # TODO: this would work nicer if we could trim at the beat
+    # otherwise the model has to awkwardly fill up space that won't match
+    # the beat unless the signal is exactly the right length
+    def loop(
+        self,
+        signal: AudioSignal,
+        prefix_dur_s: float = 0.0,
+        suffix_dur_s: float = 0.0,
+        num_loops: int = 4,
+        # overlap_hop_ratio: float = 1.0, # TODO: should this be fixed to 1.0?  or should we overlap and replace instead of overlap add
+        verbose: bool = False,
+        **kwargs,
+    ):
+        assert prefix_dur_s >= 0.0, "prefix duration must be >= 0"
+        assert suffix_dur_s >= 0.0, "suffix duration must be >= 0"
+        signal = self.preprocess(signal)
+        suffix_len_samples = int(suffix_dur_s * signal.sample_rate)
+        prefix_len_tokens = self.s2t(prefix_dur_s)
+        suffix_len_tokens = self.s2t(suffix_dur_s)
+        loops = [
+            # add everything but the suffix a the beggining
+            self.encode(signal.clone().trim(before=0, after=suffix_len_samples))
+        ]
+        range_fn = range if not verbose else tqdm.trange
+        for i in range_fn(num_loops):
+            is_flipped = i % 2 == 0
+            vamped = self.coarse_vamp_v2(
+                        signal,
+                        prefix_dur_s=prefix_dur_s,
+                        suffix_dur_s=suffix_dur_s,
+                        swap_prefix_suffix=is_flipped,
+                        **kwargs
+                )
+            # if we're flipped, we trim the prefix off of the end
+            # otherwise we trim the suffix off of the end
+            trim_len = prefix_len_tokens if is_flipped else suffix_len_tokens
+            vamped = vamped[:, :, :vamped.shape[-1]-trim_len]
+            loops.append(vamped)
+        if is_flipped:
+            loops.append(
+                # add everything but the prefix at the end
+                self.encode(signal.clone())
+            )
+        if self.c2f is not None:
+            loops = [self.coarse_to_fine(l) for l in loops]
+        loops = [self.to_signal(l) for l in loops]
+        return signal_concat(loops)

vampnet/modules/base.py CHANGED Viewed

@@ -31,6 +31,11 @@ class VampBase(at.ml.BaseModel):
     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
     def add_noise(
         self,
         x: torch.Tensor,

     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
+    # TODO: add a beat tracking method
+    # that uses a beat tracking model to find beat positions
+    # and then unmask the codes in those poisitions (with some width)
+    # and drop them out with some randomness
+    # and have the option to DONT drop out downbeats for
     def add_noise(
         self,
         x: torch.Tensor,