MusicGen

Runtime error

App Files Files Community

adefossez commited on Jun 13, 2023

Commit

62cbdc0

2 Parent(s): df20ddc 13ef076

Merge branch 'longgen' into our_hf2

Browse files

Files changed (8) hide show

README.md +7 -6
app.py +215 -87
audiocraft/models/loaders.py +0 -2
audiocraft/models/musicgen.py +82 -11
audiocraft/modules/transformer.py +67 -27
tests/models/test_musicgen.py +9 -1
tests/modules/test_rope.py +9 -1
tests/modules/test_transformer.py +40 -34

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ tags:
   - "music generation"
   - "language models"
   - "LLMs"
-app_file: "app_batched.py"
 emoji: 🎵
 colorFrom: white
 colorTo: blue
@@ -54,11 +54,12 @@ pip install -e .  # or if you cloned the repo locally
 ## Usage
 We offer a number of way to interact with MusicGen:
-1. You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./demo.ipynb) locally, or use the provided [colab notebook](https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing).
-2. You can use the gradio demo locally by running `python app.py`.
-3. A demo is also available on the [`facebook/MusicGen`  HuggingFace Space](https://huggingface.co/spaces/facebook/MusicGen) (huge thanks to all the HF team for their support).
-4. Finally, you can run the [Gradio demo with a Colab GPU](https://colab.research.google.com/drive/1-Xe9NCdIs2sCUbiSmwHXozK6AAhMm7_i?usp=sharing),
-as adapted from [@camenduru Colab](https://github.com/camenduru/MusicGen-colab).
 ## API

   - "music generation"
   - "language models"
   - "LLMs"
+app_file: "app.py"
 emoji: 🎵
 colorFrom: white
 colorTo: blue
 ## Usage
 We offer a number of way to interact with MusicGen:
+1. A demo is also available on the [`facebook/MusicGen`  HuggingFace Space](https://huggingface.co/spaces/facebook/MusicGen) (huge thanks to all the HF team for their support).
+2. You can run the extended demo on a Colab: [colab notebook](https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing).
+3. You can use the gradio demo locally by running `python app.py`.
+4. You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./demo.ipynb) locally (if you have a GPU).
+5. Finally, checkout [@camenduru Colab page](https://github.com/camenduru/MusicGen-colab) which is regularly
+  updated with contributions from @camenduru and the community.
 ## API

app.py CHANGED Viewed

@@ -1,70 +1,139 @@
-"""
-Copyright (c) Meta Platforms, Inc. and affiliates.
-All rights reserved.
-This source code is licensed under the license found in the
-LICENSE file in the root directory of this source tree.
-"""
-from tempfile import NamedTemporaryFile
 import argparse
 import torch
 import gradio as gr
-import os
-from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
-MODEL = None
-IS_SHARED_SPACE = "musicgen/MusicGen" in os.environ.get('SPACE_ID', '')
-def load_model(version):
-    print("Loading model", version)
-    return MusicGen.get_pretrained(version)
-def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     global MODEL
-    topk = int(topk)
-    if MODEL is None or MODEL.name != model:
-        MODEL = load_model(model)
-    if duration > MODEL.lm.cfg.dataset.segment_duration:
-        raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
-    MODEL.set_generation_params(
-        use_sampling=True,
-        top_k=topk,
-        top_p=topp,
-        temperature=temperature,
-        cfg_coef=cfg_coef,
-        duration=duration,
-    )
-    if melody:
-        sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
-        print(melody.shape)
-        if melody.dim() == 2:
-            melody = melody[None]
-        melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
-        output = MODEL.generate_with_chroma(
-            descriptions=[text],
-            melody_wavs=melody,
-            melody_sample_rate=sr,
-            progress=False
         )
     else:
-        output = MODEL.generate(descriptions=[text], progress=False)
-    output = output.detach().cpu().float()[0]
-    with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-        audio_write(
-            file.name, output, MODEL.sample_rate, strategy="loudness",
-            loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
-        waveform_video = gr.make_waveform(file.name)
-    return waveform_video
-def ui(**kwargs):
     with gr.Blocks() as interface:
         gr.Markdown(
             """
@@ -73,14 +142,6 @@ def ui(**kwargs):
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
             """
         )
-        if IS_SHARED_SPACE:
-            gr.Markdown("""
-                ⚠ This Space doesn't work in this shared UI ⚠
-                <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
-                <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-                to use it privately, or use the <a href="https://huggingface.co/spaces/facebook/MusicGen">public demo</a>
-                """)
         with gr.Row():
             with gr.Column():
                 with gr.Row():
@@ -88,10 +149,12 @@ def ui(**kwargs):
                     melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
                 with gr.Row():
                     submit = gr.Button("Submit")
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
-                    duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, interactive=True)
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
@@ -99,9 +162,9 @@ def ui(**kwargs):
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
                 output = gr.Video(label="Generated Music")
-        submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
         gr.Examples(
-            fn=predict,
             examples=[
                 [
                     "An 80s driving pop song with heavy drums and synth pads in the background",
@@ -137,7 +200,13 @@ def ui(**kwargs):
             ### More details
             The model will generate a short music extract based on the description you provided.
-            You can generate up to 30 seconds of audio.
             We present 4 model variations:
             1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
@@ -154,27 +223,75 @@ def ui(**kwargs):
             """
         )
-        # Show the interface
-        launch_kwargs = {}
-        username = kwargs.get('username')
-        password = kwargs.get('password')
-        server_port = kwargs.get('server_port', 0)
-        inbrowser = kwargs.get('inbrowser', False)
-        share = kwargs.get('share', False)
-        server_name = kwargs.get('listen')
-        launch_kwargs['server_name'] = server_name
-        if username and password:
-            launch_kwargs['auth'] = (username, password)
-        if server_port > 0:
-            launch_kwargs['server_port'] = server_port
-        if inbrowser:
-            launch_kwargs['inbrowser'] = inbrowser
-        if share:
-            launch_kwargs['share'] = share
-        interface.queue().launch(**launch_kwargs, max_threads=1)
 if __name__ == "__main__":
@@ -182,7 +299,11 @@ if __name__ == "__main__":
     parser.add_argument(
         '--listen',
         type=str,
         default='0.0.0.0',
         help='IP to listen on for connections to Gradio',
     )
     parser.add_argument(
@@ -206,11 +327,18 @@ if __name__ == "__main__":
     args = parser.parse_args()
-    ui(
-        username=args.username,
-        password=args.password,
-        inbrowser=args.inbrowser,
-        server_port=args.server_port,
-        share=args.share,
-        listen=args.listen
-    )

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
+# also released under the MIT license.
 import argparse
+from concurrent.futures import ProcessPoolExecutor
+import os
+import subprocess as sp
+from tempfile import NamedTemporaryFile
+import time
+import warnings
 import torch
 import gradio as gr
+from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
+from audiocraft.models import MusicGen
+MODEL = None  # Last used model
+IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
+MAX_BATCH_SIZE = 12
+BATCHED_DURATION = 15
+INTERRUPTING = False
+# We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
+_old_call = sp.call
+def _call_nostderr(*args, **kwargs):
+    # Avoid ffmpeg vomitting on the logs.
+    kwargs['stderr'] = sp.DEVNULL
+    kwargs['stdout'] = sp.DEVNULL
+    _old_call(*args, **kwargs)
+sp.call = _call_nostderr
+# Preallocating the pool of processes.
+pool = ProcessPoolExecutor(4)
+pool.__enter__()
+def interrupt():
+    global INTERRUPTING
+    INTERRUPTING = True
+def make_waveform(*args, **kwargs):
+    # Further remove some warnings.
+    be = time.time()
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        out = gr.make_waveform(*args, **kwargs)
+        print("Make a video took", time.time() - be)
+        return out
+def load_model(version='melody'):
     global MODEL
+    print("Loading model", version)
+    if MODEL is None or MODEL.name != version:
+        MODEL = MusicGen.get_pretrained(version)
+def _do_predictions(texts, melodies, duration, **gen_kwargs):
+    MODEL.set_generation_params(duration=duration, **gen_kwargs)
+    print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
+    be = time.time()
+    processed_melodies = []
+    target_sr = 32000
+    target_ac = 1
+    for melody in melodies:
+        if melody is None:
+            processed_melodies.append(None)
+        else:
+            sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
+            if melody.dim() == 1:
+                melody = melody[None]
+            melody = melody[..., :int(sr * duration)]
+            melody = convert_audio(melody, sr, target_sr, target_ac)
+            processed_melodies.append(melody)
+    if any(m is not None for m in processed_melodies):
+        outputs = MODEL.generate_with_chroma(
+            descriptions=texts,
+            melody_wavs=processed_melodies,
+            melody_sample_rate=target_sr,
+            progress=True
         )
     else:
+        outputs = MODEL.generate(texts, progress=True)
+    outputs = outputs.detach().cpu().float()
+    out_files = []
+    for output in outputs:
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, output, MODEL.sample_rate, strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+            out_files.append(pool.submit(make_waveform, file.name))
+    res = [out_file.result() for out_file in out_files]
+    print("batch finished", len(texts), time.time() - be)
+    return res
+def predict_batched(texts, melodies):
+    max_text_length = 512
+    texts = [text[:max_text_length] for text in texts]
+    load_model('melody')
+    res = _do_predictions(texts, melodies, BATCHED_DURATION)
+    return [res]
+def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
+    global INTERRUPTING
+    INTERRUPTING = False
+    topk = int(topk)
+    load_model(model)
+    def _progress(generated, to_generate):
+        progress((generated, to_generate))
+        if INTERRUPTING:
+            raise gr.Error("Interrupted.")
+    MODEL.set_custom_progress_callback(_progress)
+    outs = _do_predictions(
+        [text], [melody], duration,
+        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
+    return outs[0]
+def ui_full(launch_kwargs):
     with gr.Blocks() as interface:
         gr.Markdown(
             """
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
             """
         )
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
                 with gr.Row():
                     submit = gr.Button("Submit")
+                    # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
+                    _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
+                    duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, interactive=True)
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
                 output = gr.Video(label="Generated Music")
+        submit.click(predict_full, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
         gr.Examples(
+            fn=predict_full,
             examples=[
                 [
                     "An 80s driving pop song with heavy drums and synth pads in the background",
             ### More details
             The model will generate a short music extract based on the description you provided.
+            The model can generate up to 30 seconds of audio in one pass. It is now possible
+            to extend the generation by feeding back the end of the previous chunk of audio.
+            This can take a long time, and the model might lose consistency. The model might also
+            decide at arbitrary positions that the song ends.
+            **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min). An overlap of 12 seconds
+            is kept with the previously generated chunk, and 18 "new" seconds are generated each time.
             We present 4 model variations:
             1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
             """
         )
+        interface.queue().launch(**launch_kwargs)
+def ui_batched(launch_kwargs):
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # MusicGen
+            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
+            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
+            <br/>
+            <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+            for longer sequences, more control and no queue.</p>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    text = gr.Text(label="Describe your music", lines=2, interactive=True)
+                    melody = gr.Audio(source="upload", type="numpy", label="Condition on a melody (optional)", interactive=True)
+                with gr.Row():
+                    submit = gr.Button("Generate")
+            with gr.Column():
+                output = gr.Video(label="Generated Music")
+        submit.click(predict_batched, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
+        gr.Examples(
+            fn=predict_batched,
+            examples=[
+                [
+                    "An 80s driving pop song with heavy drums and synth pads in the background",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "A cheerful country song with acoustic guitars",
+                    "./assets/bolero_ravel.mp3",
+                ],
+                [
+                    "90s rock song with electric guitar and heavy drums",
+                    None,
+                ],
+                [
+                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "lofi slow bpm electro chill with organic samples",
+                    None,
+                ],
+            ],
+            inputs=[text, melody],
+            outputs=[output]
+        )
+        gr.Markdown("""
+        ### More details
+        The model will generate 12 seconds of audio based on the description you provided.
+        You can optionaly provide a reference audio from which a broad melody will be extracted.
+        The model will then try to follow both the description and melody provided.
+        All samples are generated with the `melody` model.
+        You can also use your own GPU or a Google Colab by following the instructions on our repo.
+        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+        for more details.
+        """)
+        demo.queue(max_size=8 * 4).launch(**launch_kwargs)
 if __name__ == "__main__":
     parser.add_argument(
         '--listen',
         type=str,
+<<<<<<< HEAD
         default='0.0.0.0',
+=======
+        default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
+>>>>>>> longgen
         help='IP to listen on for connections to Gradio',
     )
     parser.add_argument(
     args = parser.parse_args()
+    launch_kwargs = {}
+    if args.username and args.password:
+        launch_kwargs['auth'] = (args.username, args.password)
+    if args.server_port:
+        launch_kwargs['server_port'] = args.server_port
+    if args.inbrowser:
+        launch_kwargs['inbrowser'] = args.inbrowser
+    if args.share:
+        launch_kwargs['share'] = args.share
+    # Show the interface
+    if IS_BATCHED:
+        ui_batched(launch_kwargs)
+    else:
+        ui_full(launch_kwargs)

audiocraft/models/loaders.py CHANGED Viewed

@@ -80,8 +80,6 @@ def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_di
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     if cfg.device == 'cpu':
-        cfg.transformer_lm.memory_efficient = False
-        cfg.transformer_lm.custom = True
         cfg.dtype = 'float32'
     else:
         cfg.dtype = 'float16'

     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     if cfg.device == 'cpu':
         cfg.dtype = 'float32'
     else:
         cfg.dtype = 'float16'

audiocraft/models/musicgen.py CHANGED Viewed

@@ -36,13 +36,16 @@ class MusicGen:
             used to map audio to invertible discrete representations.
         lm (LMModel): Language model over discrete representations.
     """
-    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel):
         self.name = name
         self.compression_model = compression_model
         self.lm = lm
         self.device = next(iter(lm.parameters())).device
         self.generation_params: dict = {}
         self.set_generation_params(duration=15)  # 15 seconds by default
         if self.device.type == 'cpu':
             self.autocast = TorchAutocast(enabled=False)
         else:
@@ -65,7 +68,7 @@ class MusicGen:
         return self.compression_model.channels
     @staticmethod
-    def get_pretrained(name: str = 'melody', device='cuda'):
         """Return pretrained model, we provide four models:
         - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
         - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
@@ -73,6 +76,12 @@ class MusicGen:
         - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
         """
         if name == 'debug':
             # used only for unit tests
             compression_model = get_debug_compression_model(device)
@@ -96,7 +105,7 @@ class MusicGen:
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
-                              two_step_cfg: bool = False, extend_stride: float = 15):
         """Set the generation parameters for MusicGen.
         Args:
@@ -113,11 +122,10 @@ class MusicGen:
                 should we extend the audio each time. Larger values will mean less context is
                 preserved, and shorter value will require extra computations.
         """
-        # assert duration <= 30, "The MusicGen cannot generate more than 30 seconds"
-        assert extend_stride <= 25, "Keep at least 5 seconds of overlap!"
         self.extend_stride = extend_stride
         self.generation_params = {
-            'max_gen_len': int(duration * self.frame_rate),
             'use_sampling': use_sampling,
             'temp': temperature,
             'top_k': top_k,
@@ -126,6 +134,10 @@ class MusicGen:
             'two_step_cfg': two_step_cfg,
         }
     def generate_unconditional(self, num_samples: int, progress: bool = False) -> torch.Tensor:
         """Generate samples in an unconditional manner.
@@ -268,20 +280,79 @@ class MusicGen:
         Returns:
             torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
         """
         def _progress_callback(generated_tokens: int, tokens_to_generate: int):
-            print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
         if prompt_tokens is not None:
-            assert self.generation_params['max_gen_len'] > prompt_tokens.shape[-1], \
                 "Prompt is longer than audio to generate"
         callback = None
         if progress:
             callback = _progress_callback
-        # generate by sampling from LM
-        with self.autocast:
-            gen_tokens = self.lm.generate(prompt_tokens, attributes, callback=callback, **self.generation_params)
         # generate audio
         assert gen_tokens.dim() == 3

             used to map audio to invertible discrete representations.
         lm (LMModel): Language model over discrete representations.
     """
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: float = 30):
         self.name = name
         self.compression_model = compression_model
         self.lm = lm
+        self.max_duration = max_duration
         self.device = next(iter(lm.parameters())).device
         self.generation_params: dict = {}
         self.set_generation_params(duration=15)  # 15 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
         if self.device.type == 'cpu':
             self.autocast = TorchAutocast(enabled=False)
         else:
         return self.compression_model.channels
     @staticmethod
+    def get_pretrained(name: str = 'melody', device=None):
         """Return pretrained model, we provide four models:
         - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
         - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
         - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
         """
+        if device is None:
+            if torch.cuda.device_count():
+                device = 'cuda'
+            else:
+                device = 'cpu'
         if name == 'debug':
             # used only for unit tests
             compression_model = get_debug_compression_model(device)
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
         """Set the generation parameters for MusicGen.
         Args:
                 should we extend the audio each time. Larger values will mean less context is
                 preserved, and shorter value will require extra computations.
         """
+        assert extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
         self.extend_stride = extend_stride
+        self.duration = duration
         self.generation_params = {
             'use_sampling': use_sampling,
             'temp': temperature,
             'top_k': top_k,
             'two_step_cfg': two_step_cfg,
         }
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        """Override the default progress callback."""
+        self._progress_callback = progress_callback
     def generate_unconditional(self, num_samples: int, progress: bool = False) -> torch.Tensor:
         """Generate samples in an unconditional manner.
         Returns:
             torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
         """
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
         def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f'{generated_tokens: 6d} / {total_gen_len: 6d}', end='\r')
         if prompt_tokens is not None:
+            assert max_prompt_len >= prompt_tokens.shape[-1], \
                 "Prompt is longer than audio to generate"
         callback = None
         if progress:
             callback = _progress_callback
+        if self.duration <= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+        else:
+            # now this gets a bit messier, we need to handle prompts,
+            # melody conditioning etc.
+            ref_wavs = [attr.wav['self_wav'] for attr in attributes]
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+            while current_gen_offset + prompt_length < total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                for attr, ref_wav in zip(attributes, ref_wavs):
+                    wav_length = ref_wav.length.item()
+                    if wav_length == 0:
+                        continue
+                    # We will extend the wav periodically if it not long enough.
+                    # we have to do it here rather than in conditioners.py as otherwise
+                    # we wouldn't have the full wav.
+                    initial_position = int(time_offset * self.sample_rate)
+                    wav_target_length = int(self.max_duration * self.sample_rate)
+                    print(initial_position / self.sample_rate, wav_target_length / self.sample_rate)
+                    positions = torch.arange(initial_position,
+                                             initial_position + wav_target_length, device=self.device)
+                    attr.wav['self_wav'] = WavCondition(
+                        ref_wav[0][:, positions % wav_length],
+                        torch.full_like(ref_wav[1], wav_target_length))
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+            gen_tokens = torch.cat(all_tokens, dim=-1)
         # generate audio
         assert gen_tokens.dim() == 3

audiocraft/modules/transformer.py CHANGED Viewed

@@ -25,6 +25,22 @@ from xformers import ops
 from .rope import RotaryEmbedding
 from .streaming import StreamingModule
 def _is_profiled() -> bool:
     # Return true if we are currently running with a xformers profiler activated.
@@ -75,14 +91,22 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
 def expand_repeated_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers"""
-    bs, slen, n_kv_heads, head_dim = x.shape
     if n_rep == 1:
         return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
 class LayerScale(nn.Module):
@@ -210,6 +234,7 @@ class StreamingMultiheadAttention(StreamingModule):
         # Return a causal mask, accounting for potentially stored past keys/values
         # We actually return a bias for the attention score, as this has the same
         # convention both in the builtin MHA in Pytorch, and Xformers functions.
         if self.memory_efficient:
             from xformers.ops import LowerTriangularMask
             if current_steps == 1:
@@ -222,7 +247,7 @@ class StreamingMultiheadAttention(StreamingModule):
                 return LowerTriangularMask()
         if self._streaming_state:
             past_keys = self._streaming_state['past_keys']
-            past_steps = past_keys.shape[1]
         else:
             past_steps = 0
@@ -239,6 +264,7 @@ class StreamingMultiheadAttention(StreamingModule):
             torch.full([], float('-inf'), device=device, dtype=dtype))
     def _complete_kv(self, k, v):
         if self.cross_attention:
             # With cross attention we assume all keys and values
             # are already available, and streaming is with respect
@@ -247,20 +273,20 @@ class StreamingMultiheadAttention(StreamingModule):
         # Complete the key/value pair using the streaming state.
         if self._streaming_state:
             pk = self._streaming_state['past_keys']
-            nk = torch.cat([pk, k], dim=2)
             if v is k:
                 nv = nk
             else:
                 pv = self._streaming_state['past_values']
-                nv = torch.cat([pv, v], dim=2)
         else:
             nk = k
             nv = v
-        assert nk.shape[2] == nv.shape[2]
         offset = 0
         if self.past_context is not None:
-            offset = max(0, nk.shape[2] - self.past_context)
         if self._is_streaming:
             self._streaming_state['past_keys'] = nk[:, offset:]
             if v is not k:
@@ -271,8 +297,9 @@ class StreamingMultiheadAttention(StreamingModule):
                 self._streaming_state['offset'] = torch.tensor(0)
         return nk, nv
     def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
         # Apply rope embeddings to query and key tensors.
         assert self.rope is not None
         if 'past_keys' in self._streaming_state:
@@ -293,6 +320,11 @@ class StreamingMultiheadAttention(StreamingModule):
         assert not is_causal, ("new param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
         dtype = query.dtype
         if self._is_streaming:
             assert self.causal or self.cross_attention, \
@@ -325,8 +357,7 @@ class StreamingMultiheadAttention(StreamingModule):
                 if self.qk_layer_norm is True:
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
-                # q, k, v = [rearrange(x, "b t (h d) -> (b h) t d", h=self.num_heads) for x in [q, k, v]]
-                q, k, v = [rearrange(x, "b t (h d) -> b h t d", h=self.num_heads) for x in [q, k, v]]
             else:
                 if not _is_profiled():
                     # profiling breaks that propertysomehow.
@@ -334,7 +365,11 @@ class StreamingMultiheadAttention(StreamingModule):
                     assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
-                    packed = rearrange(projected, "b t (p h d) -> b h p t d", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
@@ -345,18 +380,17 @@ class StreamingMultiheadAttention(StreamingModule):
                     end = start + per_head_dim * kv_heads
                     k = projected[:, :, start: end]
                     v = projected[:, :, end:]
-                    q = rearrange(q, "b t (h d) -> b t h d", h=self.num_heads)
-                    k = rearrange(k, "b t (h d) -> b t h d", h=kv_heads)
-                    v = rearrange(v, "b t (h d) -> b t h d", h=kv_heads)
                 if self.qk_layer_norm is True:
                     assert self.kv_repeat == 1
-                    q, k = [rearrange(x, "b t h d -> b t (h d)") for x in [q, k]]
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
-                    q, k = [rearrange(x, "b t (h d) -> b t h d", h=self.num_heads) for x in [q, k]]
                 if self.rope:
-                    assert False, "Not supported for now"
                     q, k = self._apply_rope(q, k)
                 k, v = self._complete_kv(k, v)
                 if self.kv_repeat > 1:
@@ -366,8 +400,11 @@ class StreamingMultiheadAttention(StreamingModule):
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 p = self.dropout if self.training else 0
-                x = torch.nn.functional.scaled_dot_product_attention(
-                    q, k, v, is_causal=attn_mask is not None, dropout_p=p)
             else:
                 # We include the dot product as float32, for consistency
                 # with the other implementations that include that step
@@ -377,18 +414,21 @@ class StreamingMultiheadAttention(StreamingModule):
                 # extend a bit the range of operations done in float32,
                 # although this should make no difference.
                 q = q / q.shape[-1] ** 0.5
                 if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
                     with torch.autocast(device_type=q.device.type, dtype=torch.float32):
-                        pre_w = torch.einsum("bqhc,bkhc->bhqk", q, k)
                 else:
-                    pre_w = torch.einsum("bqhc,bkhc->bhqk", q, k)
                 if attn_mask is not None:
                     pre_w = pre_w + attn_mask
                 w = torch.softmax(pre_w, dim=-1)
                 w = F.dropout(w, self.dropout, training=self.training).to(v)
-                x = torch.einsum("bhqk,bkhc->bqhc", w, v)
             x = x.to(dtype)
-            x = rearrange(x, "b h t d -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         else:
             key, value = self._complete_kv(key, value)

 from .rope import RotaryEmbedding
 from .streaming import StreamingModule
+_efficient_attention_backend: str = 'torch'
+def set_efficient_attention_backend(backend: str = 'torch'):
+    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
+    global _efficient_attention_backend
+    assert _efficient_attention_backend in ['xformers', 'torch']
+    _efficient_attention_backend = backend
+def _get_attention_time_dimension() -> int:
+    if _efficient_attention_backend == 'torch':
+        return 2
+    else:
+        return 1
 def _is_profiled() -> bool:
     # Return true if we are currently running with a xformers profiler activated.
 def expand_repeated_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers"""
     if n_rep == 1:
         return x
+    if _efficient_attention_backend == 'torch':
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )
 class LayerScale(nn.Module):
         # Return a causal mask, accounting for potentially stored past keys/values
         # We actually return a bias for the attention score, as this has the same
         # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension()
         if self.memory_efficient:
             from xformers.ops import LowerTriangularMask
             if current_steps == 1:
                 return LowerTriangularMask()
         if self._streaming_state:
             past_keys = self._streaming_state['past_keys']
+            past_steps = past_keys.shape[time_dim]
         else:
             past_steps = 0
             torch.full([], float('-inf'), device=device, dtype=dtype))
     def _complete_kv(self, k, v):
+        time_dim = _get_attention_time_dimension()
         if self.cross_attention:
             # With cross attention we assume all keys and values
             # are already available, and streaming is with respect
         # Complete the key/value pair using the streaming state.
         if self._streaming_state:
             pk = self._streaming_state['past_keys']
+            nk = torch.cat([pk, k], dim=time_dim)
             if v is k:
                 nv = nk
             else:
                 pv = self._streaming_state['past_values']
+                nv = torch.cat([pv, v], dim=time_dim)
         else:
             nk = k
             nv = v
+        assert nk.shape[time_dim] == nv.shape[time_dim]
         offset = 0
         if self.past_context is not None:
+            offset = max(0, nk.shape[time_dim] - self.past_context)
         if self._is_streaming:
             self._streaming_state['past_keys'] = nk[:, offset:]
             if v is not k:
                 self._streaming_state['offset'] = torch.tensor(0)
         return nk, nv
     def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        # TODO: fix and verify layout.
+        assert _efficient_attention_backend == 'xformers', 'Rope not supported with torch attn.'
         # Apply rope embeddings to query and key tensors.
         assert self.rope is not None
         if 'past_keys' in self._streaming_state:
         assert not is_causal, ("new param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
+        time_dim = _get_attention_time_dimension()
+        if time_dim == 2:
+            layout = "b h t d"
+        else:
+            layout = "b t h d"
         dtype = query.dtype
         if self._is_streaming:
             assert self.causal or self.cross_attention, \
                 if self.qk_layer_norm is True:
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
+                q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 if not _is_profiled():
                     # profiling breaks that propertysomehow.
                     assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
+                    if time_dim == 2:
+                        bound_layout = "b h p t d"
+                    else:
+                        bound_layout = "b t p h d"
+                    packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
                     end = start + per_head_dim * kv_heads
                     k = projected[:, :, start: end]
                     v = projected[:, :, end:]
+                    q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
+                    k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
+                    v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
                 if self.qk_layer_norm is True:
                     assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
                 if self.rope:
                     q, k = self._apply_rope(q, k)
                 k, v = self._complete_kv(k, v)
                 if self.kv_repeat > 1:
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 p = self.dropout if self.training else 0
+                if _efficient_attention_backend == 'torch':
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
             else:
                 # We include the dot product as float32, for consistency
                 # with the other implementations that include that step
                 # extend a bit the range of operations done in float32,
                 # although this should make no difference.
                 q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace('t', 'k')
+                query_layout = layout
                 if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
                     with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
                 else:
+                    pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
                 if attn_mask is not None:
                     pre_w = pre_w + attn_mask
                 w = torch.softmax(pre_w, dim=-1)
                 w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
             x = x.to(dtype)
+            x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         else:
             key, value = self._complete_kv(key, value)

tests/models/test_musicgen.py CHANGED Viewed

@@ -13,7 +13,7 @@ from audiocraft.models import MusicGen
 class TestSEANetModel:
     def get_musicgen(self):
         mg = MusicGen.get_pretrained(name='debug', device='cpu')
-        mg.set_generation_params(duration=2.0)
         return mg
     def test_base(self):
@@ -48,3 +48,11 @@ class TestSEANetModel:
         wav = mg.generate(
             ['youpi', 'lapin dort'])
         assert list(wav.shape) == [2, 1, 64000]

 class TestSEANetModel:
     def get_musicgen(self):
         mg = MusicGen.get_pretrained(name='debug', device='cpu')
+        mg.set_generation_params(duration=2.0, extend_stride=2.)
         return mg
     def test_base(self):
         wav = mg.generate(
             ['youpi', 'lapin dort'])
         assert list(wav.shape) == [2, 1, 64000]
+    def test_generate_long(self):
+        mg = self.get_musicgen()
+        mg.max_duration = 3.
+        mg.set_generation_params(duration=4., extend_stride=2.)
+        wav = mg.generate(
+            ['youpi', 'lapin dort'])
+        assert list(wav.shape) == [2, 1, 32000 * 4]

tests/modules/test_rope.py CHANGED Viewed

@@ -7,10 +7,11 @@
 import torch
 from audiocraft.modules.rope import RotaryEmbedding
-from audiocraft.modules.transformer import StreamingTransformer
 def test_rope():
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C)
@@ -23,6 +24,7 @@ def test_rope():
 def test_rope_io_dtypes():
     B, T, H, C = 8, 75, 16, 128
     rope_32 = RotaryEmbedding(dim=C, dtype=torch.float32)
@@ -46,6 +48,7 @@ def test_rope_io_dtypes():
 def test_transformer_with_rope():
     torch.manual_seed(1234)
     for pos in ['rope', 'sin_rope']:
         tr = StreamingTransformer(
@@ -61,6 +64,7 @@ def test_transformer_with_rope():
 @torch.no_grad()
 def test_rope_streaming():
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, causal=True, dropout=0.,
@@ -88,6 +92,7 @@ def test_rope_streaming():
 @torch.no_grad()
 def test_rope_streaming_past_context():
     torch.manual_seed(1234)
     for context in [None, 10]:
@@ -117,6 +122,7 @@ def test_rope_streaming_past_context():
 def test_rope_memory_efficient():
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, custom=True, dropout=0., layer_scale=0.1,
@@ -137,6 +143,7 @@ def test_rope_memory_efficient():
 def test_rope_with_xpos():
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True)
@@ -149,6 +156,7 @@ def test_rope_with_xpos():
 def test_positional_scale():
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True, scale=0.0)

 import torch
 from audiocraft.modules.rope import RotaryEmbedding
+from audiocraft.modules.transformer import StreamingTransformer, set_efficient_attention_backend
 def test_rope():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C)
 def test_rope_io_dtypes():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope_32 = RotaryEmbedding(dim=C, dtype=torch.float32)
 def test_transformer_with_rope():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     for pos in ['rope', 'sin_rope']:
         tr = StreamingTransformer(
 @torch.no_grad()
 def test_rope_streaming():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, causal=True, dropout=0.,
 @torch.no_grad()
 def test_rope_streaming_past_context():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     for context in [None, 10]:
 def test_rope_memory_efficient():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, custom=True, dropout=0., layer_scale=0.1,
 def test_rope_with_xpos():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True)
 def test_positional_scale():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True, scale=0.0)

tests/modules/test_transformer.py CHANGED Viewed

@@ -9,7 +9,8 @@ from itertools import product
 import pytest
 import torch
-from audiocraft.modules.transformer import StreamingMultiheadAttention, StreamingTransformer
 def test_transformer_causal_streaming():
@@ -86,19 +87,22 @@ def test_streaming_api():
 def test_memory_efficient():
     torch.manual_seed(1234)
-    tr = StreamingTransformer(
-        16, 4, 2, custom=True, dropout=0., layer_scale=0.1)
-    tr_mem_efficient = StreamingTransformer(
-        16, 4, 2, dropout=0., memory_efficient=True, layer_scale=0.1)
-    tr_mem_efficient.load_state_dict(tr.state_dict())
-    tr.eval()
-    steps = 12
-    x = torch.randn(3, steps, 16)
-    with torch.no_grad():
-        y = tr(x)
-        y2 = tr_mem_efficient(x)
-        assert torch.allclose(y, y2), (y - y2).norm()
 def test_attention_as_float32():
@@ -129,30 +133,32 @@ def test_attention_as_float32():
 @torch.no_grad()
 def test_streaming_memory_efficient():
     torch.manual_seed(1234)
-    tr = StreamingTransformer(16, 4, 2, causal=True, dropout=0., custom=True)
-    tr_mem_efficient = StreamingTransformer(
-        16, 4, 2, dropout=0., memory_efficient=True, causal=True)
-    tr.load_state_dict(tr_mem_efficient.state_dict())
-    tr.eval()
-    tr_mem_efficient.eval()
-    steps = 12
-    x = torch.randn(3, steps, 16)
-    ref = tr(x)
-    with tr_mem_efficient.streaming():
-        outs = []
-        # frame_sizes = [2] + [1] * (steps - 2)
-        frame_sizes = [1] * steps
-        for frame_size in frame_sizes:
-            frame = x[:, :frame_size]
-            x = x[:, frame_size:]
-            outs.append(tr_mem_efficient(frame))
-    out = torch.cat(outs, dim=1)
-    delta = torch.norm(out - ref) / torch.norm(out)
-    assert delta < 1e-6, delta
 def test_cross_attention():
@@ -204,7 +210,7 @@ def test_cross_attention_compat():
     y = cross_attn(queries, keys, values)[0]
     y_ref = ref_attn(queries, keys, values)[0]
-    assert torch.allclose(y, y_ref, atol=1e-7)
     # Now let's check that streaming is working properly.
     with cross_attn.streaming():

 import pytest
 import torch
+from audiocraft.modules.transformer import (
+    StreamingMultiheadAttention, StreamingTransformer, set_efficient_attention_backend)
 def test_transformer_causal_streaming():
 def test_memory_efficient():
     torch.manual_seed(1234)
+    for backend in ['torch', 'xformers']:
+        set_efficient_attention_backend(backend)
+        tr = StreamingTransformer(
+            16, 4, 2, custom=True, dropout=0., layer_scale=0.1)
+        tr_mem_efficient = StreamingTransformer(
+            16, 4, 2, dropout=0., memory_efficient=True, layer_scale=0.1)
+        tr_mem_efficient.load_state_dict(tr.state_dict())
+        tr.eval()
+        steps = 12
+        x = torch.randn(3, steps, 16)
+        with torch.no_grad():
+            y = tr(x)
+            y2 = tr_mem_efficient(x)
+            assert torch.allclose(y, y2), ((y - y2).norm(), backend)
 def test_attention_as_float32():
 @torch.no_grad()
 def test_streaming_memory_efficient():
     torch.manual_seed(1234)
+    for backend in ['torch', 'xformers']:
+        set_efficient_attention_backend(backend)
+        tr = StreamingTransformer(16, 4, 2, causal=True, dropout=0., custom=True)
+        tr_mem_efficient = StreamingTransformer(
+            16, 4, 2, dropout=0., memory_efficient=True, causal=True)
+        tr.load_state_dict(tr_mem_efficient.state_dict())
+        tr.eval()
+        tr_mem_efficient.eval()
+        steps = 12
+        x = torch.randn(3, steps, 16)
+        ref = tr(x)
+        with tr_mem_efficient.streaming():
+            outs = []
+            # frame_sizes = [2] + [1] * (steps - 2)
+            frame_sizes = [1] * steps
+            for frame_size in frame_sizes:
+                frame = x[:, :frame_size]
+                x = x[:, frame_size:]
+                outs.append(tr_mem_efficient(frame))
+        out = torch.cat(outs, dim=1)
+        delta = torch.norm(out - ref) / torch.norm(out)
+        assert delta < 1e-6, delta
 def test_cross_attention():
     y = cross_attn(queries, keys, values)[0]
     y_ref = ref_attn(queries, keys, values)[0]
+    assert torch.allclose(y, y_ref, atol=1e-7), (y - y_ref).norm() / y_ref.norm()
     # Now let's check that streaming is working properly.
     with cross_attn.streaming():