Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Aug 28, 2022

Commit

c17b696

1 Parent(s): 8c27376

add streamlit app and tidy

Browse files

Files changed (14) hide show

.gitignore +2 -0
README.md +5 -5
app.py +4 -17
src/audio_to_images.py → audio_to_images.py +22 -32
audiodiffusion/__init__.py +40 -0
{src → audiodiffusion}/mel.py +48 -8
notebooks/test-mel.ipynb +3 -3
notebooks/test-model.ipynb +0 -0
requirements-lock.txt +51 -0
requirements.txt +2 -2
setup.cfg +19 -0
setup.py +6 -0
streamlit_app.py +22 -0
src/train_unconditional.py → train_unconditional.py +34 -37

.gitignore CHANGED Viewed

@@ -4,3 +4,5 @@ __pycache__
 data*
 ddpm-ema-audio-*
 flagged

 data*
 ddpm-ema-audio-*
 flagged
+build
+audiodiffusion.egg-info

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ You can play around with the model I trained on about 500 songs from my Spotify
 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
 ```bash
-python src/audio_to_images.py \
   --resolution 64 \
   --hop_length 1024\
   --input_dir path-to-audio-files \
@@ -38,7 +38,7 @@ python src/audio_to_images.py \
 #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
 ```bash
-python src/audio_to_images.py \
   --resolution 256 \
   --input_dir path-to-audio-files \
   --output_dir data-256 \
@@ -49,7 +49,7 @@ python src/audio_to_images.py \
 ```bash
 accelerate launch --config_file accelerate_local.yaml \
-  src/train_unconditional.py \
   --dataset_name data-64 \
   --resolution 64 \
   --hop_length 1024 \
@@ -66,7 +66,7 @@ accelerate launch --config_file accelerate_local.yaml \
 ```bash
 accelerate launch --config_file accelerate_local.yaml \
-  src/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \
@@ -86,7 +86,7 @@ accelerate launch --config_file accelerate_local.yaml \
 ```bash
 accelerate launch --config_file accelerate_sagemaker.yaml \
-  src/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \

 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
 ```bash
+python audiodiffusion/audio_to_images.py \
   --resolution 64 \
   --hop_length 1024\
   --input_dir path-to-audio-files \
 #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
 ```bash
+python audiodiffusion/audio_to_images.py \
   --resolution 256 \
   --input_dir path-to-audio-files \
   --output_dir data-256 \
 ```bash
 accelerate launch --config_file accelerate_local.yaml \
+  audiodiffusion/train_unconditional.py \
   --dataset_name data-64 \
   --resolution 64 \
   --hop_length 1024 \
 ```bash
 accelerate launch --config_file accelerate_local.yaml \
+  audiodiffusion/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \
 ```bash
 accelerate launch --config_file accelerate_sagemaker.yaml \
+  audiodiffusion/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \

app.py CHANGED Viewed

@@ -1,23 +1,10 @@
 import argparse
 import gradio as gr
-from PIL import Image
-from diffusers import DDPMPipeline
-from src.mel import Mel
-mel = Mel(x_res=256, y_res=256)
-model_id = "teticio/audio-diffusion-256"
-ddpm = DDPMPipeline.from_pretrained(model_id)
-def generate_spectrogram_and_audio():
-    images = ddpm(output_type="numpy")["sample"]
-    images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
-    image = Image.fromarray(images[0][0])
-    audio = mel.image_to_audio(image)
-    return image, (mel.get_sample_rate(), audio)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -26,9 +13,9 @@ if __name__ == "__main__":
     args = parser.parse_args()
     demo = gr.Interface(
-        fn=generate_spectrogram_and_audio,
         title="Audio Diffusion",
-        description=f"Generate audio using Huggingface diffusers.\
             This takes about 20 minutes without a GPU, so why not make yourself a cup of tea in the meantime?",
         inputs=[],
         outputs=[

 import argparse
 import gradio as gr
+from audiodiffusion import AudioDiffusion
+audio_diffusion = AudioDiffusion()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     args = parser.parse_args()
     demo = gr.Interface(
+        fn=audio_diffusion.generate_spectrogram_and_audio,
         title="Audio Diffusion",
+        description="Generate audio using Huggingface diffusers.\
             This takes about 20 minutes without a GPU, so why not make yourself a cup of tea in the meantime?",
         inputs=[],
         outputs=[

src/audio_to_images.py → audio_to_images.py RENAMED Viewed

@@ -1,8 +1,3 @@
-# TODO
-# run on sagemaker
-# run with deepspeed
 import os
 import re
 import io
@@ -12,17 +7,17 @@ import pandas as pd
 from tqdm.auto import tqdm
 from datasets import Dataset, DatasetDict, Features, Image, Value
-from mel import Mel
 def main(args):
-    mel = Mel(x_res=args.resolution, y_res=args.resolution, hop_length=args.hop_length)
     os.makedirs(args.output_dir, exist_ok=True)
     audio_files = [
-        os.path.join(root, file)
-        for root, _, files in os.walk(args.input_dir)
-        for file in files
-        if re.search("\.(mp3|wav|m4a)$", file, re.IGNORECASE)
     ]
     examples = []
     try:
@@ -35,31 +30,26 @@ def main(args):
                 continue
             for slice in range(mel.get_number_of_slices()):
                 image = mel.audio_slice_to_image(slice)
-                assert (
-                    image.width == args.resolution and image.height == args.resolution
-                )
                 with io.BytesIO() as output:
                     image.save(output, format="PNG")
                     bytes = output.getvalue()
-                examples.extend(
-                    [
-                        {
-                            "image": {"bytes": bytes},
-                            "audio_file": audio_file,
-                            "slice": slice,
-                        }
-                    ]
-                )
     finally:
         ds = Dataset.from_pandas(
             pd.DataFrame(examples),
-            features=Features(
-                {
-                    "image": Image(),
-                    "audio_file": Value(dtype="string"),
-                    "slice": Value(dtype="int16"),
-                }
-            ),
         )
         dsd = DatasetDict({"train": ds})
         dsd.save_to_disk(os.path.join(args.output_dir))
@@ -69,8 +59,8 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Create dataset of Mel spectrograms from directory of audio files."
-    )
     parser.add_argument("--input_dir", type=str)
     parser.add_argument("--output_dir", type=str, default="data")
     parser.add_argument("--resolution", type=int, default=256)

 import os
 import re
 import io
 from tqdm.auto import tqdm
 from datasets import Dataset, DatasetDict, Features, Image, Value
+from audiodiffusion.mel import Mel
 def main(args):
+    mel = Mel(x_res=args.resolution,
+              y_res=args.resolution,
+              hop_length=args.hop_length)
     os.makedirs(args.output_dir, exist_ok=True)
     audio_files = [
+        os.path.join(root, file) for root, _, files in os.walk(args.input_dir)
+        for file in files if re.search("\.(mp3|wav|m4a)$", file, re.IGNORECASE)
     ]
     examples = []
     try:
                 continue
             for slice in range(mel.get_number_of_slices()):
                 image = mel.audio_slice_to_image(slice)
+                assert (image.width == args.resolution
+                        and image.height == args.resolution)
                 with io.BytesIO() as output:
                     image.save(output, format="PNG")
                     bytes = output.getvalue()
+                examples.extend([{
+                    "image": {
+                        "bytes": bytes
+                    },
+                    "audio_file": audio_file,
+                    "slice": slice,
+                }])
     finally:
         ds = Dataset.from_pandas(
             pd.DataFrame(examples),
+            features=Features({
+                "image": Image(),
+                "audio_file": Value(dtype="string"),
+                "slice": Value(dtype="int16"),
+            }),
         )
         dsd = DatasetDict({"train": ds})
         dsd.save_to_disk(os.path.join(args.output_dir))
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
+        description=
+        "Create dataset of Mel spectrograms from directory of audio files.")
     parser.add_argument("--input_dir", type=str)
     parser.add_argument("--output_dir", type=str, default="data")
     parser.add_argument("--resolution", type=int, default=256)

audiodiffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from PIL import Image
+from torch import cuda
+from diffusers import DDPMPipeline
+from .mel import Mel
+VERSION = "1.0.1"
+class AudioDiffusion:
+    def __init__(self,
+                 model_id="teticio/audio-diffusion-256",
+                 resolution=256,
+                 cuda=cuda.is_available()):
+        """Class for generating audio using Denoising Diffusion Probabilistic Models.
+        Args:
+            model_id (String): name of model (local directory or Hugging Face Hub)
+            resolution (int): size of square mel spectrogram in pixels
+            cuda (bool): use CUDA?
+        """
+        self.mel = Mel(x_res=resolution, y_res=resolution)
+        self.model_id = model_id
+        self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
+        if cuda:
+            self.ddpm.to("cuda")
+    def generate_spectrogram_and_audio(self):
+        """Generate random mel spectrogram and convert to audio.
+        Returns:
+            PIL Image: mel spectrogram
+            (float, array): sample rate and raw audio
+        """
+        images = self.ddpm(output_type="numpy")["sample"]
+        images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
+        image = Image.fromarray(images[0][0])
+        audio = self.mel.image_to_audio(image)
+        return image, (self.mel.get_sample_rate(), audio)

{src → audiodiffusion}/mel.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import warnings
 warnings.filterwarnings('ignore')
 import librosa
@@ -7,6 +8,7 @@ from PIL import Image
 class Mel:
     def __init__(
         self,
         x_res=256,
@@ -16,6 +18,16 @@ class Mel:
         hop_length=512,
         top_db=80,
     ):
         self.x_res = x_res
         self.y_res = y_res
         self.sr = sample_rate
@@ -28,17 +40,40 @@ class Mel:
         self.y = None
     def load_audio(self, audio_file):
         self.y, _ = librosa.load(audio_file, mono=True)
     def get_number_of_slices(self):
         return len(self.y) // self.slice_size
     def get_sample_rate(self):
         return self.sr
     def audio_slice_to_image(self, slice):
         S = librosa.feature.melspectrogram(
-            y=self.y[self.slice_size * slice : self.slice_size * (slice + 1)],
             sr=self.sr,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
@@ -46,19 +81,24 @@ class Mel:
             fmax=self.fmax,
         )
         log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
-        bytedata = (
-            ((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5
-        ).astype(np.uint8)
         image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
         return image
     def image_to_audio(self, image):
         bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
-            (image.width, image.height)
-        )
         log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
         S = librosa.db_to_power(log_S)
         audio = librosa.feature.inverse.mel_to_audio(
-            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length
-        )
         return audio

 import warnings
 warnings.filterwarnings('ignore')
 import librosa
 class Mel:
     def __init__(
         self,
         x_res=256,
         hop_length=512,
         top_db=80,
     ):
+        """Class to convert audio to mel spectrograms and vice versa.
+        Args:
+            x_res (int): x resolution of spectrogram (time)
+            y_res (int): y resolution of spectrogram (frequency bins)
+            sample_rate (int): sample rate of audio
+            n_fft (int): number of Fast Fourier Transforms
+            hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
+            top_db (int): loudest in decibels
+        """
         self.x_res = x_res
         self.y_res = y_res
         self.sr = sample_rate
         self.y = None
     def load_audio(self, audio_file):
+        """Load audio.
+        Args:
+            file (str): must be a file on disk due to Librosa limitation
+        """
         self.y, _ = librosa.load(audio_file, mono=True)
     def get_number_of_slices(self):
+        """Get number of slices in audio.
+        Returns:
+            int: number of spectograms audio can be sliced into
+        """
         return len(self.y) // self.slice_size
     def get_sample_rate(self):
+        """Get sample rate:
+        Returns:
+            int: sample rate of audio
+        """
         return self.sr
     def audio_slice_to_image(self, slice):
+        """Convert slice of audio to spectrogram.
+        Args:
+            slice (int): slice number of audio to convert (out of get_number_of_slices())
+        Returns:
+            PIL Image: grayscale image of x_res x y_res
+        """
         S = librosa.feature.melspectrogram(
+            y=self.y[self.slice_size * slice:self.slice_size * (slice + 1)],
             sr=self.sr,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
             fmax=self.fmax,
         )
         log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
+        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
+                    0.5).astype(np.uint8)
         image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
         return image
     def image_to_audio(self, image):
+        """Converts spectrogram to audio.
+        Args:
+            image (PIL Image): x_res x y_res grayscale image
+        Returns:
+            audio (array): raw audio
+        """
         bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
+            (image.width, image.height))
         log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
         S = librosa.db_to_power(log_S)
         audio = librosa.feature.inverse.mel_to_audio(
+            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length)
         return audio

notebooks/test-mel.ipynb CHANGED Viewed

@@ -30,8 +30,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.mel import Mel\n",
-    "from IPython.display import Audio"
    ]
   },
   {
@@ -178,7 +178,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
   },
   "toc": {
    "base_numbering": 1,

    "metadata": {},
    "outputs": [],
    "source": [
+    "from IPython.display import Audio\n",
+    "from audiodiffusion.mel import Mel"
    ]
   },
   {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.4"
   },
   "toc": {
    "base_numbering": 1,

notebooks/test-model.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements-lock.txt CHANGED Viewed

@@ -2,6 +2,9 @@ absl-py==1.2.0
 accelerate==0.12.0
 aiohttp==3.8.1
 aiosignal==1.2.0
 appdirs==1.4.4
 argon2-cffi==21.3.0
 argon2-cffi-bindings==21.2.0
@@ -10,12 +13,19 @@ async-timeout==4.0.2
 attrs==22.1.0
 audioread==3.0.0
 backcall==0.2.0
 beautifulsoup4==4.11.1
 bleach==5.0.1
 cachetools==5.2.0
 certifi==2022.6.15
 cffi==1.15.1
 charset-normalizer==2.1.1
 datasets==2.4.0
 debugpy==1.6.3
 decorator==5.1.1
@@ -24,14 +34,23 @@ diffusers==0.2.4
 dill==0.3.5.1
 entrypoints==0.4
 executing==0.10.0
 fastjsonschema==2.16.1
 filelock==3.8.0
 frozenlist==1.3.1
 fsspec==2022.7.1
 ftfy==6.1.1
 google-auth==2.11.0
 google-auth-oauthlib==0.4.6
 grpcio==1.47.0
 huggingface-hub==0.9.0
 idna==3.3
 importlib-metadata==4.12.0
@@ -48,13 +67,20 @@ jupyter-console==6.4.4
 jupyter-core==4.11.1
 jupyterlab-pygments==0.2.2
 jupyterlab-widgets==3.0.2
 librosa==0.9.2
 llvmlite==0.39.0
 lxml==4.9.1
 Markdown==3.4.1
 MarkupSafe==2.1.1
 matplotlib-inline==0.1.6
 mistune==2.0.4
 multidict==6.0.2
 multiprocess==0.70.13
 nbclient==0.6.7
@@ -65,9 +91,11 @@ notebook==6.4.12
 numba==0.56.0
 numpy==1.22.4
 oauthlib==3.2.0
 packaging==21.3
 pandas==1.4.3
 pandocfilters==1.5.0
 parso==0.8.3
 pexpect==4.8.0
 pickleshare==0.7.5
@@ -83,11 +111,19 @@ pyarrow==9.0.0
 pyasn1==0.4.8
 pyasn1-modules==0.2.8
 pycparser==2.21
 Pygments==2.13.0
 pyparsing==3.0.9
 pyrsistent==0.18.1
 python-dateutil==2.8.2
 pytz==2022.2.1
 PyYAML==6.0
 pyzmq==23.2.1
 qtconsole==5.3.1
@@ -97,14 +133,21 @@ requests==2.28.1
 requests-oauthlib==1.3.1
 resampy==0.4.0
 responses==0.18.0
 rsa==4.9
 scikit-learn==1.1.2
 scipy==1.9.0
 Send2Trash==1.8.0
 six==1.16.0
 SoundFile==0.10.3.post1
 soupsieve==2.3.2.post1
 stack-data==0.4.0
 tensorboard==2.10.0
 tensorboard-data-server==0.6.1
 tensorboard-plugin-wit==1.8.1
@@ -113,6 +156,7 @@ threadpoolctl==3.1.0
 tinycss2==1.1.1
 tokenizers==0.12.1
 toml==0.10.2
 torch==1.12.1
 torchvision==0.13.1
 tornado==6.2
@@ -120,9 +164,16 @@ tqdm==4.64.0
 traitlets==5.3.0
 transformers==4.21.1
 typing_extensions==4.3.0
 urllib3==1.26.12
 wcwidth==0.2.5
 webencodings==0.5.1
 Werkzeug==2.2.2
 widgetsnbextension==3.6.1
 xxhash==3.0.0

 accelerate==0.12.0
 aiohttp==3.8.1
 aiosignal==1.2.0
+altair==4.2.0
+analytics-python==1.4.0
+anyio==3.6.1
 appdirs==1.4.4
 argon2-cffi==21.3.0
 argon2-cffi-bindings==21.2.0
 attrs==22.1.0
 audioread==3.0.0
 backcall==0.2.0
+backoff==1.10.0
+bcrypt==4.0.0
 beautifulsoup4==4.11.1
 bleach==5.0.1
+blinker==1.5
 cachetools==5.2.0
 certifi==2022.6.15
 cffi==1.15.1
 charset-normalizer==2.1.1
+click==8.1.3
+commonmark==0.9.1
+cryptography==37.0.4
+cycler==0.11.0
 datasets==2.4.0
 debugpy==1.6.3
 decorator==5.1.1
 dill==0.3.5.1
 entrypoints==0.4
 executing==0.10.0
+fastapi==0.81.0
 fastjsonschema==2.16.1
+ffmpy==0.3.0
 filelock==3.8.0
+fonttools==4.37.1
 frozenlist==1.3.1
 fsspec==2022.7.1
 ftfy==6.1.1
+gitdb==4.0.9
+GitPython==3.1.27
 google-auth==2.11.0
 google-auth-oauthlib==0.4.6
+gradio==3.1.7
 grpcio==1.47.0
+h11==0.12.0
+httpcore==0.15.0
+httpx==0.23.0
 huggingface-hub==0.9.0
 idna==3.3
 importlib-metadata==4.12.0
 jupyter-core==4.11.1
 jupyterlab-pygments==0.2.2
 jupyterlab-widgets==3.0.2
+kiwisolver==1.4.4
 librosa==0.9.2
+linkify-it-py==1.0.3
 llvmlite==0.39.0
 lxml==4.9.1
 Markdown==3.4.1
+markdown-it-py==2.1.0
 MarkupSafe==2.1.1
+matplotlib==3.5.3
 matplotlib-inline==0.1.6
+mdit-py-plugins==0.3.0
+mdurl==0.1.2
 mistune==2.0.4
+monotonic==1.6
 multidict==6.0.2
 multiprocess==0.70.13
 nbclient==0.6.7
 numba==0.56.0
 numpy==1.22.4
 oauthlib==3.2.0
+orjson==3.8.0
 packaging==21.3
 pandas==1.4.3
 pandocfilters==1.5.0
+paramiko==2.11.0
 parso==0.8.3
 pexpect==4.8.0
 pickleshare==0.7.5
 pyasn1==0.4.8
 pyasn1-modules==0.2.8
 pycparser==2.21
+pycryptodome==3.15.0
+pydantic==1.9.2
+pydeck==0.8.0b1
+pydub==0.25.1
 Pygments==2.13.0
+Pympler==1.0.1
+PyNaCl==1.5.0
 pyparsing==3.0.9
 pyrsistent==0.18.1
 python-dateutil==2.8.2
+python-multipart==0.0.5
 pytz==2022.2.1
+pytz-deprecation-shim==0.1.0.post0
 PyYAML==6.0
 pyzmq==23.2.1
 qtconsole==5.3.1
 requests-oauthlib==1.3.1
 resampy==0.4.0
 responses==0.18.0
+rfc3986==1.5.0
+rich==12.5.1
 rsa==4.9
 scikit-learn==1.1.2
 scipy==1.9.0
+semver==2.13.0
 Send2Trash==1.8.0
 six==1.16.0
+smmap==5.0.0
+sniffio==1.2.0
 SoundFile==0.10.3.post1
 soupsieve==2.3.2.post1
 stack-data==0.4.0
+starlette==0.19.1
+streamlit==1.12.2
 tensorboard==2.10.0
 tensorboard-data-server==0.6.1
 tensorboard-plugin-wit==1.8.1
 tinycss2==1.1.1
 tokenizers==0.12.1
 toml==0.10.2
+toolz==0.12.0
 torch==1.12.1
 torchvision==0.13.1
 tornado==6.2
 traitlets==5.3.0
 transformers==4.21.1
 typing_extensions==4.3.0
+tzdata==2022.2
+tzlocal==4.2
+uc-micro-py==1.0.1
 urllib3==1.26.12
+uvicorn==0.18.3
+validators==0.20.0
+watchdog==2.1.9
 wcwidth==0.2.5
 webencodings==0.5.1
+websockets==10.3
 Werkzeug==2.2.2
 widgetsnbextension==3.6.1
 xxhash==3.0.0

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-# for Hugging Face Spaces
 torch
 numpy
 Pillow
-diffusers
 librosa
 datasets
 gradio

 torch
 numpy
 Pillow
+diffusers>=0.2.4
 librosa
 datasets
 gradio
+streamlit

setup.cfg ADDED Viewed

	@@ -0,0 +1,19 @@

+[metadata]
+name = audiodiffusion
+version = attr: audiodiffusion.VERSION
+description = Generate Mel spectrogram dataset from directory of audio files.
+long_description = file: README.md
+license = GPL3
+classifiers =
+    Programming Language :: Python :: 3
+[options]
+zip_safe = False
+packages = audiodiffusion
+install_requires =
+    torch
+    numpy
+    Pillow
+    diffusers>=0.2.4
+    librosa
+    datasets

setup.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env python
+from setuptools import setup
+if __name__ == "__main__":
+    setup()

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from io import BytesIO
+import streamlit as st
+import soundfile as sf
+from librosa.util import normalize
+from audiodiffusion import AudioDiffusion
+audio_diffusion = AudioDiffusion()
+if __name__ == "__main__":
+    st.header("Audio Diffusion")
+    st.markdown("Generate audio using Huggingface diffusers.\
+        This takes about 20 minutes without a GPU, so why not make yourself a cup of tea in the meantime?"
+                )
+    if st.button("Generate"):
+        st.markdown("Generating...")
+        image, (sample_rate,
+                audio) = audio_diffusion.generate_spectrogram_and_audio()
+        st.image(image, caption="Mel spectrogram")
+        buffer = BytesIO()
+        sf.write(buffer, normalize(audio), sample_rate, format="WAV")
+        st.audio(buffer, format="audio/wav")

src/train_unconditional.py → train_unconditional.py RENAMED Viewed

@@ -24,8 +24,9 @@ from torchvision.transforms import (
     ToTensor,
 )
 from tqdm.auto import tqdm
-from mel import Mel
 logger = get_logger(__name__)
@@ -65,7 +66,8 @@ def main(args):
                 "UpBlock2D",
             ),
         )
-    noise_scheduler = DDPMScheduler(num_train_timesteps=1000, tensor_format="pt")
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=args.learning_rate,
@@ -74,20 +76,17 @@ def main(args):
         eps=args.adam_epsilon,
     )
-    augmentations = Compose(
-        [
-            Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
-            CenterCrop(args.resolution),
-            ToTensor(),
-            Normalize([0.5], [0.5]),
-        ]
-    )
     if args.dataset_name is not None:
         if os.path.exists(args.dataset_name):
-            dataset = load_from_disk(args.dataset_name, args.dataset_config_name)[
-                "train"
-            ]
         else:
             dataset = load_dataset(
                 args.dataset_name,
@@ -110,20 +109,18 @@ def main(args):
     dataset.set_transform(transforms)
     train_dataloader = torch.utils.data.DataLoader(
-        dataset, batch_size=args.train_batch_size, shuffle=True
-    )
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         optimizer=optimizer,
         num_warmup_steps=args.lr_warmup_steps,
-        num_training_steps=(len(train_dataloader) * args.num_epochs)
-        // args.gradient_accumulation_steps,
     )
     model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        model, optimizer, train_dataloader, lr_scheduler
-    )
     ema_model = EMAModel(
         getattr(model, "module", model),
@@ -139,13 +136,14 @@ def main(args):
         run = os.path.split(__file__)[-1].split(".")[0]
         accelerator.init_trackers(run)
-    mel = Mel(x_res=args.resolution, y_res=args.resolution, hop_length=args.hop_length)
     global_step = 0
     for epoch in range(args.num_epochs):
-        progress_bar = tqdm(
-            total=len(train_dataloader), disable=not accelerator.is_local_main_process
-        )
         progress_bar.set_description(f"Epoch {epoch}")
         if epoch < args.start_epoch:
@@ -168,13 +166,14 @@ def main(args):
             timesteps = torch.randint(
                 0,
                 noise_scheduler.num_train_timesteps,
-                (bsz,),
                 device=clean_images.device,
             ).long()
             # Add noise to the clean images according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
-            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
             with accelerator.accumulate(model):
                 # Predict the noise residual
@@ -209,11 +208,10 @@ def main(args):
             if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
                 pipeline = DDPMPipeline(
                     unet=accelerator.unwrap_model(
-                        ema_model.averaged_model if args.use_ema else model
-                    ),
                     scheduler=noise_scheduler,
                 )
                 # save the model
                 if args.push_to_hub:
                     try:
@@ -238,17 +236,16 @@ def main(args):
                 )["sample"]
                 # denormalize the images and save to tensorboard
-                images_processed = (
-                    (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
-                )
                 accelerator.trackers[0].writer.add_images(
-                    "test_samples", images_processed, epoch
-                )
                 for _, image in enumerate(images_processed):
                     audio = mel.image_to_audio(Image.fromarray(image[0]))
                     accelerator.trackers[0].writer.add_audio(
                         f"test_audio_{_}",
-                        audio,
                         epoch,
                         sample_rate=mel.get_sample_rate(),
                     )
@@ -258,7 +255,8 @@ def main(args):
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument("--local_rank", type=int, default=-1)
     parser.add_argument("--dataset_name", type=str, default=None)
     parser.add_argument("--dataset_config_name", type=str, default=None)
@@ -303,8 +301,7 @@ if __name__ == "__main__":
         help=(
             "Whether to use mixed precision. Choose"
             "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-            "and an Nvidia Ampere GPU."
-        ),
     )
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--from_pretrained", type=str, default=None)

     ToTensor,
 )
 from tqdm.auto import tqdm
+from librosa.util import normalize
+from audiodiffusion.mel import Mel
 logger = get_logger(__name__)
                 "UpBlock2D",
             ),
         )
+    noise_scheduler = DDPMScheduler(num_train_timesteps=1000,
+                                    tensor_format="pt")
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=args.learning_rate,
         eps=args.adam_epsilon,
     )
+    augmentations = Compose([
+        Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
+        CenterCrop(args.resolution),
+        ToTensor(),
+        Normalize([0.5], [0.5]),
+    ])
     if args.dataset_name is not None:
         if os.path.exists(args.dataset_name):
+            dataset = load_from_disk(args.dataset_name,
+                                     args.dataset_config_name)["train"]
         else:
             dataset = load_dataset(
                 args.dataset_name,
     dataset.set_transform(transforms)
     train_dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=args.train_batch_size, shuffle=True)
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         optimizer=optimizer,
         num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs) //
+        args.gradient_accumulation_steps,
     )
     model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler)
     ema_model = EMAModel(
         getattr(model, "module", model),
         run = os.path.split(__file__)[-1].split(".")[0]
         accelerator.init_trackers(run)
+    mel = Mel(x_res=args.resolution,
+              y_res=args.resolution,
+              hop_length=args.hop_length)
     global_step = 0
     for epoch in range(args.num_epochs):
+        progress_bar = tqdm(total=len(train_dataloader),
+                            disable=not accelerator.is_local_main_process)
         progress_bar.set_description(f"Epoch {epoch}")
         if epoch < args.start_epoch:
             timesteps = torch.randint(
                 0,
                 noise_scheduler.num_train_timesteps,
+                (bsz, ),
                 device=clean_images.device,
             ).long()
             # Add noise to the clean images according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
+            noisy_images = noise_scheduler.add_noise(clean_images, noise,
+                                                     timesteps)
             with accelerator.accumulate(model):
                 # Predict the noise residual
             if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
                 pipeline = DDPMPipeline(
                     unet=accelerator.unwrap_model(
+                        ema_model.averaged_model if args.use_ema else model),
                     scheduler=noise_scheduler,
                 )
                 # save the model
                 if args.push_to_hub:
                     try:
                 )["sample"]
                 # denormalize the images and save to tensorboard
+                images_processed = ((images *
+                                     255).round().astype("uint8").transpose(
+                                         0, 3, 1, 2))
                 accelerator.trackers[0].writer.add_images(
+                    "test_samples", images_processed, epoch)
                 for _, image in enumerate(images_processed):
                     audio = mel.image_to_audio(Image.fromarray(image[0]))
                     accelerator.trackers[0].writer.add_audio(
                         f"test_audio_{_}",
+                        normalize(audio),
                         epoch,
                         sample_rate=mel.get_sample_rate(),
                     )
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Simple example of a training script.")
     parser.add_argument("--local_rank", type=int, default=-1)
     parser.add_argument("--dataset_name", type=str, default=None)
     parser.add_argument("--dataset_config_name", type=str, default=None)
         help=(
             "Whether to use mixed precision. Choose"
             "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."),
     )
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--from_pretrained", type=str, default=None)