Spaces:
Runtime error
Runtime error
add n_iter to Mel and update test_mel notebook
Browse files- audiodiffusion/mel.py +20 -17
- notebooks/test_mel.ipynb +7 -1
audiodiffusion/mel.py
CHANGED
|
@@ -9,15 +9,14 @@ from PIL import Image
|
|
| 9 |
|
| 10 |
class Mel:
|
| 11 |
|
| 12 |
-
def __init__(
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
):
|
| 21 |
"""Class to convert audio to mel spectrograms and vice versa.
|
| 22 |
|
| 23 |
Args:
|
|
@@ -27,6 +26,7 @@ class Mel:
|
|
| 27 |
n_fft (int): number of Fast Fourier Transforms
|
| 28 |
hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
|
| 29 |
top_db (int): loudest in decibels
|
|
|
|
| 30 |
"""
|
| 31 |
self.x_res = x_res
|
| 32 |
self.y_res = y_res
|
|
@@ -36,6 +36,7 @@ class Mel:
|
|
| 36 |
self.n_mels = self.y_res
|
| 37 |
self.slice_size = self.x_res * self.hop_length - 1
|
| 38 |
self.top_db = top_db
|
|
|
|
| 39 |
self.audio = None
|
| 40 |
|
| 41 |
def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
|
|
@@ -94,13 +95,11 @@ class Mel:
|
|
| 94 |
Returns:
|
| 95 |
PIL Image: grayscale image of x_res x y_res
|
| 96 |
"""
|
| 97 |
-
S = librosa.feature.melspectrogram(
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
n_mels=self.n_mels
|
| 103 |
-
)
|
| 104 |
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
|
| 105 |
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
|
| 106 |
0.5).astype(np.uint8)
|
|
@@ -121,5 +120,9 @@ class Mel:
|
|
| 121 |
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
|
| 122 |
S = librosa.db_to_power(log_S)
|
| 123 |
audio = librosa.feature.inverse.mel_to_audio(
|
| 124 |
-
S,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
return audio
|
|
|
|
| 9 |
|
| 10 |
class Mel:
|
| 11 |
|
| 12 |
+
def __init__(self,
|
| 13 |
+
x_res: int = 256,
|
| 14 |
+
y_res: int = 256,
|
| 15 |
+
sample_rate: int = 22050,
|
| 16 |
+
n_fft: int = 2048,
|
| 17 |
+
hop_length: int = 512,
|
| 18 |
+
top_db: int = 80,
|
| 19 |
+
n_iter: int = 32):
|
|
|
|
| 20 |
"""Class to convert audio to mel spectrograms and vice versa.
|
| 21 |
|
| 22 |
Args:
|
|
|
|
| 26 |
n_fft (int): number of Fast Fourier Transforms
|
| 27 |
hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
|
| 28 |
top_db (int): loudest in decibels
|
| 29 |
+
n_iter (int): number of iterations for Griffin Linn mel inversion
|
| 30 |
"""
|
| 31 |
self.x_res = x_res
|
| 32 |
self.y_res = y_res
|
|
|
|
| 36 |
self.n_mels = self.y_res
|
| 37 |
self.slice_size = self.x_res * self.hop_length - 1
|
| 38 |
self.top_db = top_db
|
| 39 |
+
self.n_iter = n_iter
|
| 40 |
self.audio = None
|
| 41 |
|
| 42 |
def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
|
|
|
|
| 95 |
Returns:
|
| 96 |
PIL Image: grayscale image of x_res x y_res
|
| 97 |
"""
|
| 98 |
+
S = librosa.feature.melspectrogram(y=self.get_audio_slice(slice),
|
| 99 |
+
sr=self.sr,
|
| 100 |
+
n_fft=self.n_fft,
|
| 101 |
+
hop_length=self.hop_length,
|
| 102 |
+
n_mels=self.n_mels)
|
|
|
|
|
|
|
| 103 |
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
|
| 104 |
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
|
| 105 |
0.5).astype(np.uint8)
|
|
|
|
| 120 |
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
|
| 121 |
S = librosa.db_to_power(log_S)
|
| 122 |
audio = librosa.feature.inverse.mel_to_audio(
|
| 123 |
+
S,
|
| 124 |
+
sr=self.sr,
|
| 125 |
+
n_fft=self.n_fft,
|
| 126 |
+
hop_length=self.hop_length,
|
| 127 |
+
n_iter=self.n_iter)
|
| 128 |
return audio
|
notebooks/test_mel.ipynb
CHANGED
|
@@ -41,7 +41,13 @@
|
|
| 41 |
"metadata": {},
|
| 42 |
"outputs": [],
|
| 43 |
"source": [
|
| 44 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
]
|
| 46 |
},
|
| 47 |
{
|
|
|
|
| 41 |
"metadata": {},
|
| 42 |
"outputs": [],
|
| 43 |
"source": [
|
| 44 |
+
"# These are the default parameters. If you change any of them, you may have to adjust the others.\n",
|
| 45 |
+
"mel = Mel(x_res=256,\n",
|
| 46 |
+
" y_res=256,\n",
|
| 47 |
+
" hop_length=512,\n",
|
| 48 |
+
" sample_rate=22050,\n",
|
| 49 |
+
" n_fft=2048,\n",
|
| 50 |
+
" n_iter=32)"
|
| 51 |
]
|
| 52 |
},
|
| 53 |
{
|