Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -153,7 +153,7 @@ def spectrogram_to_audio(magnitude_spectrogram):
|
|
153 |
# Inverse STFT to convert the spectrogram back to audio
|
154 |
audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
|
155 |
|
156 |
-
# Handle
|
157 |
audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
|
158 |
|
159 |
# Normalize audio to the range [-1, 1] (standard audio range)
|
@@ -163,13 +163,17 @@ def spectrogram_to_audio(magnitude_spectrogram):
|
|
163 |
# Clip the audio to the range [-1, 1] to avoid out-of-bounds values
|
164 |
audio = torch.clamp(audio, min=-1, max=1)
|
165 |
|
166 |
-
# Convert
|
167 |
audio = (audio * 32767).short()
|
168 |
|
169 |
-
# Ensure audio is
|
170 |
audio = torch.clamp(audio, min=-32768, max=32767)
|
171 |
|
172 |
-
|
|
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
def generate_audio_from_image(image):
|
@@ -180,10 +184,7 @@ def generate_audio_from_image(image):
|
|
180 |
generated_spectrogram = generator(test_img)
|
181 |
|
182 |
# Convert the generated spectrogram to audio
|
183 |
-
|
184 |
-
|
185 |
-
# Convert the audio tensor to a NumPy array before passing it to Gradio
|
186 |
-
generated_audio_numpy = generated_audio.numpy().astype(np.int16) # Ensure it's int16
|
187 |
|
188 |
# Return the sample rate and the NumPy array containing the audio
|
189 |
return (sample_rate, generated_audio_numpy)
|
|
|
153 |
# Inverse STFT to convert the spectrogram back to audio
|
154 |
audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
|
155 |
|
156 |
+
# Handle NaNs or Infs in the audio and replace them with zeros
|
157 |
audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
|
158 |
|
159 |
# Normalize audio to the range [-1, 1] (standard audio range)
|
|
|
163 |
# Clip the audio to the range [-1, 1] to avoid out-of-bounds values
|
164 |
audio = torch.clamp(audio, min=-1, max=1)
|
165 |
|
166 |
+
# Convert to 16-bit PCM format by scaling and casting to int16
|
167 |
audio = (audio * 32767).short()
|
168 |
|
169 |
+
# Ensure the audio is clipped to the valid range for int16
|
170 |
audio = torch.clamp(audio, min=-32768, max=32767)
|
171 |
|
172 |
+
# Convert the audio to a NumPy array in int16 format
|
173 |
+
audio_numpy = audio.cpu().numpy().astype(np.int16)
|
174 |
+
|
175 |
+
return audio_numpy
|
176 |
+
|
177 |
|
178 |
|
179 |
def generate_audio_from_image(image):
|
|
|
184 |
generated_spectrogram = generator(test_img)
|
185 |
|
186 |
# Convert the generated spectrogram to audio
|
187 |
+
generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
|
|
|
|
|
|
|
188 |
|
189 |
# Return the sample rate and the NumPy array containing the audio
|
190 |
return (sample_rate, generated_audio_numpy)
|