Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -144,53 +144,55 @@ def magnitude_to_complex_spectrogram(magnitude_spectrogram):
|
|
144 |
|
145 |
|
146 |
def spectrogram_to_audio(magnitude_spectrogram):
|
147 |
-
# Perform inverse log scaling
|
148 |
magnitude_spectrogram = torch.expm1(magnitude_spectrogram)
|
149 |
|
150 |
# Convert magnitude-only spectrogram to complex format
|
151 |
complex_spectrogram = magnitude_to_complex_spectrogram(magnitude_spectrogram)
|
152 |
|
153 |
-
#
|
154 |
audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
|
155 |
|
156 |
# Handle NaNs or Infs in the audio and replace them with zeros
|
157 |
audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
|
158 |
|
159 |
-
# Normalize audio to the range [-1, 1]
|
160 |
if torch.max(torch.abs(audio)) != 0:
|
161 |
audio = audio / torch.max(torch.abs(audio))
|
162 |
|
163 |
# Clip the audio to the range [-1, 1] to avoid out-of-bounds values
|
164 |
audio = torch.clamp(audio, min=-1, max=1)
|
165 |
|
166 |
-
#
|
167 |
audio = (audio * 32767).short()
|
168 |
|
169 |
-
# Ensure the audio is clipped to the valid range for int16
|
170 |
audio = torch.clamp(audio, min=-32768, max=32767)
|
171 |
|
172 |
-
# Convert
|
173 |
audio_numpy = audio.cpu().numpy().astype(np.int16)
|
174 |
|
175 |
return audio_numpy
|
176 |
|
177 |
|
178 |
|
|
|
179 |
def generate_audio_from_image(image):
|
180 |
-
test_img = image_transform(image).unsqueeze(0).to(device) # Preprocess image
|
181 |
|
182 |
-
# Generate sound spectrogram from the image using the
|
183 |
with torch.no_grad():
|
184 |
generated_spectrogram = generator(test_img)
|
185 |
|
186 |
-
# Convert the generated spectrogram to audio
|
187 |
generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
|
188 |
|
189 |
-
# Return the sample rate and the NumPy array containing the audio
|
190 |
return (sample_rate, generated_audio_numpy)
|
191 |
|
192 |
|
193 |
|
|
|
194 |
# Gradio Interface
|
195 |
def main():
|
196 |
global generator # Declare the generator object globally
|
|
|
144 |
|
145 |
|
146 |
def spectrogram_to_audio(magnitude_spectrogram):
|
147 |
+
# Perform inverse log scaling to undo any log scaling applied to the spectrogram
|
148 |
magnitude_spectrogram = torch.expm1(magnitude_spectrogram)
|
149 |
|
150 |
# Convert magnitude-only spectrogram to complex format
|
151 |
complex_spectrogram = magnitude_to_complex_spectrogram(magnitude_spectrogram)
|
152 |
|
153 |
+
# Use inverse STFT to convert the spectrogram back to time-domain audio
|
154 |
audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
|
155 |
|
156 |
# Handle NaNs or Infs in the audio and replace them with zeros
|
157 |
audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
|
158 |
|
159 |
+
# Normalize the audio to the range [-1, 1]
|
160 |
if torch.max(torch.abs(audio)) != 0:
|
161 |
audio = audio / torch.max(torch.abs(audio))
|
162 |
|
163 |
# Clip the audio to the range [-1, 1] to avoid out-of-bounds values
|
164 |
audio = torch.clamp(audio, min=-1, max=1)
|
165 |
|
166 |
+
# Scale the audio to 16-bit PCM format and convert to int16
|
167 |
audio = (audio * 32767).short()
|
168 |
|
169 |
+
# Ensure the audio is clipped to the valid range for int16 [-32768, 32767]
|
170 |
audio = torch.clamp(audio, min=-32768, max=32767)
|
171 |
|
172 |
+
# Convert to a NumPy array and ensure it's in the correct format
|
173 |
audio_numpy = audio.cpu().numpy().astype(np.int16)
|
174 |
|
175 |
return audio_numpy
|
176 |
|
177 |
|
178 |
|
179 |
+
|
180 |
def generate_audio_from_image(image):
|
181 |
+
test_img = image_transform(image).unsqueeze(0).to(device) # Preprocess the input image
|
182 |
|
183 |
+
# Generate a sound spectrogram from the image using the pre-trained GAN model
|
184 |
with torch.no_grad():
|
185 |
generated_spectrogram = generator(test_img)
|
186 |
|
187 |
+
# Convert the generated spectrogram to time-domain audio
|
188 |
generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
|
189 |
|
190 |
+
# Return the sample rate and the NumPy array containing the audio data
|
191 |
return (sample_rate, generated_audio_numpy)
|
192 |
|
193 |
|
194 |
|
195 |
+
|
196 |
# Gradio Interface
|
197 |
def main():
|
198 |
global generator # Declare the generator object globally
|