musdfakoc commited on
Commit
4970be7
·
verified ·
1 Parent(s): 755bafb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -144,53 +144,55 @@ def magnitude_to_complex_spectrogram(magnitude_spectrogram):
144
 
145
 
146
  def spectrogram_to_audio(magnitude_spectrogram):
147
- # Perform inverse log scaling
148
  magnitude_spectrogram = torch.expm1(magnitude_spectrogram)
149
 
150
  # Convert magnitude-only spectrogram to complex format
151
  complex_spectrogram = magnitude_to_complex_spectrogram(magnitude_spectrogram)
152
 
153
- # Inverse STFT to convert the spectrogram back to audio
154
  audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
155
 
156
  # Handle NaNs or Infs in the audio and replace them with zeros
157
  audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
158
 
159
- # Normalize audio to the range [-1, 1] (standard audio range)
160
  if torch.max(torch.abs(audio)) != 0:
161
  audio = audio / torch.max(torch.abs(audio))
162
 
163
  # Clip the audio to the range [-1, 1] to avoid out-of-bounds values
164
  audio = torch.clamp(audio, min=-1, max=1)
165
 
166
- # Convert to 16-bit PCM format by scaling and casting to int16
167
  audio = (audio * 32767).short()
168
 
169
- # Ensure the audio is clipped to the valid range for int16
170
  audio = torch.clamp(audio, min=-32768, max=32767)
171
 
172
- # Convert the audio to a NumPy array in int16 format
173
  audio_numpy = audio.cpu().numpy().astype(np.int16)
174
 
175
  return audio_numpy
176
 
177
 
178
 
 
179
  def generate_audio_from_image(image):
180
- test_img = image_transform(image).unsqueeze(0).to(device) # Preprocess image
181
 
182
- # Generate sound spectrogram from the image using the loaded generator
183
  with torch.no_grad():
184
  generated_spectrogram = generator(test_img)
185
 
186
- # Convert the generated spectrogram to audio
187
  generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
188
 
189
- # Return the sample rate and the NumPy array containing the audio
190
  return (sample_rate, generated_audio_numpy)
191
 
192
 
193
 
 
194
  # Gradio Interface
195
  def main():
196
  global generator # Declare the generator object globally
 
144
 
145
 
146
  def spectrogram_to_audio(magnitude_spectrogram):
147
+ # Perform inverse log scaling to undo any log scaling applied to the spectrogram
148
  magnitude_spectrogram = torch.expm1(magnitude_spectrogram)
149
 
150
  # Convert magnitude-only spectrogram to complex format
151
  complex_spectrogram = magnitude_to_complex_spectrogram(magnitude_spectrogram)
152
 
153
+ # Use inverse STFT to convert the spectrogram back to time-domain audio
154
  audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
155
 
156
  # Handle NaNs or Infs in the audio and replace them with zeros
157
  audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
158
 
159
+ # Normalize the audio to the range [-1, 1]
160
  if torch.max(torch.abs(audio)) != 0:
161
  audio = audio / torch.max(torch.abs(audio))
162
 
163
  # Clip the audio to the range [-1, 1] to avoid out-of-bounds values
164
  audio = torch.clamp(audio, min=-1, max=1)
165
 
166
+ # Scale the audio to 16-bit PCM format and convert to int16
167
  audio = (audio * 32767).short()
168
 
169
+ # Ensure the audio is clipped to the valid range for int16 [-32768, 32767]
170
  audio = torch.clamp(audio, min=-32768, max=32767)
171
 
172
+ # Convert to a NumPy array and ensure it's in the correct format
173
  audio_numpy = audio.cpu().numpy().astype(np.int16)
174
 
175
  return audio_numpy
176
 
177
 
178
 
179
+
180
  def generate_audio_from_image(image):
181
+ test_img = image_transform(image).unsqueeze(0).to(device) # Preprocess the input image
182
 
183
+ # Generate a sound spectrogram from the image using the pre-trained GAN model
184
  with torch.no_grad():
185
  generated_spectrogram = generator(test_img)
186
 
187
+ # Convert the generated spectrogram to time-domain audio
188
  generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
189
 
190
+ # Return the sample rate and the NumPy array containing the audio data
191
  return (sample_rate, generated_audio_numpy)
192
 
193
 
194
 
195
+
196
  # Gradio Interface
197
  def main():
198
  global generator # Declare the generator object globally