musdfakoc commited on
Commit
35945f8
·
verified ·
1 Parent(s): 5af138b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -153,20 +153,20 @@ def spectrogram_to_audio(magnitude_spectrogram):
153
  # Inverse STFT to convert the spectrogram back to audio
154
  audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
155
 
156
- # Check for NaNs or Infs in the audio and replace them with zeros
157
  audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
158
 
159
- # Normalize audio to the range [-1, 1]
160
  if torch.max(torch.abs(audio)) != 0:
161
  audio = audio / torch.max(torch.abs(audio))
162
 
163
- # Clip the audio to ensure it fits in the range [-1, 1]
164
  audio = torch.clamp(audio, min=-1, max=1)
165
 
166
- # Convert to 16-bit PCM format by scaling and casting to int16
167
  audio = (audio * 32767).short()
168
 
169
- # Ensure the values are clipped to the int16 range
170
  audio = torch.clamp(audio, min=-32768, max=32767)
171
 
172
  return audio
@@ -183,7 +183,7 @@ def generate_audio_from_image(image):
183
  generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
184
 
185
  # Convert the audio tensor to a NumPy array before passing it to Gradio
186
- generated_audio_numpy = generated_audio.numpy()
187
 
188
  # Return the sample rate and the NumPy array containing the audio
189
  return (sample_rate, generated_audio_numpy)
 
153
  # Inverse STFT to convert the spectrogram back to audio
154
  audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
155
 
156
+ # Handle any NaNs or Infs
157
  audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
158
 
159
+ # Normalize audio to the range [-1, 1] (standard audio range)
160
  if torch.max(torch.abs(audio)) != 0:
161
  audio = audio / torch.max(torch.abs(audio))
162
 
163
+ # Clip the audio to the range [-1, 1] to avoid out-of-bounds values
164
  audio = torch.clamp(audio, min=-1, max=1)
165
 
166
+ # Convert the audio to 16-bit PCM format by scaling and casting to int16
167
  audio = (audio * 32767).short()
168
 
169
+ # Ensure audio is in the valid range for int16
170
  audio = torch.clamp(audio, min=-32768, max=32767)
171
 
172
  return audio
 
183
  generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
184
 
185
  # Convert the audio tensor to a NumPy array before passing it to Gradio
186
+ generated_audio_numpy = generated_audio.numpy().astype(np.int16) # Ensure it's int16
187
 
188
  # Return the sample rate and the NumPy array containing the audio
189
  return (sample_rate, generated_audio_numpy)