Spaces:

musdfakoc
/

local_intelligence

Sleeping

musdfakoc commited on Sep 30, 2024

Commit

755bafb

verified ·

1 Parent(s): 35945f8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -153,7 +153,7 @@ def spectrogram_to_audio(magnitude_spectrogram):
     # Inverse STFT to convert the spectrogram back to audio
     audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
-    # Handle any NaNs or Infs
     audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
     # Normalize audio to the range [-1, 1] (standard audio range)
@@ -163,13 +163,17 @@ def spectrogram_to_audio(magnitude_spectrogram):
     # Clip the audio to the range [-1, 1] to avoid out-of-bounds values
     audio = torch.clamp(audio, min=-1, max=1)
-    # Convert the audio to 16-bit PCM format by scaling and casting to int16
     audio = (audio * 32767).short()
-    # Ensure audio is in the valid range for int16
     audio = torch.clamp(audio, min=-32768, max=32767)
-    return audio
 def generate_audio_from_image(image):
@@ -180,10 +184,7 @@ def generate_audio_from_image(image):
         generated_spectrogram = generator(test_img)
     # Convert the generated spectrogram to audio
-    generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
-    # Convert the audio tensor to a NumPy array before passing it to Gradio
-    generated_audio_numpy = generated_audio.numpy().astype(np.int16)  # Ensure it's int16
     # Return the sample rate and the NumPy array containing the audio
     return (sample_rate, generated_audio_numpy)

     # Inverse STFT to convert the spectrogram back to audio
     audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
+    # Handle NaNs or Infs in the audio and replace them with zeros
     audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
     # Normalize audio to the range [-1, 1] (standard audio range)
     # Clip the audio to the range [-1, 1] to avoid out-of-bounds values
     audio = torch.clamp(audio, min=-1, max=1)
+    # Convert to 16-bit PCM format by scaling and casting to int16
     audio = (audio * 32767).short()
+    # Ensure the audio is clipped to the valid range for int16
     audio = torch.clamp(audio, min=-32768, max=32767)
+    # Convert the audio to a NumPy array in int16 format
+    audio_numpy = audio.cpu().numpy().astype(np.int16)
+    return audio_numpy
 def generate_audio_from_image(image):
         generated_spectrogram = generator(test_img)
     # Convert the generated spectrogram to audio
+    generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
     # Return the sample rate and the NumPy array containing the audio
     return (sample_rate, generated_audio_numpy)