Spaces:

musdfakoc
/

local_intelligence

Sleeping

App Files Files Community

musdfakoc commited on Sep 30, 2024

Commit

f36caa0

verified ·

1 Parent(s): f7e6aa6

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -57

app.py CHANGED Viewed

@@ -128,35 +128,25 @@ def load_gan_model(generator, model_path, device):
     generator.eval()  # Set the model to evaluation mode
     return generator
 def magnitude_to_complex_spectrogram(magnitude_spectrogram):
-    # Create a zero-phase tensor with the same shape as the magnitude spectrogram
     zero_phase = torch.zeros_like(magnitude_spectrogram)
-    # Create a complex-valued spectrogram using the magnitude and zero phase
-    complex_spectrogram = torch.complex(magnitude_spectrogram, zero_phase)
     return complex_spectrogram
 def spectrogram_to_audio(magnitude_spectrogram):
-    # Convert magnitude-only spectrogram to complex format
     complex_spectrogram = magnitude_to_complex_spectrogram(magnitude_spectrogram)
-    # Provide a rectangular window to suppress the warning
-    window = torch.ones(n_fft, device=complex_spectrogram.device)
-    # Inverse STFT to convert the spectrogram back to audio
-    audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length, window=window)
     return audio
 def generate_audio_from_image(image):
-    if image is None:
-        raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
-    # Preprocess the image
-    test_img = image_transform(image).unsqueeze(0).to(device)
     # Generate sound spectrogram from the image using the loaded generator
     with torch.no_grad():
@@ -165,36 +155,8 @@ def generate_audio_from_image(image):
     # Convert the generated spectrogram to audio
     generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
-    # Convert the audio to a NumPy array
-    generated_audio = generated_audio.numpy()
-    # Normalize the audio between -1 and 1
-    max_value = np.abs(generated_audio).max()
-    if max_value > 0:
-        generated_audio = generated_audio / max_value
-    # Convert the audio to 16-bit integer format
-    generated_audio = np.int16(generated_audio * 32767)
-    # Ensure audio is in stereo format (samples, channels)
-    if generated_audio.ndim == 1:  # If mono, make it stereo
-        generated_audio = np.expand_dims(generated_audio, axis=-1)
-    # Transpose to ensure the shape is (samples, channels)
-    generated_audio = generated_audio.T
-    # Convert sample_rate to a scalar integer
-    sample_rate_scalar = int(sample_rate)
-    # Debug: Ensure everything is correct before returning
-    print(f"Returning audio data of shape {generated_audio.shape}, dtype {generated_audio.dtype}")
-    print(f"Returning sample rate: {sample_rate_scalar}, dtype {type(sample_rate_scalar)}")
-    # Return the tuple (sample_rate, audio_data)
-    return (sample_rate_scalar, generated_audio)
 # Gradio Interface
 def main():
@@ -203,16 +165,13 @@ def main():
     generator = Generator(output_time_frames).to(device)
     # Load the pre-trained model
-    model_path = './gan_model.pth'  # Ensure the model is in the correct relative path
     generator = load_gan_model(generator, model_path, device)
-    iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"), title="Image to Sound Generation")
     iface.launch()
 if __name__ == "__main__":
     main()

     generator.eval()  # Set the model to evaluation mode
     return generator
+# Generator model class definitions remain the same as in your original code.
+# Convert magnitude-only spectrogram to complex format by assuming zero phase
 def magnitude_to_complex_spectrogram(magnitude_spectrogram):
     zero_phase = torch.zeros_like(magnitude_spectrogram)
+    complex_spectrogram = torch.stack([magnitude_spectrogram, zero_phase], dim=-1)
     return complex_spectrogram
+# Convert spectrogram back to audio using inverse STFT
 def spectrogram_to_audio(magnitude_spectrogram):
+    magnitude_spectrogram = torch.expm1(magnitude_spectrogram)
     complex_spectrogram = magnitude_to_complex_spectrogram(magnitude_spectrogram)
+    audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
     return audio
+# Function to generate audio from an uploaded image
 def generate_audio_from_image(image):
+    test_img = image_transform(image).unsqueeze(0).to(device)  # Preprocess image
     # Generate sound spectrogram from the image using the loaded generator
     with torch.no_grad():
     # Convert the generated spectrogram to audio
     generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
+    # Convert audio tensor to numpy and return it for Gradio to handle
+    return (sample_rate, generated_audio.numpy())
 # Gradio Interface
 def main():
     generator = Generator(output_time_frames).to(device)
     # Load the pre-trained model
+    model_path = '/path/to/your/model/gan_model_100e_16b.pth'  # Change this path
     generator = load_gan_model(generator, model_path, device)
+    # Gradio interface: allow users to upload an image and generate audio
+    iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"))
     iface.launch()
 if __name__ == "__main__":
     main()