Spaces:

musdfakoc
/

local_intelligence

Sleeping

App Files Files Community

musdfakoc commited on Sep 30, 2024

Commit

355b1c8

verified ·

1 Parent(s): 1e7178e

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -14

app.py CHANGED Viewed

@@ -154,8 +154,8 @@ def generate_audio_from_image(image):
     if image is None:
         raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
-    # Ensure the image is in the right format
-    test_img = image_transform(image).unsqueeze(0).to(device)  # Preprocess image
     # Generate sound spectrogram from the image using the loaded generator
     with torch.no_grad():
@@ -172,23 +172,20 @@ def generate_audio_from_image(image):
     if max_value > 0:
         generated_audio = generated_audio / max_value
-    # Convert to the required format (e.g., float32)
-    generated_audio = generated_audio.astype(np.float32)
-    # Transpose to (samples, channels) for stereo
-    generated_audio = generated_audio.T
-    # Ensure the audio is 2D (even if mono)
     if generated_audio.ndim == 1:
-        generated_audio = np.expand_dims(generated_audio, axis=-1)  # Add channel dimension for mono
     # Debug: Print the shape and type of the generated audio
     print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
-    # Debug: Ensure sample_rate is correct
-    print(f"Returning sample rate: {sample_rate}, type: {type(sample_rate)}")
-    # Explicit return of the tuple (audio_data, sample_rate)
     return generated_audio, sample_rate
@@ -203,7 +200,8 @@ def main():
     model_path = './gan_model.pth'  # Ensure the model is in the correct relative path
     generator = load_gan_model(generator, model_path, device)
-    iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"))

     if image is None:
         raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
+    # Preprocess image
+    test_img = image_transform(image).unsqueeze(0).to(device)
     # Generate sound spectrogram from the image using the loaded generator
     with torch.no_grad():
     if max_value > 0:
         generated_audio = generated_audio / max_value
+    # Convert audio to 16-bit integer format (-32768 to 32767)
+    generated_audio = np.int16(generated_audio * 32767)
+    # Ensure audio is stereo with shape (samples, channels)
     if generated_audio.ndim == 1:
+        generated_audio = np.expand_dims(generated_audio, axis=-1)
+    # Transpose to (samples, channels) format
+    generated_audio = generated_audio.T
     # Debug: Print the shape and type of the generated audio
     print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
+    # Return audio and sample rate
     return generated_audio, sample_rate
     model_path = './gan_model.pth'  # Ensure the model is in the correct relative path
     generator = load_gan_model(generator, model_path, device)
+    iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"), title="Image to Sound Generation")