musdfakoc commited on
Commit
355b1c8
·
verified ·
1 Parent(s): 1e7178e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -154,8 +154,8 @@ def generate_audio_from_image(image):
154
  if image is None:
155
  raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
156
 
157
- # Ensure the image is in the right format
158
- test_img = image_transform(image).unsqueeze(0).to(device) # Preprocess image
159
 
160
  # Generate sound spectrogram from the image using the loaded generator
161
  with torch.no_grad():
@@ -172,23 +172,20 @@ def generate_audio_from_image(image):
172
  if max_value > 0:
173
  generated_audio = generated_audio / max_value
174
 
175
- # Convert to the required format (e.g., float32)
176
- generated_audio = generated_audio.astype(np.float32)
177
 
178
- # Transpose to (samples, channels) for stereo
179
- generated_audio = generated_audio.T
180
-
181
- # Ensure the audio is 2D (even if mono)
182
  if generated_audio.ndim == 1:
183
- generated_audio = np.expand_dims(generated_audio, axis=-1) # Add channel dimension for mono
 
 
 
184
 
185
  # Debug: Print the shape and type of the generated audio
186
  print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
187
- # Debug: Ensure sample_rate is correct
188
- print(f"Returning sample rate: {sample_rate}, type: {type(sample_rate)}")
189
-
190
 
191
- # Explicit return of the tuple (audio_data, sample_rate)
192
  return generated_audio, sample_rate
193
 
194
 
@@ -203,7 +200,8 @@ def main():
203
  model_path = './gan_model.pth' # Ensure the model is in the correct relative path
204
  generator = load_gan_model(generator, model_path, device)
205
 
206
- iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"))
 
207
 
208
 
209
 
 
154
  if image is None:
155
  raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
156
 
157
+ # Preprocess image
158
+ test_img = image_transform(image).unsqueeze(0).to(device)
159
 
160
  # Generate sound spectrogram from the image using the loaded generator
161
  with torch.no_grad():
 
172
  if max_value > 0:
173
  generated_audio = generated_audio / max_value
174
 
175
+ # Convert audio to 16-bit integer format (-32768 to 32767)
176
+ generated_audio = np.int16(generated_audio * 32767)
177
 
178
+ # Ensure audio is stereo with shape (samples, channels)
 
 
 
179
  if generated_audio.ndim == 1:
180
+ generated_audio = np.expand_dims(generated_audio, axis=-1)
181
+
182
+ # Transpose to (samples, channels) format
183
+ generated_audio = generated_audio.T
184
 
185
  # Debug: Print the shape and type of the generated audio
186
  print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
 
 
 
187
 
188
+ # Return audio and sample rate
189
  return generated_audio, sample_rate
190
 
191
 
 
200
  model_path = './gan_model.pth' # Ensure the model is in the correct relative path
201
  generator = load_gan_model(generator, model_path, device)
202
 
203
+ iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"), title="Image to Sound Generation")
204
+
205
 
206
 
207