musdfakoc commited on
Commit
f7e6aa6
·
verified ·
1 Parent(s): 355b1c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -11
app.py CHANGED
@@ -150,11 +150,12 @@ def spectrogram_to_audio(magnitude_spectrogram):
150
  return audio
151
 
152
 
 
153
  def generate_audio_from_image(image):
154
  if image is None:
155
  raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
156
 
157
- # Preprocess image
158
  test_img = image_transform(image).unsqueeze(0).to(device)
159
 
160
  # Generate sound spectrogram from the image using the loaded generator
@@ -164,29 +165,34 @@ def generate_audio_from_image(image):
164
  # Convert the generated spectrogram to audio
165
  generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
166
 
167
- # Ensure the audio is a NumPy array and properly formatted
168
  generated_audio = generated_audio.numpy()
169
 
170
- # Normalize the audio to fit between -1 and 1 for proper playback
171
  max_value = np.abs(generated_audio).max()
172
  if max_value > 0:
173
  generated_audio = generated_audio / max_value
174
 
175
- # Convert audio to 16-bit integer format (-32768 to 32767)
176
  generated_audio = np.int16(generated_audio * 32767)
177
 
178
- # Ensure audio is stereo with shape (samples, channels)
179
- if generated_audio.ndim == 1:
180
  generated_audio = np.expand_dims(generated_audio, axis=-1)
181
 
182
- # Transpose to (samples, channels) format
183
  generated_audio = generated_audio.T
184
 
185
- # Debug: Print the shape and type of the generated audio
186
- print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
 
 
 
 
 
 
 
187
 
188
- # Return audio and sample rate
189
- return generated_audio, sample_rate
190
 
191
 
192
 
 
150
  return audio
151
 
152
 
153
+
154
  def generate_audio_from_image(image):
155
  if image is None:
156
  raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
157
 
158
+ # Preprocess the image
159
  test_img = image_transform(image).unsqueeze(0).to(device)
160
 
161
  # Generate sound spectrogram from the image using the loaded generator
 
165
  # Convert the generated spectrogram to audio
166
  generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
167
 
168
+ # Convert the audio to a NumPy array
169
  generated_audio = generated_audio.numpy()
170
 
171
+ # Normalize the audio between -1 and 1
172
  max_value = np.abs(generated_audio).max()
173
  if max_value > 0:
174
  generated_audio = generated_audio / max_value
175
 
176
+ # Convert the audio to 16-bit integer format
177
  generated_audio = np.int16(generated_audio * 32767)
178
 
179
+ # Ensure audio is in stereo format (samples, channels)
180
+ if generated_audio.ndim == 1: # If mono, make it stereo
181
  generated_audio = np.expand_dims(generated_audio, axis=-1)
182
 
183
+ # Transpose to ensure the shape is (samples, channels)
184
  generated_audio = generated_audio.T
185
 
186
+ # Convert sample_rate to a scalar integer
187
+ sample_rate_scalar = int(sample_rate)
188
+
189
+ # Debug: Ensure everything is correct before returning
190
+ print(f"Returning audio data of shape {generated_audio.shape}, dtype {generated_audio.dtype}")
191
+ print(f"Returning sample rate: {sample_rate_scalar}, dtype {type(sample_rate_scalar)}")
192
+
193
+ # Return the tuple (sample_rate, audio_data)
194
+ return (sample_rate_scalar, generated_audio)
195
 
 
 
196
 
197
 
198