Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -150,11 +150,12 @@ def spectrogram_to_audio(magnitude_spectrogram):
|
|
150 |
return audio
|
151 |
|
152 |
|
|
|
153 |
def generate_audio_from_image(image):
|
154 |
if image is None:
|
155 |
raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
|
156 |
|
157 |
-
# Preprocess image
|
158 |
test_img = image_transform(image).unsqueeze(0).to(device)
|
159 |
|
160 |
# Generate sound spectrogram from the image using the loaded generator
|
@@ -164,29 +165,34 @@ def generate_audio_from_image(image):
|
|
164 |
# Convert the generated spectrogram to audio
|
165 |
generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
|
166 |
|
167 |
-
#
|
168 |
generated_audio = generated_audio.numpy()
|
169 |
|
170 |
-
# Normalize the audio
|
171 |
max_value = np.abs(generated_audio).max()
|
172 |
if max_value > 0:
|
173 |
generated_audio = generated_audio / max_value
|
174 |
|
175 |
-
# Convert audio to 16-bit integer format
|
176 |
generated_audio = np.int16(generated_audio * 32767)
|
177 |
|
178 |
-
# Ensure audio is stereo
|
179 |
-
if generated_audio.ndim == 1:
|
180 |
generated_audio = np.expand_dims(generated_audio, axis=-1)
|
181 |
|
182 |
-
# Transpose to (samples, channels)
|
183 |
generated_audio = generated_audio.T
|
184 |
|
185 |
-
#
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
# Return audio and sample rate
|
189 |
-
return generated_audio, sample_rate
|
190 |
|
191 |
|
192 |
|
|
|
150 |
return audio
|
151 |
|
152 |
|
153 |
+
|
154 |
def generate_audio_from_image(image):
|
155 |
if image is None:
|
156 |
raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
|
157 |
|
158 |
+
# Preprocess the image
|
159 |
test_img = image_transform(image).unsqueeze(0).to(device)
|
160 |
|
161 |
# Generate sound spectrogram from the image using the loaded generator
|
|
|
165 |
# Convert the generated spectrogram to audio
|
166 |
generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
|
167 |
|
168 |
+
# Convert the audio to a NumPy array
|
169 |
generated_audio = generated_audio.numpy()
|
170 |
|
171 |
+
# Normalize the audio between -1 and 1
|
172 |
max_value = np.abs(generated_audio).max()
|
173 |
if max_value > 0:
|
174 |
generated_audio = generated_audio / max_value
|
175 |
|
176 |
+
# Convert the audio to 16-bit integer format
|
177 |
generated_audio = np.int16(generated_audio * 32767)
|
178 |
|
179 |
+
# Ensure audio is in stereo format (samples, channels)
|
180 |
+
if generated_audio.ndim == 1: # If mono, make it stereo
|
181 |
generated_audio = np.expand_dims(generated_audio, axis=-1)
|
182 |
|
183 |
+
# Transpose to ensure the shape is (samples, channels)
|
184 |
generated_audio = generated_audio.T
|
185 |
|
186 |
+
# Convert sample_rate to a scalar integer
|
187 |
+
sample_rate_scalar = int(sample_rate)
|
188 |
+
|
189 |
+
# Debug: Ensure everything is correct before returning
|
190 |
+
print(f"Returning audio data of shape {generated_audio.shape}, dtype {generated_audio.dtype}")
|
191 |
+
print(f"Returning sample rate: {sample_rate_scalar}, dtype {type(sample_rate_scalar)}")
|
192 |
+
|
193 |
+
# Return the tuple (sample_rate, audio_data)
|
194 |
+
return (sample_rate_scalar, generated_audio)
|
195 |
|
|
|
|
|
196 |
|
197 |
|
198 |
|