Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -154,8 +154,8 @@ def generate_audio_from_image(image):
|
|
154 |
if image is None:
|
155 |
raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
|
156 |
|
157 |
-
#
|
158 |
-
test_img = image_transform(image).unsqueeze(0).to(device)
|
159 |
|
160 |
# Generate sound spectrogram from the image using the loaded generator
|
161 |
with torch.no_grad():
|
@@ -172,23 +172,20 @@ def generate_audio_from_image(image):
|
|
172 |
if max_value > 0:
|
173 |
generated_audio = generated_audio / max_value
|
174 |
|
175 |
-
# Convert to
|
176 |
-
generated_audio =
|
177 |
|
178 |
-
#
|
179 |
-
generated_audio = generated_audio.T
|
180 |
-
|
181 |
-
# Ensure the audio is 2D (even if mono)
|
182 |
if generated_audio.ndim == 1:
|
183 |
-
generated_audio = np.expand_dims(generated_audio, axis=-1)
|
|
|
|
|
|
|
184 |
|
185 |
# Debug: Print the shape and type of the generated audio
|
186 |
print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
|
187 |
-
# Debug: Ensure sample_rate is correct
|
188 |
-
print(f"Returning sample rate: {sample_rate}, type: {type(sample_rate)}")
|
189 |
-
|
190 |
|
191 |
-
#
|
192 |
return generated_audio, sample_rate
|
193 |
|
194 |
|
@@ -203,7 +200,8 @@ def main():
|
|
203 |
model_path = './gan_model.pth' # Ensure the model is in the correct relative path
|
204 |
generator = load_gan_model(generator, model_path, device)
|
205 |
|
206 |
-
iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"))
|
|
|
207 |
|
208 |
|
209 |
|
|
|
154 |
if image is None:
|
155 |
raise ValueError("The uploaded image is 'None'. Please check the Gradio input.")
|
156 |
|
157 |
+
# Preprocess image
|
158 |
+
test_img = image_transform(image).unsqueeze(0).to(device)
|
159 |
|
160 |
# Generate sound spectrogram from the image using the loaded generator
|
161 |
with torch.no_grad():
|
|
|
172 |
if max_value > 0:
|
173 |
generated_audio = generated_audio / max_value
|
174 |
|
175 |
+
# Convert audio to 16-bit integer format (-32768 to 32767)
|
176 |
+
generated_audio = np.int16(generated_audio * 32767)
|
177 |
|
178 |
+
# Ensure audio is stereo with shape (samples, channels)
|
|
|
|
|
|
|
179 |
if generated_audio.ndim == 1:
|
180 |
+
generated_audio = np.expand_dims(generated_audio, axis=-1)
|
181 |
+
|
182 |
+
# Transpose to (samples, channels) format
|
183 |
+
generated_audio = generated_audio.T
|
184 |
|
185 |
# Debug: Print the shape and type of the generated audio
|
186 |
print(f"Generated audio shape after processing: {generated_audio.shape}, type: {generated_audio.dtype}")
|
|
|
|
|
|
|
187 |
|
188 |
+
# Return audio and sample rate
|
189 |
return generated_audio, sample_rate
|
190 |
|
191 |
|
|
|
200 |
model_path = './gan_model.pth' # Ensure the model is in the correct relative path
|
201 |
generator = load_gan_model(generator, model_path, device)
|
202 |
|
203 |
+
iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"), title="Image to Sound Generation")
|
204 |
+
|
205 |
|
206 |
|
207 |
|