Spaces:
Runtime error
Runtime error
File size: 5,762 Bytes
63f345f 041bd28 698d4cd ccdc62f 041bd28 213e5d3 041bd28 1ea1538 213e5d3 698d4cd 213e5d3 1ea1538 9c06b1a 0def226 041bd28 0def226 1ea1538 041bd28 1ea1538 9c06b1a 1ea1538 041bd28 1ea1538 041bd28 1ea1538 041bd28 1ea1538 0def226 041bd28 0def226 1ea1538 2f15cbe 1ea1538 0def226 1ea1538 041bd28 1ea1538 0def226 ccdc62f 1ea1538 63f345f ccdc62f 041bd28 1ea1538 63f345f 1ea1538 041bd28 81b2481 1ea1538 041bd28 81b2481 698d4cd 63f345f 1ea1538 24da5c3 1ea1538 041bd28 18fbeec 1ea1538 041bd28 1ea1538 041bd28 1ea1538 041bd28 4d9e689 1ea1538 698d4cd 4d9e689 1ea1538 172038e 1ea1538 172038e 698d4cd 63f345f a4f881b e18ae6e 041bd28 1ea1538 041bd28 18fbeec 041bd28 1ea1538 041bd28 18fbeec 041bd28 1ea1538 041bd28 0def226 1ea1538 041bd28 1ea1538 041bd28 1ea1538 041bd28 1ea1538 041bd28 2f15cbe 041bd28 698d4cd 8a09658 041bd28 e18ae6e 63f345f a4f881b 1ea1538 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path
from PIL import Image # <-- Required for new model
import io # <-- Required for new model
# --- Setup Models and Device ---
load_dotenv()
hf_token = os.getenv("HF_TKN")
# Use GPU if available, otherwise CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Correctly initialize the modern, reliable captioning pipeline
captioning_pipeline = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-large",
device=device
)
print("Image captioning pipeline loaded.")
# Initialize the audio pipeline. Use float16 for less VRAM on GPU.
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
print("Audio generation pipeline loaded.")
# --- Core Functions ---
@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file_bytes):
"""Takes image bytes and returns a caption."""
try:
print("Received image bytes, opening with Pillow...")
# Open the image data directly from memory using Pillow
image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB")
print("Generating caption...")
results = captioning_pipeline(image)
if not results or not isinstance(results, list):
print("ERROR: Caption generation returned invalid results.")
return "Error: Could not generate caption.", True
caption = results[0].get("generated_text", "").strip()
if not caption:
print("ERROR: Generated caption is empty.")
return "No caption was generated.", True
print(f"Successfully generated caption: {caption}")
return caption, False
except Exception as e:
print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}")
return f"Error analyzing image: {e}", True
@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
"""Takes a text caption and returns a filepath to a generated WAV file."""
try:
# Move the large audio pipeline to the GPU only when it's being used
pipe.to(device)
print(f"Generating audio for prompt: '{caption}'")
audio_output = pipe(
prompt=caption,
num_inference_steps=25, # Fewer steps for faster generation
guidance_scale=7.0
).audios[0]
# Move the pipeline back to CPU to free up GPU memory for others
pipe.to("cpu")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
print(f"Saving audio to temporary file: {temp_wav.name}")
# write(file, sample_rate, data)
write(temp_wav.name, 16000, audio_output)
return temp_wav.name
except Exception as e:
print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}")
return None
# --- Gradio Interface ---
css = """
#col-container{ margin: 0 auto; max-width: 800px; }
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
<p style="text-align: center;">
⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
</p>
""")
gr.Markdown("""
1. **Upload an image**.
2. Click **Generate Description**.
3. Click **Generate Sound Effect**.
""")
image_upload = gr.File(label="Upload Image", type="binary")
generate_description_button = gr.Button("Generate Description", variant="primary")
caption_display = gr.Textbox(label="Image Description", interactive=False)
generate_sound_button = gr.Button("Generate Sound Effect")
audio_output = gr.Audio(label="Generated Sound Effect")
gr.Markdown("""
## 👥 Contribute & Support
For support, questions, or to contribute, please contact us at
[[email protected]](mailto:[email protected]).
Support our work and get involved by donating through
[Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
""")
# --- Event Handlers ---
def update_caption(image_bytes):
"""Wrapper function for the button click."""
if image_bytes is None:
return "Please upload an image first."
description, _ = analyze_image_with_free_model(image_bytes)
return description
def generate_sound(description):
"""Wrapper function for the button click."""
if not description or description.startswith("Error"):
gr.Warning("Cannot generate sound without a valid description!")
return None
audio_path = get_audioldm_from_caption(description)
if audio_path is None:
gr.Error("Failed to generate audio. Please check the logs.")
return audio_path
generate_description_button.click(
fn=update_caption,
inputs=image_upload,
outputs=caption_display
)
generate_sound_button.click(
fn=generate_sound,
inputs=caption_display,
outputs=audio_output
)
gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
# Launch the app. `share=True` is not needed on Spaces.
demo.launch() |