File size: 5,762 Bytes
63f345f
041bd28
 
698d4cd
ccdc62f
041bd28
 
213e5d3
 
041bd28
1ea1538
 
 
 
213e5d3
 
698d4cd
213e5d3
1ea1538
 
 
9c06b1a
0def226
041bd28
 
0def226
1ea1538
041bd28
1ea1538
9c06b1a
1ea1538
041bd28
 
1ea1538
041bd28
1ea1538
 
041bd28
1ea1538
0def226
041bd28
0def226
1ea1538
2f15cbe
1ea1538
0def226
1ea1538
041bd28
1ea1538
0def226
 
ccdc62f
1ea1538
63f345f
ccdc62f
041bd28
 
1ea1538
63f345f
1ea1538
 
041bd28
 
81b2481
1ea1538
041bd28
81b2481
698d4cd
63f345f
1ea1538
24da5c3
1ea1538
 
 
 
041bd28
18fbeec
1ea1538
 
 
041bd28
1ea1538
 
 
041bd28
1ea1538
 
 
041bd28
 
4d9e689
1ea1538
698d4cd
4d9e689
1ea1538
 
172038e
1ea1538
172038e
 
698d4cd
 
 
63f345f
 
 
 
a4f881b
e18ae6e
041bd28
1ea1538
 
 
041bd28
18fbeec
041bd28
1ea1538
041bd28
 
 
18fbeec
041bd28
1ea1538
041bd28
 
 
 
 
0def226
1ea1538
 
 
 
 
 
 
041bd28
 
 
1ea1538
041bd28
1ea1538
041bd28
 
1ea1538
 
041bd28
2f15cbe
041bd28
 
 
 
698d4cd
8a09658
041bd28
 
 
 
e18ae6e
63f345f
 
a4f881b
1ea1538
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path
from PIL import Image  # <-- Required for new model
import io               # <-- Required for new model

# --- Setup Models and Device ---

load_dotenv()
hf_token = os.getenv("HF_TKN")

# Use GPU if available, otherwise CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Correctly initialize the modern, reliable captioning pipeline
captioning_pipeline = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-large", 
    device=device
)
print("Image captioning pipeline loaded.")

# Initialize the audio pipeline. Use float16 for less VRAM on GPU.
pipe = DiffusionPipeline.from_pretrained(
    "cvssp/audioldm2",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
print("Audio generation pipeline loaded.")


# --- Core Functions ---

@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file_bytes):
    """Takes image bytes and returns a caption."""
    try:
        print("Received image bytes, opening with Pillow...")
        # Open the image data directly from memory using Pillow
        image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB")

        print("Generating caption...")
        results = captioning_pipeline(image)
        
        if not results or not isinstance(results, list):
            print("ERROR: Caption generation returned invalid results.")
            return "Error: Could not generate caption.", True
        
        caption = results[0].get("generated_text", "").strip()
        if not caption:
            print("ERROR: Generated caption is empty.")
            return "No caption was generated.", True

        print(f"Successfully generated caption: {caption}")
        return caption, False

    except Exception as e:
        print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}")
        return f"Error analyzing image: {e}", True

@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
    """Takes a text caption and returns a filepath to a generated WAV file."""
    try:
        # Move the large audio pipeline to the GPU only when it's being used
        pipe.to(device) 
        
        print(f"Generating audio for prompt: '{caption}'")
        audio_output = pipe(
            prompt=caption,
            num_inference_steps=25, # Fewer steps for faster generation
            guidance_scale=7.0
        ).audios[0]

        # Move the pipeline back to CPU to free up GPU memory for others
        pipe.to("cpu")
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            print(f"Saving audio to temporary file: {temp_wav.name}")
            # write(file, sample_rate, data)
            write(temp_wav.name, 16000, audio_output)
            return temp_wav.name

    except Exception as e:
        print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}")
        return None

# --- Gradio Interface ---

css = """
#col-container{ margin: 0 auto; max-width: 800px; }
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
    <p style="text-align: center;">
        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
    </p>
        """)

    gr.Markdown("""
    1. **Upload an image**.
    2. Click **Generate Description**.
    3. Click **Generate Sound Effect**.
    """)

    image_upload = gr.File(label="Upload Image", type="binary")
    generate_description_button = gr.Button("Generate Description", variant="primary")
    caption_display = gr.Textbox(label="Image Description", interactive=False)
    generate_sound_button = gr.Button("Generate Sound Effect")
    audio_output = gr.Audio(label="Generated Sound Effect")

    gr.Markdown("""
    ## 👥 Contribute & Support
    For support, questions, or to contribute, please contact us at 
    [[email protected]](mailto:[email protected]).
    Support our work and get involved by donating through 
    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
    """)
    
    # --- Event Handlers ---

    def update_caption(image_bytes):
        """Wrapper function for the button click."""
        if image_bytes is None:
            return "Please upload an image first."
        description, _ = analyze_image_with_free_model(image_bytes)
        return description

    def generate_sound(description):
        """Wrapper function for the button click."""
        if not description or description.startswith("Error"):
            gr.Warning("Cannot generate sound without a valid description!")
            return None
        audio_path = get_audioldm_from_caption(description)
        if audio_path is None:
            gr.Error("Failed to generate audio. Please check the logs.")
        return audio_path

    generate_description_button.click(
        fn=update_caption,
        inputs=image_upload,
        outputs=caption_display
    )

    generate_sound_button.click(
        fn=generate_sound,
        inputs=caption_display,
        outputs=audio_output
    )
    
    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')

# Launch the app. `share=True` is not needed on Spaces.
demo.launch()