Bils commited on
Commit
1ea1538
·
verified ·
1 Parent(s): b67339a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -53
app.py CHANGED
@@ -8,78 +8,94 @@ from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
- from PIL import Image # <-- ADDED THIS IMPORT
12
- import io # <-- ADDED THIS IMPORT
 
 
13
 
14
  load_dotenv()
15
  hf_token = os.getenv("HF_TKN")
16
 
17
- device_id = 0 if torch.cuda.is_available() else -1
 
 
18
 
19
  # Correctly initialize the modern, reliable captioning pipeline
20
  captioning_pipeline = pipeline(
21
  "image-to-text",
22
  model="Salesforce/blip-image-captioning-large",
23
- device=device_id
24
  )
 
25
 
26
- # Initialize the audio pipeline
27
  pipe = DiffusionPipeline.from_pretrained(
28
  "cvssp/audioldm2",
29
- use_auth_token=hf_token
30
  )
 
 
31
 
 
32
 
33
- # === THIS IS THE CORRECTED FUNCTION ===
34
  @spaces.GPU(duration=120)
35
  def analyze_image_with_free_model(image_file_bytes):
 
36
  try:
37
- # No more temp files!
38
  # Open the image data directly from memory using Pillow
39
- image = Image.open(io.BytesIO(image_file_bytes))
40
 
41
- # Pass the Pillow Image object directly to the pipeline. This is the robust method.
42
  results = captioning_pipeline(image)
43
 
44
  if not results or not isinstance(results, list):
 
45
  return "Error: Could not generate caption.", True
46
 
47
  caption = results[0].get("generated_text", "").strip()
48
  if not caption:
 
49
  return "No caption was generated.", True
 
 
50
  return caption, False
51
 
52
  except Exception as e:
53
- print(f"ERROR in analyze_image_with_free_model: {e}") # Print error to logs
54
  return f"Error analyzing image: {e}", True
55
 
56
-
57
  @spaces.GPU(duration=120)
58
  def get_audioldm_from_caption(caption):
 
59
  try:
60
- pipe.to("cuda")
 
 
 
61
  audio_output = pipe(
62
  prompt=caption,
63
- num_inference_steps=50,
64
- guidance_scale=7.5
65
- )
66
- pipe.to("cpu")
67
- audio = audio_output.audios[0]
68
 
 
 
 
69
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
70
- write(temp_wav.name, 16000, audio)
 
 
71
  return temp_wav.name
72
 
73
  except Exception as e:
74
- print(f"Error generating audio from caption: {e}")
75
  return None
76
 
77
- # --- Gradio UI (No changes needed here) ---
 
78
  css = """
79
- #col-container{
80
- margin: 0 auto;
81
- max-width: 800px;
82
- }
83
  """
84
 
85
  with gr.Blocks(css=css) as demo:
@@ -92,52 +108,42 @@ with gr.Blocks(css=css) as demo:
92
  """)
93
 
94
  gr.Markdown("""
95
- Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
96
- descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
97
-
98
- **💡 How it works:**
99
- 1. **Upload an image**: Choose an image that you'd like to analyze.
100
- 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
101
- 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
102
- sound effect that matches the image context.
103
-
104
- Enjoy the journey from visual to auditory sensation with just a few clicks!
105
  """)
106
 
107
  image_upload = gr.File(label="Upload Image", type="binary")
108
- generate_description_button = gr.Button("Generate Description")
109
  caption_display = gr.Textbox(label="Image Description", interactive=False)
110
  generate_sound_button = gr.Button("Generate Sound Effect")
111
  audio_output = gr.Audio(label="Generated Sound Effect")
112
 
113
  gr.Markdown("""
114
- ## 👥 How You Can Contribute
115
- We welcome contributions and suggestions for improvements. Your feedback is invaluable
116
- to the continuous enhancement of this application.
117
-
118
  For support, questions, or to contribute, please contact us at
119
120
-
121
  Support our work and get involved by donating through
122
  [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
123
  """)
124
-
125
- gr.Markdown("""
126
- ## 📢 Stay Connected
127
- This app is a testament to the creative possibilities that emerge when technology meets art.
128
- Enjoy exploring the auditory landscape of your images!
129
- """)
130
 
131
- # --- Gradio event handlers (I've updated the function called here) ---
132
- def update_caption(image_file_bytes):
133
- # We pass the bytes from the uploader directly to our corrected function
134
- description, _ = analyze_image_with_free_model(image_file_bytes)
 
 
 
135
  return description
136
 
137
  def generate_sound(description):
 
138
  if not description or description.startswith("Error"):
 
139
  return None
140
  audio_path = get_audioldm_from_caption(description)
 
 
141
  return audio_path
142
 
143
  generate_description_button.click(
@@ -153,6 +159,6 @@ with gr.Blocks(css=css) as demo:
153
  )
154
 
155
  gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
156
- html = gr.HTML()
157
 
158
- demo.launch(debug=True, share=True)
 
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
+ from PIL import Image # <-- Required for new model
12
+ import io # <-- Required for new model
13
+
14
+ # --- Setup Models and Device ---
15
 
16
  load_dotenv()
17
  hf_token = os.getenv("HF_TKN")
18
 
19
+ # Use GPU if available, otherwise CPU
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ print(f"Using device: {device}")
22
 
23
  # Correctly initialize the modern, reliable captioning pipeline
24
  captioning_pipeline = pipeline(
25
  "image-to-text",
26
  model="Salesforce/blip-image-captioning-large",
27
+ device=device
28
  )
29
+ print("Image captioning pipeline loaded.")
30
 
31
+ # Initialize the audio pipeline. Use float16 for less VRAM on GPU.
32
  pipe = DiffusionPipeline.from_pretrained(
33
  "cvssp/audioldm2",
34
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
35
  )
36
+ print("Audio generation pipeline loaded.")
37
+
38
 
39
+ # --- Core Functions ---
40
 
 
41
  @spaces.GPU(duration=120)
42
  def analyze_image_with_free_model(image_file_bytes):
43
+ """Takes image bytes and returns a caption."""
44
  try:
45
+ print("Received image bytes, opening with Pillow...")
46
  # Open the image data directly from memory using Pillow
47
+ image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB")
48
 
49
+ print("Generating caption...")
50
  results = captioning_pipeline(image)
51
 
52
  if not results or not isinstance(results, list):
53
+ print("ERROR: Caption generation returned invalid results.")
54
  return "Error: Could not generate caption.", True
55
 
56
  caption = results[0].get("generated_text", "").strip()
57
  if not caption:
58
+ print("ERROR: Generated caption is empty.")
59
  return "No caption was generated.", True
60
+
61
+ print(f"Successfully generated caption: {caption}")
62
  return caption, False
63
 
64
  except Exception as e:
65
+ print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}")
66
  return f"Error analyzing image: {e}", True
67
 
 
68
  @spaces.GPU(duration=120)
69
  def get_audioldm_from_caption(caption):
70
+ """Takes a text caption and returns a filepath to a generated WAV file."""
71
  try:
72
+ # Move the large audio pipeline to the GPU only when it's being used
73
+ pipe.to(device)
74
+
75
+ print(f"Generating audio for prompt: '{caption}'")
76
  audio_output = pipe(
77
  prompt=caption,
78
+ num_inference_steps=25, # Fewer steps for faster generation
79
+ guidance_scale=7.0
80
+ ).audios[0]
 
 
81
 
82
+ # Move the pipeline back to CPU to free up GPU memory for others
83
+ pipe.to("cpu")
84
+
85
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
86
+ print(f"Saving audio to temporary file: {temp_wav.name}")
87
+ # write(file, sample_rate, data)
88
+ write(temp_wav.name, 16000, audio_output)
89
  return temp_wav.name
90
 
91
  except Exception as e:
92
+ print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}")
93
  return None
94
 
95
+ # --- Gradio Interface ---
96
+
97
  css = """
98
+ #col-container{ margin: 0 auto; max-width: 800px; }
 
 
 
99
  """
100
 
101
  with gr.Blocks(css=css) as demo:
 
108
  """)
109
 
110
  gr.Markdown("""
111
+ 1. **Upload an image**.
112
+ 2. Click **Generate Description**.
113
+ 3. Click **Generate Sound Effect**.
 
 
 
 
 
 
 
114
  """)
115
 
116
  image_upload = gr.File(label="Upload Image", type="binary")
117
+ generate_description_button = gr.Button("Generate Description", variant="primary")
118
  caption_display = gr.Textbox(label="Image Description", interactive=False)
119
  generate_sound_button = gr.Button("Generate Sound Effect")
120
  audio_output = gr.Audio(label="Generated Sound Effect")
121
 
122
  gr.Markdown("""
123
+ ## 👥 Contribute & Support
 
 
 
124
  For support, questions, or to contribute, please contact us at
125
 
126
  Support our work and get involved by donating through
127
  [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
128
  """)
 
 
 
 
 
 
129
 
130
+ # --- Event Handlers ---
131
+
132
+ def update_caption(image_bytes):
133
+ """Wrapper function for the button click."""
134
+ if image_bytes is None:
135
+ return "Please upload an image first."
136
+ description, _ = analyze_image_with_free_model(image_bytes)
137
  return description
138
 
139
  def generate_sound(description):
140
+ """Wrapper function for the button click."""
141
  if not description or description.startswith("Error"):
142
+ gr.Warning("Cannot generate sound without a valid description!")
143
  return None
144
  audio_path = get_audioldm_from_caption(description)
145
+ if audio_path is None:
146
+ gr.Error("Failed to generate audio. Please check the logs.")
147
  return audio_path
148
 
149
  generate_description_button.click(
 
159
  )
160
 
161
  gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
 
162
 
163
+ # Launch the app. `share=True` is not needed on Spaces.
164
+ demo.launch()