Bils's picture
Update app.py
1ea1538 verified
raw
history blame
5.76 kB
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path
from PIL import Image # <-- Required for new model
import io # <-- Required for new model
# --- Setup Models and Device ---
load_dotenv()
hf_token = os.getenv("HF_TKN")
# Use GPU if available, otherwise CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Correctly initialize the modern, reliable captioning pipeline
captioning_pipeline = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-large",
device=device
)
print("Image captioning pipeline loaded.")
# Initialize the audio pipeline. Use float16 for less VRAM on GPU.
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
print("Audio generation pipeline loaded.")
# --- Core Functions ---
@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file_bytes):
"""Takes image bytes and returns a caption."""
try:
print("Received image bytes, opening with Pillow...")
# Open the image data directly from memory using Pillow
image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB")
print("Generating caption...")
results = captioning_pipeline(image)
if not results or not isinstance(results, list):
print("ERROR: Caption generation returned invalid results.")
return "Error: Could not generate caption.", True
caption = results[0].get("generated_text", "").strip()
if not caption:
print("ERROR: Generated caption is empty.")
return "No caption was generated.", True
print(f"Successfully generated caption: {caption}")
return caption, False
except Exception as e:
print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}")
return f"Error analyzing image: {e}", True
@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
"""Takes a text caption and returns a filepath to a generated WAV file."""
try:
# Move the large audio pipeline to the GPU only when it's being used
pipe.to(device)
print(f"Generating audio for prompt: '{caption}'")
audio_output = pipe(
prompt=caption,
num_inference_steps=25, # Fewer steps for faster generation
guidance_scale=7.0
).audios[0]
# Move the pipeline back to CPU to free up GPU memory for others
pipe.to("cpu")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
print(f"Saving audio to temporary file: {temp_wav.name}")
# write(file, sample_rate, data)
write(temp_wav.name, 16000, audio_output)
return temp_wav.name
except Exception as e:
print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}")
return None
# --- Gradio Interface ---
css = """
#col-container{ margin: 0 auto; max-width: 800px; }
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
<p style="text-align: center;">
⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
</p>
""")
gr.Markdown("""
1. **Upload an image**.
2. Click **Generate Description**.
3. Click **Generate Sound Effect**.
""")
image_upload = gr.File(label="Upload Image", type="binary")
generate_description_button = gr.Button("Generate Description", variant="primary")
caption_display = gr.Textbox(label="Image Description", interactive=False)
generate_sound_button = gr.Button("Generate Sound Effect")
audio_output = gr.Audio(label="Generated Sound Effect")
gr.Markdown("""
## 👥 Contribute & Support
For support, questions, or to contribute, please contact us at
[[email protected]](mailto:[email protected]).
Support our work and get involved by donating through
[Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
""")
# --- Event Handlers ---
def update_caption(image_bytes):
"""Wrapper function for the button click."""
if image_bytes is None:
return "Please upload an image first."
description, _ = analyze_image_with_free_model(image_bytes)
return description
def generate_sound(description):
"""Wrapper function for the button click."""
if not description or description.startswith("Error"):
gr.Warning("Cannot generate sound without a valid description!")
return None
audio_path = get_audioldm_from_caption(description)
if audio_path is None:
gr.Error("Failed to generate audio. Please check the logs.")
return audio_path
generate_description_button.click(
fn=update_caption,
inputs=image_upload,
outputs=caption_display
)
generate_sound_button.click(
fn=generate_sound,
inputs=caption_display,
outputs=audio_output
)
gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
# Launch the app. `share=True` is not needed on Spaces.
demo.launch()