Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

App Files Files Community

Generate-Sound-Effects-from-Image / app.py

Bils

Update app.py

1ea1538 verified 14 days ago

raw

history blame

5.76 kB

	import spaces
	import os
	import tempfile
	import gradio as gr
	from dotenv import load_dotenv
	import torch
	from scipy.io.wavfile import write
	from diffusers import DiffusionPipeline
	from transformers import pipeline
	from pathlib import Path
	from PIL import Image # <-- Required for new model
	import io # <-- Required for new model

	# --- Setup Models and Device ---

	load_dotenv()
	hf_token = os.getenv("HF_TKN")

	# Use GPU if available, otherwise CPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Correctly initialize the modern, reliable captioning pipeline
	captioning_pipeline = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-large",
	device=device
	)
	print("Image captioning pipeline loaded.")

	# Initialize the audio pipeline. Use float16 for less VRAM on GPU.
	pipe = DiffusionPipeline.from_pretrained(
	"cvssp/audioldm2",
	torch_dtype=torch.float16 if device == "cuda" else torch.float32,
	)
	print("Audio generation pipeline loaded.")


	# --- Core Functions ---

	@spaces.GPU(duration=120)
	def analyze_image_with_free_model(image_file_bytes):
	"""Takes image bytes and returns a caption."""
	try:
	print("Received image bytes, opening with Pillow...")
	# Open the image data directly from memory using Pillow
	image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB")

	print("Generating caption...")
	results = captioning_pipeline(image)

	if not results or not isinstance(results, list):
	print("ERROR: Caption generation returned invalid results.")
	return "Error: Could not generate caption.", True

	caption = results[0].get("generated_text", "").strip()
	if not caption:
	print("ERROR: Generated caption is empty.")
	return "No caption was generated.", True

	print(f"Successfully generated caption: {caption}")
	return caption, False

	except Exception as e:
	print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}")
	return f"Error analyzing image: {e}", True

	@spaces.GPU(duration=120)
	def get_audioldm_from_caption(caption):
	"""Takes a text caption and returns a filepath to a generated WAV file."""
	try:
	# Move the large audio pipeline to the GPU only when it's being used
	pipe.to(device)

	print(f"Generating audio for prompt: '{caption}'")
	audio_output = pipe(
	prompt=caption,
	num_inference_steps=25, # Fewer steps for faster generation
	guidance_scale=7.0
	).audios[0]

	# Move the pipeline back to CPU to free up GPU memory for others
	pipe.to("cpu")

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
	print(f"Saving audio to temporary file: {temp_wav.name}")
	# write(file, sample_rate, data)
	write(temp_wav.name, 16000, audio_output)
	return temp_wav.name

	except Exception as e:
	print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}")
	return None

	# --- Gradio Interface ---

	css = """
	#col-container{ margin: 0 auto; max-width: 800px; }
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
	<p style="text-align: center;">
	⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
	</p>
	""")

	gr.Markdown("""
	1. Upload an image.
	2. Click Generate Description.
	3. Click Generate Sound Effect.
	""")

	image_upload = gr.File(label="Upload Image", type="binary")
	generate_description_button = gr.Button("Generate Description", variant="primary")
	caption_display = gr.Textbox(label="Image Description", interactive=False)
	generate_sound_button = gr.Button("Generate Sound Effect")
	audio_output = gr.Audio(label="Generated Sound Effect")

	gr.Markdown("""
	## 👥 Contribute & Support
	For support, questions, or to contribute, please contact us at
	[[email protected]](mailto:[email protected]).
	Support our work and get involved by donating through
	[Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
	""")

	# --- Event Handlers ---

	def update_caption(image_bytes):
	"""Wrapper function for the button click."""
	if image_bytes is None:
	return "Please upload an image first."
	description, _ = analyze_image_with_free_model(image_bytes)
	return description

	def generate_sound(description):
	"""Wrapper function for the button click."""
	if not description or description.startswith("Error"):
	gr.Warning("Cannot generate sound without a valid description!")
	return None
	audio_path = get_audioldm_from_caption(description)
	if audio_path is None:
	gr.Error("Failed to generate audio. Please check the logs.")
	return audio_path

	generate_description_button.click(
	fn=update_caption,
	inputs=image_upload,
	outputs=caption_display
	)

	generate_sound_button.click(
	fn=generate_sound,
	inputs=caption_display,
	outputs=audio_output
	)

	gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')

	# Launch the app. `share=True` is not needed on Spaces.
	demo.launch()