Spaces:

preston-cell
/

image-text-to-text

Running

image-text-to-text / app.py

Update app.py

ed4af8f verified 5 months ago

1.16 kB

	import gradio as gr
	from transformers import pipeline
	import numpy as np

	# Load image captioning model
	caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	# Load Whisper TTS model
	speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

	def process_image(image):
	try:
	# Step 1: Generate caption from image
	caption = caption_model(image)[0]['generated_text']

	# Step 2: Convert caption to speech using Whisper
	speech = speech_model(caption)
	audio = np.array(speech["audio"])
	rate = speech["sampling_rate"]

	# Return both the audio and the caption
	return (audio, rate), caption
	except Exception as e:
	return str(e), "Error generating caption or audio."

	# Gradio Interface
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type='pil', label="Upload an Image"),
	outputs=[
	gr.Audio(label="Generated Audio"),
	gr.Textbox(label="Generated Caption")
	],
	title="SeeSay",
	description="Upload an image to generate a caption and hear it described with speech."
	)

	iface.launch()