Spaces:

preston-cell
/

image-text-to-text

Running

image-text-to-text / app.py

Update app.py

629e04f verified 5 months ago

1.12 kB

	import gradio as gr
	from transformers import pipeline

	# Load the Whisper model for generating speech
	speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

	# Load the BLIP model for image captioning
	caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	def generate_caption_and_speech(image):
	try:
	# Generate a caption from the image
	caption = caption_model(image)[0]['generated_text']

	# Generate speech using the caption
	speech = speech_model(caption)

	# Return both the caption and the audio
	return speech["audio"], caption
	except Exception as e:
	return str(e), ""

	# Set up the Gradio interface
	iface = gr.Interface(
	fn=generate_caption_and_speech,
	inputs=gr.Image(type='pil', label="Upload Image"),
	outputs=[
	gr.Audio(type="filepath", label="Generated Audio"),
	gr.Textbox(label="Generated Caption")
	],
	title="SeeSay: Image to Speech",
	description="Upload an image to generate a caption and hear it described with speech."
	)

	iface.launch(share=True)