Spaces:

preston-cell
/

image-text-to-text

Running

image-text-to-text / app.py

Update app.py

1b7018a verified 5 months ago

1.14 kB

	import gradio as gr
	import torch
	from transformers import pipeline

	# Load the image captioning pipeline
	captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	# Load Whisper model for speech synthesis
	pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

	def launch(input_image):
	try:
	# Step 1: Generate caption
	out = captioner(input_image)
	caption = out[0]['generated_text']


	# Step 2: Synthesize speech from caption
	speech = pipe(caption)
	audio_data = speech['audio']
	sample_rate = speech['sampling_rate']

	return (audio_data, sample_rate), caption

	except Exception as e:
	return str(e), "Error in processing."

	# Gradio UI
	iface = gr.Interface(
	fn=launch,
	inputs=gr.Image(type='pil', label="Upload an Image"),
	outputs=[
	gr.Audio(type="numpy", label="Generated Audio"),
	gr.Textbox(label="Generated Caption")
	],
	title="Whisper Image to Audio",
	description="Upload an image to generate a caption and hear it described with speech."
	)

	iface.launch(share=True)