Spaces:

preston-cell
/

image-text-to-text

Running

image-text-to-text / app.py

Update app.py

f67b2c3 verified 5 months ago

1.35 kB

	import gradio as gr
	from transformers import pipeline
	import numpy as np
	import torch

	# Load the image-to-text model
	captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	# Load the Whisper model for speech synthesis
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model_id = "openai/whisper-large-v3-turbo"
	model = pipeline("text-to-speech", model=model_id, device=device)


	def process_image(input_image):
	try:
	# Step 1: Generate caption
	caption = captioner(input_image)[0]['generated_text']

	# Step 2: Convert caption to speech
	speech_output = model(caption)
	audio_data = np.array(speech_output["audio"]).astype(np.float32)
	sample_rate = speech_output["sampling_rate"]

	return (audio_data, sample_rate), caption

	except Exception as e:
	return str(e)


	# Create Gradio Interface
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type='pil', label="Upload Image"),
	outputs=[
	gr.Audio(label="Generated Speech"),
	gr.Textbox(label="Generated Caption")
	],
	title="Image to Audio with Caption",
	description="Upload an image to generate a caption and hear it described with speech."
	)

	iface.launch(share=True)