Spaces:

preston-cell
/

image-text-to-text

Sleeping

image-text-to-text / app.py

Update app.py

602e80d verified 4 months ago

1.51 kB

	import gradio as gr
	from transformers import pipeline
	from datasets import load_dataset
	import torch
	import numpy as np

	# Load BLIP model for image captioning
	caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

	# Load SpeechT5 model for text-to-speech
	synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")

	# Load speaker embedding once
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	def process_image(image):
	try:
	# Generate caption from the image
	caption = caption_model(image)[0]['generated_text']

	# Convert caption to speech
	speech = synthesiser(
	caption,
	forward_params={"speaker_embeddings": speaker_embedding}
	)

	# Prepare audio data
	audio = np.array(speech["audio"])
	rate = speech["sampling_rate"]

	# Return both audio and caption
	return (rate, audio), caption

	except Exception as e:
	return None, f"Error: {str(e)}"


	# Gradio Interface
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type='pil', label="Upload an Image"),
	outputs=[
	gr.Audio(label="Generated Audio"),
	gr.Textbox(label="Generated Caption")
	],
	title="SeeSay",
	description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
	)

	iface.launch()