Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
from datasets import load_dataset | |
import torch | |
import numpy as np | |
# Load BLIP model for image captioning | |
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
# Load SpeechT5 model for text-to-speech | |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
# Load speaker embedding once | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
def process_image(image): | |
try: | |
# Generate caption from the image | |
caption = caption_model(image)[0]['generated_text'] | |
# Convert caption to speech | |
speech = synthesiser( | |
caption, | |
forward_params={"speaker_embeddings": speaker_embedding} | |
) | |
# Prepare audio data | |
audio = np.array(speech["audio"]) | |
rate = speech["sampling_rate"] | |
# Return both audio and caption | |
return (rate, audio), caption | |
except Exception as e: | |
return None, f"Error: {str(e)}" | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type='pil', label="Upload an Image"), | |
outputs=[ | |
gr.Audio(label="Generated Audio"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="SeeSay", | |
description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis." | |
) | |
iface.launch() | |