Spaces:

preston-cell
/

image-text-to-text

Sleeping

File size: 1,512 Bytes

f30b843
629e04f
602e80d
 
 
608498c
4f56a4b
ed4af8f
83cd235
602e80d
 
 
 
 
 
 
 
 
629e04f
602e80d
ed4af8f
602e80d
 
 
 
 
 
 
 
 
 
 
 
 
 
629e04f
602e80d
26dbd13
602e80d
 
26dbd13
602e80d
ed4af8f
602e80d
 
 
 
 
 
629e04f
5c86456
ed4af8f
602e80d

import gradio as gr
from transformers import pipeline
from datasets import load_dataset
import torch
import numpy as np

# Load BLIP model for image captioning
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Load SpeechT5 model for text-to-speech
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")

# Load speaker embedding once
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def process_image(image):
    try:
        # Generate caption from the image
        caption = caption_model(image)[0]['generated_text']

        # Convert caption to speech
        speech = synthesiser(
            caption,
            forward_params={"speaker_embeddings": speaker_embedding}
        )

        # Prepare audio data
        audio = np.array(speech["audio"])
        rate = speech["sampling_rate"]

        # Return both audio and caption
        return (rate, audio), caption

    except Exception as e:
        return None, f"Error: {str(e)}"


# Gradio Interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="SeeSay",
    description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
)

iface.launch()