Spaces:

preston-cell
/

image-text-to-text

Running

File size: 1,123 Bytes

f30b843
629e04f
608498c
629e04f
 
83cd235
629e04f
 
83cd235
629e04f
 
 
 
83cd235
629e04f
 
26dbd13
629e04f
 
 
 
26dbd13
629e04f
26dbd13
 
 
629e04f
 
 
 
 
 
 
5c86456
629e04f

import gradio as gr
from transformers import pipeline

# Load the Whisper model for generating speech
speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

# Load the BLIP model for image captioning
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

def generate_caption_and_speech(image):
    try:
        # Generate a caption from the image
        caption = caption_model(image)[0]['generated_text']

        # Generate speech using the caption
        speech = speech_model(caption)

        # Return both the caption and the audio
        return speech["audio"], caption
    except Exception as e:
        return str(e), ""

# Set up the Gradio interface
iface = gr.Interface(
    fn=generate_caption_and_speech,
    inputs=gr.Image(type='pil', label="Upload Image"),
    outputs=[
        gr.Audio(type="filepath", label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="SeeSay: Image to Speech",
    description="Upload an image to generate a caption and hear it described with speech."
)

iface.launch(share=True)