Spaces:

preston-cell
/

image-text-to-text

Running

File size: 1,158 Bytes

f30b843
629e04f
f67b2c3
608498c
ed4af8f
 
83cd235
ed4af8f
 
f67b2c3
ed4af8f
629e04f
ed4af8f
 
83cd235
ed4af8f
 
 
 
26dbd13
ed4af8f
 
629e04f
ed4af8f
26dbd13
ed4af8f
26dbd13
f67b2c3
ed4af8f
629e04f
ed4af8f
516718c
629e04f
ed4af8f
629e04f
 
5c86456
ed4af8f
516718c
f67b2c3

import gradio as gr
from transformers import pipeline
import numpy as np

# Load image captioning model
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Load Whisper TTS model
speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

def process_image(image):
    try:
        # Step 1: Generate caption from image
        caption = caption_model(image)[0]['generated_text']

        # Step 2: Convert caption to speech using Whisper
        speech = speech_model(caption)
        audio = np.array(speech["audio"])
        rate = speech["sampling_rate"]

        # Return both the audio and the caption
        return (audio, rate), caption
    except Exception as e:
        return str(e), "Error generating caption or audio."

# Gradio Interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="SeeSay",
    description="Upload an image to generate a caption and hear it described with speech."
)

iface.launch()