Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
# Load image captioning model | |
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
# Load Whisper TTS model | |
speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo") | |
def process_image(image): | |
try: | |
# Step 1: Generate caption from image | |
caption = caption_model(image)[0]['generated_text'] | |
# Step 2: Convert caption to speech using Whisper | |
speech = speech_model(caption) | |
audio = np.array(speech["audio"]) | |
rate = speech["sampling_rate"] | |
# Return both the audio and the caption | |
return (audio, rate), caption | |
except Exception as e: | |
return str(e), "Error generating caption or audio." | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type='pil', label="Upload an Image"), | |
outputs=[ | |
gr.Audio(label="Generated Audio"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="SeeSay", | |
description="Upload an image to generate a caption and hear it described with speech." | |
) | |
iface.launch() | |