Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
# Load the Whisper model for generating speech | |
speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo") | |
# Load the BLIP model for image captioning | |
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
def generate_caption_and_speech(image): | |
try: | |
# Generate a caption from the image | |
caption = caption_model(image)[0]['generated_text'] | |
# Generate speech using the caption | |
speech = speech_model(caption) | |
# Return both the caption and the audio | |
return speech["audio"], caption | |
except Exception as e: | |
return str(e), "" | |
# Set up the Gradio interface | |
iface = gr.Interface( | |
fn=generate_caption_and_speech, | |
inputs=gr.Image(type='pil', label="Upload Image"), | |
outputs=[ | |
gr.Audio(type="filepath", label="Generated Audio"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="SeeSay: Image to Speech", | |
description="Upload an image to generate a caption and hear it described with speech." | |
) | |
iface.launch(share=True) | |