Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import pipeline | |
# Load the image captioning pipeline | |
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
# Load Whisper model for speech synthesis | |
pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo") | |
def launch(input_image): | |
try: | |
# Step 1: Generate caption | |
out = captioner(input_image) | |
caption = out[0]['generated_text'] | |
# Step 2: Synthesize speech from caption | |
speech = pipe(caption) | |
audio_data = speech['audio'] | |
sample_rate = speech['sampling_rate'] | |
return (audio_data, sample_rate), caption | |
except Exception as e: | |
return str(e), "Error in processing." | |
# Gradio UI | |
iface = gr.Interface( | |
fn=launch, | |
inputs=gr.Image(type='pil', label="Upload an Image"), | |
outputs=[ | |
gr.Audio(type="numpy", label="Generated Audio"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="Whisper Image to Audio", | |
description="Upload an image to generate a caption and hear it described with speech." | |
) | |
iface.launch(share=True) | |