Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
# Captioning model | |
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
# Bark TTS model | |
synthesiser = pipeline("text-to-speech", model="suno/bark") | |
def launch(input_image): | |
# Step 1: Generate caption | |
caption = captioner(input_image)[0]['generated_text'] | |
# Step 2: Synthesize speech from caption | |
speech = synthesiser(caption, forward_params={"do_sample": True}) | |
audio = np.array(speech["audio"]) | |
rate = speech["sampling_rate"] | |
return (audio, rate), caption | |
# Gradio UI | |
iface = gr.Interface( | |
fn=launch, | |
inputs=gr.Image(type="pil", label="Upload Image"), | |
outputs=[ | |
gr.Audio(type="numpy", label="Narrated Audio"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="🎙️ SeeSay", | |
description="Upload an image to hear it described with expressive speech." | |
) | |
iface.launch(share = True) |