|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") |
|
|
|
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") |
|
|
|
def launch(input_image): |
|
|
|
caption = pipe(input_image)[0]["generated_text"] |
|
|
|
|
|
audio_output = narrator(caption) |
|
audio_array = np.array(audio_output["audio"]) |
|
sample_rate = audio_output["sampling_rate"] |
|
|
|
|
|
return (audio_array, sample_rate), caption |
|
|
|
|
|
iface = gr.Interface( |
|
fn=launch, |
|
inputs=gr.Image(type='pil', label="Upload Image"), |
|
outputs=[ |
|
gr.Audio(type="numpy", label="Narrated Audio"), |
|
gr.Textbox(label="Extracted Caption") |
|
], |
|
title="SeeSay", |
|
description="Upload an image to hear its context narrated aloud." |
|
) |
|
|
|
iface.launch(share=True) |