Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import espeak | |
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll') | |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
def launch(input_image): | |
# Step 1: Extract caption | |
caption = pipe(input_image)[0]["generated_text"] | |
# Step 2: Convert caption to audio | |
audio_output = narrator(caption) | |
audio_array = np.array(audio_output["audio"]) | |
sample_rate = audio_output["sampling_rate"] | |
# Step 3: Return audio + caption | |
return (audio_array, sample_rate), caption | |
# Use dictionary to avoid conflicting argument ordering | |
iface = gr.Interface( | |
fn=launch, | |
inputs=gr.Image(type='pil', label="Upload Image"), | |
outputs=[ | |
gr.Audio(type="numpy", label="Narrated Audio"), | |
gr.Textbox(label="Extracted Caption") | |
], | |
title="SeeSay", | |
description="Upload an image to hear its context narrated aloud." | |
) | |
iface.launch(share=True) |