Spaces:
Sleeping
Sleeping
import gradio as gr | |
# Use a pipeline as a high-level helper | |
from transformers import pipeline | |
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
def launch(input): | |
# Step 1: Extract text from image | |
caption = pipe(input_image)[0]['generated_text'] | |
# Step 2: Generate speech from the caption | |
audio_output = narrator(caption) | |
# Step 3: Save the audio to a temporary file | |
audio_data = audio_output["audio"] | |
sampling_rate = audio_output["sampling_rate"] | |
# Gradio expects a tuple: (numpy_array, sampling_rate) | |
return (np.array(audio_data), sampling_rate) | |
iface = gr.Interface(launch, | |
fn=launch, | |
inputs=gr.Image(type='pil'), | |
outputs=gr.Audio(type="numpy", label="Narrated Output"), | |
title="SeeSay", | |
description="Upload an image to hear its context narrated aloud." | |
) | |
iface.launch() |