preston-cell's picture
Update app.py
eade8cd verified
raw
history blame
959 Bytes
import gradio as gr
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
def launch(input):
# Step 1: Extract text from image
caption = pipe(input_image)[0]['generated_text']
# Step 2: Generate speech from the caption
audio_output = narrator(caption)
# Step 3: Save the audio to a temporary file
audio_data = audio_output["audio"]
sampling_rate = audio_output["sampling_rate"]
# Gradio expects a tuple: (numpy_array, sampling_rate)
return (np.array(audio_data), sampling_rate)
iface = gr.Interface(launch,
fn=launch,
inputs=gr.Image(type='pil'),
outputs=gr.Audio(type="numpy", label="Narrated Output"),
title="SeeSay",
description="Upload an image to hear its context narrated aloud."
)
iface.launch()