Spaces:
Running
Running
File size: 1,116 Bytes
f30b843 e363033 8461775 78c45fd 83cd235 5152850 d9b8ff3 5152850 d9b8ff3 83cd235 ba2d445 520c499 83cd235 ba2d445 83cd235 ba2d445 520c499 ba2d445 83cd235 520c499 83cd235 520c499 83cd235 5c86456 cbd5ff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import gradio as gr
from transformers import pipeline
import espeak
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
from phonemizer.backend.espeak.wrapper import EspeakWrapper
EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll')
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
def launch(input_image):
# Step 1: Extract caption
caption = pipe(input_image)[0]["generated_text"]
# Step 2: Convert caption to audio
audio_output = narrator(caption)
audio_array = np.array(audio_output["audio"])
sample_rate = audio_output["sampling_rate"]
# Step 3: Return audio + caption
return (audio_array, sample_rate), caption
# Use dictionary to avoid conflicting argument ordering
iface = gr.Interface(
fn=launch,
inputs=gr.Image(type='pil', label="Upload Image"),
outputs=[
gr.Audio(type="numpy", label="Narrated Audio"),
gr.Textbox(label="Extracted Caption")
],
title="SeeSay",
description="Upload an image to hear its context narrated aloud."
)
iface.launch(share=True) |