File size: 1,116 Bytes
f30b843
e363033
8461775
78c45fd
83cd235
 
5152850
d9b8ff3
5152850
d9b8ff3
83cd235
ba2d445
520c499
83cd235
 
 
 
ba2d445
83cd235
 
 
 
 
ba2d445
520c499
 
ba2d445
83cd235
520c499
 
83cd235
520c499
83cd235
 
5c86456
 
cbd5ff0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import gradio as gr
from transformers import pipeline
import espeak

pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

from phonemizer.backend.espeak.wrapper import EspeakWrapper

EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll')

narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")

def launch(input_image):
    # Step 1: Extract caption
    caption = pipe(input_image)[0]["generated_text"]

    # Step 2: Convert caption to audio
    audio_output = narrator(caption)
    audio_array = np.array(audio_output["audio"])
    sample_rate = audio_output["sampling_rate"]

    # Step 3: Return audio + caption
    return (audio_array, sample_rate), caption

# Use dictionary to avoid conflicting argument ordering
iface = gr.Interface(
    fn=launch,
    inputs=gr.Image(type='pil', label="Upload Image"),
    outputs=[
        gr.Audio(type="numpy", label="Narrated Audio"),
        gr.Textbox(label="Extracted Caption")
    ],
    title="SeeSay",
    description="Upload an image to hear its context narrated aloud."
)

iface.launch(share=True)