Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,36 +1,33 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll')
|
9 |
-
|
10 |
-
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
11 |
|
12 |
def launch(input_image):
|
13 |
-
# Step 1:
|
14 |
-
caption =
|
15 |
|
16 |
-
# Step 2:
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
|
22 |
-
return (audio_array, sample_rate), caption
|
23 |
|
24 |
-
#
|
25 |
iface = gr.Interface(
|
26 |
fn=launch,
|
27 |
-
inputs=gr.Image(type=
|
28 |
outputs=[
|
29 |
gr.Audio(type="numpy", label="Narrated Audio"),
|
30 |
-
gr.Textbox(label="
|
31 |
],
|
32 |
-
title="SeeSay",
|
33 |
-
description="Upload an image to hear
|
34 |
)
|
35 |
|
36 |
-
iface.launch(share=True)
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
|
4 |
+
# Captioning model
|
5 |
+
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
6 |
|
7 |
+
# Bark TTS model
|
8 |
+
synthesiser = pipeline("text-to-speech", model="suno/bark")
|
|
|
|
|
|
|
9 |
|
10 |
def launch(input_image):
|
11 |
+
# Step 1: Generate caption
|
12 |
+
caption = captioner(input_image)[0]['generated_text']
|
13 |
|
14 |
+
# Step 2: Synthesize speech from caption
|
15 |
+
speech = synthesiser(caption, forward_params={"do_sample": True})
|
16 |
+
audio = np.array(speech["audio"])
|
17 |
+
rate = speech["sampling_rate"]
|
18 |
|
19 |
+
return (audio, rate), caption
|
|
|
20 |
|
21 |
+
# Gradio UI
|
22 |
iface = gr.Interface(
|
23 |
fn=launch,
|
24 |
+
inputs=gr.Image(type="pil", label="Upload Image"),
|
25 |
outputs=[
|
26 |
gr.Audio(type="numpy", label="Narrated Audio"),
|
27 |
+
gr.Textbox(label="Generated Caption")
|
28 |
],
|
29 |
+
title="🎙️ SeeSay",
|
30 |
+
description="Upload an image to hear it described with expressive speech."
|
31 |
)
|
32 |
|
33 |
+
iface.launch(share = True)
|