File size: 959 Bytes
f30b843
e363033
e35301b
78c45fd
1129fe7
 
83cd235
1129fe7
 
ba2d445
520c499
1129fe7
 
83cd235
1129fe7
 
 
 
83cd235
1129fe7
ba2d445
1129fe7
520c499
ba2d445
1129fe7
520c499
 
1129fe7
520c499
1129fe7
 
5c86456
 
1129fe7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import gradio as gr
from transformers import pipeline
import numpy as np

# Captioning model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Bark TTS model
synthesiser = pipeline("text-to-speech", model="suno/bark")

def launch(input_image):
    # Step 1: Generate caption
    caption = captioner(input_image)[0]['generated_text']

    # Step 2: Synthesize speech from caption
    speech = synthesiser(caption, forward_params={"do_sample": True})
    audio = np.array(speech["audio"])
    rate = speech["sampling_rate"]

    return (audio, rate), caption

# Gradio UI
iface = gr.Interface(
    fn=launch,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=[
        gr.Audio(type="numpy", label="Narrated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="🎙️ SeeSay",
    description="Upload an image to hear it described with expressive speech."
)

iface.launch(share = True)