Spaces:

preston-cell
/

image-text-to-text

Running

File size: 959 Bytes

f30b843
e363033
e35301b
78c45fd
1129fe7
 
83cd235
1129fe7
 
ba2d445
520c499
1129fe7
 
83cd235
1129fe7
 
 
 
83cd235
1129fe7
ba2d445
1129fe7
520c499
ba2d445
1129fe7
520c499
 
1129fe7
520c499
1129fe7
 
5c86456
 
1129fe7

import gradio as gr
from transformers import pipeline
import numpy as np

# Captioning model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Bark TTS model
synthesiser = pipeline("text-to-speech", model="suno/bark")

def launch(input_image):
    # Step 1: Generate caption
    caption = captioner(input_image)[0]['generated_text']

    # Step 2: Synthesize speech from caption
    speech = synthesiser(caption, forward_params={"do_sample": True})
    audio = np.array(speech["audio"])
    rate = speech["sampling_rate"]

    return (audio, rate), caption

# Gradio UI
iface = gr.Interface(
    fn=launch,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=[
        gr.Audio(type="numpy", label="Narrated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="🎙️ SeeSay",
    description="Upload an image to hear it described with expressive speech."
)

iface.launch(share = True)