Spaces:
Running
Running
File size: 959 Bytes
f30b843 e363033 e35301b 78c45fd 1129fe7 83cd235 1129fe7 ba2d445 520c499 1129fe7 83cd235 1129fe7 83cd235 1129fe7 ba2d445 1129fe7 520c499 ba2d445 1129fe7 520c499 1129fe7 520c499 1129fe7 5c86456 1129fe7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import gradio as gr
from transformers import pipeline
import numpy as np
# Captioning model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Bark TTS model
synthesiser = pipeline("text-to-speech", model="suno/bark")
def launch(input_image):
# Step 1: Generate caption
caption = captioner(input_image)[0]['generated_text']
# Step 2: Synthesize speech from caption
speech = synthesiser(caption, forward_params={"do_sample": True})
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
return (audio, rate), caption
# Gradio UI
iface = gr.Interface(
fn=launch,
inputs=gr.Image(type="pil", label="Upload Image"),
outputs=[
gr.Audio(type="numpy", label="Narrated Audio"),
gr.Textbox(label="Generated Caption")
],
title="🎙️ SeeSay",
description="Upload an image to hear it described with expressive speech."
)
iface.launch(share = True) |