Spaces:
Running
Running
File size: 1,158 Bytes
f30b843 629e04f f67b2c3 608498c ed4af8f 83cd235 ed4af8f f67b2c3 ed4af8f 629e04f ed4af8f 83cd235 ed4af8f 26dbd13 ed4af8f 629e04f ed4af8f 26dbd13 ed4af8f 26dbd13 f67b2c3 ed4af8f 629e04f ed4af8f 516718c 629e04f ed4af8f 629e04f 5c86456 ed4af8f 516718c f67b2c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import gradio as gr
from transformers import pipeline
import numpy as np
# Load image captioning model
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load Whisper TTS model
speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
def process_image(image):
try:
# Step 1: Generate caption from image
caption = caption_model(image)[0]['generated_text']
# Step 2: Convert caption to speech using Whisper
speech = speech_model(caption)
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
# Return both the audio and the caption
return (audio, rate), caption
except Exception as e:
return str(e), "Error generating caption or audio."
# Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption")
],
title="SeeSay",
description="Upload an image to generate a caption and hear it described with speech."
)
iface.launch()
|