preston-cell's picture
Update app.py
ed4af8f verified
raw
history blame
1.16 kB
import gradio as gr
from transformers import pipeline
import numpy as np
# Load image captioning model
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load Whisper TTS model
speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
def process_image(image):
try:
# Step 1: Generate caption from image
caption = caption_model(image)[0]['generated_text']
# Step 2: Convert caption to speech using Whisper
speech = speech_model(caption)
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
# Return both the audio and the caption
return (audio, rate), caption
except Exception as e:
return str(e), "Error generating caption or audio."
# Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption")
],
title="SeeSay",
description="Upload an image to generate a caption and hear it described with speech."
)
iface.launch()