preston-cell's picture
Update app.py
1b7018a verified
raw
history blame
1.14 kB
import gradio as gr
import torch
from transformers import pipeline
# Load the image captioning pipeline
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load Whisper model for speech synthesis
pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
def launch(input_image):
try:
# Step 1: Generate caption
out = captioner(input_image)
caption = out[0]['generated_text']
# Step 2: Synthesize speech from caption
speech = pipe(caption)
audio_data = speech['audio']
sample_rate = speech['sampling_rate']
return (audio_data, sample_rate), caption
except Exception as e:
return str(e), "Error in processing."
# Gradio UI
iface = gr.Interface(
fn=launch,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(type="numpy", label="Generated Audio"),
gr.Textbox(label="Generated Caption")
],
title="Whisper Image to Audio",
description="Upload an image to generate a caption and hear it described with speech."
)
iface.launch(share=True)