Spaces:
Running
Running
File size: 1,142 Bytes
f30b843 516718c 629e04f 608498c 516718c 83cd235 516718c 83cd235 516718c 629e04f 516718c 1b7018a 83cd235 516718c 26dbd13 629e04f 516718c 26dbd13 516718c 26dbd13 516718c 629e04f 516718c 629e04f 516718c 629e04f 5c86456 629e04f 516718c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import gradio as gr
import torch
from transformers import pipeline
# Load the image captioning pipeline
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load Whisper model for speech synthesis
pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
def launch(input_image):
try:
# Step 1: Generate caption
out = captioner(input_image)
caption = out[0]['generated_text']
# Step 2: Synthesize speech from caption
speech = pipe(caption)
audio_data = speech['audio']
sample_rate = speech['sampling_rate']
return (audio_data, sample_rate), caption
except Exception as e:
return str(e), "Error in processing."
# Gradio UI
iface = gr.Interface(
fn=launch,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(type="numpy", label="Generated Audio"),
gr.Textbox(label="Generated Caption")
],
title="Whisper Image to Audio",
description="Upload an image to generate a caption and hear it described with speech."
)
iface.launch(share=True)
|