Spaces:
Running
Running
File size: 1,345 Bytes
f30b843 629e04f f67b2c3 608498c f67b2c3 516718c 83cd235 f67b2c3 83cd235 f67b2c3 629e04f 516718c f67b2c3 83cd235 f67b2c3 516718c 26dbd13 629e04f f67b2c3 26dbd13 f67b2c3 26dbd13 f67b2c3 629e04f f67b2c3 516718c 629e04f f67b2c3 629e04f 5c86456 629e04f 516718c f67b2c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
from transformers import pipeline
import numpy as np
import torch
# Load the image-to-text model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load the Whisper model for speech synthesis
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = pipeline("text-to-speech", model=model_id, device=device)
def process_image(input_image):
try:
# Step 1: Generate caption
caption = captioner(input_image)[0]['generated_text']
# Step 2: Convert caption to speech
speech_output = model(caption)
audio_data = np.array(speech_output["audio"]).astype(np.float32)
sample_rate = speech_output["sampling_rate"]
return (audio_data, sample_rate), caption
except Exception as e:
return str(e)
# Create Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload Image"),
outputs=[
gr.Audio(label="Generated Speech"),
gr.Textbox(label="Generated Caption")
],
title="Image to Audio with Caption",
description="Upload an image to generate a caption and hear it described with speech."
)
iface.launch(share=True)
|