Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
import torch | |
# Load the image-to-text model | |
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
# Load the Whisper model for speech synthesis | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-large-v3-turbo" | |
model = pipeline("text-to-speech", model=model_id, device=device) | |
def process_image(input_image): | |
try: | |
# Step 1: Generate caption | |
caption = captioner(input_image)[0]['generated_text'] | |
# Step 2: Convert caption to speech | |
speech_output = model(caption) | |
audio_data = np.array(speech_output["audio"]).astype(np.float32) | |
sample_rate = speech_output["sampling_rate"] | |
return (audio_data, sample_rate), caption | |
except Exception as e: | |
return str(e) | |
# Create Gradio Interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type='pil', label="Upload Image"), | |
outputs=[ | |
gr.Audio(label="Generated Speech"), | |
gr.Textbox(label="Generated Caption") | |
], | |
title="Image to Audio with Caption", | |
description="Upload an image to generate a caption and hear it described with speech." | |
) | |
iface.launch(share=True) | |