preston-cell's picture
Update app.py
f67b2c3 verified
raw
history blame
1.35 kB
import gradio as gr
from transformers import pipeline
import numpy as np
import torch
# Load the image-to-text model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load the Whisper model for speech synthesis
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = pipeline("text-to-speech", model=model_id, device=device)
def process_image(input_image):
try:
# Step 1: Generate caption
caption = captioner(input_image)[0]['generated_text']
# Step 2: Convert caption to speech
speech_output = model(caption)
audio_data = np.array(speech_output["audio"]).astype(np.float32)
sample_rate = speech_output["sampling_rate"]
return (audio_data, sample_rate), caption
except Exception as e:
return str(e)
# Create Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload Image"),
outputs=[
gr.Audio(label="Generated Speech"),
gr.Textbox(label="Generated Caption")
],
title="Image to Audio with Caption",
description="Upload an image to generate a caption and hear it described with speech."
)
iface.launch(share=True)