Spaces:

preston-cell
/

image-text-to-text

Running

File size: 1,345 Bytes

f30b843
629e04f
f67b2c3
 
608498c
f67b2c3
516718c
83cd235
f67b2c3
 
 
 
 
 
83cd235
f67b2c3
 
629e04f
516718c
f67b2c3
83cd235
f67b2c3
 
 
 
516718c
 
26dbd13
629e04f
f67b2c3
 
26dbd13
f67b2c3
26dbd13
f67b2c3
 
629e04f
f67b2c3
516718c
629e04f
f67b2c3
629e04f
 
5c86456
629e04f
516718c
f67b2c3

import gradio as gr
from transformers import pipeline
import numpy as np
import torch

# Load the image-to-text model
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Load the Whisper model for speech synthesis
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"
model = pipeline("text-to-speech", model=model_id, device=device)


def process_image(input_image):
    try:
        # Step 1: Generate caption
        caption = captioner(input_image)[0]['generated_text']

        # Step 2: Convert caption to speech
        speech_output = model(caption)
        audio_data = np.array(speech_output["audio"]).astype(np.float32)
        sample_rate = speech_output["sampling_rate"]

        return (audio_data, sample_rate), caption

    except Exception as e:
        return str(e)


# Create Gradio Interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type='pil', label="Upload Image"),
    outputs=[
        gr.Audio(label="Generated Speech"),
        gr.Textbox(label="Generated Caption")
    ],
    title="Image to Audio with Caption",
    description="Upload an image to generate a caption and hear it described with speech."
)

iface.launch(share=True)