Spaces:

preston-cell
/

image-text-to-text

Running

File size: 1,142 Bytes

import gradio as gr
import torch
from transformers import pipeline

# Load the image captioning pipeline
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Load Whisper model for speech synthesis
pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

def launch(input_image):
    try:
        # Step 1: Generate caption
        out = captioner(input_image)
        caption = out[0]['generated_text']
        

        # Step 2: Synthesize speech from caption
        speech = pipe(caption)
        audio_data = speech['audio']
        sample_rate = speech['sampling_rate']

        return (audio_data, sample_rate), caption

    except Exception as e:
        return str(e), "Error in processing."

# Gradio UI
iface = gr.Interface(
    fn=launch,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(type="numpy", label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="Whisper Image to Audio",
    description="Upload an image to generate a caption and hear it described with speech."
)

iface.launch(share=True)