File size: 1,142 Bytes
f30b843
516718c
629e04f
608498c
516718c
 
83cd235
516718c
 
83cd235
516718c
629e04f
516718c
1b7018a
 
 
83cd235
516718c
 
 
 
 
 
26dbd13
629e04f
516718c
26dbd13
516718c
26dbd13
516718c
 
629e04f
516718c
 
629e04f
516718c
629e04f
 
5c86456
629e04f
516718c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
import torch
from transformers import pipeline

# Load the image captioning pipeline
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

# Load Whisper model for speech synthesis
pipe = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")

def launch(input_image):
    try:
        # Step 1: Generate caption
        out = captioner(input_image)
        caption = out[0]['generated_text']
        

        # Step 2: Synthesize speech from caption
        speech = pipe(caption)
        audio_data = speech['audio']
        sample_rate = speech['sampling_rate']

        return (audio_data, sample_rate), caption

    except Exception as e:
        return str(e), "Error in processing."

# Gradio UI
iface = gr.Interface(
    fn=launch,
    inputs=gr.Image(type='pil', label="Upload an Image"),
    outputs=[
        gr.Audio(type="numpy", label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="Whisper Image to Audio",
    description="Upload an image to generate a caption and hear it described with speech."
)

iface.launch(share=True)