import os import gradio as gr from transformers import pipeline from IPython.display import Audio as IPythonAudio pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") def launch(input): out = pipe(input) narrated_text = tts_pipe(out[0]['generated_text']) IPythonAudio(narrated_text["audio"][0], rate=narrated_text["sampling_rate"]) return out[0]['generated_text'] iface = gr.Interface(launch, inputs=gr.Image(type='pil'), outputs="text") iface.launch()