import gradio as gr # Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") def launch(input_image): # Step 1: Extract caption caption = pipe(input_image)[0]["generated_text"] # Step 2: Convert caption to audio audio_output = narrator(caption) audio_array = np.array(audio_output["audio"]) sample_rate = audio_output["sampling_rate"] # Step 3: Return audio + caption return (audio_array, sample_rate), caption # Use dictionary to avoid conflicting argument ordering iface = gr.Interface( fn=launch, inputs=gr.Image(type='pil', label="Upload Image"), outputs=[ gr.Audio(type="numpy", label="Narrated Audio"), gr.Textbox(label="Extracted Caption") ], title="SeeSay", description="Upload an image to hear its context narrated aloud." ) iface.launch()