import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor import torch from PIL import Image import requests # Load the tokenizer, model, and feature extractor model_name = "Salesforce/BLIP-image-captioning-base" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) def generate_caption(image): inputs = feature_extractor(images=image, return_tensors="pt") outputs = model.generate(**inputs, max_length=128, num_beams=4, return_dict_in_generate=True) caption = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) return caption # Create the Gradio interface interface = gr.Interface(fn=generate_caption, inputs=gr.inputs.Image(type="pil"), outputs="text", title="Image Captioning with BLIP", description="Upload an image to generate a caption.") if __name__ == "__main__": interface.launch()