import torch,sys from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration # Load the BLIP model and processor processor = BlipProcessor.from_pretrained("image2text") model = BlipForConditionalGeneration.from_pretrained('image2text') # Load the image you want to describe image_path = sys.argv[1] raw_image = Image.open(image_path).convert('RGB') # Process the image and prepare the input for the model inputs = processor(images=raw_image, return_tensors="pt") # Generate a description for the image with torch.no_grad(): generated_ids = model.generate(**inputs) # Decode the generated description description = processor.decode(generated_ids[0], skip_special_tokens=True) # Print the description print("Generated Description:\n", description)