from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch class IdentificationModel: def __init__(self): self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) def identify_objects(self, image_path, text_descriptions): # Load image image = Image.open(image_path) # Prepare inputs inputs = self.processor(text=text_descriptions, images=image, return_tensors="pt", padding=True) inputs = {k: v.to(self.device) for k, v in inputs.items()} # Run inference with torch.no_grad(): outputs = self.model(**inputs) # Get logits and compute probabilities logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # convert logits to probabilities # Find the detection with the maximum probability max_prob, max_idx = torch.max(probs[0], dim=0) # Prepare the result for the highest probability detection detection=[] detection.append({ 'description': text_descriptions[max_idx], 'probability': float(max_prob) }) return detection