File size: 1,418 Bytes
9cfa91c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

class IdentificationModel:
    def __init__(self):
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def identify_objects(self, image_path, text_descriptions):
        # Load image
        image = Image.open(image_path)

        # Prepare inputs
        inputs = self.processor(text=text_descriptions, images=image, return_tensors="pt", padding=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Run inference
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Get logits and compute probabilities
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1) # convert logits to probabilities

        # Find the detection with the maximum probability
        max_prob, max_idx = torch.max(probs[0], dim=0)

        # Prepare the result for the highest probability detection
        detection=[]
        detection.append({
            'description': text_descriptions[max_idx],
            'probability': float(max_prob)
        })

        return detection