File size: 3,241 Bytes
d60d34b
3ccdd83
acda6c7
 
d60d34b
acda6c7
61b7eee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acda6c7
73f9f45
61b7eee
3601eff
 
 
 
 
c890be1
61b7eee
 
 
 
f93e53d
c890be1
 
3ccdd83
61b7eee
 
3ccdd83
 
61b7eee
3ccdd83
 
 
c890be1
3ccdd83
 
 
 
ba78c79
3ccdd83
 
3601eff
acda6c7
d60d34b
61b7eee
3ccdd83
c890be1
d60d34b
61b7eee
d60d34b
c890be1
61b7eee
 
 
 
 
 
 
 
c890be1
f93e53d
 
61b7eee
f93e53d
 
 
c890be1
d60d34b
 
 
 
 
61b7eee
d60d34b
 
61b7eee
f93e53d
 
3601eff
 
c890be1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces

# Dictionary of available CLIP models with their image sizes
CLIP_MODELS = {
    "ViT-B/32": ("openai/clip-vit-base-patch32", 224),
    "ViT-B/16": ("openai/clip-vit-base-patch16", 224),
    "ViT-L/14": ("openai/clip-vit-large-patch14", 224),
    "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
}

# Initialize models and processors
models = {}
processors = {}

for model_name, (model_path, _) in CLIP_MODELS.items():
    models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
    processors[model_name] = CLIPProcessor.from_pretrained(model_path)

@spaces.GPU
def calculate_score(image, text, model_name):
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    
    model = models[model_name]
    processor = processors[model_name]
    
    # Preprocess the image and text
    inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    # Calculate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
    
    # Normalize embeddings
    image_embeds = F.normalize(image_embeds, p=2, dim=1)
    text_embeds = F.normalize(text_embeds, p=2, dim=1)
    
    # Calculate cosine similarity
    cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
    
    # Convert to percentages
    percentages = ((cosine_similarities + 1) / 2).cpu().numpy()
    
    results_dict = {label: float(score) for label, score in zip(labels, percentages)}
    return results_dict

with gr.Blocks() as demo:
    gr.Markdown("# Multi-Model CLIP Score")
    gr.Markdown("Calculate the CLIP score (cosine similarity) between the given image and text descriptions using different CLIP model variants")
    
    with gr.Row():
        image_input = gr.Image(type="pil")
        output_label = gr.Label()
    
    with gr.Row():
        text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
        model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
    
    def process_inputs(image, text, model_name):
        if image is None or text.strip() == "":
            return None
        return calculate_score(image, text, model_name)
    
    inputs = [image_input, text_input, model_dropdown]
    outputs = output_label
    
    image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
    text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
    model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
    
    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
                "ViT-B/16"
            ]
        ],
        fn=process_inputs,
        inputs=inputs,
        outputs=outputs,
    )

demo.launch()