File size: 3,241 Bytes
d60d34b 3ccdd83 acda6c7 d60d34b acda6c7 61b7eee acda6c7 73f9f45 61b7eee 3601eff c890be1 61b7eee f93e53d c890be1 3ccdd83 61b7eee 3ccdd83 61b7eee 3ccdd83 c890be1 3ccdd83 ba78c79 3ccdd83 3601eff acda6c7 d60d34b 61b7eee 3ccdd83 c890be1 d60d34b 61b7eee d60d34b c890be1 61b7eee c890be1 f93e53d 61b7eee f93e53d c890be1 d60d34b 61b7eee d60d34b 61b7eee f93e53d 3601eff c890be1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces
# Dictionary of available CLIP models with their image sizes
CLIP_MODELS = {
"ViT-B/32": ("openai/clip-vit-base-patch32", 224),
"ViT-B/16": ("openai/clip-vit-base-patch16", 224),
"ViT-L/14": ("openai/clip-vit-large-patch14", 224),
"ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
}
# Initialize models and processors
models = {}
processors = {}
for model_name, (model_path, _) in CLIP_MODELS.items():
models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
processors[model_name] = CLIPProcessor.from_pretrained(model_path)
@spaces.GPU
def calculate_score(image, text, model_name):
labels = text.split(";")
labels = [l.strip() for l in labels]
labels = list(filter(None, labels))
if len(labels) == 0:
return dict()
model = models[model_name]
processor = processors[model_name]
# Preprocess the image and text
inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Calculate embeddings
with torch.no_grad():
outputs = model(**inputs)
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds
# Normalize embeddings
image_embeds = F.normalize(image_embeds, p=2, dim=1)
text_embeds = F.normalize(text_embeds, p=2, dim=1)
# Calculate cosine similarity
cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
# Convert to percentages
percentages = ((cosine_similarities + 1) / 2).cpu().numpy()
results_dict = {label: float(score) for label, score in zip(labels, percentages)}
return results_dict
with gr.Blocks() as demo:
gr.Markdown("# Multi-Model CLIP Score")
gr.Markdown("Calculate the CLIP score (cosine similarity) between the given image and text descriptions using different CLIP model variants")
with gr.Row():
image_input = gr.Image(type="pil")
output_label = gr.Label()
with gr.Row():
text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
def process_inputs(image, text, model_name):
if image is None or text.strip() == "":
return None
return calculate_score(image, text, model_name)
inputs = [image_input, text_input, model_dropdown]
outputs = output_label
image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
gr.Examples(
examples=[
[
"cat.jpg",
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
"ViT-B/16"
]
],
fn=process_inputs,
inputs=inputs,
outputs=outputs,
)
demo.launch() |