Spaces:
Runtime error
Runtime error
File size: 2,694 Bytes
413d4d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import numpy as np
from PIL import Image
import torch.nn.functional as F
from typing import List
from transformers import AutoTokenizer, AutoModel, AutoProcessor
NUM_ASPECT=5
ROUND_DIGIT=3
MAX_LENGTH = 76
MAX_NUM_FRAMES=8
X_CLIP_POINT_LOW=0.15
X_CLIP_POINT_MID=0.225
X_CLIP_POINT_HIGH=0.30
def _read_video_frames(frames, max_frames):
total_frames = len(frames)
indices = np.linspace(0, total_frames - 1, num=max_frames).astype(int)
selected_frames = [np.array(frames[i]) for i in indices]
return np.stack(selected_frames)
class MetricXCLIPScore():
def __init__(self, device="cuda") -> None:
"""
Initialize a MetricXCLIPScore object with the specified device.
Args:
device (str, optional): The device on which the model will run. Defaults to "cuda".
"""
self.model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
self.processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
def evaluate(self, frame_list:List[Image.Image], text:str,):
"""
Calculate the cosine similarity of between X-CLIP features of text prompt and the given video to test text-to-video alignment,
then quantize the orginal output based on some predefined thresholds.
Args:
frame_list:List[Image.Image], frames of the video used in calculation.
text:str, text prompt for generating the video.
Returns:
xclip_score_avg: float, the computed X-CLIP-Score between video and its text prompt.
quantized_ans: int, the quantized value of the above avg SSIM scores based on pre-defined thresholds.
"""
input_text = self.tokenizer([text], max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors="pt")
text_feature = self.model.get_text_features(**input_text).flatten()
video=_read_video_frames(frame_list,MAX_NUM_FRAMES)
input_video = self.processor(videos=list(video), return_tensors="pt")
video_feature = self.model.get_video_features(**input_video).flatten()
cos_sim=F.cosine_similarity(text_feature, video_feature, dim=0).item()
quantized_ans=0
if cos_sim < X_CLIP_POINT_LOW:
quantized_ans=1
elif cos_sim >= X_CLIP_POINT_LOW and cos_sim < X_CLIP_POINT_MID:
quantized_ans=2
elif cos_sim >= X_CLIP_POINT_MID and cos_sim < X_CLIP_POINT_HIGH:
quantized_ans=3
else:
quantized_ans=4
return cos_sim, quantized_ans
|