Spaces:
Runtime error
Runtime error
| import argparse | |
| import json | |
| from math import ceil | |
| import os | |
| import random | |
| import uuid | |
| from collections import defaultdict | |
| from typing import Callable | |
| import time | |
| import cv2 | |
| import more_itertools | |
| import numpy as np | |
| import torch | |
| from coco_metric import compute_cider, postprocess_captioning_generation | |
| from eval_datasets import VQADataset, GQADataset | |
| from tqdm import tqdm | |
| from collections import Counter | |
| from vqa_metric import compute_vqa_accuracy, compute_gqa_accuracy | |
| from open_flamingo.eval.classification import ( | |
| compute_per_sample_probs, | |
| compute_per_sample_loss, | |
| ) | |
| from open_flamingo.eval.imagenet_utils import ( | |
| openai_imagenet_classnames, | |
| IMAGENET_1K_CLASS_ID_TO_LABEL, | |
| ) | |
| from open_flamingo.src.factory import create_model_and_transforms | |
| from PIL import Image | |
| from io import BytesIO | |
| import base64 | |
| from open_flamingo.train.distributed import init_distributed_device, world_info_from_env | |
| import string | |
| from lavis.datasets.builders import load_dataset | |
| def get_iou(box1, box2): | |
| # box1 and box2 should be in the format [x1, y1, x2, y2] | |
| intersection = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0])) * \ | |
| max(0, min(box1[3], box2[3]) - max(box1[1], box2[1])) | |
| area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) | |
| area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) | |
| union = area_box1 + area_box2 - intersection | |
| iou = intersection / union if union > 0 else 0 | |
| return iou | |
| def expand2square(pil_img, background_color): | |
| width, height = pil_img.size | |
| if width == height: | |
| return pil_img | |
| elif width > height: | |
| result = Image.new(pil_img.mode, (width, width), background_color) | |
| result.paste(pil_img, (0, (width - height) // 2)) | |
| return result | |
| else: | |
| result = Image.new(pil_img.mode, (height, height), background_color) | |
| result.paste(pil_img, ((height - width) // 2, 0)) | |
| return result | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b") | |
| parser.add_argument("--lm_tokenizer_path", type=str, default="facebook/opt-30b") | |
| parser.add_argument("--vision_encoder_path", default="ViT-L-14", type=str) | |
| parser.add_argument("--vision_encoder_pretrained", default="openai", type=str) | |
| parser.add_argument("--checkpoint_path", type=str, required=True) | |
| parser.add_argument( | |
| "--results_file", type=str, default=None, help="JSON file to save results" | |
| ) | |
| # Trial arguments | |
| parser.add_argument("--shots", nargs="+", default=[0, 4, 8, 16, 32], type=int) | |
| parser.add_argument( | |
| "--num_trials", | |
| type=int, | |
| default=1, | |
| help="Number of trials to run for each shot using different demonstrations", | |
| ) | |
| parser.add_argument( | |
| "--trial_seeds", | |
| nargs="+", | |
| default=[0], | |
| help="Seeds to use for each trial for picking demonstrations and eval sets", | |
| ) | |
| parser.add_argument( | |
| "--num_samples", type=int, default=5000, help="Number of samples to evaluate on" | |
| ) | |
| parser.add_argument("--batch_size", type=int, default=8) | |
| # Per-dataset evaluation flags | |
| parser.add_argument( | |
| "--eval_coco", | |
| action="store_true", | |
| default=False, | |
| help="Whether to evaluate on COCO.", | |
| ) | |
| parser.add_argument( | |
| "--eval_vqav2", | |
| action="store_true", | |
| default=False, | |
| help="Whether to evaluate on VQAV2.", | |
| ) | |
| parser.add_argument( | |
| "--eval_ok_vqa", | |
| action="store_true", | |
| default=False, | |
| help="Whether to evaluate on OK-VQA.", | |
| ) | |
| parser.add_argument( | |
| "--eval_imagenet", | |
| action="store_true", | |
| default=False, | |
| help="Whether to evaluate on ImageNet.", | |
| ) | |
| parser.add_argument( | |
| "--eval_flickr30", | |
| action="store_true", | |
| default=False, | |
| help="Whether to evaluate on Flickr30.", | |
| ) | |
| parser.add_argument( | |
| "--eval_refcoco", | |
| action="store_true", | |
| default=False, | |
| help="Whether to evaluate on RefCOCO.", | |
| ) | |
| # Dataset arguments | |
| ## Flickr30 Dataset | |
| parser.add_argument( | |
| "--flickr_image_dir_path", | |
| type=str, | |
| help="Path to the flickr30/flickr30k_images directory.", | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--flickr_annotations_json_path", | |
| type=str, | |
| help="Path to the dataset_flickr30k_coco_style.json file.", | |
| default=None, | |
| ) | |
| ## COCO Dataset | |
| parser.add_argument( | |
| "--coco_image_dir_path", | |
| type=str, | |
| help="Path to the flickr30/flickr30k_images directory.", | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--coco_annotations_json_path", | |
| type=str, | |
| default=None, | |
| ) | |
| ## VQAV2 Dataset | |
| parser.add_argument( | |
| "--vqav2_image_dir_path", | |
| type=str, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--vqav2_questions_json_path", | |
| type=str, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--vqav2_annotations_json_path", | |
| type=str, | |
| default=None, | |
| ) | |
| ## OK-VQA Dataset | |
| parser.add_argument( | |
| "--ok_vqa_image_dir_path", | |
| type=str, | |
| help="Path to the vqav2/train2014 directory.", | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--ok_vqa_questions_json_path", | |
| type=str, | |
| help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.", | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--ok_vqa_annotations_json_path", | |
| type=str, | |
| help="Path to the v2_mscoco_train2014_annotations.json file.", | |
| default=None, | |
| ) | |
| ## Imagenet dataset | |
| parser.add_argument("--imagenet_root", type=str, default="/tmp") | |
| ## RefCOCO dataset | |
| parser.add_argument("--refcoco_tsvfile", type=str, default=None) | |
| parser.add_argument( | |
| "--location_token_num", | |
| default=1000, | |
| type=int, | |
| ) | |
| # distributed training | |
| parser.add_argument( | |
| "--dist-url", | |
| default="env://", | |
| type=str, | |
| help="url used to set up distributed training", | |
| ) | |
| parser.add_argument( | |
| "--dist-backend", default="nccl", type=str, help="distributed backend" | |
| ) | |
| parser.add_argument( | |
| "--horovod", | |
| default=False, | |
| action="store_true", | |
| help="Use horovod for distributed training.", | |
| ) | |
| parser.add_argument( | |
| "--no-set-device-rank", | |
| default=False, | |
| action="store_true", | |
| help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).", | |
| ) | |
| parser.add_argument( | |
| "--dist", | |
| default=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "--lora", | |
| default=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "--lora_r", | |
| default=16, | |
| type=int, | |
| required=False, | |
| ) | |
| parser.add_argument( | |
| "--legacy", | |
| default=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "--special", | |
| default=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "--id", | |
| default=0, | |
| type=int, | |
| required=False, | |
| ) | |
| parser.add_argument( | |
| "--eval_gqa", | |
| default=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "--use_sam", | |
| default=None, | |
| type=str, | |
| required=False, | |
| ) | |
| parser.add_argument( | |
| "--add_visual_token", | |
| default=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "--use_format_v2", | |
| default=False, | |
| action="store_true", | |
| ) | |
| class OKVQAPostProcess(): | |
| def __init__(self): | |
| self._lemmatizer = None | |
| def _lemmatize(self, answers): | |
| def apply(answer): | |
| doc = self.lemmatizer(answer) | |
| words = [] | |
| for token in doc: | |
| if token.pos_ in ["NOUN", "VERB"]: | |
| words.append(token.lemma_) | |
| else: | |
| words.append(token.text) | |
| answer = " ".join(words) | |
| return answer | |
| return [apply(answer) for answer in answers] | |
| def lemmatizer(self): | |
| if self._lemmatizer is None: | |
| try: | |
| import spacy | |
| self._lemmatizer = spacy.load("en_core_web_sm") | |
| except ImportError: | |
| logging.error( | |
| """ | |
| Please install spacy and en_core_web_sm model to apply lemmatization. | |
| python -m spacy download en_core_web_sm | |
| OR | |
| import spacy.cli | |
| spacy.cli.download("en_core_web_sm") | |
| """ | |
| ) | |
| exit(1) | |
| return self._lemmatizer | |
| def main(): | |
| args = parser.parse_args() | |
| if args.dist: | |
| args.local_rank, args.rank, args.world_size = world_info_from_env() | |
| print(f"local_rank: {args.local_rank} rank: {args.rank} world_size: {args.world_size}") | |
| device_id = init_distributed_device(args) | |
| else: | |
| args.rank = 0 | |
| args.world_size = 1 | |
| print(f"rank: {args.rank} world_size: {args.world_size}") | |
| if "sam" in args.checkpoint_path: | |
| args.use_sam = "vit_l" | |
| args.add_visual_token = True | |
| if "lora" in args.checkpoint_path: | |
| args.lora = True | |
| args.add_pe = False | |
| args.add_box = False | |
| args.relation = False | |
| if "debug" in args.checkpoint_path: | |
| # args.add_pe = True | |
| args.add_box = True | |
| if "box" in args.checkpoint_path: | |
| args.add_box = True | |
| if "pe" in args.checkpoint_path: | |
| args.add_pe = True | |
| if "rel" in args.checkpoint_path: | |
| args.relation = True | |
| args.add_pe = False | |
| if "previsual" in args.checkpoint_path: | |
| args.use_format_v2 = True | |
| args.relation = False | |
| # load model | |
| flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transforms( | |
| args.vision_encoder_path, | |
| args.vision_encoder_pretrained, | |
| args.lm_path, | |
| args.lm_tokenizer_path, | |
| location_token_num=args.location_token_num, | |
| lora=args.lora, | |
| lora_r=16, | |
| use_sam=args.use_sam, | |
| add_visual_token=args.add_visual_token, | |
| use_format_v2=args.use_format_v2, | |
| add_box=args.add_box, | |
| add_pe=args.add_pe, | |
| add_relation=args.relation, | |
| ) | |
| flamingo.use_format_v2 = args.use_format_v2 | |
| if args.special: | |
| flamingo.special = True | |
| else: | |
| flamingo.special = False | |
| if args.legacy: | |
| flamingo.legacy = True | |
| print("use legacy evaluation") | |
| flamingo.step_num = int(args.checkpoint_path.split("/")[-1].split(".")[0].split("_")[-1]) | |
| flamingo.expr_name = args.checkpoint_path.split("/")[-2] | |
| if args.rank == 0: | |
| print("legacy", True if hasattr(flamingo, "legacy") else False) | |
| print("step:", flamingo.step_num) | |
| print("expr:", flamingo.expr_name) | |
| print("use format v2:", flamingo.use_format_v2) | |
| print(args) | |
| checkpoint = torch.load(args.checkpoint_path, map_location="cpu") | |
| model_state_dict = {} | |
| for key in checkpoint["model_state_dict"].keys(): | |
| model_state_dict[key.replace("module.", "")] = checkpoint["model_state_dict"][key] | |
| if "vision_encoder.logit_scale"in model_state_dict: | |
| # previous checkpoint has some unnecessary weights | |
| del model_state_dict["vision_encoder.logit_scale"] | |
| del model_state_dict["vision_encoder.visual.proj"] | |
| del model_state_dict["vision_encoder.visual.ln_post.weight"] | |
| del model_state_dict["vision_encoder.visual.ln_post.bias"] | |
| flamingo.load_state_dict(model_state_dict, strict=True) | |
| results = defaultdict(list) | |
| if args.eval_coco: | |
| print("Evaluating on COCO...") | |
| for shot in args.shots: | |
| scores = [] | |
| for seed, trial in zip(args.trial_seeds, range(args.num_trials)): | |
| cider_score = evaluate_coco_flickr( | |
| model=flamingo, | |
| tokenizer=tokenizer, | |
| image_processor=image_processor, | |
| batch_size=args.batch_size, | |
| image_dir_path=args.coco_image_dir_path, | |
| annotations_json_path=args.coco_annotations_json_path, | |
| device=args.device, | |
| seed=seed, | |
| vis_embed_size=vis_embed_size, | |
| rank=args.rank, | |
| world_size=args.world_size, | |
| id=args.id, | |
| ) | |
| print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}") | |
| scores.append(cider_score) | |
| print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}") | |
| results["coco"].append( | |
| {"shots": shot, "trials": scores, "mean": np.mean(scores)} | |
| ) | |
| if args.eval_ok_vqa: | |
| print("Evaluating on OK-VQA...") | |
| for shot in args.shots: | |
| scores = [] | |
| for seed, trial in zip(args.trial_seeds, range(args.num_trials)): | |
| ok_vqa_score = evaluate_vqa( | |
| model=flamingo, | |
| tokenizer=tokenizer, | |
| image_processor=image_processor, | |
| batch_size=args.batch_size, | |
| image_dir_path=args.ok_vqa_image_dir_path, | |
| questions_json_path=args.ok_vqa_questions_json_path, | |
| annotations_json_path=args.ok_vqa_annotations_json_path, | |
| vqa_dataset="ok_vqa", | |
| vis_embed_size=vis_embed_size, | |
| rank=args.rank, | |
| world_size=args.world_size, | |
| id=args.id, | |
| ) | |
| results["ok_vqa"].append( | |
| {"shots": shot, "score": ok_vqa_score} | |
| ) | |
| if args.eval_vqav2: | |
| print("Evaluating on VQAv2...") | |
| for shot in args.shots: | |
| scores = [] | |
| for seed, trial in zip(args.trial_seeds, range(args.num_trials)): | |
| vqa_score = evaluate_vqa( | |
| model=flamingo, | |
| tokenizer=tokenizer, | |
| image_processor=image_processor, | |
| batch_size=args.batch_size, | |
| image_dir_path=args.vqav2_image_dir_path, | |
| questions_json_path=args.vqav2_questions_json_path, | |
| annotations_json_path=args.vqav2_annotations_json_path, | |
| vqa_dataset="vqa", | |
| vis_embed_size=vis_embed_size, | |
| rank=args.rank, | |
| world_size=args.world_size, | |
| id=args.id, | |
| ) | |
| results["vqav2"].append( | |
| {"shots": shot, "score": vqa_score} | |
| ) | |
| if args.eval_gqa: | |
| print("Evaluating on GQA...") | |
| for shot in args.shots: | |
| scores = [] | |
| for seed, trial in zip(args.trial_seeds, range(args.num_trials)): | |
| vqa_score = evaluate_vqa( | |
| model=flamingo, | |
| tokenizer=tokenizer, | |
| image_processor=image_processor, | |
| batch_size=args.batch_size, | |
| vqa_dataset="gqa", | |
| vis_embed_size=vis_embed_size, | |
| rank=args.rank, | |
| world_size=args.world_size, | |
| id=args.id, | |
| ) | |
| results["gqa"].append( | |
| {"shots": shot, "score": vqa_score} | |
| ) | |
| if args.eval_imagenet: | |
| print("Evaluating on ImageNet...") | |
| for shot in args.shots: | |
| scores = [] | |
| for seed, trial in zip(args.trial_seeds, range(args.num_trials)): | |
| imagenet_score = evaluate_imagenet( | |
| model=flamingo, | |
| tokenizer=tokenizer, | |
| image_processor=image_processor, | |
| batch_size=args.batch_size, | |
| num_samples=args.num_samples, | |
| num_shots=shot, | |
| device=args.device, | |
| seed=seed, | |
| imagenet_root=args.imagenet_root, | |
| ) | |
| print( | |
| f"Shots {shot} Trial {trial} " f"ImageNet score: {imagenet_score}" | |
| ) | |
| scores.append(imagenet_score) | |
| print(f"Shots {shot} Mean ImageNet score: {np.mean(scores)}") | |
| results["imagenet"].append( | |
| {"shots": shot, "trials": scores, "mean": np.mean(scores)} | |
| ) | |
| if args.eval_refcoco: | |
| print("Evaluating on RefCOCO...") | |
| refcoco_score = evaluate_refcoco( | |
| model=flamingo, | |
| tokenizer=tokenizer, | |
| image_processor=image_processor, | |
| batch_size=args.batch_size, | |
| device=args.device, | |
| tsvfile=args.refcoco_tsvfile, | |
| vis_embed_size=vis_embed_size, | |
| rank=args.rank, | |
| world_size=args.world_size, | |
| id=args.id, | |
| ) | |
| results["refcoco"].append( | |
| {"score": refcoco_score} | |
| ) | |
| def prepare_batch_images(batch, image_processor): | |
| batch_images = None | |
| for b in batch: | |
| b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0) | |
| if batch_images is None: | |
| batch_images = b_image | |
| else: | |
| batch_images = torch.cat([batch_images, b_image], dim=0) | |
| return batch_images | |
| def get_outputs( | |
| model, | |
| batch_images, | |
| attention_mask, | |
| max_generation_length, | |
| min_generation_length, | |
| num_beams, | |
| length_penalty, | |
| input_ids, | |
| image_start_index_list=None, | |
| image_nums=None, | |
| bad_words_ids=None, | |
| ): | |
| with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): | |
| outputs = model.generate( | |
| batch_images, | |
| input_ids, | |
| attention_mask=attention_mask, | |
| max_new_tokens=max_generation_length, | |
| min_length=min_generation_length, | |
| num_beams=num_beams, | |
| length_penalty=length_penalty, | |
| image_start_index_list=image_start_index_list, | |
| image_nums=image_nums, | |
| bad_words_ids=bad_words_ids, | |
| ) | |
| outputs = outputs[:, len(input_ids[0]) :] | |
| return outputs | |
| def evaluate_coco_flickr( | |
| model, | |
| tokenizer, | |
| image_processor, | |
| batch_size, | |
| image_dir_path, | |
| annotations_json_path, | |
| seed=42, | |
| max_generation_length=20, | |
| num_beams=1, | |
| length_penalty=-2.0, | |
| device=-1, | |
| is_flickr=False, | |
| vis_embed_size=None, | |
| rank=0, | |
| world_size=1, | |
| id=0, | |
| ): | |
| """Evaluate a model on COCO dataset. | |
| Args: | |
| model (nn.Module): model to evaluate | |
| tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model | |
| image_processor : image processor for the model | |
| batch_size (int): batch size | |
| image_dir_path (str, optional): path to the directory containing the images. | |
| annotations_json_path (str, optional): path to the json file containing the annotations. | |
| seed (int, optional): seed for random number generator. Defaults to 42. | |
| max_generation_length (int, optional): maximum length of the generated caption. Defaults to 10. | |
| num_beams (int, optional): number of beams to use for beam search. Defaults to 3. | |
| length_penalty (float, optional): length penalty for beam search. Defaults to -2.0. | |
| num_samples (int, optional): number of samples to evaluate on. Defaults to 5000. | |
| query_set_size (int, optional): number of samples to use for query set. Defaults to 2048. | |
| num_shots (int, optional): number of in-context samples to use. Defaults to 8. | |
| device (int, optional): device to use. Defaults to -1. | |
| num_workers (int, optional): number of workers to use for dataloader. Defaults to 4. | |
| is_flickr (bool): defines if that data is COCO or Flickr. Defaults to False (COCO). | |
| Returns: | |
| float: CIDEr score | |
| """ | |
| # eval_dataset = COCOFlickrDataset( | |
| # image_dir_path=image_dir_path, | |
| # annotations_path=annotations_json_path, | |
| # is_flickr=is_flickr, | |
| # ) | |
| coco_dataset = load_dataset("coco_caption") | |
| eval_dataset = coco_dataset["test"] | |
| model.eval().cuda() | |
| predictions = defaultdict() | |
| lang_encoder_name = model.lang_encoder.__class__.__name__.lower() | |
| # if "peft" in lang_encoder_name: | |
| # lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower() | |
| try: | |
| media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] | |
| endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1] | |
| pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1] | |
| bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1] | |
| except: | |
| pass | |
| def get_prompt(sample): | |
| return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>" | |
| tokenizer.padding_side = "left" | |
| cnt = 0 | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| desc = "Running inference Flickr30" if is_flickr else "Running inference COCO" | |
| for ii, batch in enumerate(more_itertools.chunked( | |
| tqdm(eval_dataset, desc=desc, disable=(rank != 0)), batch_size | |
| )): | |
| if ii % world_size != rank: | |
| continue | |
| cnt += len(batch) | |
| batch_images = prepare_batch_images( | |
| batch=batch, | |
| image_processor=image_processor, | |
| ).cuda() | |
| batch_text = [get_prompt(s) for s in batch] | |
| encodings = tokenizer( | |
| batch_text, | |
| padding="longest", | |
| truncation=True, | |
| return_tensors="pt", | |
| max_length=2000, | |
| ) | |
| input_ids = encodings["input_ids"].cuda() | |
| attention_mask = encodings["attention_mask"].cuda() | |
| skip_special_tokens = False | |
| if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name: | |
| if rank == 0: | |
| tqdm.write("use legacy model") | |
| skip_special_tokens = True | |
| for i in range(len(input_ids)): | |
| media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0] | |
| endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0] | |
| input_ids[i, media_token_index - 1] = media_token_id | |
| input_ids[i, media_token_index] = pad_token_id | |
| input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id | |
| input_ids[i, endofmedia_token_index] = bos_token_id | |
| image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
| image_start_index_list = [[x] for x in image_start_index_list] | |
| image_nums = [1] * len(input_ids) | |
| if "llama" in lang_encoder_name: | |
| attention_mask[input_ids == 0] = 0 | |
| outputs = get_outputs( | |
| model=model, | |
| batch_images=batch_images, | |
| attention_mask=attention_mask, | |
| max_generation_length=30, | |
| min_generation_length=8, | |
| num_beams=5, | |
| length_penalty=0, | |
| input_ids=input_ids, | |
| image_start_index_list=image_start_index_list, | |
| image_nums=image_nums, | |
| ) | |
| new_predictions = [ | |
| postprocess_captioning_generation(out).replace('"', "") | |
| for out in tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
| ] | |
| # if rank == 0: | |
| # tqdm.write(f"{batch_images.shape} {batch[0]} pred: {new_predictions[0]}") | |
| for i, sample in enumerate(batch): | |
| predictions[int(sample["image_id"])] = { | |
| "caption": new_predictions[i], | |
| } | |
| results_path = ( | |
| f"flickrresults_{lang_encoder_name}_{rank}_{id}.json" | |
| if is_flickr | |
| else f"cocoresults_{lang_encoder_name}_{rank}_{id}.json" | |
| ) | |
| with open(results_path, "w") as f: | |
| f.write( | |
| json.dumps( | |
| [ | |
| {"image_id": k, "caption": predictions[k]["caption"]} | |
| for k in predictions | |
| ], | |
| indent=2, | |
| ) | |
| ) | |
| print("save to", results_path) | |
| del predictions | |
| time.sleep(10) | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| if rank == 0: | |
| print(f"evaluate on rank {rank}. world size is {world_size}") | |
| predictions = [] | |
| for rank_i in range(world_size): | |
| part_results_path = ( | |
| f"flickrresults_{lang_encoder_name}_{rank_i}_{id}.json" | |
| if is_flickr | |
| else f"cocoresults_{lang_encoder_name}_{rank_i}_{id}.json" | |
| ) | |
| print("load", part_results_path) | |
| predictions.extend(json.load(open(part_results_path))) | |
| os.remove(part_results_path) | |
| print("num:", len(predictions)) | |
| results_path = ( | |
| f"flickrresults_{lang_encoder_name}.json" | |
| if is_flickr | |
| else f"cocoresults_{lang_encoder_name}.json" | |
| ) | |
| json.dump(predictions, open(results_path, "w"), indent=2) | |
| metrics = compute_cider( | |
| result_path=results_path, | |
| annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json", | |
| ) | |
| os.makedirs("eval_results", exist_ok=True) | |
| acc = metrics["CIDEr"] | |
| with open(os.path.join("eval_results", f"cococap_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f: | |
| f.write(json.dumps(predictions, indent=2)) | |
| # delete the temporary file | |
| os.remove(results_path) | |
| else: | |
| metrics = {} | |
| metrics["CIDEr"] = 0.0 | |
| return metrics["CIDEr"] | |
| def evaluate_vqa( | |
| model, | |
| tokenizer, | |
| image_processor, | |
| batch_size, | |
| image_dir_path=None, | |
| questions_json_path=None, | |
| annotations_json_path=None, | |
| vqa_dataset="vqa", | |
| vis_embed_size=None, | |
| rank=0, | |
| world_size=1, | |
| id=0, | |
| ): | |
| """ | |
| Evaluate a model on VQA datasets. Currently supports VQA v2.0. | |
| Args: | |
| model (nn.Module): model to evaluate | |
| tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model | |
| image_processor : image processor for the model | |
| batch_size (int): batch size | |
| image_dir_path (str): path to image directory | |
| questions_json_path (str): path to questions json file | |
| annotations_json_path (str): path to annotations json file | |
| seed (int, optional): random seed. Defaults to 42. | |
| max_generation_length (int, optional): max generation length. Defaults to 5. | |
| num_beams (int, optional): number of beams to use for beam search. Defaults to 3. | |
| length_penalty (float, optional): length penalty for beam search. Defaults to -2.0. | |
| num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples. | |
| query_set_size (int, optional): size of the query set. Defaults to 2048. | |
| num_shots (int, optional): number of shots to use. Defaults to 8. | |
| device (int, optional): device to use. Defaults to -1 (cpu). | |
| num_workers (int, optional): number of workers to use. Defaults to 4. | |
| vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa. | |
| Returns: | |
| float: accuracy score | |
| """ | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| if vqa_dataset == "gqa": | |
| eval_dataset = GQADataset() | |
| else: | |
| eval_dataset = VQADataset( | |
| image_dir_path=image_dir_path, | |
| question_path=questions_json_path, | |
| annotations_path=annotations_json_path, | |
| vqa_dataset=vqa_dataset, | |
| ) | |
| postprocessor = OKVQAPostProcess() | |
| try: | |
| media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] | |
| endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1] | |
| pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1] | |
| bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1] | |
| except: | |
| pass | |
| def get_prompt(sample): | |
| return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:" | |
| # return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>" | |
| model.eval().cuda() | |
| lang_encoder_name = model.lang_encoder.__class__.__name__.lower() | |
| if "peft" in lang_encoder_name: | |
| lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower() | |
| predictions = [] | |
| tokenizer.padding_side = "left" | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| for ii, batch in enumerate(more_itertools.chunked( | |
| tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size | |
| )): | |
| if ii % world_size != rank: | |
| continue | |
| batch_images = prepare_batch_images( | |
| batch=batch, | |
| image_processor=image_processor, | |
| ).cuda() | |
| batch_text = [get_prompt(s) for s in batch] | |
| encodings = tokenizer( | |
| batch_text, | |
| return_tensors="pt", | |
| padding="longest", | |
| truncation=True, | |
| max_length=2000, | |
| ) | |
| input_ids = encodings["input_ids"].cuda() | |
| attention_mask = encodings["attention_mask"].cuda() | |
| skip_special_tokens = True | |
| if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name: | |
| if rank == 0: | |
| tqdm.write("use legacy model") | |
| for i in range(len(input_ids)): | |
| media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0] | |
| endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0] | |
| input_ids[i, media_token_index - 1] = media_token_id | |
| input_ids[i, media_token_index] = pad_token_id | |
| input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id | |
| input_ids[i, endofmedia_token_index] = bos_token_id | |
| image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
| image_start_index_list = [[x] for x in image_start_index_list] | |
| image_nums = [1] * len(input_ids) | |
| if "llama" in lang_encoder_name: | |
| attention_mask[input_ids == 0] = 0 | |
| outputs = get_outputs( | |
| model=model, | |
| batch_images=batch_images, | |
| attention_mask=attention_mask, | |
| max_generation_length=10, | |
| min_generation_length=1, | |
| num_beams=5, | |
| length_penalty=0, | |
| input_ids=input_ids, | |
| image_start_index_list=image_start_index_list, | |
| image_nums=image_nums, | |
| ) | |
| # postprocess begin | |
| new_predictions = [ | |
| out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens) | |
| ] | |
| if vqa_dataset == "ok_vqa": | |
| new_predictions = postprocessor._lemmatize(new_predictions) | |
| if model.special: | |
| for i in range(len(new_predictions)): | |
| for answer, _ in Counter(batch[i]['answers']).most_common(): | |
| if answer in new_predictions[i]: | |
| new_predictions[i] = answer | |
| break | |
| if "cant" in new_predictions[i] and "no" == answer: | |
| new_predictions[i] = answer | |
| break | |
| if "can" in new_predictions[i] and "not" not in new_predictions[i] and "cant" not in new_predictions[i] and "yes" == answer: | |
| new_predictions[i] = answer | |
| break | |
| # if rank == 0: | |
| # tqdm.write(f"{image_nums} {image_start_index_list}") | |
| # for i in range(1): | |
| # tqdm.write(f"ID: {batch[i]['question_id']} | gt QA: {batch[i]['question']} {Counter(batch[i]['answers']).most_common()}") | |
| # tqdm.write("prompt: " + tokenizer.decode(input_ids[i])) | |
| # tqdm.write("model output: " + new_predictions[i]) | |
| predictions.extend( | |
| [ | |
| {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]} | |
| for p, sample in zip(new_predictions, batch) | |
| ] | |
| ) | |
| with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f: | |
| f.write(json.dumps(predictions)) | |
| print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json") | |
| time.sleep(10) | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| if rank == 0: | |
| print(f"evaluate on rank {rank}. world size is {world_size}") | |
| predictions = [] | |
| for rank_i in range(world_size): | |
| print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json") | |
| predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json"))) | |
| os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json") | |
| print("num:", len(predictions)) | |
| # save the predictions to a temporary file | |
| random_uuid = str(uuid.uuid4()) | |
| with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f: | |
| f.write(json.dumps(predictions, indent=4)) | |
| if vqa_dataset == "gqa": | |
| acc = compute_gqa_accuracy(predictions) | |
| else: | |
| acc = compute_vqa_accuracy( | |
| f"{vqa_dataset}results_{random_uuid}.json", | |
| questions_json_path, | |
| annotations_json_path, | |
| vqa_dataset=vqa_dataset, | |
| ) | |
| print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json") | |
| os.makedirs("eval_results", exist_ok=True) | |
| with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f: | |
| f.write(json.dumps(predictions, indent=2)) | |
| # delete the temporary file | |
| os.remove(f"{vqa_dataset}results_{random_uuid}.json") | |
| else: | |
| time.sleep(5) | |
| acc = 0.0 | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| return acc | |
| def evaluate_refcoco( | |
| model, | |
| tokenizer, | |
| image_processor, | |
| batch_size, | |
| tsvfile, | |
| max_generation_length=20, | |
| num_beams=3, | |
| length_penalty=-2.0, | |
| device=-1, | |
| vis_embed_size=None, | |
| rank=0, | |
| world_size=1, | |
| id=0, | |
| ): | |
| model.eval().cuda() | |
| loc_token_ids = [] | |
| for i in range(1000): | |
| loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1])) | |
| media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] | |
| total = 0 | |
| correct = 0 | |
| ious = [] | |
| if "refcocog" in tsvfile: | |
| dataset_name = "refcocog" | |
| elif "refcocoplus" in tsvfile: | |
| dataset_name = "refcocoplus" | |
| else: | |
| dataset_name = "refcoco" | |
| with open(tsvfile, "r") as f: | |
| lines = f.readlines() | |
| pbar = tqdm(lines, disable=(rank != 0)) | |
| for ii, line in enumerate(pbar): | |
| if ii % world_size != rank: | |
| continue | |
| total += 1 | |
| line = line.rstrip() | |
| uniq_id, image_id, text, region_coord, image = line.split("\t") | |
| # image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB") | |
| # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB") | |
| # image2 = Image.open("yolo.png").convert("RGB") | |
| # image1 = image1.resize((224, 224)) | |
| # image2 = image2.resize((224, 224)) | |
| # images = [image1, image2] | |
| # gt_box = np.array(list(map(float, region_coord.split(",")))) | |
| # width = image.width | |
| # height = image.height | |
| # gt_box /= np.array([width, height, width, height]) | |
| # batch_images = [image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0) for image in images] | |
| # batch_images = torch.cat(batch_images, dim=0) | |
| # image = Image.open("yolo_test.png").convert("RGB") | |
| image = Image.open("example.png").convert("RGB") | |
| image = image.resize((224, 224)) | |
| batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0) | |
| # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text.rstrip('.')}<|#visual#|>"] | |
| prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#endofattr#|>man<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|> is sitting on<|#object#|><|#previsual#|>"] | |
| # prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|>man<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|> is sitting on<|#object#|><|#previsual#|>"] | |
| # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"] | |
| # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"] | |
| encodings = tokenizer( | |
| prompt, | |
| padding="longest", | |
| truncation=True, | |
| return_tensors="pt", | |
| max_length=2000, | |
| ) | |
| input_ids = encodings["input_ids"] | |
| attention_mask = encodings["attention_mask"] | |
| image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
| image_start_index_list = [image_start_index_list] | |
| image_nums = [1] | |
| vision_x = batch_images.cuda() | |
| lang_x = input_ids.cuda() | |
| attention_mask = attention_mask.cuda() | |
| print(image_start_index_list, image_nums) | |
| model.debug_id = 0 | |
| # outputs = get_outputs( | |
| # model=model, | |
| # batch_images=vision_x, | |
| # attention_mask=attention_mask, | |
| # max_generation_length=20, | |
| # min_generation_length=8, | |
| # num_beams=5, | |
| # length_penalty=0, | |
| # input_ids=lang_x, | |
| # image_start_index_list=image_start_index_list, | |
| # image_nums=image_nums, | |
| # ) | |
| # print(tokenizer.decode(outputs[0])) | |
| # exit() | |
| prebox = [93, 20, 155, 172] # man | |
| # prebox = [32, 82, 89, 213] # dog | |
| # prebox = [34, 49, 166, 164] # bike | |
| with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): | |
| outputs = model( | |
| vision_x=vision_x, | |
| lang_x=lang_x, | |
| attention_mask=attention_mask, | |
| labels=None, | |
| image_nums=image_nums, | |
| image_start_index_list=image_start_index_list, | |
| added_bbox_list=[torch.tensor(prebox).cuda().unsqueeze(0) / 224], | |
| add_box=True, | |
| debug_mode=True, | |
| ) | |
| boxes = outputs["boxes"] | |
| scores = outputs["scores"] | |
| box = boxes[scores.argmax()] | |
| open_cv_image = np.array(image) | |
| # Convert RGB to BGR | |
| open_cv_image = open_cv_image[:, :, ::-1].copy() | |
| open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2) | |
| open_cv_image = cv2.rectangle(open_cv_image, prebox[:2], prebox[2:], (0, 0, 255), 2) | |
| cv2.imwrite(f"output2.jpg", open_cv_image) | |
| print(box) | |
| print(prebox) | |
| exit() | |
| # force_words = ["man", "table"] | |
| # force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids | |
| # sequences, hidden_states_for_each_step = get_outputs( | |
| # model=model, | |
| # batch_images=vision_x, | |
| # attention_mask=attention_mask, | |
| # max_generation_length=20, | |
| # min_generation_length=8, | |
| # num_beams=5, | |
| # length_penalty=0, | |
| # input_ids=lang_x, | |
| # image_start_index_list=image_start_index_list, | |
| # image_nums=image_nums, | |
| # force_words_ids=force_words_ids, | |
| # ) | |
| # sequence = sequences[0] | |
| # print(tokenizer.decode(sequence)) | |
| # for i, token in enumerate(sequence): | |
| # if token == model.visual_token_id: | |
| # print(tokenizer.decode(sequence[:i+1])) | |
| # if hasattr(model, "debug_id"): | |
| # model.debug_id += 1 | |
| # else: | |
| # model.debug_id = 0 | |
| # this_lang_x = torch.hstack([lang_x[0], sequence[:i+1]]).unsqueeze(0) | |
| # this_attention_mask = torch.ones_like(this_lang_x).cuda() | |
| # with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad(): | |
| # _ = model( | |
| # vision_x=vision_x, | |
| # lang_x=this_lang_x, | |
| # attention_mask=this_attention_mask, | |
| # labels=None, | |
| # image_nums=image_nums, | |
| # image_start_index_list=image_start_index_list, | |
| # added_bbox_list=None, | |
| # ) | |
| # exit() | |
| with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f: | |
| f.write(json.dumps([total, correct])) | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| if rank == 0: | |
| total = 0 | |
| correct = 0 | |
| print(f"evaluate on rank {rank}. world size is {world_size}") | |
| for rank_i in range(world_size): | |
| [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json")) | |
| os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json") | |
| total += total_part | |
| correct += correct_part | |
| score = correct / total | |
| print("score:", score) | |
| with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f: | |
| pass | |
| else: | |
| score = 0.0 | |
| if world_size > 1: | |
| torch.distributed.barrier() | |
| return score | |
| if __name__ == "__main__": | |
| main() | |