import argparse import torch import os import json import pandas as pd from tqdm import tqdm import shortuuid from PIL import Image from io import BytesIO import base64 from longva.constants import IMAGE_TOKEN_INDEX from longva.longva.conversation import conv_templates from longva.model.builder import load_pretrained_model from longva.mm_utils import tokenizer_image_token, process_images,transform_input_id from torch.utils.data import Dataset, DataLoader import math all_options = ['A', 'B', 'C', 'D'] def load_image_from_base64(image): return Image.open(BytesIO(base64.b64decode(image))) def split_list(lst, n): """Split a list into n (roughly) equal-sized chunks""" chunk_size = math.ceil(len(lst) / n) # integer division return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] def get_chunk(lst, n, k): chunks = split_list(lst, n) return chunks[k] def is_none(value): if value is None: return True if type(value) is float and math.isnan(value): return True if type(value) is str and value.lower() == 'nan': return True if type(value) is str and value.lower() == 'none': return True return False def get_options(row, options): parsed_options = [] for option in options: option_value = row[option] if is_none(option_value): break parsed_options.append(option_value) return parsed_options def eval_model(args): # Model tokenizer, model, image_processor, _ = load_pretrained_model(args.model_path, None, "llava_qwen", device_map="cuda:0") questions = pd.read_table(os.path.expanduser(args.question_file)) questions = get_chunk(questions, args.num_chunks, args.chunk_idx) answers_file = os.path.expanduser(args.answers_file) os.makedirs(os.path.dirname(answers_file), exist_ok=True) ans_file = open(answers_file, "w") gen_kwargs = {"do_sample": False, "temperature": 0, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 128} for index, row in tqdm(questions.iterrows(), total=len(questions)): model.memory.reset() options = get_options(row, all_options) cur_option_char = all_options[:len(options)] if args.all_rounds: num_rounds = len(options) else: num_rounds = 1 for round_idx in range(num_rounds): idx = row['index'] question = row['question'] hint = row['hint'] image = load_image_from_base64(row['image']) if not is_none(hint): question = hint + '\n' + question for option_char, option in zip(all_options[:len(options)], options): question = question + '\n' + option_char + '. ' + option qs = cur_prompt = question # qs = "" + '\n' + qs if args.single_pred_prompt: if args.lang == 'cn': qs = qs + '\n' + "请直接回答选项字母。" else: qs = qs + '\n' + "Answer with the option's letter from the given choices directly." prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\n{qs}<|im_end|>\n<|im_start|>assistant\n" input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() image_tensor = process_images([image], image_processor, model.config) image_tensor=image_tensor.to('cuda', dtype=torch.float16) num_tokens=(image_tensor.shape[1]-1) *144 with torch.inference_mode(): output_ids = model.generate(input_ids, images=image_tensor, image_sizes=[image.size], modalities=["image"], **gen_kwargs) if -200 in input_ids: transform_input_ids=transform_input_id(input_ids,num_tokens,model.config.vocab_size-1) output_ids=output_ids[:,transform_input_ids.shape[1]:] outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() ans_id = shortuuid.uuid() ans_file.write(json.dumps({"question_id": idx, "round_id": round_idx, "prompt": cur_prompt, "text": outputs, "options": options, "option_char": cur_option_char, "answer_id": ans_id, "model_id": "longva_qwen", "metadata": {}}) + "\n") ans_file.flush() # rotate options options = options[1:] + options[:1] cur_option_char = cur_option_char[1:] + cur_option_char[:1] ans_file.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str, default=None) parser.add_argument("--model-base", type=str, default=None) parser.add_argument("--model-type", type=str, default=None) parser.add_argument("--image-folder", type=str, default=None) parser.add_argument("--question-file", type=str, default=None) parser.add_argument("--answers-file", type=str, default=None) parser.add_argument("--conv-mode", type=str, default=None) parser.add_argument("--num-chunks", type=int, default=1) parser.add_argument("--chunk-idx", type=int, default=0) parser.add_argument("--temperature", type=float, default=0.2) parser.add_argument("--top_p", type=float, default=None) parser.add_argument("--num_beams", type=int, default=1) parser.add_argument("--all-rounds", action="store_true") parser.add_argument("--single-pred-prompt", action="store_true") parser.add_argument("--lang", type=str, default="en") args = parser.parse_args() eval_model(args)