eawolf2357-git / videoxl /eval /model_vqa_mmbench.py
seawolf2357's picture
Upload folder using huggingface_hub
321d89c verified
import argparse
import torch
import os
import json
import pandas as pd
from tqdm import tqdm
import shortuuid
from PIL import Image
from io import BytesIO
import base64
from longva.constants import IMAGE_TOKEN_INDEX
from longva.longva.conversation import conv_templates
from longva.model.builder import load_pretrained_model
from longva.mm_utils import tokenizer_image_token, process_images,transform_input_id
from torch.utils.data import Dataset, DataLoader
import math
all_options = ['A', 'B', 'C', 'D']
def load_image_from_base64(image):
return Image.open(BytesIO(base64.b64decode(image)))
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def is_none(value):
if value is None:
return True
if type(value) is float and math.isnan(value):
return True
if type(value) is str and value.lower() == 'nan':
return True
if type(value) is str and value.lower() == 'none':
return True
return False
def get_options(row, options):
parsed_options = []
for option in options:
option_value = row[option]
if is_none(option_value):
break
parsed_options.append(option_value)
return parsed_options
def eval_model(args):
# Model
tokenizer, model, image_processor, _ = load_pretrained_model(args.model_path, None, "llava_qwen", device_map="cuda:0")
questions = pd.read_table(os.path.expanduser(args.question_file))
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
gen_kwargs = {"do_sample": False, "temperature": 0, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}
for index, row in tqdm(questions.iterrows(), total=len(questions)):
model.memory.reset()
options = get_options(row, all_options)
cur_option_char = all_options[:len(options)]
if args.all_rounds:
num_rounds = len(options)
else:
num_rounds = 1
for round_idx in range(num_rounds):
idx = row['index']
question = row['question']
hint = row['hint']
image = load_image_from_base64(row['image'])
if not is_none(hint):
question = hint + '\n' + question
for option_char, option in zip(all_options[:len(options)], options):
question = question + '\n' + option_char + '. ' + option
qs = cur_prompt = question
# qs = "<image>" + '\n' + qs
if args.single_pred_prompt:
if args.lang == 'cn':
qs = qs + '\n' + "请直接回答选项字母。"
else:
qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\n{qs}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
image_tensor = process_images([image], image_processor, model.config)
image_tensor=image_tensor.to('cuda', dtype=torch.float16)
num_tokens=(image_tensor.shape[1]-1) *144
with torch.inference_mode():
output_ids = model.generate(input_ids, images=image_tensor, image_sizes=[image.size], modalities=["image"], **gen_kwargs)
if -200 in input_ids:
transform_input_ids=transform_input_id(input_ids,num_tokens,model.config.vocab_size-1)
output_ids=output_ids[:,transform_input_ids.shape[1]:]
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"round_id": round_idx,
"prompt": cur_prompt,
"text": outputs,
"options": options,
"option_char": cur_option_char,
"answer_id": ans_id,
"model_id": "longva_qwen",
"metadata": {}}) + "\n")
ans_file.flush()
# rotate options
options = options[1:] + options[:1]
cur_option_char = cur_option_char[1:] + cur_option_char[:1]
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default=None)
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--model-type", type=str, default=None)
parser.add_argument("--image-folder", type=str, default=None)
parser.add_argument("--question-file", type=str, default=None)
parser.add_argument("--answers-file", type=str, default=None)
parser.add_argument("--conv-mode", type=str, default=None)
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--all-rounds", action="store_true")
parser.add_argument("--single-pred-prompt", action="store_true")
parser.add_argument("--lang", type=str, default="en")
args = parser.parse_args()
eval_model(args)