Spaces:
Runtime error
Runtime error
File size: 6,109 Bytes
193b86e 8d66a77 193b86e 06d8f45 193b86e 06d8f45 193b86e 8d66a77 06d8f45 193b86e 6e02b3f 06d8f45 193b86e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import json
import re
import string
import warnings
import numpy as np
def normalize_number_str(number_str: str) -> float:
# we replace these common units and commas to allow
# conversion to float
for char in ["$", "%", ","]:
number_str = number_str.replace(char, "")
try:
return float(number_str)
except ValueError:
print(f"String {number_str} cannot be normalized to number str.")
return float("inf")
def normalize_answer(a):
# Lower case
# Trim (left and right)
# Replace multiple spaces with one space
# Remove trailing punctuation
# return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))
if isinstance(a, list):
a = ''.join(a)
return a.strip().lower()
def exact_match(answer, ground_truth):
return normalize_answer(answer) == normalize_answer(ground_truth)
def keyword_match(answer, ground_truth):
return normalize_answer(ground_truth) in normalize_answer(answer)
def split_string(
s: str,
char_list: list[str] = [",", ";"],
) -> list[str]:
pattern = f"[{''.join(char_list)}]"
return re.split(pattern, s)
def question_scorer(
user_task: str,
val: str,
) -> bool:
def is_float(element: any) -> bool:
try:
float(element)
return True
except ValueError:
return False
# 打分机制
level = 0
expertise = 0
reasoning = 0
comprehension = 0
final_answer = user_task["final_answer"]
expected_answer = val["Final answer"]
data = val["score"]
chat_score = []
for i in range(len(data['type'])):
item = {
'type': data['type'][i],
'question': data['question'][i],
'choices': data['choices'][i],
'answer': data['answer'][i],
'expertise': data['expertise'][i],
'reasoning': data['reasoning'][i],
'comprehension': data['comprehension'][i],
'score': data['score'][i]
}
chat_score.append(item)
for i, score_item in enumerate(chat_score):
answer_true = False
if score_item['type'].lower() == 'multiple choice':
if exact_match(user_task["score_answer"][i], score_item['answer']):
answer_true = True
elif score_item['type'].lower() == 'fill in the blanks':
if keyword_match(user_task["score_answer"][i], score_item['answer']):
answer_true = True
elif score_item['type'].lower() == 'short answer questions':
for ground_truth in score_item['answer']:
if keyword_match(user_task["score_answer"][i], ground_truth):
answer_true = True
break
# print(answer, score_item['answer'], answer_true)
# 加分
if answer_true:
expertise += score_item['expertise']
reasoning += score_item['reasoning']
comprehension += score_item['comprehension']
if score_item['score'] > level:
level = score_item['score']
print([level, expertise, reasoning, comprehension])
# final_answer正确 则满分,但是能力分不加了
if expected_answer and exact_match(final_answer, expected_answer):
level = 10
return [level, expertise, reasoning, comprehension]
score = 0
if user_task["final_answer"] == val["Final answer"]:
score = val["Total score"]
else:
for i, item in enumerate(val["score"]["question"]):
if user_task["score_answer"][i] in val["score"]["answer"][i] and val["score"]["score"][i] > score:
score = item["score"]
return score
# # if gt is a number
# if is_float(ground_truth):
# print(f"Evaluating {model_answer} as a number.")
# normalized_answer = normalize_number_str(model_answer)
# return normalized_answer == float(ground_truth)
#
# # if gt is a list
# elif any(char in ground_truth for char in [",", ";"]):
# print(f"Evaluating {model_answer} as a comma separated list.")
# # question with the fish: normalization removes punct
#
# gt_elems = split_string(ground_truth)
# ma_elems = split_string(model_answer)
#
# # check length is the same
# if len(gt_elems) != len(ma_elems):
# warnings.warn(
# "Answer lists have different lengths, returning False.", UserWarning
# )
# return False
#
# # compare each element as float or str
# comparisons = []
# for ma_elem, gt_elem in zip(ma_elems, gt_elems):
# if is_float(gt_elem):
# normalized_ma_elem = normalize_number_str(ma_elem)
# comparisons.append(normalized_ma_elem == float(gt_elem))
# else:
# # we do not remove punct since comparisons can include punct
# comparisons.append(
# normalize_str(ma_elem, remove_punct=False)
# == normalize_str(gt_elem, remove_punct=False)
# )
# return all(comparisons)
#
# # if gt is a str
# else:
# print(f"Evaluating {model_answer} as a string.")
# return normalize_str(model_answer) == normalize_str(ground_truth)
def normalize_str(input_str, remove_punct=True) -> str:
"""
Normalize a string by:
- Removing all white spaces
- Optionally removing punctuation (if remove_punct is True)
- Converting to lowercase
Parameters:
- input_str: str, the string to normalize
- remove_punct: bool, whether to remove punctuation (default: True)
Returns:
- str, the normalized string
"""
# Remove all white spaces. Required e.g for seagull vs. sea gull
no_spaces = re.sub(r"\s", "", input_str)
# Remove punctuation, if specified.
if remove_punct:
translator = str.maketrans("", "", string.punctuation)
return no_spaces.lower().translate(translator)
else:
return no_spaces.lower()
|