Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
f0189a6
1
Parent(s):
42f179a
fix: evaluation and scorer test
Browse files
dabstep_benchmark/evaluation/scorer.py
CHANGED
|
@@ -4,8 +4,25 @@ import math
|
|
| 4 |
from difflib import SequenceMatcher
|
| 5 |
|
| 6 |
def is_numeric_with_commas(value: str) -> bool:
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def question_scorer(input1: str, input2: str) -> bool:
|
| 11 |
# Remove leading/trailing whitespace and convert to lowercase
|
|
@@ -67,7 +84,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
|
|
| 67 |
|
| 68 |
# For percentages and small numbers, use a more lenient comparison
|
| 69 |
if num1 < 1 and num2 < 1:
|
| 70 |
-
return math.isclose(num1, num2, rel_tol=1e-
|
| 71 |
|
| 72 |
# For larger numbers, use the original comparison method
|
| 73 |
dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
|
|
@@ -79,7 +96,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
|
|
| 79 |
if rounded1 == rounded2:
|
| 80 |
return True
|
| 81 |
|
| 82 |
-
return math.isclose(num1, num2, rel_tol=1e-
|
| 83 |
|
| 84 |
def compare_strings(str1: str, str2: str) -> bool:
|
| 85 |
# Remove all whitespace and punctuation
|
|
|
|
| 4 |
from difflib import SequenceMatcher
|
| 5 |
|
| 6 |
def is_numeric_with_commas(value: str) -> bool:
|
| 7 |
+
"""
|
| 8 |
+
True for strings that are either
|
| 9 |
+
- numbers using comma thousands‑separators (at least one comma),
|
| 10 |
+
with optional dot‑decimal, e.g. "1,000" or "12,345.67"
|
| 11 |
+
OR
|
| 12 |
+
- pure decimals (no separators) with a decimal point or comma,
|
| 13 |
+
e.g. "0.99" or "0,99"
|
| 14 |
+
Plain ints without commas (e.g. "64") are rejected.
|
| 15 |
+
"""
|
| 16 |
+
v = value.strip()
|
| 17 |
+
pattern = r'''
|
| 18 |
+
^\$? # optional dollar sign
|
| 19 |
+
(?: # two alternate groups:
|
| 20 |
+
\d{1,3}(?:,\d{3})+(?:\.\d+)? # 1) at least one comma‑group + optional .decimal
|
| 21 |
+
| \d+[.,]\d+ # 2) or plain decimal with . or ,
|
| 22 |
+
)
|
| 23 |
+
$ # end of string
|
| 24 |
+
'''
|
| 25 |
+
return bool(re.match(pattern, v, re.VERBOSE))
|
| 26 |
|
| 27 |
def question_scorer(input1: str, input2: str) -> bool:
|
| 28 |
# Remove leading/trailing whitespace and convert to lowercase
|
|
|
|
| 84 |
|
| 85 |
# For percentages and small numbers, use a more lenient comparison
|
| 86 |
if num1 < 1 and num2 < 1:
|
| 87 |
+
return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
|
| 88 |
|
| 89 |
# For larger numbers, use the original comparison method
|
| 90 |
dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
|
|
|
|
| 96 |
if rounded1 == rounded2:
|
| 97 |
return True
|
| 98 |
|
| 99 |
+
return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
|
| 100 |
|
| 101 |
def compare_strings(str1: str, str2: str) -> bool:
|
| 102 |
# Remove all whitespace and punctuation
|
dabstep_benchmark/leaderboard.py
CHANGED
|
@@ -311,7 +311,11 @@ def generate_leaderboard_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
| 311 |
leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
|
| 312 |
|
| 313 |
# sort-by best score
|
| 314 |
-
leaderboard_df.sort_values(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
|
| 317 |
unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])
|
|
|
|
| 311 |
leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
|
| 312 |
|
| 313 |
# sort-by best score
|
| 314 |
+
leaderboard_df.sort_values(
|
| 315 |
+
by=["Hard Level Accuracy (%)", "Easy Level Accuracy (%)"],
|
| 316 |
+
ascending=[False, False],
|
| 317 |
+
inplace=True
|
| 318 |
+
)
|
| 319 |
|
| 320 |
validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
|
| 321 |
unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])
|
dabstep_benchmark/tests/test_scorer.py
CHANGED
|
@@ -51,12 +51,13 @@ def test_list_match(input1, input2, expected):
|
|
| 51 |
@pytest.mark.parametrize("input1, input2, expected", [
|
| 52 |
("42, hello", "42, hello", True),
|
| 53 |
("42, world", "42, hello", False),
|
|
|
|
| 54 |
])
|
| 55 |
def test_mixed_list_match(input1, input2, expected):
|
| 56 |
assert question_scorer(input1, input2) == expected
|
| 57 |
|
| 58 |
@pytest.mark.parametrize("input1, input2, expected", [
|
| 59 |
-
("3.14", "3.1483",
|
| 60 |
("3.14", "3.20", False),
|
| 61 |
("1", "1.0", True),
|
| 62 |
("1.0", "1", True),
|
|
@@ -66,7 +67,9 @@ def test_mixed_list_match(input1, input2, expected):
|
|
| 66 |
("$0.10", "$0.10 per retry", True),
|
| 67 |
("D", "D) Apples", True),
|
| 68 |
("D", "A) Oranges", False),
|
| 69 |
-
("25.0", "0.250", False) #input is not a percentage
|
|
|
|
|
|
|
| 70 |
])
|
| 71 |
def test_approximate_numeric_match(input1, input2, expected):
|
| 72 |
assert question_scorer(input1, input2) == expected
|
|
@@ -74,7 +77,7 @@ def test_approximate_numeric_match(input1, input2, expected):
|
|
| 74 |
@pytest.mark.parametrize("input1, input2, expected", [
|
| 75 |
("73.15%", "73.1495", True),
|
| 76 |
("42%", "42", True),
|
| 77 |
-
("30%", "30.1",
|
| 78 |
("25", "25%", True),
|
| 79 |
("100%", "100", True),
|
| 80 |
("0.1%", "0.1", True),
|
|
|
|
| 51 |
@pytest.mark.parametrize("input1, input2, expected", [
|
| 52 |
("42, hello", "42, hello", True),
|
| 53 |
("42, world", "42, hello", False),
|
| 54 |
+
("64", "64, 53, 454, 231, 473, 381", False)
|
| 55 |
])
|
| 56 |
def test_mixed_list_match(input1, input2, expected):
|
| 57 |
assert question_scorer(input1, input2) == expected
|
| 58 |
|
| 59 |
@pytest.mark.parametrize("input1, input2, expected", [
|
| 60 |
+
("3.14", "3.1483", False),
|
| 61 |
("3.14", "3.20", False),
|
| 62 |
("1", "1.0", True),
|
| 63 |
("1.0", "1", True),
|
|
|
|
| 67 |
("$0.10", "$0.10 per retry", True),
|
| 68 |
("D", "D) Apples", True),
|
| 69 |
("D", "A) Oranges", False),
|
| 70 |
+
("25.0", "0.250", False), #input is not a percentage,
|
| 71 |
+
("5.760000", "5.715872", False),
|
| 72 |
+
("8.68000000000000", "8.66999999999916", False)
|
| 73 |
])
|
| 74 |
def test_approximate_numeric_match(input1, input2, expected):
|
| 75 |
assert question_scorer(input1, input2) == expected
|
|
|
|
| 77 |
@pytest.mark.parametrize("input1, input2, expected", [
|
| 78 |
("73.15%", "73.1495", True),
|
| 79 |
("42%", "42", True),
|
| 80 |
+
("30%", "30.1", False),
|
| 81 |
("25", "25%", True),
|
| 82 |
("100%", "100", True),
|
| 83 |
("0.1%", "0.1", True),
|