Commit
·
bed23b0
1
Parent(s):
5a34fee
Add RobloxQA_OpenEnded
Browse files- src/about.py +2 -0
- src/leaderboard/populate.py +1 -1
- src/leaderboard/utils.py +5 -2
src/about.py
CHANGED
@@ -8,4 +8,6 @@ Tracking LLM capabilities regarding Roblox game development.
|
|
8 |
Benchmarks:
|
9 |
|
10 |
- [RobloxQA](https://huggingface.co/datasets/boatbomber/RobloxQA-v1.0): Multiple choice question answering about Roblox APIs and concepts.
|
|
|
|
|
11 |
"""
|
|
|
8 |
Benchmarks:
|
9 |
|
10 |
- [RobloxQA](https://huggingface.co/datasets/boatbomber/RobloxQA-v1.0): Multiple choice question answering about Roblox APIs and concepts.
|
11 |
+
- [RobloxQA_OpenEnded](https://huggingface.co/datasets/boatbomber/RobloxQA-v1.0): Question answering about Roblox APIs and concepts without giving the multiple choices. \
|
12 |
+
Correctness judged by an LLM by comparing the generated answer to the correct answer choice.
|
13 |
"""
|
src/leaderboard/populate.py
CHANGED
@@ -42,7 +42,7 @@ def load_results() -> pd.DataFrame:
|
|
42 |
|
43 |
for c in COLUMNS:
|
44 |
if c.name not in evaluation:
|
45 |
-
evaluation[c.name] =
|
46 |
|
47 |
data.append(evaluation)
|
48 |
|
|
|
42 |
|
43 |
for c in COLUMNS:
|
44 |
if c.name not in evaluation:
|
45 |
+
evaluation[c.name] = c.default
|
46 |
|
47 |
data.append(evaluation)
|
48 |
|
src/leaderboard/utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
|
|
2 |
|
3 |
|
4 |
@dataclass
|
@@ -9,6 +10,7 @@ class ColumnContent:
|
|
9 |
hidden: bool = False
|
10 |
never_hidden: bool = False
|
11 |
searchable: bool = False
|
|
|
12 |
|
13 |
|
14 |
## Leaderboard columns
|
@@ -16,6 +18,7 @@ COLUMNS = [
|
|
16 |
ColumnContent("Model", type="str", displayed_by_default=True, never_hidden=True, searchable=True),
|
17 |
ColumnContent("Precision", type="str", displayed_by_default=False),
|
18 |
ColumnContent("Params (B)", type="number", displayed_by_default=True),
|
19 |
-
ColumnContent("Average", type="number", displayed_by_default=True),
|
20 |
-
ColumnContent("RobloxQA", type="number", displayed_by_default=True),
|
|
|
21 |
]
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
from typing import Any
|
3 |
|
4 |
|
5 |
@dataclass
|
|
|
10 |
hidden: bool = False
|
11 |
never_hidden: bool = False
|
12 |
searchable: bool = False
|
13 |
+
default: Any = None
|
14 |
|
15 |
|
16 |
## Leaderboard columns
|
|
|
18 |
ColumnContent("Model", type="str", displayed_by_default=True, never_hidden=True, searchable=True),
|
19 |
ColumnContent("Precision", type="str", displayed_by_default=False),
|
20 |
ColumnContent("Params (B)", type="number", displayed_by_default=True),
|
21 |
+
ColumnContent("Average", type="number", displayed_by_default=True, default=0),
|
22 |
+
ColumnContent("RobloxQA", type="number", displayed_by_default=True, default=0),
|
23 |
+
ColumnContent("RobloxQA_OpenEnded", type="number", displayed_by_default=True, default=0),
|
24 |
]
|