fixing html files handling when loading
Browse files- requests/HuggingFaceTB/SmolLM2-135M-Instruct_eval_request.json +0 -9
- requests/Qwen/Qwen2.5-0.5B-Instruct_eval_request.json +0 -9
- requests/open-ai/gpt-3.5-turbo_eval_request.json +0 -9
- requests/openai-community/gpt2_eval_request.json +0 -9
- results/HuggingFaceTB/SmolLM2-135M-Instruct_results_2025-04-21 17:27:52.203995.json +0 -38
- results/Qwen/Qwen2.5-0.5B-Instruct_results_2025-04-21 16:50:28.595317.json +0 -38
- results/open-ai/gpt-3.5-turbo_abb_benchmark_answers_2025-04-26 17:17:28.074158+00:00.html +0 -0
- results/open-ai/gpt-3.5-turbo_results_2025-04-26 17:17:26.272549+00:00.json +0 -0
- results/openai-community/gpt2_results_2025-04-21 16:59:47.547731.json +0 -38
- src/leaderboard/read_evals.py +8 -5
requests/HuggingFaceTB/SmolLM2-135M-Instruct_eval_request.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
3 |
-
"model_sha": "e2c3f7557efbdec707ae3a336371d169783f1da1",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2025-04-21T17:18:59Z",
|
6 |
-
"likes": 178,
|
7 |
-
"params": 0.135,
|
8 |
-
"license": "apache-2.0"
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requests/Qwen/Qwen2.5-0.5B-Instruct_eval_request.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
3 |
-
"model_sha": "7ae557604adf67be50417f59c2c2f167def9a775",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2025-04-21T14:43:01Z",
|
6 |
-
"likes": 310,
|
7 |
-
"params": 0.494,
|
8 |
-
"license": "apache-2.0"
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requests/open-ai/gpt-3.5-turbo_eval_request.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"model": "open-ai/gpt-3.5-turbo",
|
3 |
-
"model_sha": "NA",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2025-04-26 16:56:14",
|
6 |
-
"likes": -1,
|
7 |
-
"params": 999,
|
8 |
-
"license": "closed"
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requests/openai-community/gpt2_eval_request.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"model": "openai-community/gpt2",
|
3 |
-
"model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2025-04-21T14:50:23Z",
|
6 |
-
"likes": 2679,
|
7 |
-
"params": 0.137,
|
8 |
-
"license": "mit"
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/HuggingFaceTB/SmolLM2-135M-Instruct_results_2025-04-21 17:27:52.203995.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"average_score": 3.0,
|
4 |
-
"speed": 0.8272417944325482,
|
5 |
-
"contamination_score": 0,
|
6 |
-
"execution_time": 386.321918,
|
7 |
-
"errors": [],
|
8 |
-
"scores_by_category": [
|
9 |
-
{
|
10 |
-
"category": "Function Calling",
|
11 |
-
"average_score": 6.0,
|
12 |
-
"count": 2
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"category": "Reasoning & Math",
|
16 |
-
"average_score": 1.0,
|
17 |
-
"count": 3
|
18 |
-
}
|
19 |
-
],
|
20 |
-
"scores_by_format": [
|
21 |
-
{
|
22 |
-
"format": "Generation",
|
23 |
-
"average_score": 3.0,
|
24 |
-
"count": 5
|
25 |
-
}
|
26 |
-
]
|
27 |
-
},
|
28 |
-
"config": {
|
29 |
-
"model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
30 |
-
"model_sha": "e2c3f7557efbdec707ae3a336371d169783f1da1",
|
31 |
-
"submitted_time": "2025-04-21T17:18:59Z",
|
32 |
-
"likes": 178,
|
33 |
-
"params": 0.135,
|
34 |
-
"license": "apache-2.0",
|
35 |
-
"model_source": "Hugging Face",
|
36 |
-
"model_category": "Nano"
|
37 |
-
}
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Qwen/Qwen2.5-0.5B-Instruct_results_2025-04-21 16:50:28.595317.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"average_score": 6.0,
|
4 |
-
"speed": 5,
|
5 |
-
"contamination_score": 0,
|
6 |
-
"execution_time": 88.587424,
|
7 |
-
"errors": [],
|
8 |
-
"scores_by_category": [
|
9 |
-
{
|
10 |
-
"category": "Function Calling",
|
11 |
-
"average_score": 5.5,
|
12 |
-
"count": 2
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"category": "Reasoning & Math",
|
16 |
-
"average_score": 1.6666666666666667,
|
17 |
-
"count": 3
|
18 |
-
}
|
19 |
-
],
|
20 |
-
"scores_by_format": [
|
21 |
-
{
|
22 |
-
"format": "Generation",
|
23 |
-
"average_score": 3.2,
|
24 |
-
"count": 5
|
25 |
-
}
|
26 |
-
]
|
27 |
-
},
|
28 |
-
"config": {
|
29 |
-
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
30 |
-
"model_sha": "7ae557604adf67be50417f59c2c2f167def9a775",
|
31 |
-
"model_source": "Hugging Face",
|
32 |
-
"model_category": "Nano",
|
33 |
-
"submitted_time": "2025-04-21T14:43:01Z",
|
34 |
-
"likes": 310,
|
35 |
-
"params": 0.494,
|
36 |
-
"license": "apache-2.0"
|
37 |
-
}
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/open-ai/gpt-3.5-turbo_abb_benchmark_answers_2025-04-26 17:17:28.074158+00:00.html
DELETED
The diff for this file is too large to render.
See raw diff
|
|
results/open-ai/gpt-3.5-turbo_results_2025-04-26 17:17:26.272549+00:00.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
results/openai-community/gpt2_results_2025-04-21 16:59:47.547731.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"average_score": 1.0,
|
4 |
-
"speed": 1.1064065631691649,
|
5 |
-
"contamination_score": 0,
|
6 |
-
"execution_time": 516.691865,
|
7 |
-
"errors": [],
|
8 |
-
"scores_by_category": [
|
9 |
-
{
|
10 |
-
"category": "Function Calling",
|
11 |
-
"average_score": 1.0,
|
12 |
-
"count": 2
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"category": "Reasoning & Math",
|
16 |
-
"average_score": 1.0,
|
17 |
-
"count": 3
|
18 |
-
}
|
19 |
-
],
|
20 |
-
"scores_by_format": [
|
21 |
-
{
|
22 |
-
"format": "Generation",
|
23 |
-
"average_score": 1.0,
|
24 |
-
"count": 5
|
25 |
-
}
|
26 |
-
]
|
27 |
-
},
|
28 |
-
"config": {
|
29 |
-
"model": "openai-community/gpt2",
|
30 |
-
"model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
|
31 |
-
"model_source": "Hugging Face",
|
32 |
-
"model_category": "Nano",
|
33 |
-
"submitted_time": "2025-04-21T14:50:23Z",
|
34 |
-
"likes": 2679,
|
35 |
-
"params": 0.137,
|
36 |
-
"license": "mit"
|
37 |
-
}
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
CHANGED
@@ -168,16 +168,19 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
168 |
model_result_filepaths = []
|
169 |
|
170 |
for root, _, files in os.walk(results_path):
|
171 |
-
|
172 |
-
|
173 |
-
|
|
|
|
|
174 |
|
175 |
# Sort the files by date
|
176 |
try:
|
177 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
178 |
-
except dateutil.parser._parser.ParserError:
|
|
|
179 |
files = [files[-1]]
|
180 |
-
|
181 |
for file in files:
|
182 |
model_result_filepaths.append(os.path.join(root, file))
|
183 |
|
|
|
168 |
model_result_filepaths = []
|
169 |
|
170 |
for root, _, files in os.walk(results_path):
|
171 |
+
print("HERE",files)
|
172 |
+
# We should only have json files in model results ##we allow HTML files
|
173 |
+
#if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
174 |
+
# continue
|
175 |
+
files = [f for f in files if f.endswith(".json")]
|
176 |
|
177 |
# Sort the files by date
|
178 |
try:
|
179 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
180 |
+
except dateutil.parser._parser.ParserError as e:
|
181 |
+
print("Error",e)
|
182 |
files = [files[-1]]
|
183 |
+
print(files)
|
184 |
for file in files:
|
185 |
model_result_filepaths.append(os.path.join(root, file))
|
186 |
|