Spaces:
Runtime error
Runtime error
feat: fix the to_dict function
Browse files- src/leaderboard/read_evals.py +12 -14
- src/populate.py +6 -3
- tests/src/display/test_utils.py +0 -1
- tests/src/leaderboard/test_read_evals.py +7 -2
- tests/src/test_populate.py +14 -6
src/leaderboard/read_evals.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import glob
|
|
|
|
| 2 |
import json
|
| 3 |
import os.path
|
| 4 |
from dataclasses import dataclass
|
|
@@ -6,7 +7,7 @@ from typing import List
|
|
| 6 |
|
| 7 |
import dateutil.parser._parser
|
| 8 |
|
| 9 |
-
from src.display.utils import
|
| 10 |
from src.benchmarks import get_safe_name
|
| 11 |
|
| 12 |
|
|
@@ -61,20 +62,19 @@ class FullEvalResult:
|
|
| 61 |
results=result_list
|
| 62 |
)
|
| 63 |
|
| 64 |
-
def to_dict(self, task='qa', metric='ndcg_at_1'):
|
| 65 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
| 66 |
"""
|
| 67 |
-
results =
|
| 68 |
for eval_result in self.results:
|
| 69 |
if eval_result.metric != metric:
|
| 70 |
continue
|
| 71 |
if eval_result.task != task:
|
| 72 |
continue
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
}
|
| 78 |
for result in eval_result.results:
|
| 79 |
# add result for each domain, language, and dataset
|
| 80 |
domain = result["domain"]
|
|
@@ -82,12 +82,11 @@ class FullEvalResult:
|
|
| 82 |
dataset = result["dataset"]
|
| 83 |
value = result["value"]
|
| 84 |
if task == 'qa':
|
| 85 |
-
benchmark_name = f"{
|
| 86 |
elif task == 'long_doc':
|
| 87 |
-
benchmark_name = f"{
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
return results
|
| 91 |
|
| 92 |
def update_with_request_file(self, request_path):
|
| 93 |
"""
|
|
@@ -148,7 +147,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
| 148 |
eval_results = {}
|
| 149 |
for model_result_filepath in model_result_filepaths:
|
| 150 |
# create evaluation results
|
| 151 |
-
# TODO: fix the bug here, the running results should not be loaded
|
| 152 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
| 153 |
# get the latest result that is finished
|
| 154 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
| 1 |
import glob
|
| 2 |
+
from collections import defaultdict
|
| 3 |
import json
|
| 4 |
import os.path
|
| 5 |
from dataclasses import dataclass
|
|
|
|
| 7 |
|
| 8 |
import dateutil.parser._parser
|
| 9 |
|
| 10 |
+
from src.display.utils import AutoEvalColumnQA
|
| 11 |
from src.benchmarks import get_safe_name
|
| 12 |
|
| 13 |
|
|
|
|
| 62 |
results=result_list
|
| 63 |
)
|
| 64 |
|
| 65 |
+
def to_dict(self, task='qa', metric='ndcg_at_1') -> List:
|
| 66 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
| 67 |
"""
|
| 68 |
+
results = defaultdict(dict)
|
| 69 |
for eval_result in self.results:
|
| 70 |
if eval_result.metric != metric:
|
| 71 |
continue
|
| 72 |
if eval_result.task != task:
|
| 73 |
continue
|
| 74 |
+
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
| 75 |
+
results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
|
| 76 |
+
results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
|
| 77 |
+
|
|
|
|
| 78 |
for result in eval_result.results:
|
| 79 |
# add result for each domain, language, and dataset
|
| 80 |
domain = result["domain"]
|
|
|
|
| 82 |
dataset = result["dataset"]
|
| 83 |
value = result["value"]
|
| 84 |
if task == 'qa':
|
| 85 |
+
benchmark_name = f"{domain}_{lang}"
|
| 86 |
elif task == 'long_doc':
|
| 87 |
+
benchmark_name = f"{domain}_{lang}_{dataset}_{metric}"
|
| 88 |
+
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
| 89 |
+
return [v for v in results.values()]
|
|
|
|
| 90 |
|
| 91 |
def update_with_request_file(self, request_path):
|
| 92 |
"""
|
|
|
|
| 147 |
eval_results = {}
|
| 148 |
for model_result_filepath in model_result_filepaths:
|
| 149 |
# create evaluation results
|
|
|
|
| 150 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
| 151 |
# get the latest result that is finished
|
| 152 |
eval_result.update_with_request_file(requests_path)
|
src/populate.py
CHANGED
|
@@ -4,7 +4,7 @@ import os
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
-
from src.display.utils import
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
| 9 |
from typing import Tuple
|
| 10 |
|
|
@@ -12,10 +12,13 @@ from typing import Tuple
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
|
| 13 |
"""Creates a dataframe from all the individual experiment results"""
|
| 14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 15 |
-
all_data_json = [
|
|
|
|
|
|
|
| 16 |
|
| 17 |
df = pd.DataFrame.from_records(all_data_json)
|
| 18 |
-
df = df.
|
|
|
|
| 19 |
df = df[cols].round(decimals=2)
|
| 20 |
|
| 21 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
+
from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
| 9 |
from typing import Tuple
|
| 10 |
|
|
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
|
| 13 |
"""Creates a dataframe from all the individual experiment results"""
|
| 14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 15 |
+
all_data_json = []
|
| 16 |
+
for v in raw_data:
|
| 17 |
+
all_data_json += v.to_dict()
|
| 18 |
|
| 19 |
df = pd.DataFrame.from_records(all_data_json)
|
| 20 |
+
df["Average ⬆️"] = df[benchmark_cols].mean(axis=1)
|
| 21 |
+
# df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
| 22 |
df = df[cols].round(decimals=2)
|
| 23 |
|
| 24 |
# filter out if any of the benchmarks have not been produced
|
tests/src/display/test_utils.py
CHANGED
|
@@ -2,7 +2,6 @@ import pytest
|
|
| 2 |
from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
| 3 |
|
| 4 |
|
| 5 |
-
@pytest.mark.parametrize('auto_eval_column')
|
| 6 |
def test_fields():
|
| 7 |
for c in fields(AutoEvalColumnQA):
|
| 8 |
print(c)
|
|
|
|
| 2 |
from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
| 3 |
|
| 4 |
|
|
|
|
| 5 |
def test_fields():
|
| 6 |
for c in fields(AutoEvalColumnQA):
|
| 7 |
print(c)
|
tests/src/leaderboard/test_read_evals.py
CHANGED
|
@@ -14,8 +14,13 @@ def test_init_from_json_file():
|
|
| 14 |
def test_to_dict():
|
| 15 |
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
| 16 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 17 |
-
|
| 18 |
-
assert len(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def test_get_request_file_for_model():
|
|
|
|
| 14 |
def test_to_dict():
|
| 15 |
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
| 16 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 17 |
+
result_list = full_eval_result.to_dict(task='qa', metric='ndcg_at_1')
|
| 18 |
+
assert len(result_list) == 1
|
| 19 |
+
result_dict = result_list[0]
|
| 20 |
+
assert result_dict["Retrieval Model"] == "bge-m3"
|
| 21 |
+
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
| 22 |
+
assert result_dict["qa_wiki_en"] is not None
|
| 23 |
+
assert result_dict["qa_wiki_zh"] is not None
|
| 24 |
|
| 25 |
|
| 26 |
def test_get_request_file_for_model():
|
tests/src/test_populate.py
CHANGED
|
@@ -3,10 +3,18 @@ from pathlib import Path
|
|
| 3 |
|
| 4 |
cur_fp = Path(__file__)
|
| 5 |
|
|
|
|
| 6 |
def test_get_leaderboard_df():
|
| 7 |
-
requests_path = cur_fp.parents[
|
| 8 |
-
results_path = cur_fp.parents[
|
| 9 |
-
cols = []
|
| 10 |
-
benchmark_cols = []
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
cur_fp = Path(__file__)
|
| 5 |
|
| 6 |
+
|
| 7 |
def test_get_leaderboard_df():
|
| 8 |
+
requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
|
| 9 |
+
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
| 10 |
+
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
| 11 |
+
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
| 12 |
+
raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
|
| 13 |
+
assert df.shape[0] == 2
|
| 14 |
+
assert df["Retrieval Model"][0] == "bge-m3"
|
| 15 |
+
assert df["Retrieval Model"][1] == "bge-m3"
|
| 16 |
+
assert df["Reranking Model"][0] == "NoReranker"
|
| 17 |
+
assert df["Reranking Model"][1] == "bge-reranker-v2-m3"
|
| 18 |
+
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
|
| 19 |
+
|
| 20 |
+
|