Spaces:
Runtime error
Runtime error
fix: fix the data loader
Browse files- app.py +2 -1
- src/envs.py +2 -2
- src/leaderboard/read_evals.py +8 -3
- src/populate.py +3 -0
- tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json} +0 -0
- tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-12-21T18-10-08.json → results_2023-12-21T18-10-08.json} +0 -0
- tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json} +0 -0
app.py
CHANGED
|
@@ -42,7 +42,8 @@ def restart_space():
|
|
| 42 |
# restart_space()
|
| 43 |
|
| 44 |
raw_data_qa, original_df_qa = get_leaderboard_df(
|
| 45 |
-
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='
|
|
|
|
| 46 |
leaderboard_df = original_df_qa.copy()
|
| 47 |
|
| 48 |
# (
|
|
|
|
| 42 |
# restart_space()
|
| 43 |
|
| 44 |
raw_data_qa, original_df_qa = get_leaderboard_df(
|
| 45 |
+
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
|
| 46 |
+
print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
|
| 47 |
leaderboard_df = original_df_qa.copy()
|
| 48 |
|
| 49 |
# (
|
src/envs.py
CHANGED
|
@@ -17,8 +17,8 @@ RESULTS_REPO = f"{OWNER}/results"
|
|
| 17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
-
EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/
|
| 21 |
-
EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/
|
| 22 |
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
|
|
|
| 17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
+
EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/toys/toydata/requests" # os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
+
EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/toys/toydata/results" #os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -62,19 +62,22 @@ class FullEvalResult:
|
|
| 62 |
results=result_list
|
| 63 |
)
|
| 64 |
|
| 65 |
-
def to_dict(self, task='qa', metric='
|
| 66 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
| 67 |
"""
|
| 68 |
results = defaultdict(dict)
|
| 69 |
for eval_result in self.results:
|
| 70 |
if eval_result.metric != metric:
|
|
|
|
| 71 |
continue
|
| 72 |
if eval_result.task != task:
|
|
|
|
| 73 |
continue
|
| 74 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
| 75 |
results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
|
| 76 |
results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
|
| 77 |
|
|
|
|
| 78 |
for result in eval_result.results:
|
| 79 |
# add result for each domain, language, and dataset
|
| 80 |
domain = result["domain"]
|
|
@@ -136,7 +139,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
| 136 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 137 |
continue
|
| 138 |
try:
|
| 139 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("
|
| 140 |
except dateutil.parser._parser.ParserError:
|
| 141 |
files = [files[-1]]
|
| 142 |
|
|
@@ -152,9 +155,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
| 152 |
eval_result.update_with_request_file(requests_path)
|
| 153 |
latest_date_str = eval_result.date.replace(":", "-")
|
| 154 |
model_result_date_str = model_result_filepath.split('/')[-1
|
| 155 |
-
].removeprefix("
|
| 156 |
if latest_date_str != model_result_date_str:
|
|
|
|
| 157 |
continue
|
|
|
|
| 158 |
eval_name = eval_result.eval_name
|
| 159 |
eval_results[eval_name] = eval_result
|
| 160 |
|
|
|
|
| 62 |
results=result_list
|
| 63 |
)
|
| 64 |
|
| 65 |
+
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
| 66 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
| 67 |
"""
|
| 68 |
results = defaultdict(dict)
|
| 69 |
for eval_result in self.results:
|
| 70 |
if eval_result.metric != metric:
|
| 71 |
+
# print(f'result skipped: {metric} != {eval_result.metric}')
|
| 72 |
continue
|
| 73 |
if eval_result.task != task:
|
| 74 |
+
# print(f'result skipped: {task} != {eval_result.task}')
|
| 75 |
continue
|
| 76 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
| 77 |
results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
|
| 78 |
results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
|
| 79 |
|
| 80 |
+
print(f'result loaded: {eval_result.eval_name}')
|
| 81 |
for result in eval_result.results:
|
| 82 |
# add result for each domain, language, and dataset
|
| 83 |
domain = result["domain"]
|
|
|
|
| 139 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 140 |
continue
|
| 141 |
try:
|
| 142 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
|
| 143 |
except dateutil.parser._parser.ParserError:
|
| 144 |
files = [files[-1]]
|
| 145 |
|
|
|
|
| 155 |
eval_result.update_with_request_file(requests_path)
|
| 156 |
latest_date_str = eval_result.date.replace(":", "-")
|
| 157 |
model_result_date_str = model_result_filepath.split('/')[-1
|
| 158 |
+
].removeprefix("results_").removesuffix(".json")
|
| 159 |
if latest_date_str != model_result_date_str:
|
| 160 |
+
print(f'file skipped: {model_result_filepath}')
|
| 161 |
continue
|
| 162 |
+
print(f'file loaded: {model_result_filepath}')
|
| 163 |
eval_name = eval_result.eval_name
|
| 164 |
eval_results[eval_name] = eval_result
|
| 165 |
|
src/populate.py
CHANGED
|
@@ -12,11 +12,14 @@ from typing import Tuple
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
|
| 13 |
"""Creates a dataframe from all the individual experiment results"""
|
| 14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
|
| 15 |
all_data_json = []
|
| 16 |
for v in raw_data:
|
| 17 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 18 |
|
|
|
|
| 19 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
| 20 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 21 |
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
| 22 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
|
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
|
| 13 |
"""Creates a dataframe from all the individual experiment results"""
|
| 14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 15 |
+
print(f"raw_data loaded: {len(raw_data)}")
|
| 16 |
all_data_json = []
|
| 17 |
for v in raw_data:
|
| 18 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 19 |
|
| 20 |
+
print(f'records loaded: {len(all_data_json)}')
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|
| 22 |
+
print(f'dataframe created: {df.shape}')
|
| 23 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 24 |
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
| 25 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json}
RENAMED
|
File without changes
|
tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-12-21T18-10-08.json → results_2023-12-21T18-10-08.json}
RENAMED
|
File without changes
|
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json}
RENAMED
|
File without changes
|