Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
06e8556
1
Parent(s):
9ebccf5
debug
Browse files- app.py +32 -16
- src/about.py +1 -1
- src/display/utils.py +4 -4
- src/leaderboard/read_evals.py +61 -83
- src/populate.py +6 -34
app.py
CHANGED
@@ -30,7 +30,7 @@ from src.display.utils import (
|
|
30 |
fields,
|
31 |
)
|
32 |
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
33 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df,
|
34 |
from src.submission.submit import add_new_eval
|
35 |
|
36 |
|
@@ -49,15 +49,6 @@ try:
|
|
49 |
except Exception:
|
50 |
restart_space()
|
51 |
|
52 |
-
# print("EVAL_RESULTS_PATH")
|
53 |
-
# try:
|
54 |
-
# print(EVAL_RESULTS_PATH)
|
55 |
-
# snapshot_download(
|
56 |
-
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
57 |
-
# )
|
58 |
-
# except Exception:
|
59 |
-
# restart_space()
|
60 |
-
|
61 |
|
62 |
try:
|
63 |
print(RESULTS_REPO_MIB_SUBGRAPH)
|
@@ -78,8 +69,8 @@ except Exception:
|
|
78 |
|
79 |
|
80 |
|
81 |
-
LEADERBOARD_DF_MIB_SUBGRAPH =
|
82 |
-
|
83 |
|
84 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
85 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
@@ -91,7 +82,32 @@ LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_P
|
|
91 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
92 |
|
93 |
|
94 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
96 |
|
97 |
if dataframe is None or dataframe.empty:
|
@@ -116,6 +132,7 @@ def init_leaderboard_mib(dataframe, track):
|
|
116 |
interactive=False,
|
117 |
)
|
118 |
|
|
|
119 |
def init_leaderboard(dataframe, track):
|
120 |
if dataframe is None or dataframe.empty:
|
121 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
@@ -180,11 +197,10 @@ with demo:
|
|
180 |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
181 |
|
182 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
183 |
-
leaderboard =
|
184 |
-
# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
|
185 |
|
186 |
# with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
187 |
-
# leaderboard =
|
188 |
|
189 |
# with gr.Row():
|
190 |
# with gr.Accordion("📙 Citation", open=False):
|
|
|
30 |
fields,
|
31 |
)
|
32 |
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
33 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
|
34 |
from src.submission.submit import add_new_eval
|
35 |
|
36 |
|
|
|
49 |
except Exception:
|
50 |
restart_space()
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
try:
|
54 |
print(RESULTS_REPO_MIB_SUBGRAPH)
|
|
|
69 |
|
70 |
|
71 |
|
72 |
+
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
73 |
+
LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
74 |
|
75 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
76 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
|
82 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
83 |
|
84 |
|
85 |
+
def init_leaderboard_mib_subgraph(dataframe, track):
|
86 |
+
print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
87 |
+
|
88 |
+
if dataframe is None or dataframe.empty:
|
89 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
90 |
+
|
91 |
+
# filter for correct track
|
92 |
+
# dataframe = dataframe.loc[dataframe["Track"] == track]
|
93 |
+
|
94 |
+
print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
|
95 |
+
|
96 |
+
return Leaderboard(
|
97 |
+
value=dataframe,
|
98 |
+
datatype=[c.type for c in fields(AutoEvalColumn_mib)],
|
99 |
+
select_columns=SelectColumns(
|
100 |
+
default_selection=[c.name for c in fields(AutoEvalColumn_mib) if c.displayed_by_default],
|
101 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn_mib) if c.never_hidden],
|
102 |
+
label="Select Columns to Display:",
|
103 |
+
),
|
104 |
+
search_columns=["Method"], # Changed from AutoEvalColumn_mib.model.name to "Method"
|
105 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn_mib) if c.hidden],
|
106 |
+
bool_checkboxgroup_label="Hide models",
|
107 |
+
interactive=False,
|
108 |
+
)
|
109 |
+
|
110 |
+
def init_leaderboard_mib_causalgraph(dataframe, track):
|
111 |
print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
112 |
|
113 |
if dataframe is None or dataframe.empty:
|
|
|
132 |
interactive=False,
|
133 |
)
|
134 |
|
135 |
+
|
136 |
def init_leaderboard(dataframe, track):
|
137 |
if dataframe is None or dataframe.empty:
|
138 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
197 |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
198 |
|
199 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
200 |
+
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
|
|
201 |
|
202 |
# with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
203 |
+
# leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
|
204 |
|
205 |
# with gr.Row():
|
206 |
# with gr.Accordion("📙 Citation", open=False):
|
src/about.py
CHANGED
@@ -27,7 +27,7 @@ class Tasks(Enum):
|
|
27 |
task3 = Task("ewok", "acc", "EWoK")
|
28 |
|
29 |
|
30 |
-
class
|
31 |
task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
32 |
task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
33 |
|
|
|
27 |
task3 = Task("ewok", "acc", "EWoK")
|
28 |
|
29 |
|
30 |
+
class TasksMib_Subgraph(Enum):
|
31 |
task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
32 |
task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
33 |
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks, TasksMultimodal,
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -35,7 +35,7 @@ auto_eval_column_dict_mib = []
|
|
35 |
auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
36 |
|
37 |
# For each task and model combination
|
38 |
-
for task in
|
39 |
for model in task.value.models:
|
40 |
col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
|
41 |
auto_eval_column_dict_mib.append([
|
@@ -54,9 +54,9 @@ AutoEvalColumn_mib = make_dataclass("AutoEvalColumn_mib", auto_eval_column_dict_
|
|
54 |
# Column selection for display
|
55 |
COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
|
56 |
|
57 |
-
# BENCHMARK_COLS_MIB = [t.value.col_name for t in
|
58 |
BENCHMARK_COLS_MIB = []
|
59 |
-
for task in
|
60 |
for model in task.value.models:
|
61 |
col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
|
62 |
BENCHMARK_COLS_MIB.append(col_name)
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
35 |
auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
36 |
|
37 |
# For each task and model combination
|
38 |
+
for task in TasksMib_Subgraph:
|
39 |
for model in task.value.models:
|
40 |
col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
|
41 |
auto_eval_column_dict_mib.append([
|
|
|
54 |
# Column selection for display
|
55 |
COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
|
56 |
|
57 |
+
# BENCHMARK_COLS_MIB = [t.value.col_name for t in TasksMib_Subgraph]
|
58 |
BENCHMARK_COLS_MIB = []
|
59 |
+
for task in TasksMib_Subgraph:
|
60 |
for model in task.value.models:
|
61 |
col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
|
62 |
BENCHMARK_COLS_MIB.append(col_name)
|
src/leaderboard/read_evals.py
CHANGED
@@ -13,29 +13,9 @@ from src.submission.check_validity import is_model_on_hub
|
|
13 |
|
14 |
|
15 |
from typing import List, Dict
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
# def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
20 |
-
# percentages = [e / max(edge_counts) for e in edge_counts]
|
21 |
-
# area_under = 0.
|
22 |
-
# area_from_100 = 0.
|
23 |
-
# for i in range(len(faithfulnesses) - 1):
|
24 |
-
# i_1, i_2 = i, i+1
|
25 |
-
# x_1 = percentages[i_1]
|
26 |
-
# x_2 = percentages[i_2]
|
27 |
-
# # area from point to 100
|
28 |
-
# if log_scale:
|
29 |
-
# x_1 = math.log(x_1)
|
30 |
-
# x_2 = math.log(x_2)
|
31 |
-
# trapezoidal = (percentages[i_2] - percentages[i_1]) * \
|
32 |
-
# (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
|
33 |
-
# area_from_100 += trapezoidal
|
34 |
-
|
35 |
-
# trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
|
36 |
-
# area_under += trapezoidal
|
37 |
-
# average = sum(faithfulnesses) / len(faithfulnesses)
|
38 |
-
# return (area_under, area_from_100, average)
|
39 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
40 |
# Return None if either list is empty
|
41 |
if not edge_counts or not faithfulnesses:
|
@@ -62,7 +42,7 @@ def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
|
62 |
return (area_under, area_from_100, average)
|
63 |
|
64 |
@dataclass
|
65 |
-
class
|
66 |
"""Represents one full evaluation for a method across all models in MIB."""
|
67 |
eval_name: str # method name as identifier
|
68 |
method_name: str # name of the interpretation method
|
@@ -104,63 +84,13 @@ class EvalResult_MIB:
|
|
104 |
"faithfulness": scores[task]["faithfulness"]
|
105 |
}
|
106 |
|
107 |
-
return
|
108 |
eval_name=method_name,
|
109 |
method_name=method_name,
|
110 |
results=results
|
111 |
)
|
112 |
|
113 |
|
114 |
-
|
115 |
-
# def to_dict(self):
|
116 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
117 |
-
# data_dict = {
|
118 |
-
# "eval_name": self.eval_name,
|
119 |
-
# "Method": self.method_name,
|
120 |
-
# }
|
121 |
-
|
122 |
-
# all_scores = []
|
123 |
-
# required_entries = {
|
124 |
-
# 'ioi_meta_llama': False,
|
125 |
-
# 'ioi_qwen': False,
|
126 |
-
# 'ioi_gpt2': False,
|
127 |
-
# 'mcqa_meta_llama': False,
|
128 |
-
# 'mcqa_qwen': False,
|
129 |
-
# 'mcqa_gpt2': False
|
130 |
-
# }
|
131 |
-
|
132 |
-
# # For each task (ioi, mcqa)
|
133 |
-
# for task, task_results in self.results.items():
|
134 |
-
# # Get the models that have results for this task
|
135 |
-
# models = task_results.keys()
|
136 |
-
|
137 |
-
# for model in models:
|
138 |
-
# col_name = f"{task}_{model}"
|
139 |
-
# metrics = task_results[model]
|
140 |
-
# if metrics:
|
141 |
-
# edge_counts = metrics["edge_counts"]
|
142 |
-
# faithfulness = metrics["faithfulness"]
|
143 |
-
# if isinstance(faithfulness[0], list):
|
144 |
-
# faithfulness = faithfulness[0]
|
145 |
-
|
146 |
-
# # Use compute_area
|
147 |
-
# area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
|
148 |
-
# score = area_under * 100
|
149 |
-
# data_dict[col_name] = round(score, 2)
|
150 |
-
# all_scores.append(score)
|
151 |
-
# required_entries[col_name] = True
|
152 |
-
# else:
|
153 |
-
# data_dict[col_name] = '-'
|
154 |
-
|
155 |
-
# # Only show average if all six required entries are present
|
156 |
-
# if all(required_entries.values()):
|
157 |
-
# data_dict["Average"] = round(np.mean(all_scores), 2)
|
158 |
-
# else:
|
159 |
-
# data_dict["Average"] = '-'
|
160 |
-
|
161 |
-
# return data_dict
|
162 |
-
|
163 |
-
|
164 |
|
165 |
def to_dict(self):
|
166 |
"""Converts the Eval Result to a dict for dataframe display"""
|
@@ -211,13 +141,7 @@ class EvalResult_MIB:
|
|
211 |
return data_dict
|
212 |
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
|
221 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
222 |
model_result_filepaths = []
|
223 |
|
@@ -243,7 +167,7 @@ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[Eval
|
|
243 |
eval_results = []
|
244 |
for model_result_filepath in model_result_filepaths:
|
245 |
try:
|
246 |
-
eval_result =
|
247 |
result = eval_result.init_from_json_file(model_result_filepath)
|
248 |
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
249 |
# Verify the result can be converted to dict format
|
@@ -264,6 +188,60 @@ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[Eval
|
|
264 |
|
265 |
|
266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
@dataclass
|
268 |
class EvalResult:
|
269 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
13 |
|
14 |
|
15 |
from typing import List, Dict
|
16 |
+
|
17 |
+
|
18 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
20 |
# Return None if either list is empty
|
21 |
if not edge_counts or not faithfulnesses:
|
|
|
42 |
return (area_under, area_from_100, average)
|
43 |
|
44 |
@dataclass
|
45 |
+
class EvalResult_MIB_SUBGRAPH:
|
46 |
"""Represents one full evaluation for a method across all models in MIB."""
|
47 |
eval_name: str # method name as identifier
|
48 |
method_name: str # name of the interpretation method
|
|
|
84 |
"faithfulness": scores[task]["faithfulness"]
|
85 |
}
|
86 |
|
87 |
+
return EvalResult_MIB_SUBGRAPH(
|
88 |
eval_name=method_name,
|
89 |
method_name=method_name,
|
90 |
results=results
|
91 |
)
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
def to_dict(self):
|
96 |
"""Converts the Eval Result to a dict for dataframe display"""
|
|
|
141 |
return data_dict
|
142 |
|
143 |
|
144 |
+
def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
146 |
model_result_filepaths = []
|
147 |
|
|
|
167 |
eval_results = []
|
168 |
for model_result_filepath in model_result_filepaths:
|
169 |
try:
|
170 |
+
eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
|
171 |
result = eval_result.init_from_json_file(model_result_filepath)
|
172 |
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
173 |
# Verify the result can be converted to dict format
|
|
|
188 |
|
189 |
|
190 |
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
@dataclass
|
202 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
203 |
+
"""Represents one full evaluation for a method across all models in MIB."""
|
204 |
+
eval_name: str # method name as identifier
|
205 |
+
method_name: str # name of the interpretation method
|
206 |
+
results: Dict # nested dict of results {task: {model: {metric: scores}}}
|
207 |
+
|
208 |
+
|
209 |
+
def init_from_json_file(self, json_filepath):
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
def to_dict(self):
|
215 |
+
|
216 |
+
return data_dict
|
217 |
+
|
218 |
+
|
219 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
220 |
+
"""From the path of the results folder root, extract all needed info for MIB results"""
|
221 |
+
model_result_filepaths = []
|
222 |
+
|
223 |
+
print(f"results_path is {results_path}")
|
224 |
+
|
225 |
+
|
226 |
+
return eval_results
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
+
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
|
245 |
@dataclass
|
246 |
class EvalResult:
|
247 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
src/populate.py
CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results,
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
@@ -42,39 +42,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
42 |
|
43 |
|
44 |
|
45 |
-
|
46 |
-
# """Creates a dataframe from all the individual experiment results"""
|
47 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
48 |
-
# raw_data = get_raw_eval_results(results_path, requests_path)
|
49 |
-
# print(f"raw_data is {raw_data}")
|
50 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
51 |
-
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
52 |
-
# all_data_json_filtered = []
|
53 |
-
# for item in all_data_json:
|
54 |
-
# item["Track"] = item["eval_name"].split("_")[-1]
|
55 |
-
# if "VQA" in benchmark_cols and "VQA" in item:
|
56 |
-
# all_data_json_filtered.append(item)
|
57 |
-
# if "VQA" not in benchmark_cols and "VQA" not in item:
|
58 |
-
# all_data_json_filtered.append(item)
|
59 |
-
# all_data_json_filtered.append(item)
|
60 |
-
|
61 |
-
# all_data_json = all_data_json_filtered
|
62 |
-
|
63 |
-
# df = pd.DataFrame.from_records(all_data_json)
|
64 |
-
# df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
65 |
-
|
66 |
-
# print(f"df is {df}")
|
67 |
-
|
68 |
-
# df = df[cols].round(decimals=1)
|
69 |
-
|
70 |
-
# # filter out if any of the benchmarks have not been produced
|
71 |
-
# df = df[has_no_nan_values(df, benchmark_cols)]
|
72 |
-
# return df
|
73 |
-
|
74 |
-
def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
75 |
"""Creates a dataframe from all the MIB experiment results"""
|
76 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
77 |
-
raw_data =
|
78 |
print(f"raw_data is {raw_data}")
|
79 |
|
80 |
# Convert each result to dict format
|
@@ -94,10 +65,11 @@ def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, be
|
|
94 |
|
95 |
return df
|
96 |
|
97 |
-
def
|
98 |
"""Creates a dataframe from all the MIB experiment results"""
|
99 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
100 |
-
raw_data =
|
|
|
101 |
return raw_data
|
102 |
|
103 |
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
42 |
|
43 |
|
44 |
|
45 |
+
def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
"""Creates a dataframe from all the MIB experiment results"""
|
47 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
48 |
+
raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
|
49 |
print(f"raw_data is {raw_data}")
|
50 |
|
51 |
# Convert each result to dict format
|
|
|
65 |
|
66 |
return df
|
67 |
|
68 |
+
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
69 |
"""Creates a dataframe from all the MIB experiment results"""
|
70 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
71 |
+
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
72 |
+
# Implement the rest of the code
|
73 |
return raw_data
|
74 |
|
75 |
|