Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
cefacdb
1
Parent(s):
f585ea0
debug
Browse files- src/leaderboard/read_evals.py +30 -0
- src/populate.py +21 -1
src/leaderboard/read_evals.py
CHANGED
@@ -259,6 +259,36 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
259 |
|
260 |
return data_dict
|
261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
263 |
model_result_filepaths = []
|
264 |
|
|
|
259 |
|
260 |
return data_dict
|
261 |
|
262 |
+
|
263 |
+
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
264 |
+
# """Extract evaluation results for MIB causalgraph"""
|
265 |
+
# model_result_filepaths = []
|
266 |
+
|
267 |
+
# for root, dirnames, files in os.walk(results_path):
|
268 |
+
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
269 |
+
# continue
|
270 |
+
|
271 |
+
# try:
|
272 |
+
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
273 |
+
# except dateutil.parser._parser.ParserError:
|
274 |
+
# files = [files[-1]]
|
275 |
+
|
276 |
+
# for file in files:
|
277 |
+
# model_result_filepaths.append(os.path.join(root, file))
|
278 |
+
|
279 |
+
# eval_results = []
|
280 |
+
# for filepath in model_result_filepaths:
|
281 |
+
# try:
|
282 |
+
# eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
|
283 |
+
# result = eval_result.init_from_json_file(filepath)
|
284 |
+
# result.to_dict() # Verify conversion works
|
285 |
+
# eval_results.append(result)
|
286 |
+
# except Exception as e:
|
287 |
+
# print(f"Error processing {filepath}: {e}")
|
288 |
+
# continue
|
289 |
+
|
290 |
+
# return eval_results
|
291 |
+
|
292 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
293 |
model_result_filepaths = []
|
294 |
|
src/populate.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
|
4 |
import pandas as pd
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
@@ -109,6 +109,26 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
109 |
|
110 |
return averaged_df
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
114 |
"""Creates three dataframes from all the MIB causal graph experiment results"""
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import pandas as pd
|
4 |
+
from typing import List, Dict, Tuple
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
|
|
109 |
|
110 |
return averaged_df
|
111 |
|
112 |
+
# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
113 |
+
# """Creates a dataframe from all the MIB causal graph experiment results"""
|
114 |
+
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
115 |
+
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
116 |
+
# print(f"raw_data is {raw_data}")
|
117 |
+
|
118 |
+
# # Convert each result to dict format for detailed df
|
119 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
120 |
+
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
121 |
+
# print(f"detailed_df is: {detailed_df}")
|
122 |
+
|
123 |
+
# # Create and print other views for debugging/reference
|
124 |
+
# aggregated_df = aggregate_methods(detailed_df)
|
125 |
+
# print(f"aggregated_df is: {aggregated_df}")
|
126 |
+
|
127 |
+
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
128 |
+
# print(f"intervention_averaged_df is: {intervention_averaged_df}")
|
129 |
+
|
130 |
+
# # Only return detailed_df for display
|
131 |
+
# return detailed_df
|
132 |
|
133 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
134 |
"""Creates three dataframes from all the MIB causal graph experiment results"""
|