Spaces:
Running
Running
File size: 6,517 Bytes
2fc77f5 cefacdb 2fc77f5 06e8556 8b331af 5ed4bca 2fc77f5 3c343e0 5ed4bca 2fc77f5 2817fcb 3c343e0 8393574 1d8e193 2817fcb 2fc77f5 5ed4bca 2fc77f5 2d87944 2fc77f5 a100ebc 51441a1 a100ebc 51441a1 a100ebc b624a39 a100ebc 51441a1 a100ebc 53c7136 a100ebc 53c7136 202dbe2 a100ebc 36438b0 53c7136 36438b0 53c7136 202dbe2 53c7136 b624a39 202dbe2 a100ebc 29701ab 3c343e0 89390c2 4780a48 c757005 aaed88c f585ea0 c757005 0ef3fb1 2fc77f5 e1faa87 2fc77f5 e1faa87 2fc77f5 3c343e0 2fc77f5 3c343e0 5ed4bca e1faa87 5ed4bca 2fc77f5 5ed4bca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import json
import os
import pandas as pd
from typing import List, Dict, Tuple
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
from src.about import TasksMib_Causalgraph
from src.submission.check_validity import parse_huggingface_url
def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
metric_type = "CPR") -> pd.DataFrame:
"""Creates a dataframe from all the MIB experiment results"""
# print(f"results_path is {results_path}, requests_path is {requests_path}")
raw_data = get_raw_eval_results_mib_subgraph(results_path)
all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
# Convert to dataframe
df = pd.DataFrame.from_records(all_data_json)
ascending = False if metric_type == "CPR" else True
# Sort by Average score descending
if 'Average' in df.columns:
# Convert '-' to NaN for sorting purposes
df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
df = df.sort_values(by=['Average'], ascending=ascending, na_position='last')
# Convert NaN back to '-'
df['Average'] = df['Average'].fillna('-')
return df
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
"""Aggregates rows with the same base method name by taking the max value for each column"""
df_copy = df.copy()
# Set Method as index if it isn't already
if 'Method' in df_copy.columns:
df_copy.set_index('Method', inplace=True)
# Extract base method names (remove _2, _3, etc. suffixes)
base_methods = [name.split('_')[0] if '_' in str(name) and str(name).split('_')[-1].isdigit()
else name for name in df_copy.index]
df_copy.index = base_methods
# Convert scores to numeric values
numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
# Group by base method name and take the max
aggregated_df = numeric_df.groupby(level=0).max().round(2)
# Reset index to get Method as a column
aggregated_df.reset_index(inplace=True)
aggregated_df.rename(columns={'index': 'Method'}, inplace=True)
return aggregated_df
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
df_copy = df.copy()
# Store Method column
method_col = None
if 'Method' in df_copy.columns:
method_col = df_copy['Method']
df_copy = df_copy.drop('Method', axis=1)
if 'eval_name' in df_copy.columns:
df_copy = df_copy.drop('eval_name', axis=1)
# Group columns by model and intervention
result_cols = {}
for task in TasksMib_Causalgraph:
for model in task.value.models: # Will iterate over all three models
for intervention in task.value.interventions:
for counterfactual in task.value.counterfactuals:
col_pattern = f"{model}_layer.*_{intervention}_{counterfactual}"
matching_cols = [c for c in df_copy.columns if pd.Series(c).str.match(col_pattern).any()]
if matching_cols:
col_name = f"{model}_{intervention}_{counterfactual}"
result_cols[col_name] = matching_cols
averaged_df = pd.DataFrame()
if method_col is not None:
averaged_df['Method'] = method_col
for col_name, cols in result_cols.items():
averaged_df[col_name] = df_copy[cols].mean(axis=1).round(2)
return averaged_df
def get_leaderboard_df_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path)
print(f"Columns in aggregated_df: {aggregated_df.columns.tolist()}")
print(f"Columns in intervention_averaged_df: {intervention_averaged_df.columns.tolist()}")
return aggregated_df, intervention_averaged_df
def get_evaluation_queue_df(save_path: str, cols: list, track: str) -> list[pd.DataFrame]:
"""Creates the different dataframes for the evaluation queues requests"""
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
all_evals = []
print(track)
for entry in entries:
if ".json" in entry:
file_path = os.path.join(save_path, entry)
with open(file_path) as fp:
data = json.load(fp)
# if "still_on_hub" in data and data["still_on_hub"]:
# data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
# else:
# data[EvalQueueColumn.model.name] = data["model"]
# data[EvalQueueColumn.revision.name] = "N/A"
all_evals.append(data)
# elif ".md" not in entry:
# # this is a folder
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
# for sub_entry in sub_entries:
# file_path = os.path.join(save_path, entry, sub_entry)
# with open(file_path) as fp:
# data = json.load(fp)
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
# all_evals.append(data)
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "PREVALIDATION"]]
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
for list in (pending_list, finished_list):
for item in list:
item["track"] = track
item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_pending[cols] |