leaderboard / src /populate.py
Aaron Mueller
updates for causal variable track
e1faa87
raw
history blame
7.94 kB
import json
import os
import pandas as pd
from typing import List, Dict, Tuple
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
from src.about import TasksMib_Causalgraph
from src.submission.check_validity import parse_huggingface_url
def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
metric_type = "CPR") -> pd.DataFrame:
"""Creates a dataframe from all the MIB experiment results"""
# print(f"results_path is {results_path}, requests_path is {requests_path}")
raw_data = get_raw_eval_results_mib_subgraph(results_path)
all_data_json = [v.to_dict(metric_type=metric_type) for v in raw_data]
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
# Convert to dataframe
df = pd.DataFrame.from_records(all_data_json)
ascending = False if metric_type == "CPR" else True
# Sort by Average score descending
if 'Average' in df.columns:
# Convert '-' to NaN for sorting purposes
df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
df = df.sort_values(by=['Average'], ascending=ascending, na_position='last')
# Convert NaN back to '-'
df['Average'] = df['Average'].fillna('-')
return df
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
"""Aggregates rows with the same base method name by taking the max value for each column"""
df_copy = df.copy()
# Set Method as index if it isn't already
if 'Method' in df_copy.columns:
df_copy.set_index('Method', inplace=True)
# Extract base method names (remove _2, _3, etc. suffixes)
base_methods = [name.split('_')[0] if '_' in str(name) and str(name).split('_')[-1].isdigit()
else name for name in df_copy.index]
df_copy.index = base_methods
# Convert scores to numeric values
numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
# Group by base method name and take the max
aggregated_df = numeric_df.groupby(level=0).max().round(2)
# Reset index to get Method as a column
aggregated_df.reset_index(inplace=True)
aggregated_df.rename(columns={'index': 'Method'}, inplace=True)
return aggregated_df
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
df_copy = df.copy()
# Store Method column
method_col = None
if 'Method' in df_copy.columns:
method_col = df_copy['Method']
df_copy = df_copy.drop('Method', axis=1)
if 'eval_name' in df_copy.columns:
df_copy = df_copy.drop('eval_name', axis=1)
# Group columns by model and intervention
result_cols = {}
for task in TasksMib_Causalgraph:
for model in task.value.models: # Will iterate over all three models
for intervention in task.value.interventions:
for counterfactual in task.value.counterfactuals:
col_pattern = f"{model}_layer.*_{intervention}_{counterfactual}"
matching_cols = [c for c in df_copy.columns if pd.Series(c).str.match(col_pattern).any()]
if matching_cols:
col_name = f"{model}_{intervention}_{counterfactual}"
result_cols[col_name] = matching_cols
averaged_df = pd.DataFrame()
if method_col is not None:
averaged_df['Method'] = method_col
for col_name, cols in result_cols.items():
averaged_df[col_name] = df_copy[cols].mean(axis=1).round(2)
return averaged_df
def get_leaderboard_df_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
# print(f"results_path is {results_path}, requests_path is {requests_path}")
detailed_df, aggregated_df, intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path)
# all_data_json = [v.to_dict() for v in raw_detailed_df]
# detailed_df = pd.DataFrame.from_records(all_data_json)
# all_data_json = [v.to_dict() for v in raw_aggregated_df]
# aggregated_df = pd.DataFrame.from_records(all_data_json)
# all_data_json = [v.to_dict() for v in raw_intervention_averaged_df]
# intervention_averaged_df = pd.DataFrame.from_records(all_data_json)
# # Rename columns to match schema
# column_mapping = {}
# for col in detailed_df.columns:
# if col in ['eval_name', 'Method']:
# continue
# # Ensure consistent casing for the column names
# new_col = col.replace('Qwen2ForCausalLM', 'qwen2forcausallm') \
# .replace('Gemma2ForCausalLM', 'gemma2forcausallm') \
# .replace('LlamaForCausalLM', 'llamaforcausallm')
# column_mapping[col] = new_col
# detailed_df = detailed_df.rename(columns=column_mapping)
# # Create aggregated df
# aggregated_df = aggregate_methods(detailed_df)
# # Create intervention-averaged df
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
# print("Transformed columns:", detailed_df.columns.tolist())
print(f"Columns in detailed_df: {detailed_df.columns.tolist()}")
print(f"Columns in aggregated_df: {aggregated_df.columns.tolist()}")
print(f"Columns in intervention_averaged_df: {intervention_averaged_df.columns.tolist()}")
return detailed_df, aggregated_df, intervention_averaged_df
def get_evaluation_queue_df(save_path: str, cols: list, track: str) -> list[pd.DataFrame]:
"""Creates the different dataframes for the evaluation queues requests"""
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
all_evals = []
print(track)
for entry in entries:
if ".json" in entry:
file_path = os.path.join(save_path, entry)
with open(file_path) as fp:
data = json.load(fp)
# if "still_on_hub" in data and data["still_on_hub"]:
# data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
# else:
# data[EvalQueueColumn.model.name] = data["model"]
# data[EvalQueueColumn.revision.name] = "N/A"
all_evals.append(data)
# elif ".md" not in entry:
# # this is a folder
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
# for sub_entry in sub_entries:
# file_path = os.path.join(save_path, entry, sub_entry)
# with open(file_path) as fp:
# data = json.load(fp)
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
# all_evals.append(data)
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "PREVALIDATION"]]
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
for list in (pending_list, finished_list):
for item in list:
item["track"] = track
item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_pending[cols]