File size: 13,849 Bytes
2fc77f5
 
 
cefacdb
2fc77f5
 
 
06e8556
8b331af
2fc77f5
 
 
2817fcb
2fc77f5
2817fcb
2fc77f5
2817fcb
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2817fcb
2fc77f5
 
 
 
 
 
 
 
 
06e8556
2fc77f5
2817fcb
06e8556
2817fcb
2fc77f5
 
 
2817fcb
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
a100ebc
 
 
51441a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a100ebc
 
 
 
51441a1
 
 
 
a100ebc
51441a1
a100ebc
 
 
 
 
 
 
 
 
51441a1
 
 
 
a100ebc
 
202dbe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53c7136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a100ebc
 
 
 
53c7136
202dbe2
 
 
 
 
 
 
a100ebc
36438b0
53c7136
 
 
 
36438b0
 
 
 
 
 
53c7136
 
202dbe2
53c7136
 
 
 
202dbe2
a100ebc
 
cefacdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a100ebc
29701ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f585ea0
2817fcb
06e8556
e46e945
a100ebc
e46e945
02e508b
56d1796
 
2817fcb
56d1796
 
 
 
 
 
 
 
 
 
 
 
 
e46e945
f585ea0
 
 
 
 
56d1796
2817fcb
f585ea0
 
0ef3fb1
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import json
import os
import pandas as pd
from typing import List, Dict, Tuple

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
from src.about import TasksMib_Causalgraph

def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    # print(f"results_path is {results_path}, requests_path is {requests_path}")
    raw_data = get_raw_eval_results(results_path, requests_path)
    # print(f"raw_data is {raw_data}")
    all_data_json = [v.to_dict() for v in raw_data]
    # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
    all_data_json_filtered = []
    for item in all_data_json:
        item["Track"] = item["eval_name"].split("_")[-1]
        item["ioi"] = 0
        item["mcqa"] = 0
        if "VQA" in benchmark_cols and "VQA" in item:
            all_data_json_filtered.append(item)
        if "VQA" not in benchmark_cols and "VQA" not in item:
            all_data_json_filtered.append(item)
            
    all_data_json = all_data_json_filtered

    df = pd.DataFrame.from_records(all_data_json)
    df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
    # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
    # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
    
    # print(f"df is {df}")
    
    # df = df[cols].round(decimals=1)

    # filter out if any of the benchmarks have not been produced
    df = df[has_no_nan_values(df, benchmark_cols)]
    return df



def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the MIB experiment results"""
    # print(f"results_path is {results_path}, requests_path is {requests_path}")
    raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
    # print(f"raw_data is {raw_data}")
    
    # Convert each result to dict format
    all_data_json = [v.to_dict() for v in raw_data]
    # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")

    # Convert to dataframe
    df = pd.DataFrame.from_records(all_data_json)
    
    # Sort by Average score descending
    if 'Average' in df.columns:
        # Convert '-' to NaN for sorting purposes
        df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
        df = df.sort_values(by=['Average'], ascending=False, na_position='last')
        # Convert NaN back to '-'
        df['Average'] = df['Average'].fillna('-')
    
    return df




# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
#     """Aggregates rows with the same base method name by taking the max value for each column"""
#     df_copy = df.copy()
    
#     # Extract base method names (remove _2, _3, etc. suffixes)
#     base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit() 
#                    else name for name in df_copy.index]
#     df_copy.index = base_methods
    
#     # Convert scores to numeric values
#     numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
    
#     # Group by base method name and take the max
#     aggregated_df = numeric_df.groupby(level=0).max().round(3)
    
#     return aggregated_df
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregates rows with the same base method name by taking the max value for each column"""
    df_copy = df.copy()
    
    # Set Method as index if it isn't already
    if 'Method' in df_copy.columns:
        df_copy.set_index('Method', inplace=True)
    
    # Extract base method names (remove _2, _3, etc. suffixes)
    base_methods = [name.split('_')[0] if '_' in str(name) and str(name).split('_')[-1].isdigit() 
                   else name for name in df_copy.index]
    df_copy.index = base_methods
    
    # Convert scores to numeric values
    numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
    
    # Group by base method name and take the max
    aggregated_df = numeric_df.groupby(level=0).max().round(3)
    
    # Reset index to get Method as a column
    aggregated_df.reset_index(inplace=True)
    aggregated_df.rename(columns={'index': 'Method'}, inplace=True)
    
    return aggregated_df

# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
#     """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
#     df_copy = df.copy()
    
#     # Remove the Method column and eval_name if present
#     columns_to_drop = ['Method', 'eval_name']
#     df_copy = df_copy.drop(columns=[col for col in columns_to_drop if col in df_copy.columns])
    
#     # Group columns by model_task
#     model_task_groups = {}
#     for col in df_copy.columns:
#         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
#         if model_task not in model_task_groups:
#             model_task_groups[model_task] = []
#         model_task_groups[model_task].append(col)
    
#     # Create new DataFrame with averaged intervention scores
#     averaged_df = pd.DataFrame({
#         model_task: df_copy[cols].mean(axis=1).round(3)
#         for model_task, cols in model_task_groups.items()
#     })
    
#     return averaged_df

# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
#     """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
#     df_copy = df.copy()
    
#     # Store Method column if it exists
#     method_col = None
#     if 'Method' in df_copy.columns:
#         method_col = df_copy['Method']
#         df_copy = df_copy.drop('Method', axis=1)
    
#     # Remove eval_name if present
#     if 'eval_name' in df_copy.columns:
#         df_copy = df_copy.drop('eval_name', axis=1)
    
#     # Group columns by model_task
#     model_task_groups = {}
#     for col in df_copy.columns:
#         model_task = '_'.join(col.split('_')[:2])  # Get model_task part
#         if model_task not in model_task_groups:
#             model_task_groups[model_task] = []
#         model_task_groups[model_task].append(col)
    
#     # Create new DataFrame with averaged intervention scores
#     averaged_df = pd.DataFrame({
#         model_task: df_copy[cols].mean(axis=1).round(3)
#         for model_task, cols in model_task_groups.items()
#     })
    
#     # Add Method column back
#     if method_col is not None:
#         averaged_df.insert(0, 'Method', method_col)
    
#     return averaged_df

def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
    """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
    df_copy = df.copy()
    
    # Store Method column
    method_col = None
    if 'Method' in df_copy.columns:
        method_col = df_copy['Method']
        df_copy = df_copy.drop('Method', axis=1)
    
    if 'eval_name' in df_copy.columns:
        df_copy = df_copy.drop('eval_name', axis=1)
    
    # Group columns by model and intervention
    result_cols = {}
    for task in TasksMib_Causalgraph:
        for model in task.value.models:  # Will iterate over all three models
            for intervention in task.value.interventions:
                for counterfactual in task.value.counterfactuals:
                    col_pattern = f"{model}_layer.*_{intervention}_{counterfactual}"
                    matching_cols = [c for c in df_copy.columns if pd.Series(c).str.match(col_pattern).any()]
                    if matching_cols:
                        col_name = f"{model}_{intervention}_{counterfactual}"
                        result_cols[col_name] = matching_cols
    
    averaged_df = pd.DataFrame()
    if method_col is not None:
        averaged_df['Method'] = method_col
    
    for col_name, cols in result_cols.items():
        averaged_df[col_name] = df_copy[cols].mean(axis=1).round(3)
    
    return averaged_df

# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
#     """Creates a dataframe from all the MIB causal graph experiment results"""
#     print(f"results_path is {results_path}, requests_path is {requests_path}")
#     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
#     print(f"raw_data is {raw_data}")
    
#     # Convert each result to dict format for detailed df
#     all_data_json = [v.to_dict() for v in raw_data]
#     detailed_df = pd.DataFrame.from_records(all_data_json)
#     print(f"detailed_df is: {detailed_df}")

#     # Create and print other views for debugging/reference
#     aggregated_df = aggregate_methods(detailed_df)
#     print(f"aggregated_df is: {aggregated_df}")
    
#     intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
#     print(f"intervention_averaged_df is: {intervention_averaged_df}")
    
#     # Only return detailed_df for display
#     return detailed_df

# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
#     print(f"results_path is {results_path}, requests_path is {requests_path}")
#     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
    
#     # Convert each result to dict format for detailed df
#     all_data_json = [v.to_dict() for v in raw_data]
#     detailed_df = pd.DataFrame.from_records(all_data_json)
#     print("Columns in detailed_df:", detailed_df.columns.tolist())  # Print actual columns
    
#     # Create aggregated df
#     aggregated_df = aggregate_methods(detailed_df)
#     print("Columns in aggregated_df:", aggregated_df.columns.tolist())
    
#     # Create intervention-averaged df
#     intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
#     print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
    
#     return detailed_df, aggregated_df, intervention_averaged_df

def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # print(f"results_path is {results_path}, requests_path is {requests_path}")
    raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
    
    # Convert each result to dict format for detailed df
    all_data_json = [v.to_dict() for v in raw_data]
    detailed_df = pd.DataFrame.from_records(all_data_json)

    # Print the actual columns for debugging
    # print("Original columns:", detailed_df.columns.tolist())
    
    # Rename columns to match schema
    column_mapping = {}
    for col in detailed_df.columns:
        if col in ['eval_name', 'Method']:
            continue
        # Ensure consistent casing for the column names
        new_col = col.replace('Qwen2ForCausalLM', 'qwen2forcausallm') \
                    .replace('Gemma2ForCausalLM', 'gemma2forcausallm') \
                    .replace('LlamaForCausalLM', 'llamaforcausallm')
        column_mapping[col] = new_col
    
    detailed_df = detailed_df.rename(columns=column_mapping)
    
    # Create aggregated df
    aggregated_df = aggregate_methods(detailed_df)
    
    # Create intervention-averaged df
    intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
    
    # print("Transformed columns:", detailed_df.columns.tolist())
    
    return detailed_df, aggregated_df, intervention_averaged_df


def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requests"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []

    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)

            if "still_on_hub" in data and data["still_on_hub"]:
                data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
            else:
                data[EvalQueueColumn.model.name] = data["model"]
                data[EvalQueueColumn.revision.name] = "N/A"

            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]