File size: 7,477 Bytes
2fc77f5
 
 
 
 
 
 
06e8556
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06e8556
2fc77f5
 
06e8556
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a100ebc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06e8556
e46e945
02e508b
06e8556
02e508b
 
 
 
 
e46e945
a100ebc
e46e945
02e508b
 
e46e945
02e508b
 
 
e46e945
a100ebc
0ef3fb1
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import json
import os

import pandas as pd

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph


def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    print(f"results_path is {results_path}, requests_path is {requests_path}")
    raw_data = get_raw_eval_results(results_path, requests_path)
    print(f"raw_data is {raw_data}")
    all_data_json = [v.to_dict() for v in raw_data]
    print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
    all_data_json_filtered = []
    for item in all_data_json:
        item["Track"] = item["eval_name"].split("_")[-1]
        item["ioi"] = 0
        item["mcqa"] = 0
        if "VQA" in benchmark_cols and "VQA" in item:
            all_data_json_filtered.append(item)
        if "VQA" not in benchmark_cols and "VQA" not in item:
            all_data_json_filtered.append(item)
            
    all_data_json = all_data_json_filtered

    df = pd.DataFrame.from_records(all_data_json)
    df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
    # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
    # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
    
    print(f"df is {df}")
    
    # df = df[cols].round(decimals=1)

    # filter out if any of the benchmarks have not been produced
    df = df[has_no_nan_values(df, benchmark_cols)]
    return df



def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the MIB experiment results"""
    print(f"results_path is {results_path}, requests_path is {requests_path}")
    raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
    print(f"raw_data is {raw_data}")
    
    # Convert each result to dict format
    all_data_json = [v.to_dict() for v in raw_data]
    print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")

    # Convert to dataframe
    df = pd.DataFrame.from_records(all_data_json)
    
    # Sort by Average score descending
    if 'Average' in df.columns:
        # Convert '-' to NaN for sorting purposes
        df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
        df = df.sort_values(by=['Average'], ascending=False, na_position='last')
        # Convert NaN back to '-'
        df['Average'] = df['Average'].fillna('-')
    
    return df




def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregates rows with the same base method name by taking the max value for each column"""
    df_copy = df.copy()
    
    # Extract base method names (remove _2, _3, etc. suffixes)
    base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit() 
                   else name for name in df_copy.index]
    df_copy.index = base_methods
    
    # Convert scores to numeric values
    numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
    
    # Group by base method name and take the max
    aggregated_df = numeric_df.groupby(level=0).max().round(3)
    
    return aggregated_df

def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
    """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
    df_copy = df.copy()
    
    # Remove the Method column and eval_name if present
    columns_to_drop = ['Method', 'eval_name']
    df_copy = df_copy.drop(columns=[col for col in columns_to_drop if col in df_copy.columns])
    
    # Group columns by model_task
    model_task_groups = {}
    for col in df_copy.columns:
        model_task = '_'.join(col.split('_')[:2])  # Get model_task part
        if model_task not in model_task_groups:
            model_task_groups[model_task] = []
        model_task_groups[model_task].append(col)
    
    # Create new DataFrame with averaged intervention scores
    averaged_df = pd.DataFrame({
        model_task: df_copy[cols].mean(axis=1).round(3)
        for model_task, cols in model_task_groups.items()
    })
    
    return averaged_df


def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the MIB causal graph experiment results"""
    print(f"Starting get_leaderboard_df_mib_causalgraph with path: {results_path}")
    raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
    print(f"Length of raw_data: {len(raw_data) if raw_data else 0}")
    
    if not raw_data:
        print("Warning: raw_data is empty")
        return pd.DataFrame()
    
    # Convert each result to dict format for detailed df
    all_data_json = [v.to_dict() for v in raw_data]
    print(f"Length of all_data_json: {len(all_data_json)}")
    print(f"First entry of all_data_json: {all_data_json[0] if all_data_json else None}")
    
    detailed_df = pd.DataFrame.from_records(all_data_json)
    print(f"Shape of detailed_df: {detailed_df.shape}")
    print(f"Columns in detailed_df: {detailed_df.columns.tolist()}")
    
    return detailed_df


def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requests"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []

    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)

            if "still_on_hub" in data and data["still_on_hub"]:
                data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
            else:
                data[EvalQueueColumn.model.name] = data["model"]
                data[EvalQueueColumn.revision.name] = "N/A"

            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]