Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		jasonshaoshun
		
	commited on
		
		
					Commit 
							
							·
						
						a100ebc
	
1
								Parent(s):
							
							0e725d0
								
debug
Browse files- caulsal_metric.py +243 -0
- src/about.py +4 -4
- src/display/utils.py +16 -18
- src/leaderboard/read_evals.py +54 -47
- src/populate.py +55 -8
    	
        caulsal_metric.py
    ADDED
    
    | @@ -0,0 +1,243 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            import pandas as pd
         | 
| 4 | 
            +
            import numpy as np
         | 
| 5 | 
            +
            from typing import List, Dict, Any, Tuple
         | 
| 6 | 
            +
            from collections import defaultdict
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            def average_counterfactuals(json_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         | 
| 9 | 
            +
                """
         | 
| 10 | 
            +
                Averages scores across counterfactuals for each layer.
         | 
| 11 | 
            +
                """
         | 
| 12 | 
            +
                processed_jsons = []
         | 
| 13 | 
            +
                
         | 
| 14 | 
            +
                for json_file in json_files:
         | 
| 15 | 
            +
                    new_json = {
         | 
| 16 | 
            +
                        'method_name': json_file['method_name'],
         | 
| 17 | 
            +
                        'results': []
         | 
| 18 | 
            +
                    }
         | 
| 19 | 
            +
                    
         | 
| 20 | 
            +
                    for result in json_file['results']:
         | 
| 21 | 
            +
                        new_result = {
         | 
| 22 | 
            +
                            'model_id': result['model_id'],
         | 
| 23 | 
            +
                            'task_scores': {}
         | 
| 24 | 
            +
                        }
         | 
| 25 | 
            +
                        
         | 
| 26 | 
            +
                        for task, scores in result['task_scores'].items():
         | 
| 27 | 
            +
                            new_scores = []
         | 
| 28 | 
            +
                            
         | 
| 29 | 
            +
                            for layer_data in scores:
         | 
| 30 | 
            +
                                new_layer_data = {
         | 
| 31 | 
            +
                                    'layer': layer_data['layer'],
         | 
| 32 | 
            +
                                    'layer_scores': []
         | 
| 33 | 
            +
                                }
         | 
| 34 | 
            +
                                
         | 
| 35 | 
            +
                                for intervention_data in layer_data['layer_scores']:
         | 
| 36 | 
            +
                                    avg_score = np.mean([cf['score'] for cf in intervention_data['counterfactual_scores']])
         | 
| 37 | 
            +
                                    if np.isnan(avg_score):
         | 
| 38 | 
            +
                                        avg_score = 0.0
         | 
| 39 | 
            +
                                    new_layer_data['layer_scores'].append({
         | 
| 40 | 
            +
                                        'intervention': intervention_data['intervention'],
         | 
| 41 | 
            +
                                        'average_score': avg_score
         | 
| 42 | 
            +
                                    })
         | 
| 43 | 
            +
                                
         | 
| 44 | 
            +
                                new_scores.append(new_layer_data)
         | 
| 45 | 
            +
                            
         | 
| 46 | 
            +
                            new_result['task_scores'][task] = new_scores
         | 
| 47 | 
            +
                        
         | 
| 48 | 
            +
                        new_json['results'].append(new_result)
         | 
| 49 | 
            +
                    
         | 
| 50 | 
            +
                    processed_jsons.append(new_json)
         | 
| 51 | 
            +
                
         | 
| 52 | 
            +
                return processed_jsons
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            def find_layer_averages(json_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         | 
| 55 | 
            +
                """
         | 
| 56 | 
            +
                Averages scores across layers for each intervention.
         | 
| 57 | 
            +
                """
         | 
| 58 | 
            +
                processed_jsons = []
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                for json_file in json_files:
         | 
| 61 | 
            +
                    new_json = {
         | 
| 62 | 
            +
                        'method_name': json_file['method_name'],
         | 
| 63 | 
            +
                        'results': []
         | 
| 64 | 
            +
                    }
         | 
| 65 | 
            +
                    
         | 
| 66 | 
            +
                    for result in json_file['results']:
         | 
| 67 | 
            +
                        new_result = {
         | 
| 68 | 
            +
                            'model_id': result['model_id'],
         | 
| 69 | 
            +
                            'task_scores': {}
         | 
| 70 | 
            +
                        }
         | 
| 71 | 
            +
                        
         | 
| 72 | 
            +
                        for task, scores in result['task_scores'].items():
         | 
| 73 | 
            +
                            # Group by intervention first
         | 
| 74 | 
            +
                            intervention_scores = defaultdict(list)
         | 
| 75 | 
            +
                            for layer_data in scores:
         | 
| 76 | 
            +
                                for intervention_data in layer_data['layer_scores']:
         | 
| 77 | 
            +
                                    intervention_key = '_'.join(intervention_data['intervention'])
         | 
| 78 | 
            +
                                    intervention_scores[intervention_key].append(intervention_data['average_score'])
         | 
| 79 | 
            +
                            
         | 
| 80 | 
            +
                            # Average across layers for each intervention
         | 
| 81 | 
            +
                            new_result['task_scores'][task] = [
         | 
| 82 | 
            +
                                {
         | 
| 83 | 
            +
                                    'intervention': intervention.split('_'),
         | 
| 84 | 
            +
                                    'average_score': np.mean(layer_scores) if layer_scores else 0.0
         | 
| 85 | 
            +
                                }
         | 
| 86 | 
            +
                                for intervention, layer_scores in intervention_scores.items()
         | 
| 87 | 
            +
                            ]
         | 
| 88 | 
            +
                        
         | 
| 89 | 
            +
                        new_json['results'].append(new_result)
         | 
| 90 | 
            +
                    
         | 
| 91 | 
            +
                    processed_jsons.append(new_json)
         | 
| 92 | 
            +
                
         | 
| 93 | 
            +
                return processed_jsons
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            def create_summary_dataframe(json_files: List[Dict[str, Any]]) -> pd.DataFrame:
         | 
| 96 | 
            +
                """
         | 
| 97 | 
            +
                Creates a summary DataFrame with methods as rows and MODEL_TASK_INTERVENTION as columns.
         | 
| 98 | 
            +
                Handles duplicate method names by adding a counter suffix.
         | 
| 99 | 
            +
                """
         | 
| 100 | 
            +
                data = {}
         | 
| 101 | 
            +
                method_counters = defaultdict(int)
         | 
| 102 | 
            +
                
         | 
| 103 | 
            +
                for json_file in json_files:
         | 
| 104 | 
            +
                    method_name = json_file['method_name']
         | 
| 105 | 
            +
                    # Increment counter for this method name
         | 
| 106 | 
            +
                    method_counters[method_name] += 1
         | 
| 107 | 
            +
                    
         | 
| 108 | 
            +
                    # If this is a duplicate method name, append a counter
         | 
| 109 | 
            +
                    unique_method_name = f"{method_name}_{method_counters[method_name]}"
         | 
| 110 | 
            +
                        
         | 
| 111 | 
            +
                    method_scores = []
         | 
| 112 | 
            +
                    column_names = []
         | 
| 113 | 
            +
                    
         | 
| 114 | 
            +
                    for result in json_file['results']:
         | 
| 115 | 
            +
                        model = result['model_id']
         | 
| 116 | 
            +
                        for task, scores in result['task_scores'].items():
         | 
| 117 | 
            +
                            for score_data in scores:
         | 
| 118 | 
            +
                                intervention = '_'.join(score_data['intervention'])
         | 
| 119 | 
            +
                                column = f"{model}_{task}_{intervention}"
         | 
| 120 | 
            +
                                score = f"{score_data['average_score']:.3f}"
         | 
| 121 | 
            +
                                method_scores.append((column, score))
         | 
| 122 | 
            +
                    
         | 
| 123 | 
            +
                    # Sort by column names for consistency
         | 
| 124 | 
            +
                    method_scores.sort(key=lambda x: x[0])
         | 
| 125 | 
            +
                    scores_only = [float(score) for _, score in method_scores]
         | 
| 126 | 
            +
                    avg_score = np.mean(scores_only)
         | 
| 127 | 
            +
                    
         | 
| 128 | 
            +
                    # Add average as first column
         | 
| 129 | 
            +
                    data[unique_method_name] = {
         | 
| 130 | 
            +
                        **{col: score for col, score in method_scores}
         | 
| 131 | 
            +
                    }
         | 
| 132 | 
            +
                
         | 
| 133 | 
            +
                df = pd.DataFrame.from_dict(data, orient='index')
         | 
| 134 | 
            +
                
         | 
| 135 | 
            +
                
         | 
| 136 | 
            +
                return df
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
         | 
| 139 | 
            +
                """
         | 
| 140 | 
            +
                Aggregates rows with the same base method name by taking the max value for each column.
         | 
| 141 | 
            +
                """
         | 
| 142 | 
            +
                # Create a copy of the DataFrame
         | 
| 143 | 
            +
                df_copy = df.copy()
         | 
| 144 | 
            +
                
         | 
| 145 | 
            +
                # Extract base method names (remove _2, _3, etc. suffixes)
         | 
| 146 | 
            +
                base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit() 
         | 
| 147 | 
            +
                               else name for name in df_copy.index]
         | 
| 148 | 
            +
                df_copy.index = base_methods
         | 
| 149 | 
            +
                
         | 
| 150 | 
            +
                # Convert scores to numeric values
         | 
| 151 | 
            +
                def extract_score(score_str):
         | 
| 152 | 
            +
                    if isinstance(score_str, str):
         | 
| 153 | 
            +
                        return float(score_str)
         | 
| 154 | 
            +
                    return 0.0
         | 
| 155 | 
            +
                
         | 
| 156 | 
            +
                numeric_df = df_copy.applymap(extract_score)
         | 
| 157 | 
            +
                
         | 
| 158 | 
            +
                # Group by base method name and take the mean
         | 
| 159 | 
            +
                aggregated_df = numeric_df.groupby(level=0).max().round(3)
         | 
| 160 | 
            +
                
         | 
| 161 | 
            +
                # Convert back to string format
         | 
| 162 | 
            +
                aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
         | 
| 163 | 
            +
                
         | 
| 164 | 
            +
                return aggregated_df
         | 
| 165 | 
            +
             | 
| 166 | 
            +
            def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
         | 
| 167 | 
            +
                """
         | 
| 168 | 
            +
                Creates a DataFrame where columns are model_task and cells are averaged over interventions.
         | 
| 169 | 
            +
                """
         | 
| 170 | 
            +
                # Create a copy of the DataFrame
         | 
| 171 | 
            +
                df_copy = df.copy()
         | 
| 172 | 
            +
                
         | 
| 173 | 
            +
                # Remove the Average column if it exists
         | 
| 174 | 
            +
                if 'Average' in df_copy.columns:
         | 
| 175 | 
            +
                    df_copy = df_copy.drop('Average', axis=1)
         | 
| 176 | 
            +
                
         | 
| 177 | 
            +
                # Function to extract score value from string
         | 
| 178 | 
            +
                def extract_score(score_str):
         | 
| 179 | 
            +
                    if isinstance(score_str, str):
         | 
| 180 | 
            +
                        return float(score_str.split()[0])
         | 
| 181 | 
            +
                    return 0.0
         | 
| 182 | 
            +
                
         | 
| 183 | 
            +
                # Convert all scores to numeric values
         | 
| 184 | 
            +
                numeric_df = df_copy.applymap(extract_score)
         | 
| 185 | 
            +
                
         | 
| 186 | 
            +
                # Group columns by model_task
         | 
| 187 | 
            +
                model_task_groups = {}
         | 
| 188 | 
            +
                for col in numeric_df.columns:
         | 
| 189 | 
            +
                    model_task = '_'.join(col.split('_')[:2])  # Get model_task part
         | 
| 190 | 
            +
                    if model_task not in model_task_groups:
         | 
| 191 | 
            +
                        model_task_groups[model_task] = []
         | 
| 192 | 
            +
                    model_task_groups[model_task].append(col)
         | 
| 193 | 
            +
                
         | 
| 194 | 
            +
                # Create new DataFrame with averaged intervention scores
         | 
| 195 | 
            +
                averaged_df = pd.DataFrame({
         | 
| 196 | 
            +
                    model_task: numeric_df[cols].mean(axis=1).round(3)
         | 
| 197 | 
            +
                    for model_task, cols in model_task_groups.items()
         | 
| 198 | 
            +
                })
         | 
| 199 | 
            +
                
         | 
| 200 | 
            +
                # Add overall average column
         | 
| 201 | 
            +
                averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
         | 
| 202 | 
            +
                
         | 
| 203 | 
            +
                # Sort by Average column
         | 
| 204 | 
            +
                averaged_df = averaged_df.sort_values('Average', ascending=False)
         | 
| 205 | 
            +
                
         | 
| 206 | 
            +
                return averaged_df
         | 
| 207 | 
            +
             | 
| 208 | 
            +
            def process_json_folder(folder_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         | 
| 209 | 
            +
                """
         | 
| 210 | 
            +
                Processes all JSON files in a folder and returns three DataFrames:
         | 
| 211 | 
            +
                1. Detailed DataFrame showing all results including duplicates (with layer-averaged scores)
         | 
| 212 | 
            +
                2. Aggregated DataFrame showing average scores for each base method
         | 
| 213 | 
            +
                3. Intervention-averaged DataFrame showing means across interventions
         | 
| 214 | 
            +
                """
         | 
| 215 | 
            +
                json_files = []
         | 
| 216 | 
            +
                
         | 
| 217 | 
            +
                # Read all JSON files
         | 
| 218 | 
            +
                for filename in os.listdir(folder_path):
         | 
| 219 | 
            +
                    if filename.endswith('.json'):
         | 
| 220 | 
            +
                        with open(os.path.join(folder_path, filename), 'r') as f:
         | 
| 221 | 
            +
                            json_files.append(json.load(f))
         | 
| 222 | 
            +
                
         | 
| 223 | 
            +
                # Process the files through each step
         | 
| 224 | 
            +
                averaged_cf = average_counterfactuals(json_files)
         | 
| 225 | 
            +
                layer_averaged = find_layer_averages(averaged_cf)
         | 
| 226 | 
            +
                detailed_df = create_summary_dataframe(layer_averaged)
         | 
| 227 | 
            +
                aggregated_df = aggregate_methods(detailed_df)
         | 
| 228 | 
            +
                intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
         | 
| 229 | 
            +
                
         | 
| 230 | 
            +
                return detailed_df, aggregated_df, intervention_averaged_df
         | 
| 231 | 
            +
             | 
| 232 | 
            +
            # Example usage:
         | 
| 233 | 
            +
            if __name__ == "__main__":
         | 
| 234 | 
            +
                # Replace with your folder path
         | 
| 235 | 
            +
                folder_path = "./json_files"
         | 
| 236 | 
            +
                detailed_df, aggregated_df, intervention_averaged_df = process_json_folder(folder_path)
         | 
| 237 | 
            +
                
         | 
| 238 | 
            +
                print("Detailed Results (including duplicates):")
         | 
| 239 | 
            +
                print(detailed_df)
         | 
| 240 | 
            +
                print("\nAggregated Results (max scores per method):")
         | 
| 241 | 
            +
                print(aggregated_df)
         | 
| 242 | 
            +
                print("\nIntervention-Averaged Results:")
         | 
| 243 | 
            +
                print(intervention_averaged_df)
         | 
    	
        src/about.py
    CHANGED
    
    | @@ -51,19 +51,19 @@ class TaskMIB_Causalgraph: | |
| 51 | 
             
                layers: list[str]   # 0-31
         | 
| 52 | 
             
                col_name: str       # display name in leaderboard
         | 
| 53 | 
             
                interventions: list[str]  # output_token, output_location
         | 
| 54 | 
            -
                counterfactuals: list[str]  # symbol_counterfactual,  | 
| 55 | 
            -
                metrics: list[str]  # score | 
| 56 |  | 
| 57 | 
             
            class TasksMib_Causalgraph(Enum):
         | 
| 58 | 
             
                task0 = TaskMIB_Causalgraph(
         | 
| 59 | 
             
                    "MCQA", 
         | 
| 60 | 
             
                    ["LlamaForCausalLM"],
         | 
| 61 | 
            -
                    [str(i) for i in range(32)], | 
| 62 | 
             
                    "mcqa",
         | 
| 63 | 
             
                    ["output_token", "output_location"],
         | 
| 64 | 
             
                    ["symbol_counterfactual", "randomLetter_counterfactual", 
         | 
| 65 | 
             
                     "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
         | 
| 66 | 
            -
                    ["score"] | 
| 67 | 
             
                )
         | 
| 68 |  | 
| 69 |  | 
|  | |
| 51 | 
             
                layers: list[str]   # 0-31
         | 
| 52 | 
             
                col_name: str       # display name in leaderboard
         | 
| 53 | 
             
                interventions: list[str]  # output_token, output_location
         | 
| 54 | 
            +
                counterfactuals: list[str]  # symbol_counterfactual, etc.
         | 
| 55 | 
            +
                metrics: list[str]  # score
         | 
| 56 |  | 
| 57 | 
             
            class TasksMib_Causalgraph(Enum):
         | 
| 58 | 
             
                task0 = TaskMIB_Causalgraph(
         | 
| 59 | 
             
                    "MCQA", 
         | 
| 60 | 
             
                    ["LlamaForCausalLM"],
         | 
| 61 | 
            +
                    [str(i) for i in range(32)],
         | 
| 62 | 
             
                    "mcqa",
         | 
| 63 | 
             
                    ["output_token", "output_location"],
         | 
| 64 | 
             
                    ["symbol_counterfactual", "randomLetter_counterfactual", 
         | 
| 65 | 
             
                     "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
         | 
| 66 | 
            +
                    ["score"]
         | 
| 67 | 
             
                )
         | 
| 68 |  | 
| 69 |  | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -98,38 +98,36 @@ COLS_MIB_CAUSALGRAPH = [] | |
| 98 | 
             
            BENCHMARK_COLS_MIB_CAUSALGRAPH = []
         | 
| 99 |  | 
| 100 |  | 
|  | |
|  | |
|  | |
|  | |
| 101 | 
             
            # Initialize the MIB causal graph columns
         | 
| 102 | 
             
            auto_eval_column_dict_mib_causalgraph = []
         | 
| 103 |  | 
| 104 | 
             
            # Method name column
         | 
| 105 | 
             
            auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
         | 
| 106 |  | 
| 107 | 
            -
            # For each  | 
| 108 | 
             
            for task in TasksMib_Causalgraph:
         | 
| 109 | 
             
                for model in task.value.models:
         | 
| 110 | 
            -
                    for  | 
| 111 | 
            -
                         | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 117 | 
            -
             | 
| 118 | 
            -
             | 
| 119 | 
            -
                                    ColumnContent(col_name, "number", True)
         | 
| 120 | 
            -
                                ])
         | 
| 121 | 
            -
             | 
| 122 | 
            -
            # Create the dataclass for MIB causal graph columns
         | 
| 123 | 
             
            AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
         | 
| 124 |  | 
| 125 | 
             
            # Column selection for display
         | 
| 126 | 
             
            COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
         | 
| 127 | 
            -
            BENCHMARK_COLS_MIB_CAUSALGRAPH = [f" | 
| 128 | 
             
                                             for task in TasksMib_Causalgraph
         | 
| 129 | 
             
                                             for model in task.value.models
         | 
| 130 | 
            -
                                             for  | 
| 131 | 
            -
                                             for intervention in task.value.interventions
         | 
| 132 | 
            -
                                             for counterfactual in task.value.counterfactuals]
         | 
| 133 |  | 
| 134 |  | 
| 135 |  | 
|  | |
| 98 | 
             
            BENCHMARK_COLS_MIB_CAUSALGRAPH = []
         | 
| 99 |  | 
| 100 |  | 
| 101 | 
            +
             | 
| 102 | 
            +
             | 
| 103 | 
            +
             | 
| 104 | 
            +
             | 
| 105 | 
             
            # Initialize the MIB causal graph columns
         | 
| 106 | 
             
            auto_eval_column_dict_mib_causalgraph = []
         | 
| 107 |  | 
| 108 | 
             
            # Method name column
         | 
| 109 | 
             
            auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
         | 
| 110 |  | 
| 111 | 
            +
            # For each model-task-intervention combination
         | 
| 112 | 
             
            for task in TasksMib_Causalgraph:
         | 
| 113 | 
             
                for model in task.value.models:
         | 
| 114 | 
            +
                    for intervention in task.value.interventions:
         | 
| 115 | 
            +
                        col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
         | 
| 116 | 
            +
                        auto_eval_column_dict_mib_causalgraph.append([
         | 
| 117 | 
            +
                            col_name, 
         | 
| 118 | 
            +
                            ColumnContent, 
         | 
| 119 | 
            +
                            ColumnContent(col_name, "number", True)
         | 
| 120 | 
            +
                        ])
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            # Create the dataclass
         | 
|  | |
|  | |
|  | |
|  | |
| 123 | 
             
            AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
         | 
| 124 |  | 
| 125 | 
             
            # Column selection for display
         | 
| 126 | 
             
            COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
         | 
| 127 | 
            +
            BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
         | 
| 128 | 
             
                                             for task in TasksMib_Causalgraph
         | 
| 129 | 
             
                                             for model in task.value.models
         | 
| 130 | 
            +
                                             for intervention in task.value.interventions]
         | 
|  | |
|  | |
| 131 |  | 
| 132 |  | 
| 133 |  | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -185,43 +185,48 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) -> | |
| 185 | 
             
            @dataclass
         | 
| 186 | 
             
            class EvalResult_MIB_CAUSALGRAPH:
         | 
| 187 | 
             
                """Represents one full evaluation for a method in MIB causalgraph."""
         | 
| 188 | 
            -
                eval_name: str         | 
| 189 | 
            -
                method_name: str       | 
| 190 | 
            -
                results: Dict          | 
| 191 |  | 
| 192 | 
             
                def init_from_json_file(self, json_filepath):
         | 
| 193 | 
             
                    """Inits results from the method result file"""
         | 
| 194 | 
             
                    with open(json_filepath) as fp:
         | 
| 195 | 
             
                        data = json.load(fp)
         | 
| 196 | 
            -
             | 
| 197 | 
             
                    method_name = data.get("method_name")
         | 
| 198 | 
             
                    results = {}
         | 
| 199 |  | 
| 200 | 
            -
                    #  | 
| 201 | 
            -
                    for  | 
| 202 | 
            -
                        model_id =  | 
| 203 | 
            -
                        task_scores =  | 
| 204 |  | 
| 205 | 
            -
                         | 
| 206 | 
            -
                         | 
| 207 | 
            -
             | 
| 208 | 
            -
                             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
                            # Store scores for each intervention and counterfactual
         | 
| 212 | 
            -
                            for intervention_data in layer_scores:
         | 
| 213 | 
            -
                                intervention = intervention_data["intervention"][0]  # e.g., "output_token"
         | 
| 214 | 
            -
                                counterfactual_scores = intervention_data["counterfactual_scores"]
         | 
| 215 |  | 
| 216 | 
            -
                                for  | 
| 217 | 
            -
                                     | 
| 218 | 
            -
                                     | 
| 219 | 
            -
                                    
         | 
| 220 | 
            -
             | 
| 221 | 
            -
                                     | 
| 222 | 
            -
             | 
| 223 | 
            -
             | 
| 224 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 225 |  | 
| 226 | 
             
                    return EvalResult_MIB_CAUSALGRAPH(
         | 
| 227 | 
             
                        eval_name=method_name,
         | 
| @@ -235,27 +240,33 @@ class EvalResult_MIB_CAUSALGRAPH: | |
| 235 | 
             
                        "eval_name": self.eval_name,
         | 
| 236 | 
             
                        "Method": self.method_name,
         | 
| 237 | 
             
                    }
         | 
| 238 | 
            -
             | 
| 239 | 
            -
                    #  | 
| 240 | 
            -
                    for model_id,  | 
| 241 | 
            -
                        for  | 
| 242 | 
            -
                             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 243 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 244 | 
             
                    return data_dict
         | 
| 245 |  | 
| 246 | 
             
            def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
         | 
| 247 | 
            -
                """ | 
| 248 | 
             
                model_result_filepaths = []
         | 
| 249 |  | 
| 250 | 
            -
                print(f"results_path is {results_path}")
         | 
| 251 | 
            -
                
         | 
| 252 | 
             
                for root, dirnames, files in os.walk(results_path):
         | 
| 253 | 
            -
                    print(f"root is {root}, dirnames is {dirnames}, files is {files}")
         | 
| 254 | 
            -
                    # We should only have json files in model results
         | 
| 255 | 
             
                    if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         | 
| 256 | 
             
                        continue
         | 
| 257 |  | 
| 258 | 
            -
                    # Sort the files by date - keeping original sorting logic
         | 
| 259 | 
             
                    try:
         | 
| 260 | 
             
                        files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
         | 
| 261 | 
             
                    except dateutil.parser._parser.ParserError:
         | 
| @@ -264,19 +275,15 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) | |
| 264 | 
             
                    for file in files:
         | 
| 265 | 
             
                        model_result_filepaths.append(os.path.join(root, file))
         | 
| 266 |  | 
| 267 | 
            -
                print(f"model_result_filepaths is {model_result_filepaths}")
         | 
| 268 | 
            -
             | 
| 269 | 
             
                eval_results = []
         | 
| 270 | 
            -
                for  | 
| 271 | 
             
                    try:
         | 
| 272 | 
            -
                        eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) | 
| 273 | 
            -
                        result = eval_result.init_from_json_file( | 
| 274 | 
            -
                         | 
| 275 | 
            -
                        # Verify the result can be converted to dict format
         | 
| 276 | 
            -
                        result.to_dict()
         | 
| 277 | 
             
                        eval_results.append(result)
         | 
| 278 | 
             
                    except Exception as e:
         | 
| 279 | 
            -
                        print(f"Error processing { | 
| 280 | 
             
                        continue
         | 
| 281 |  | 
| 282 | 
             
                return eval_results
         | 
|  | |
| 185 | 
             
            @dataclass
         | 
| 186 | 
             
            class EvalResult_MIB_CAUSALGRAPH:
         | 
| 187 | 
             
                """Represents one full evaluation for a method in MIB causalgraph."""
         | 
| 188 | 
            +
                eval_name: str        
         | 
| 189 | 
            +
                method_name: str      
         | 
| 190 | 
            +
                results: Dict         
         | 
| 191 |  | 
| 192 | 
             
                def init_from_json_file(self, json_filepath):
         | 
| 193 | 
             
                    """Inits results from the method result file"""
         | 
| 194 | 
             
                    with open(json_filepath) as fp:
         | 
| 195 | 
             
                        data = json.load(fp)
         | 
| 196 | 
            +
                    
         | 
| 197 | 
             
                    method_name = data.get("method_name")
         | 
| 198 | 
             
                    results = {}
         | 
| 199 |  | 
| 200 | 
            +
                    # First average across counterfactuals
         | 
| 201 | 
            +
                    for result in data.get("results", []):
         | 
| 202 | 
            +
                        model_id = result.get("model_id")
         | 
| 203 | 
            +
                        task_scores = result.get("task_scores", {})
         | 
| 204 |  | 
| 205 | 
            +
                        model_results = {}
         | 
| 206 | 
            +
                        for task, scores in task_scores.items():
         | 
| 207 | 
            +
                            layer_scores = []
         | 
| 208 | 
            +
                            for layer_data in scores:
         | 
| 209 | 
            +
                                layer = layer_data.get("layer")
         | 
| 210 | 
            +
                                layer_scores_data = []
         | 
|  | |
|  | |
|  | |
|  | |
| 211 |  | 
| 212 | 
            +
                                for intervention_data in layer_data.get("layer_scores", []):
         | 
| 213 | 
            +
                                    # Average across counterfactuals
         | 
| 214 | 
            +
                                    avg_score = np.mean([cf['score'] for cf in intervention_data['counterfactual_scores']])
         | 
| 215 | 
            +
                                    if np.isnan(avg_score):
         | 
| 216 | 
            +
                                        avg_score = 0.0
         | 
| 217 | 
            +
                                    layer_scores_data.append({
         | 
| 218 | 
            +
                                        'intervention': intervention_data['intervention'][0],
         | 
| 219 | 
            +
                                        'score': avg_score
         | 
| 220 | 
            +
                                    })
         | 
| 221 | 
            +
                                
         | 
| 222 | 
            +
                                layer_scores.append({
         | 
| 223 | 
            +
                                    'layer': layer,
         | 
| 224 | 
            +
                                    'scores': layer_scores_data
         | 
| 225 | 
            +
                                })
         | 
| 226 | 
            +
                            
         | 
| 227 | 
            +
                            model_results[task] = layer_scores
         | 
| 228 | 
            +
                        
         | 
| 229 | 
            +
                        results[model_id] = model_results
         | 
| 230 |  | 
| 231 | 
             
                    return EvalResult_MIB_CAUSALGRAPH(
         | 
| 232 | 
             
                        eval_name=method_name,
         | 
|  | |
| 240 | 
             
                        "eval_name": self.eval_name,
         | 
| 241 | 
             
                        "Method": self.method_name,
         | 
| 242 | 
             
                    }
         | 
| 243 | 
            +
                    
         | 
| 244 | 
            +
                    # Process each model's results
         | 
| 245 | 
            +
                    for model_id, model_results in self.results.items():
         | 
| 246 | 
            +
                        for task, task_scores in model_results.items():
         | 
| 247 | 
            +
                            # Calculate layer-averaged scores for each intervention
         | 
| 248 | 
            +
                            intervention_scores = defaultdict(list)
         | 
| 249 | 
            +
                            
         | 
| 250 | 
            +
                            for layer_data in task_scores:
         | 
| 251 | 
            +
                                for score_data in layer_data['scores']:
         | 
| 252 | 
            +
                                    intervention = score_data['intervention']
         | 
| 253 | 
            +
                                    intervention_scores[intervention].append(score_data['score'])
         | 
| 254 |  | 
| 255 | 
            +
                            # Average across layers for each intervention
         | 
| 256 | 
            +
                            for intervention, scores in intervention_scores.items():
         | 
| 257 | 
            +
                                col_name = f"{model_id}_{task}_{intervention}".lower()
         | 
| 258 | 
            +
                                data_dict[col_name] = round(np.mean(scores), 3)
         | 
| 259 | 
            +
                    
         | 
| 260 | 
             
                    return data_dict
         | 
| 261 |  | 
| 262 | 
             
            def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
         | 
| 263 | 
            +
                """Extract evaluation results for MIB causalgraph"""
         | 
| 264 | 
             
                model_result_filepaths = []
         | 
| 265 |  | 
|  | |
|  | |
| 266 | 
             
                for root, dirnames, files in os.walk(results_path):
         | 
|  | |
|  | |
| 267 | 
             
                    if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         | 
| 268 | 
             
                        continue
         | 
| 269 |  | 
|  | |
| 270 | 
             
                    try:
         | 
| 271 | 
             
                        files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
         | 
| 272 | 
             
                    except dateutil.parser._parser.ParserError:
         | 
|  | |
| 275 | 
             
                    for file in files:
         | 
| 276 | 
             
                        model_result_filepaths.append(os.path.join(root, file))
         | 
| 277 |  | 
|  | |
|  | |
| 278 | 
             
                eval_results = []
         | 
| 279 | 
            +
                for filepath in model_result_filepaths:
         | 
| 280 | 
             
                    try:
         | 
| 281 | 
            +
                        eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
         | 
| 282 | 
            +
                        result = eval_result.init_from_json_file(filepath)
         | 
| 283 | 
            +
                        result.to_dict()  # Verify conversion works
         | 
|  | |
|  | |
| 284 | 
             
                        eval_results.append(result)
         | 
| 285 | 
             
                    except Exception as e:
         | 
| 286 | 
            +
                        print(f"Error processing {filepath}: {e}")
         | 
| 287 | 
             
                        continue
         | 
| 288 |  | 
| 289 | 
             
                return eval_results
         | 
    	
        src/populate.py
    CHANGED
    
    | @@ -65,24 +65,71 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: | |
| 65 |  | 
| 66 | 
             
                return df
         | 
| 67 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 68 | 
             
            def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
         | 
| 69 | 
             
                """Creates a dataframe from all the MIB causal graph experiment results"""
         | 
| 70 | 
             
                print(f"results_path is {results_path}, requests_path is {requests_path}")
         | 
| 71 | 
             
                raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
         | 
| 72 | 
             
                print(f"raw_data is {raw_data}")
         | 
| 73 |  | 
| 74 | 
            -
                # Convert each result to dict format
         | 
| 75 | 
             
                all_data_json = [v.to_dict() for v in raw_data]
         | 
| 76 | 
            -
                 | 
|  | |
| 77 |  | 
| 78 | 
            -
                #  | 
| 79 | 
            -
                 | 
|  | |
| 80 |  | 
| 81 | 
            -
                 | 
| 82 | 
            -
                 | 
| 83 | 
            -
                df[numeric_cols] = df[numeric_cols].round(2)
         | 
| 84 |  | 
| 85 | 
            -
                return  | 
|  | |
| 86 |  | 
| 87 |  | 
| 88 | 
             
            def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         | 
|  | |
| 65 |  | 
| 66 | 
             
                return df
         | 
| 67 |  | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
             | 
| 71 | 
            +
            def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
         | 
| 72 | 
            +
                """Aggregates rows with the same base method name by taking the max value for each column"""
         | 
| 73 | 
            +
                df_copy = df.copy()
         | 
| 74 | 
            +
                
         | 
| 75 | 
            +
                # Extract base method names (remove _2, _3, etc. suffixes)
         | 
| 76 | 
            +
                base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit() 
         | 
| 77 | 
            +
                               else name for name in df_copy.index]
         | 
| 78 | 
            +
                df_copy.index = base_methods
         | 
| 79 | 
            +
                
         | 
| 80 | 
            +
                # Convert scores to numeric values
         | 
| 81 | 
            +
                numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
         | 
| 82 | 
            +
                
         | 
| 83 | 
            +
                # Group by base method name and take the max
         | 
| 84 | 
            +
                aggregated_df = numeric_df.groupby(level=0).max().round(3)
         | 
| 85 | 
            +
                
         | 
| 86 | 
            +
                return aggregated_df
         | 
| 87 | 
            +
             | 
| 88 | 
            +
            def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
         | 
| 89 | 
            +
                """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
         | 
| 90 | 
            +
                df_copy = df.copy()
         | 
| 91 | 
            +
                
         | 
| 92 | 
            +
                # Remove the Method column and eval_name if present
         | 
| 93 | 
            +
                columns_to_drop = ['Method', 'eval_name']
         | 
| 94 | 
            +
                df_copy = df_copy.drop(columns=[col for col in columns_to_drop if col in df_copy.columns])
         | 
| 95 | 
            +
                
         | 
| 96 | 
            +
                # Group columns by model_task
         | 
| 97 | 
            +
                model_task_groups = {}
         | 
| 98 | 
            +
                for col in df_copy.columns:
         | 
| 99 | 
            +
                    model_task = '_'.join(col.split('_')[:2])  # Get model_task part
         | 
| 100 | 
            +
                    if model_task not in model_task_groups:
         | 
| 101 | 
            +
                        model_task_groups[model_task] = []
         | 
| 102 | 
            +
                    model_task_groups[model_task].append(col)
         | 
| 103 | 
            +
                
         | 
| 104 | 
            +
                # Create new DataFrame with averaged intervention scores
         | 
| 105 | 
            +
                averaged_df = pd.DataFrame({
         | 
| 106 | 
            +
                    model_task: df_copy[cols].mean(axis=1).round(3)
         | 
| 107 | 
            +
                    for model_task, cols in model_task_groups.items()
         | 
| 108 | 
            +
                })
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                return averaged_df
         | 
| 111 | 
            +
             | 
| 112 | 
            +
             | 
| 113 | 
             
            def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
         | 
| 114 | 
             
                """Creates a dataframe from all the MIB causal graph experiment results"""
         | 
| 115 | 
             
                print(f"results_path is {results_path}, requests_path is {requests_path}")
         | 
| 116 | 
             
                raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
         | 
| 117 | 
             
                print(f"raw_data is {raw_data}")
         | 
| 118 |  | 
| 119 | 
            +
                # Convert each result to dict format for detailed df
         | 
| 120 | 
             
                all_data_json = [v.to_dict() for v in raw_data]
         | 
| 121 | 
            +
                detailed_df = pd.DataFrame.from_records(all_data_json)
         | 
| 122 | 
            +
                print(f"detailed_df is: {detailed_df}")
         | 
| 123 |  | 
| 124 | 
            +
                # Create and print other views for debugging/reference
         | 
| 125 | 
            +
                aggregated_df = aggregate_methods(detailed_df)
         | 
| 126 | 
            +
                print(f"aggregated_df is: {aggregated_df}")
         | 
| 127 |  | 
| 128 | 
            +
                intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
         | 
| 129 | 
            +
                print(f"intervention_averaged_df is: {intervention_averaged_df}")
         | 
|  | |
| 130 |  | 
| 131 | 
            +
                # Only return detailed_df for display
         | 
| 132 | 
            +
                return detailed_df
         | 
| 133 |  | 
| 134 |  | 
| 135 | 
             
            def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         | 
