atticusg commited on
Commit
bcb3085
·
verified ·
1 Parent(s): d788e14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -34
app.py CHANGED
@@ -117,20 +117,50 @@ def init_leaderboard_mib(dataframe, track):
117
  )
118
 
119
 
120
- def init_leaderboard_mib_causal(json_data, track):
121
- """Creates a leaderboard summary for causal intervention results"""
 
122
 
123
- # Process results into summary format
124
- summary_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
 
 
 
 
 
 
 
126
  method_name = json_data['method_name']
127
 
128
- # Extract scores for MCQA task
129
  for model_result in json_data['results']:
130
  model_id = model_result['model_id']
131
  task_data = model_result['task_scores']['MCQA']
132
 
133
- # Calculate best layer performance
134
  best_scores = calculate_best_layer_scores(task_data)
135
 
136
  summary_row = {
@@ -140,10 +170,32 @@ def init_leaderboard_mib_causal(json_data, track):
140
  'Best Output Location Score': best_scores['output_location'],
141
  'Best Layer': best_scores['best_layer']
142
  }
143
- summary_data.append(summary_row)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  # Convert to DataFrame
146
- results_df = pd.DataFrame(summary_data)
 
 
 
147
 
148
  # Round numeric columns to 3 decimal places
149
  numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
@@ -161,32 +213,6 @@ def init_leaderboard_mib_causal(json_data, track):
161
  interactive=False,
162
  )
163
 
164
- def calculate_best_layer_scores(task_data):
165
- """Calculate the best scores across all layers for each intervention type"""
166
- best_output_token = 0
167
- best_output_location = 0
168
- best_layer = 0
169
-
170
- for layer_data in task_data:
171
- layer_num = int(layer_data['layer'])
172
- layer_scores = layer_data['layer_scores']
173
-
174
- # Calculate average scores for each intervention type
175
- output_token_avg = sum(cf['score'] for cf in layer_scores[0]['counterfactual_scores']) / len(layer_scores[0]['counterfactual_scores'])
176
- output_location_avg = sum(cf['score'] for cf in layer_scores[1]['counterfactual_scores']) / len(layer_scores[1]['counterfactual_scores'])
177
-
178
- # Update best scores
179
- if output_token_avg > best_output_token or output_location_avg > best_output_location:
180
- best_output_token = max(best_output_token, output_token_avg)
181
- best_output_location = max(best_output_location, output_location_avg)
182
- best_layer = layer_num
183
-
184
- return {
185
- 'output_token': best_output_token,
186
- 'output_location': best_output_location,
187
- 'best_layer': best_layer
188
- }
189
-
190
  def init_leaderboard(dataframe, track):
191
  if dataframe is None or dataframe.empty:
192
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
117
  )
118
 
119
 
120
+ def calculate_best_layer_scores(task_data: Dict[str, Any]) -> Dict[str, float]:
121
+ """
122
+ Calculate the best scores across layers for output token and location
123
 
124
+ Args:
125
+ task_data: Dictionary containing task scores for different layers
126
+
127
+ Returns:
128
+ Dictionary with best scores and corresponding layer
129
+ """
130
+ output_token_scores = [layer_data['output_token'] for layer_data in task_data.values()]
131
+ output_location_scores = [layer_data['output_location'] for layer_data in task_data.values()]
132
+
133
+ best_output_token = max(output_token_scores)
134
+ best_output_location = max(output_location_scores)
135
+
136
+ # Find the layer with the best combined performance
137
+ layer_scores = [(layer, layer_data['output_token'] + layer_data['output_location'])
138
+ for layer, layer_data in task_data.items()]
139
+ best_layer = max(layer_scores, key=lambda x: x[1])[0]
140
+
141
+ return {
142
+ 'output_token': best_output_token,
143
+ 'output_location': best_output_location,
144
+ 'best_layer': int(best_layer)
145
+ }
146
+
147
+ def process_single_method(json_data: Dict[str, Any]) -> List[Dict[str, Any]]:
148
+ """
149
+ Process results for a single method into summary rows
150
 
151
+ Args:
152
+ json_data: Dictionary containing results for one method
153
+
154
+ Returns:
155
+ List of summary rows for the method
156
+ """
157
+ summary_rows = []
158
  method_name = json_data['method_name']
159
 
 
160
  for model_result in json_data['results']:
161
  model_id = model_result['model_id']
162
  task_data = model_result['task_scores']['MCQA']
163
 
 
164
  best_scores = calculate_best_layer_scores(task_data)
165
 
166
  summary_row = {
 
170
  'Best Output Location Score': best_scores['output_location'],
171
  'Best Layer': best_scores['best_layer']
172
  }
173
+ summary_rows.append(summary_row)
174
+
175
+ return summary_rows
176
+
177
+ def init_leaderboard_mib_causal(json_data_list: List[Dict[str, Any]], track: str) -> 'Leaderboard':
178
+ """
179
+ Creates a leaderboard summary for causal intervention results from multiple methods
180
+
181
+ Args:
182
+ json_data_list: List of dictionaries containing results for different methods
183
+ track: Track identifier (currently unused but maintained for compatibility)
184
+
185
+ Returns:
186
+ Leaderboard object containing processed and formatted results
187
+ """
188
+ # Process all methods
189
+ all_summary_data = []
190
+ for method_data in json_data_list:
191
+ method_summary = process_single_method(method_data)
192
+ all_summary_data.extend(method_summary)
193
 
194
  # Convert to DataFrame
195
+ results_df = pd.DataFrame(all_summary_data)
196
+
197
+ # Sort by best score (using output token score as primary metric)
198
+ results_df = results_df.sort_values('Best Output Token Score', ascending=False)
199
 
200
  # Round numeric columns to 3 decimal places
201
  numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
 
213
  interactive=False,
214
  )
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  def init_leaderboard(dataframe, track):
217
  if dataframe is None or dataframe.empty:
218
  raise ValueError("Leaderboard DataFrame is empty or None.")