atticusg commited on
Commit
b348eb5
Β·
verified Β·
1 Parent(s): a782a90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -1
app.py CHANGED
@@ -116,6 +116,82 @@ def init_leaderboard_mib(dataframe, track):
116
  interactive=False,
117
  )
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def init_leaderboard(dataframe, track):
120
  if dataframe is None or dataframe.empty:
121
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -184,7 +260,7 @@ with demo:
184
  # leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
185
 
186
  with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
187
- leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
188
 
189
  # with gr.Row():
190
  # with gr.Accordion("πŸ“™ Citation", open=False):
 
116
  interactive=False,
117
  )
118
 
119
+
120
+ def init_leaderboard_mib_causal(json_data, task_type):
121
+ """Creates a summary leaderboard showing best layer performance for each method"""
122
+ if not json_data or 'results' not in json_data:
123
+ raise ValueError("Invalid JSON data structure")
124
+
125
+ # Process results into summary format
126
+ summary_data = []
127
+ method_name = json_data['method_name']
128
+
129
+ # Extract model and task data
130
+ for model_result in json_data['results']:
131
+ model_id = model_result['model_id']
132
+
133
+ # Get scores for the specified task
134
+ task_data = model_result['task_scores'].get(task_type, [])
135
+ if not task_data:
136
+ continue
137
+
138
+ # Calculate best layer performance
139
+ best_scores = calculate_best_layer_scores(task_data)
140
+
141
+ summary_row = {
142
+ 'Method': method_name,
143
+ 'Model': model_id,
144
+ 'Best Output Token Score': best_scores['output_token'],
145
+ 'Best Output Location Score': best_scores['output_location'],
146
+ 'Best Layer': best_scores['best_layer']
147
+ }
148
+ summary_data.append(summary_row)
149
+
150
+ # Convert to DataFrame
151
+ df = pd.DataFrame(summary_data)
152
+
153
+ # Round numeric columns to 3 decimal places
154
+ numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
155
+ df[numeric_cols] = df[numeric_cols].round(3)
156
+
157
+ return Leaderboard(
158
+ value=df,
159
+ datatype=['text', 'text', 'number', 'number', 'number'],
160
+ select_columns=SelectColumns(
161
+ default_selection=['Method', 'Model', 'Best Output Token Score', 'Best Output Location Score', 'Best Layer'],
162
+ cant_deselect=['Method', 'Model'],
163
+ label="Select Metrics to Display:",
164
+ ),
165
+ search_columns=['Method', 'Model'],
166
+ interactive=False,
167
+ )
168
+
169
+ def calculate_best_layer_scores(task_data):
170
+ """Calculate the best scores across all layers for each intervention type"""
171
+ best_output_token = 0
172
+ best_output_location = 0
173
+ best_layer = 0
174
+
175
+ for layer_data in task_data:
176
+ layer_num = int(layer_data['layer'])
177
+ layer_scores = layer_data['layer_scores']
178
+
179
+ # Calculate average scores for each intervention type
180
+ output_token_avg = sum(cf['score'] for cf in layer_scores[0]['counterfactual_scores']) / len(layer_scores[0]['counterfactual_scores'])
181
+ output_location_avg = sum(cf['score'] for cf in layer_scores[1]['counterfactual_scores']) / len(layer_scores[1]['counterfactual_scores'])
182
+
183
+ # Update best scores
184
+ if output_token_avg > best_output_token or output_location_avg > best_output_location:
185
+ best_output_token = max(best_output_token, output_token_avg)
186
+ best_output_location = max(best_output_location, output_location_avg)
187
+ best_layer = layer_num
188
+
189
+ return {
190
+ 'output_token': best_output_token,
191
+ 'output_location': best_output_location,
192
+ 'best_layer': best_layer
193
+ }
194
+
195
  def init_leaderboard(dataframe, track):
196
  if dataframe is None or dataframe.empty:
197
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
260
  # leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
261
 
262
  with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
263
+ leaderboard = init_leaderboard_mib_causal(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
264
 
265
  # with gr.Row():
266
  # with gr.Accordion("πŸ“™ Citation", open=False):