Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -117,20 +117,50 @@ def init_leaderboard_mib(dataframe, track):
|
|
117 |
)
|
118 |
|
119 |
|
120 |
-
def
|
121 |
-
"""
|
|
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
method_name = json_data['method_name']
|
127 |
|
128 |
-
# Extract scores for MCQA task
|
129 |
for model_result in json_data['results']:
|
130 |
model_id = model_result['model_id']
|
131 |
task_data = model_result['task_scores']['MCQA']
|
132 |
|
133 |
-
# Calculate best layer performance
|
134 |
best_scores = calculate_best_layer_scores(task_data)
|
135 |
|
136 |
summary_row = {
|
@@ -140,10 +170,32 @@ def init_leaderboard_mib_causal(json_data, track):
|
|
140 |
'Best Output Location Score': best_scores['output_location'],
|
141 |
'Best Layer': best_scores['best_layer']
|
142 |
}
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
# Convert to DataFrame
|
146 |
-
results_df = pd.DataFrame(
|
|
|
|
|
|
|
147 |
|
148 |
# Round numeric columns to 3 decimal places
|
149 |
numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
|
@@ -161,32 +213,6 @@ def init_leaderboard_mib_causal(json_data, track):
|
|
161 |
interactive=False,
|
162 |
)
|
163 |
|
164 |
-
def calculate_best_layer_scores(task_data):
|
165 |
-
"""Calculate the best scores across all layers for each intervention type"""
|
166 |
-
best_output_token = 0
|
167 |
-
best_output_location = 0
|
168 |
-
best_layer = 0
|
169 |
-
|
170 |
-
for layer_data in task_data:
|
171 |
-
layer_num = int(layer_data['layer'])
|
172 |
-
layer_scores = layer_data['layer_scores']
|
173 |
-
|
174 |
-
# Calculate average scores for each intervention type
|
175 |
-
output_token_avg = sum(cf['score'] for cf in layer_scores[0]['counterfactual_scores']) / len(layer_scores[0]['counterfactual_scores'])
|
176 |
-
output_location_avg = sum(cf['score'] for cf in layer_scores[1]['counterfactual_scores']) / len(layer_scores[1]['counterfactual_scores'])
|
177 |
-
|
178 |
-
# Update best scores
|
179 |
-
if output_token_avg > best_output_token or output_location_avg > best_output_location:
|
180 |
-
best_output_token = max(best_output_token, output_token_avg)
|
181 |
-
best_output_location = max(best_output_location, output_location_avg)
|
182 |
-
best_layer = layer_num
|
183 |
-
|
184 |
-
return {
|
185 |
-
'output_token': best_output_token,
|
186 |
-
'output_location': best_output_location,
|
187 |
-
'best_layer': best_layer
|
188 |
-
}
|
189 |
-
|
190 |
def init_leaderboard(dataframe, track):
|
191 |
if dataframe is None or dataframe.empty:
|
192 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
117 |
)
|
118 |
|
119 |
|
120 |
+
def calculate_best_layer_scores(task_data: Dict[str, Any]) -> Dict[str, float]:
|
121 |
+
"""
|
122 |
+
Calculate the best scores across layers for output token and location
|
123 |
|
124 |
+
Args:
|
125 |
+
task_data: Dictionary containing task scores for different layers
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
Dictionary with best scores and corresponding layer
|
129 |
+
"""
|
130 |
+
output_token_scores = [layer_data['output_token'] for layer_data in task_data.values()]
|
131 |
+
output_location_scores = [layer_data['output_location'] for layer_data in task_data.values()]
|
132 |
+
|
133 |
+
best_output_token = max(output_token_scores)
|
134 |
+
best_output_location = max(output_location_scores)
|
135 |
+
|
136 |
+
# Find the layer with the best combined performance
|
137 |
+
layer_scores = [(layer, layer_data['output_token'] + layer_data['output_location'])
|
138 |
+
for layer, layer_data in task_data.items()]
|
139 |
+
best_layer = max(layer_scores, key=lambda x: x[1])[0]
|
140 |
+
|
141 |
+
return {
|
142 |
+
'output_token': best_output_token,
|
143 |
+
'output_location': best_output_location,
|
144 |
+
'best_layer': int(best_layer)
|
145 |
+
}
|
146 |
+
|
147 |
+
def process_single_method(json_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
148 |
+
"""
|
149 |
+
Process results for a single method into summary rows
|
150 |
|
151 |
+
Args:
|
152 |
+
json_data: Dictionary containing results for one method
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
List of summary rows for the method
|
156 |
+
"""
|
157 |
+
summary_rows = []
|
158 |
method_name = json_data['method_name']
|
159 |
|
|
|
160 |
for model_result in json_data['results']:
|
161 |
model_id = model_result['model_id']
|
162 |
task_data = model_result['task_scores']['MCQA']
|
163 |
|
|
|
164 |
best_scores = calculate_best_layer_scores(task_data)
|
165 |
|
166 |
summary_row = {
|
|
|
170 |
'Best Output Location Score': best_scores['output_location'],
|
171 |
'Best Layer': best_scores['best_layer']
|
172 |
}
|
173 |
+
summary_rows.append(summary_row)
|
174 |
+
|
175 |
+
return summary_rows
|
176 |
+
|
177 |
+
def init_leaderboard_mib_causal(json_data_list: List[Dict[str, Any]], track: str) -> 'Leaderboard':
|
178 |
+
"""
|
179 |
+
Creates a leaderboard summary for causal intervention results from multiple methods
|
180 |
+
|
181 |
+
Args:
|
182 |
+
json_data_list: List of dictionaries containing results for different methods
|
183 |
+
track: Track identifier (currently unused but maintained for compatibility)
|
184 |
+
|
185 |
+
Returns:
|
186 |
+
Leaderboard object containing processed and formatted results
|
187 |
+
"""
|
188 |
+
# Process all methods
|
189 |
+
all_summary_data = []
|
190 |
+
for method_data in json_data_list:
|
191 |
+
method_summary = process_single_method(method_data)
|
192 |
+
all_summary_data.extend(method_summary)
|
193 |
|
194 |
# Convert to DataFrame
|
195 |
+
results_df = pd.DataFrame(all_summary_data)
|
196 |
+
|
197 |
+
# Sort by best score (using output token score as primary metric)
|
198 |
+
results_df = results_df.sort_values('Best Output Token Score', ascending=False)
|
199 |
|
200 |
# Round numeric columns to 3 decimal places
|
201 |
numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
|
|
|
213 |
interactive=False,
|
214 |
)
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
def init_leaderboard(dataframe, track):
|
217 |
if dataframe is None or dataframe.empty:
|
218 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|