Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -116,6 +116,82 @@ def init_leaderboard_mib(dataframe, track):
|
|
116 |
interactive=False,
|
117 |
)
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def init_leaderboard(dataframe, track):
|
120 |
if dataframe is None or dataframe.empty:
|
121 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
@@ -184,7 +260,7 @@ with demo:
|
|
184 |
# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
|
185 |
|
186 |
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
187 |
-
leaderboard =
|
188 |
|
189 |
# with gr.Row():
|
190 |
# with gr.Accordion("π Citation", open=False):
|
|
|
116 |
interactive=False,
|
117 |
)
|
118 |
|
119 |
+
|
120 |
+
def init_leaderboard_mib_causal(json_data, task_type):
|
121 |
+
"""Creates a summary leaderboard showing best layer performance for each method"""
|
122 |
+
if not json_data or 'results' not in json_data:
|
123 |
+
raise ValueError("Invalid JSON data structure")
|
124 |
+
|
125 |
+
# Process results into summary format
|
126 |
+
summary_data = []
|
127 |
+
method_name = json_data['method_name']
|
128 |
+
|
129 |
+
# Extract model and task data
|
130 |
+
for model_result in json_data['results']:
|
131 |
+
model_id = model_result['model_id']
|
132 |
+
|
133 |
+
# Get scores for the specified task
|
134 |
+
task_data = model_result['task_scores'].get(task_type, [])
|
135 |
+
if not task_data:
|
136 |
+
continue
|
137 |
+
|
138 |
+
# Calculate best layer performance
|
139 |
+
best_scores = calculate_best_layer_scores(task_data)
|
140 |
+
|
141 |
+
summary_row = {
|
142 |
+
'Method': method_name,
|
143 |
+
'Model': model_id,
|
144 |
+
'Best Output Token Score': best_scores['output_token'],
|
145 |
+
'Best Output Location Score': best_scores['output_location'],
|
146 |
+
'Best Layer': best_scores['best_layer']
|
147 |
+
}
|
148 |
+
summary_data.append(summary_row)
|
149 |
+
|
150 |
+
# Convert to DataFrame
|
151 |
+
df = pd.DataFrame(summary_data)
|
152 |
+
|
153 |
+
# Round numeric columns to 3 decimal places
|
154 |
+
numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
|
155 |
+
df[numeric_cols] = df[numeric_cols].round(3)
|
156 |
+
|
157 |
+
return Leaderboard(
|
158 |
+
value=df,
|
159 |
+
datatype=['text', 'text', 'number', 'number', 'number'],
|
160 |
+
select_columns=SelectColumns(
|
161 |
+
default_selection=['Method', 'Model', 'Best Output Token Score', 'Best Output Location Score', 'Best Layer'],
|
162 |
+
cant_deselect=['Method', 'Model'],
|
163 |
+
label="Select Metrics to Display:",
|
164 |
+
),
|
165 |
+
search_columns=['Method', 'Model'],
|
166 |
+
interactive=False,
|
167 |
+
)
|
168 |
+
|
169 |
+
def calculate_best_layer_scores(task_data):
|
170 |
+
"""Calculate the best scores across all layers for each intervention type"""
|
171 |
+
best_output_token = 0
|
172 |
+
best_output_location = 0
|
173 |
+
best_layer = 0
|
174 |
+
|
175 |
+
for layer_data in task_data:
|
176 |
+
layer_num = int(layer_data['layer'])
|
177 |
+
layer_scores = layer_data['layer_scores']
|
178 |
+
|
179 |
+
# Calculate average scores for each intervention type
|
180 |
+
output_token_avg = sum(cf['score'] for cf in layer_scores[0]['counterfactual_scores']) / len(layer_scores[0]['counterfactual_scores'])
|
181 |
+
output_location_avg = sum(cf['score'] for cf in layer_scores[1]['counterfactual_scores']) / len(layer_scores[1]['counterfactual_scores'])
|
182 |
+
|
183 |
+
# Update best scores
|
184 |
+
if output_token_avg > best_output_token or output_location_avg > best_output_location:
|
185 |
+
best_output_token = max(best_output_token, output_token_avg)
|
186 |
+
best_output_location = max(best_output_location, output_location_avg)
|
187 |
+
best_layer = layer_num
|
188 |
+
|
189 |
+
return {
|
190 |
+
'output_token': best_output_token,
|
191 |
+
'output_location': best_output_location,
|
192 |
+
'best_layer': best_layer
|
193 |
+
}
|
194 |
+
|
195 |
def init_leaderboard(dataframe, track):
|
196 |
if dataframe is None or dataframe.empty:
|
197 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
260 |
# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
|
261 |
|
262 |
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
263 |
+
leaderboard = init_leaderboard_mib_causal(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
|
264 |
|
265 |
# with gr.Row():
|
266 |
# with gr.Accordion("π Citation", open=False):
|