Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

atticusg commited on Jan 10

Commit

b348eb5

verified ·

1 Parent(s): a782a90

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -1

app.py CHANGED Viewed

@@ -116,6 +116,82 @@ def init_leaderboard_mib(dataframe, track):
         interactive=False,
     )
 def init_leaderboard(dataframe, track):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -184,7 +260,7 @@ with demo:
             # leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
         with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
-            leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
     # with gr.Row():
     #     with gr.Accordion("📙 Citation", open=False):

         interactive=False,
     )
+def init_leaderboard_mib_causal(json_data, task_type):
+    """Creates a summary leaderboard showing best layer performance for each method"""
+    if not json_data or 'results' not in json_data:
+        raise ValueError("Invalid JSON data structure")
+    # Process results into summary format
+    summary_data = []
+    method_name = json_data['method_name']
+    # Extract model and task data
+    for model_result in json_data['results']:
+        model_id = model_result['model_id']
+        # Get scores for the specified task
+        task_data = model_result['task_scores'].get(task_type, [])
+        if not task_data:
+            continue
+        # Calculate best layer performance
+        best_scores = calculate_best_layer_scores(task_data)
+        summary_row = {
+            'Method': method_name,
+            'Model': model_id,
+            'Best Output Token Score': best_scores['output_token'],
+            'Best Output Location Score': best_scores['output_location'],
+            'Best Layer': best_scores['best_layer']
+        }
+        summary_data.append(summary_row)
+    # Convert to DataFrame
+    df = pd.DataFrame(summary_data)
+    # Round numeric columns to 3 decimal places
+    numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
+    df[numeric_cols] = df[numeric_cols].round(3)
+    return Leaderboard(
+        value=df,
+        datatype=['text', 'text', 'number', 'number', 'number'],
+        select_columns=SelectColumns(
+            default_selection=['Method', 'Model', 'Best Output Token Score', 'Best Output Location Score', 'Best Layer'],
+            cant_deselect=['Method', 'Model'],
+            label="Select Metrics to Display:",
+        ),
+        search_columns=['Method', 'Model'],
+        interactive=False,
+    )
+def calculate_best_layer_scores(task_data):
+    """Calculate the best scores across all layers for each intervention type"""
+    best_output_token = 0
+    best_output_location = 0
+    best_layer = 0
+    for layer_data in task_data:
+        layer_num = int(layer_data['layer'])
+        layer_scores = layer_data['layer_scores']
+        # Calculate average scores for each intervention type
+        output_token_avg = sum(cf['score'] for cf in layer_scores[0]['counterfactual_scores']) / len(layer_scores[0]['counterfactual_scores'])
+        output_location_avg = sum(cf['score'] for cf in layer_scores[1]['counterfactual_scores']) / len(layer_scores[1]['counterfactual_scores'])
+        # Update best scores
+        if output_token_avg > best_output_token or output_location_avg > best_output_location:
+            best_output_token = max(best_output_token, output_token_avg)
+            best_output_location = max(best_output_location, output_location_avg)
+            best_layer = layer_num
+    return {
+        'output_token': best_output_token,
+        'output_location': best_output_location,
+        'best_layer': best_layer
+    }
 def init_leaderboard(dataframe, track):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
             # leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
         with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
+            leaderboard = init_leaderboard_mib_causal(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
     # with gr.Row():
     #     with gr.Accordion("📙 Citation", open=False):