jasonshaoshun commited on
Commit
da0827e
·
2 Parent(s): 4780a48 fe05167

Merge branch 'main' of https://huggingface.co/spaces/mech-interp-bench/leaderboard

Browse files
app.py CHANGED
@@ -451,7 +451,8 @@ def process_json(temp_file):
451
  # Define the preset substrings for filtering
452
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
453
 
454
- def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str]) -> pd.DataFrame:
 
455
  """
456
  Filter columns based on the selected substrings.
457
  """
@@ -465,13 +466,30 @@ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: L
465
  if any(sub.lower() in col.lower() for sub in selected_substrings)
466
  or col == "Method"
467
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  return dataframe[filtered_columns]
469
 
470
- def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str]):
471
  """
472
  Update the leaderboard based on the selected substrings.
473
  """
474
- filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings)
 
475
  return filtered_dataframe
476
 
477
  demo = gr.Blocks(css=custom_css)
@@ -497,7 +515,7 @@ with demo:
497
 
498
  # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
499
  # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
500
- with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
501
  with gr.Tabs() as subgraph_tabs:
502
  with gr.TabItem("F+", id=0):
503
  # Add description for filters
@@ -536,16 +554,17 @@ with demo:
536
  )
537
  leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
538
  original_leaderboard = gr.State(value=data)
 
539
  # Update the leaderboard when the user selects/deselects substrings
540
  substring_checkbox.change(
541
  fn=update_leaderboard,
542
- inputs=[original_leaderboard, substring_checkbox],
543
  outputs=leaderboard
544
  )
545
  print(f"Leaderboard is {leaderboard}")
546
 
547
  # Then modify the Causal Graph tab section
548
- with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
549
  with gr.Tabs() as causalgraph_tabs:
550
  with gr.TabItem("Detailed View", id=0):
551
  leaderboard_detailed = init_leaderboard_mib_causalgraph(
 
451
  # Define the preset substrings for filtering
452
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
453
 
454
+ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str],
455
+ sort_ascending: bool) -> pd.DataFrame:
456
  """
457
  Filter columns based on the selected substrings.
458
  """
 
466
  if any(sub.lower() in col.lower() for sub in selected_substrings)
467
  or col == "Method"
468
  ]
469
+
470
+ def _compute_row_average(row):
471
+ # If any value is "-", return "-"
472
+ if any(v == "-" for v in row.values):
473
+ return 100 if sort_ascending else -100
474
+ # Convert to numeric, dropping any non-numeric values
475
+ numeric_values = pd.to_numeric(row, errors='coerce')
476
+ # Compute mean of non-NA values
477
+ return numeric_values.mean().round(3)
478
+
479
+ dataframe["Average"] = original_dataframe[filtered_columns].apply(_compute_row_average, axis=1)
480
+ # dataframe["Average"] = dataframe['Average'].mask(dataframe.isna().any(axis=1), '-')
481
+ filtered_columns.append("Average")
482
+ dataframe = dataframe.sort_values('Average', ascending=sort_ascending)
483
+ dataframe["Average"] = dataframe["Average"].replace(-100, "-").replace(100, "-")
484
+
485
  return dataframe[filtered_columns]
486
 
487
+ def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str], sort_ascending: bool = False):
488
  """
489
  Update the leaderboard based on the selected substrings.
490
  """
491
+ filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings,
492
+ sort_ascending=sort_ascending)
493
  return filtered_dataframe
494
 
495
  demo = gr.Blocks(css=custom_css)
 
515
 
516
  # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
517
  # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
518
+ with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
519
  with gr.Tabs() as subgraph_tabs:
520
  with gr.TabItem("F+", id=0):
521
  # Add description for filters
 
554
  )
555
  leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
556
  original_leaderboard = gr.State(value=data)
557
+ boolean_checkbox = gr.Checkbox(value=True, visible=False) # Default to True
558
  # Update the leaderboard when the user selects/deselects substrings
559
  substring_checkbox.change(
560
  fn=update_leaderboard,
561
+ inputs=[original_leaderboard, substring_checkbox, boolean_checkbox],
562
  outputs=leaderboard
563
  )
564
  print(f"Leaderboard is {leaderboard}")
565
 
566
  # Then modify the Causal Graph tab section
567
+ with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
568
  with gr.Tabs() as causalgraph_tabs:
569
  with gr.TabItem("Detailed View", id=0):
570
  leaderboard_detailed = init_leaderboard_mib_causalgraph(
eval-results-mib-subgraph/baselines/UGS.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "method_name": "UGS",
3
+ "results": [
4
+ {
5
+ "model_id": "qwen2.5",
6
+ "scores": {
7
+ "ioi": {
8
+ "edge_counts": [
9
+ 164.0,
10
+ 349.0,
11
+ 888.0,
12
+ 1766.0,
13
+ 3575.0,
14
+ 8977.0,
15
+ 17961.0,
16
+ 35949.0,
17
+ 89874.0,
18
+ 179749.0
19
+ ],
20
+ "faithfulness": [
21
+ 0.8161993769470405,
22
+ 1.0623052959501558,
23
+ 1.1557632398753894,
24
+ 1.1806853582554517,
25
+ 1.071651090342679,
26
+ 1.0093457943925233,
27
+ 0.9875389408099688,
28
+ 0.9470404984423676,
29
+ 0.9719626168224299,
30
+ 1.0
31
+ ]
32
+ },
33
+ "mcqa": {
34
+ "edge_counts": [
35
+ 86.0,
36
+ 212.0,
37
+ 704.0,
38
+ 1632.0,
39
+ 3449.0,
40
+ 8871.0,
41
+ 17814.0,
42
+ 35720.0,
43
+ 89874.0,
44
+ 179749.0
45
+ ],
46
+ "faithfulness": [
47
+ 0.37104430379746833,
48
+ 0.4506526898734177,
49
+ 0.6471518987341772,
50
+ 0.7231012658227848,
51
+ 0.9113924050632911,
52
+ 1.0,
53
+ 1.5917721518987342,
54
+ 1.7183544303797469,
55
+ 1.009493670886076,
56
+ 1.0
57
+ ]
58
+ }
59
+ }
60
+ },
61
+ {
62
+ "model_id": "gpt2",
63
+ "scores": {
64
+ "ioi": {
65
+ "edge_counts": [
66
+ 27.0,
67
+ 57.0,
68
+ 159.0,
69
+ 322.0,
70
+ 640.0,
71
+ 1608.0,
72
+ 3244.0,
73
+ 6498.0,
74
+ 16245.0,
75
+ 32491.0
76
+ ],
77
+ "faithfulness": [
78
+ 0.10013020765541497,
79
+ 0.33153985647745055,
80
+ 1.0775680479866294,
81
+ 0.960686341813994,
82
+ 1.0155814417206641,
83
+ 1.0182404988203417,
84
+ 0.9613478605327729,
85
+ 0.9464708735339975,
86
+ 0.9555035267362492,
87
+ 1.0
88
+ ]
89
+ }
90
+ }
91
+ }
92
+ ]
93
+ }
eval-results-mib-subgraph/submissions/results_2024-10-2T13-36-121.json DELETED
@@ -1,19 +0,0 @@
1
- {"method_name": "EAP-IG (mean)", "results": [
2
- {"model_id": "meta-llama/Llama-3.1-8B", "scores": {
3
- "ioi": {
4
- "edge_counts": [10.0, 29.0, 117.0, 269.0, 561.0, 1570.0, 3194.0, 6386.0, 16245.0, 32491.0],
5
- "faithfulness": [0.11454112510535433,0.14123527363014815,0.3197643850972241,0.47765884872924175,0.7701570853704176,1.3201798748760563,2.037825774185549,2.651813181821849,3.27612042118584,1.0]},
6
- "mcqa": {
7
- "edge_counts": [10.0, 21.0, 94.0, 241.0, 527.0, 1469.0, 3046.0, 6036.0, 14832.0, 32491.0],
8
- "faithfulness": [[0.02677059664121319,0.1965060952906922,0.449060470868564,0.7604756153676078,0.786575587658478,1.106011020720112,1.3436645156597262,1.5466349080478032,1.4914126224418107,1.0]]}
9
- }},
10
- {"model_id": "Qwen/Qwen2-1.5B", "scores": {
11
- "ioi": {
12
- "edge_counts": [],
13
- "faithfulness": []},
14
- "mcqa": {
15
- "edge_counts": [],
16
- "faithfulness": []}
17
- }}
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/__pycache__/populate.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
 
src/populate.py CHANGED
@@ -47,12 +47,13 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
47
 
48
  # Convert to dataframe
49
  df = pd.DataFrame.from_records(all_data_json)
 
50
 
51
  # Sort by Average score descending
52
  if 'Average' in df.columns:
53
  # Convert '-' to NaN for sorting purposes
54
  df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
55
- df = df.sort_values(by=['Average'], ascending=False, na_position='last')
56
  # Convert NaN back to '-'
57
  df['Average'] = df['Average'].fillna('-')
58
 
 
47
 
48
  # Convert to dataframe
49
  df = pd.DataFrame.from_records(all_data_json)
50
+ ascending = False if metric_type == "F+" else True
51
 
52
  # Sort by Average score descending
53
  if 'Average' in df.columns:
54
  # Convert '-' to NaN for sorting purposes
55
  df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
56
+ df = df.sort_values(by=['Average'], ascending=ascending, na_position='last')
57
  # Convert NaN back to '-'
58
  df['Average'] = df['Average'].fillna('-')
59