Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/mech-interp-bench/leaderboard
Browse files
app.py
CHANGED
@@ -451,7 +451,8 @@ def process_json(temp_file):
|
|
451 |
# Define the preset substrings for filtering
|
452 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
453 |
|
454 |
-
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str]
|
|
|
455 |
"""
|
456 |
Filter columns based on the selected substrings.
|
457 |
"""
|
@@ -465,13 +466,30 @@ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: L
|
|
465 |
if any(sub.lower() in col.lower() for sub in selected_substrings)
|
466 |
or col == "Method"
|
467 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
return dataframe[filtered_columns]
|
469 |
|
470 |
-
def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str]):
|
471 |
"""
|
472 |
Update the leaderboard based on the selected substrings.
|
473 |
"""
|
474 |
-
filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings
|
|
|
475 |
return filtered_dataframe
|
476 |
|
477 |
demo = gr.Blocks(css=custom_css)
|
@@ -497,7 +515,7 @@ with demo:
|
|
497 |
|
498 |
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
499 |
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
500 |
-
with gr.TabItem("
|
501 |
with gr.Tabs() as subgraph_tabs:
|
502 |
with gr.TabItem("F+", id=0):
|
503 |
# Add description for filters
|
@@ -536,16 +554,17 @@ with demo:
|
|
536 |
)
|
537 |
leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
|
538 |
original_leaderboard = gr.State(value=data)
|
|
|
539 |
# Update the leaderboard when the user selects/deselects substrings
|
540 |
substring_checkbox.change(
|
541 |
fn=update_leaderboard,
|
542 |
-
inputs=[original_leaderboard, substring_checkbox],
|
543 |
outputs=leaderboard
|
544 |
)
|
545 |
print(f"Leaderboard is {leaderboard}")
|
546 |
|
547 |
# Then modify the Causal Graph tab section
|
548 |
-
with gr.TabItem("Causal
|
549 |
with gr.Tabs() as causalgraph_tabs:
|
550 |
with gr.TabItem("Detailed View", id=0):
|
551 |
leaderboard_detailed = init_leaderboard_mib_causalgraph(
|
|
|
451 |
# Define the preset substrings for filtering
|
452 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
453 |
|
454 |
+
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str],
|
455 |
+
sort_ascending: bool) -> pd.DataFrame:
|
456 |
"""
|
457 |
Filter columns based on the selected substrings.
|
458 |
"""
|
|
|
466 |
if any(sub.lower() in col.lower() for sub in selected_substrings)
|
467 |
or col == "Method"
|
468 |
]
|
469 |
+
|
470 |
+
def _compute_row_average(row):
|
471 |
+
# If any value is "-", return "-"
|
472 |
+
if any(v == "-" for v in row.values):
|
473 |
+
return 100 if sort_ascending else -100
|
474 |
+
# Convert to numeric, dropping any non-numeric values
|
475 |
+
numeric_values = pd.to_numeric(row, errors='coerce')
|
476 |
+
# Compute mean of non-NA values
|
477 |
+
return numeric_values.mean().round(3)
|
478 |
+
|
479 |
+
dataframe["Average"] = original_dataframe[filtered_columns].apply(_compute_row_average, axis=1)
|
480 |
+
# dataframe["Average"] = dataframe['Average'].mask(dataframe.isna().any(axis=1), '-')
|
481 |
+
filtered_columns.append("Average")
|
482 |
+
dataframe = dataframe.sort_values('Average', ascending=sort_ascending)
|
483 |
+
dataframe["Average"] = dataframe["Average"].replace(-100, "-").replace(100, "-")
|
484 |
+
|
485 |
return dataframe[filtered_columns]
|
486 |
|
487 |
+
def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str], sort_ascending: bool = False):
|
488 |
"""
|
489 |
Update the leaderboard based on the selected substrings.
|
490 |
"""
|
491 |
+
filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings,
|
492 |
+
sort_ascending=sort_ascending)
|
493 |
return filtered_dataframe
|
494 |
|
495 |
demo = gr.Blocks(css=custom_css)
|
|
|
515 |
|
516 |
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
517 |
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
518 |
+
with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
|
519 |
with gr.Tabs() as subgraph_tabs:
|
520 |
with gr.TabItem("F+", id=0):
|
521 |
# Add description for filters
|
|
|
554 |
)
|
555 |
leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
|
556 |
original_leaderboard = gr.State(value=data)
|
557 |
+
boolean_checkbox = gr.Checkbox(value=True, visible=False) # Default to True
|
558 |
# Update the leaderboard when the user selects/deselects substrings
|
559 |
substring_checkbox.change(
|
560 |
fn=update_leaderboard,
|
561 |
+
inputs=[original_leaderboard, substring_checkbox, boolean_checkbox],
|
562 |
outputs=leaderboard
|
563 |
)
|
564 |
print(f"Leaderboard is {leaderboard}")
|
565 |
|
566 |
# Then modify the Causal Graph tab section
|
567 |
+
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
568 |
with gr.Tabs() as causalgraph_tabs:
|
569 |
with gr.TabItem("Detailed View", id=0):
|
570 |
leaderboard_detailed = init_leaderboard_mib_causalgraph(
|
eval-results-mib-subgraph/baselines/UGS.json
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"method_name": "UGS",
|
3 |
+
"results": [
|
4 |
+
{
|
5 |
+
"model_id": "qwen2.5",
|
6 |
+
"scores": {
|
7 |
+
"ioi": {
|
8 |
+
"edge_counts": [
|
9 |
+
164.0,
|
10 |
+
349.0,
|
11 |
+
888.0,
|
12 |
+
1766.0,
|
13 |
+
3575.0,
|
14 |
+
8977.0,
|
15 |
+
17961.0,
|
16 |
+
35949.0,
|
17 |
+
89874.0,
|
18 |
+
179749.0
|
19 |
+
],
|
20 |
+
"faithfulness": [
|
21 |
+
0.8161993769470405,
|
22 |
+
1.0623052959501558,
|
23 |
+
1.1557632398753894,
|
24 |
+
1.1806853582554517,
|
25 |
+
1.071651090342679,
|
26 |
+
1.0093457943925233,
|
27 |
+
0.9875389408099688,
|
28 |
+
0.9470404984423676,
|
29 |
+
0.9719626168224299,
|
30 |
+
1.0
|
31 |
+
]
|
32 |
+
},
|
33 |
+
"mcqa": {
|
34 |
+
"edge_counts": [
|
35 |
+
86.0,
|
36 |
+
212.0,
|
37 |
+
704.0,
|
38 |
+
1632.0,
|
39 |
+
3449.0,
|
40 |
+
8871.0,
|
41 |
+
17814.0,
|
42 |
+
35720.0,
|
43 |
+
89874.0,
|
44 |
+
179749.0
|
45 |
+
],
|
46 |
+
"faithfulness": [
|
47 |
+
0.37104430379746833,
|
48 |
+
0.4506526898734177,
|
49 |
+
0.6471518987341772,
|
50 |
+
0.7231012658227848,
|
51 |
+
0.9113924050632911,
|
52 |
+
1.0,
|
53 |
+
1.5917721518987342,
|
54 |
+
1.7183544303797469,
|
55 |
+
1.009493670886076,
|
56 |
+
1.0
|
57 |
+
]
|
58 |
+
}
|
59 |
+
}
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"model_id": "gpt2",
|
63 |
+
"scores": {
|
64 |
+
"ioi": {
|
65 |
+
"edge_counts": [
|
66 |
+
27.0,
|
67 |
+
57.0,
|
68 |
+
159.0,
|
69 |
+
322.0,
|
70 |
+
640.0,
|
71 |
+
1608.0,
|
72 |
+
3244.0,
|
73 |
+
6498.0,
|
74 |
+
16245.0,
|
75 |
+
32491.0
|
76 |
+
],
|
77 |
+
"faithfulness": [
|
78 |
+
0.10013020765541497,
|
79 |
+
0.33153985647745055,
|
80 |
+
1.0775680479866294,
|
81 |
+
0.960686341813994,
|
82 |
+
1.0155814417206641,
|
83 |
+
1.0182404988203417,
|
84 |
+
0.9613478605327729,
|
85 |
+
0.9464708735339975,
|
86 |
+
0.9555035267362492,
|
87 |
+
1.0
|
88 |
+
]
|
89 |
+
}
|
90 |
+
}
|
91 |
+
}
|
92 |
+
]
|
93 |
+
}
|
eval-results-mib-subgraph/submissions/results_2024-10-2T13-36-121.json
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
{"method_name": "EAP-IG (mean)", "results": [
|
2 |
-
{"model_id": "meta-llama/Llama-3.1-8B", "scores": {
|
3 |
-
"ioi": {
|
4 |
-
"edge_counts": [10.0, 29.0, 117.0, 269.0, 561.0, 1570.0, 3194.0, 6386.0, 16245.0, 32491.0],
|
5 |
-
"faithfulness": [0.11454112510535433,0.14123527363014815,0.3197643850972241,0.47765884872924175,0.7701570853704176,1.3201798748760563,2.037825774185549,2.651813181821849,3.27612042118584,1.0]},
|
6 |
-
"mcqa": {
|
7 |
-
"edge_counts": [10.0, 21.0, 94.0, 241.0, 527.0, 1469.0, 3046.0, 6036.0, 14832.0, 32491.0],
|
8 |
-
"faithfulness": [[0.02677059664121319,0.1965060952906922,0.449060470868564,0.7604756153676078,0.786575587658478,1.106011020720112,1.3436645156597262,1.5466349080478032,1.4914126224418107,1.0]]}
|
9 |
-
}},
|
10 |
-
{"model_id": "Qwen/Qwen2-1.5B", "scores": {
|
11 |
-
"ioi": {
|
12 |
-
"edge_counts": [],
|
13 |
-
"faithfulness": []},
|
14 |
-
"mcqa": {
|
15 |
-
"edge_counts": [],
|
16 |
-
"faithfulness": []}
|
17 |
-
}}
|
18 |
-
]
|
19 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/__pycache__/populate.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
|
|
src/populate.py
CHANGED
@@ -47,12 +47,13 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
|
|
47 |
|
48 |
# Convert to dataframe
|
49 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
50 |
|
51 |
# Sort by Average score descending
|
52 |
if 'Average' in df.columns:
|
53 |
# Convert '-' to NaN for sorting purposes
|
54 |
df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
|
55 |
-
df = df.sort_values(by=['Average'], ascending=
|
56 |
# Convert NaN back to '-'
|
57 |
df['Average'] = df['Average'].fillna('-')
|
58 |
|
|
|
47 |
|
48 |
# Convert to dataframe
|
49 |
df = pd.DataFrame.from_records(all_data_json)
|
50 |
+
ascending = False if metric_type == "F+" else True
|
51 |
|
52 |
# Sort by Average score descending
|
53 |
if 'Average' in df.columns:
|
54 |
# Convert '-' to NaN for sorting purposes
|
55 |
df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
|
56 |
+
df = df.sort_values(by=['Average'], ascending=ascending, na_position='last')
|
57 |
# Convert NaN back to '-'
|
58 |
df['Average'] = df['Average'].fillna('-')
|
59 |
|