Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
2d87944
1
Parent(s):
1d8e193
dynamic averaging
Browse files
app.py
CHANGED
@@ -727,7 +727,8 @@ def process_json(temp_file):
|
|
727 |
# Define the preset substrings for filtering
|
728 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
729 |
|
730 |
-
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str]
|
|
|
731 |
"""
|
732 |
Filter columns based on the selected substrings.
|
733 |
"""
|
@@ -741,13 +742,30 @@ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: L
|
|
741 |
if any(sub.lower() in col.lower() for sub in selected_substrings)
|
742 |
or col == "Method"
|
743 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
744 |
return dataframe[filtered_columns]
|
745 |
|
746 |
-
def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str]):
|
747 |
"""
|
748 |
Update the leaderboard based on the selected substrings.
|
749 |
"""
|
750 |
-
filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings
|
|
|
751 |
return filtered_dataframe
|
752 |
|
753 |
demo = gr.Blocks(css=custom_css)
|
@@ -812,10 +830,11 @@ with demo:
|
|
812 |
)
|
813 |
leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
|
814 |
original_leaderboard = gr.State(value=data)
|
|
|
815 |
# Update the leaderboard when the user selects/deselects substrings
|
816 |
substring_checkbox.change(
|
817 |
fn=update_leaderboard,
|
818 |
-
inputs=[original_leaderboard, substring_checkbox],
|
819 |
outputs=leaderboard
|
820 |
)
|
821 |
print(f"Leaderboard is {leaderboard}")
|
|
|
727 |
# Define the preset substrings for filtering
|
728 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
729 |
|
730 |
+
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str],
|
731 |
+
sort_ascending: bool) -> pd.DataFrame:
|
732 |
"""
|
733 |
Filter columns based on the selected substrings.
|
734 |
"""
|
|
|
742 |
if any(sub.lower() in col.lower() for sub in selected_substrings)
|
743 |
or col == "Method"
|
744 |
]
|
745 |
+
|
746 |
+
def _compute_row_average(row):
|
747 |
+
# If any value is "-", return "-"
|
748 |
+
if any(v == "-" for v in row.values):
|
749 |
+
return 100 if sort_ascending else -100
|
750 |
+
# Convert to numeric, dropping any non-numeric values
|
751 |
+
numeric_values = pd.to_numeric(row, errors='coerce')
|
752 |
+
# Compute mean of non-NA values
|
753 |
+
return numeric_values.mean().round(3)
|
754 |
+
|
755 |
+
dataframe["Average"] = original_dataframe[filtered_columns].apply(_compute_row_average, axis=1)
|
756 |
+
# dataframe["Average"] = dataframe['Average'].mask(dataframe.isna().any(axis=1), '-')
|
757 |
+
filtered_columns.append("Average")
|
758 |
+
dataframe = dataframe.sort_values('Average', ascending=sort_ascending)
|
759 |
+
dataframe["Average"] = dataframe["Average"].replace(-100, "-").replace(100, "-")
|
760 |
+
|
761 |
return dataframe[filtered_columns]
|
762 |
|
763 |
+
def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str], sort_ascending: bool = False):
|
764 |
"""
|
765 |
Update the leaderboard based on the selected substrings.
|
766 |
"""
|
767 |
+
filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings,
|
768 |
+
sort_ascending=sort_ascending)
|
769 |
return filtered_dataframe
|
770 |
|
771 |
demo = gr.Blocks(css=custom_css)
|
|
|
830 |
)
|
831 |
leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
|
832 |
original_leaderboard = gr.State(value=data)
|
833 |
+
boolean_checkbox = gr.Checkbox(value=True, visible=False) # Default to True
|
834 |
# Update the leaderboard when the user selects/deselects substrings
|
835 |
substring_checkbox.change(
|
836 |
fn=update_leaderboard,
|
837 |
+
inputs=[original_leaderboard, substring_checkbox, boolean_checkbox],
|
838 |
outputs=leaderboard
|
839 |
)
|
840 |
print(f"Leaderboard is {leaderboard}")
|
eval-results-mib-subgraph/submissions/results_2024-10-2T13-36-121.json
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
{"method_name": "EAP-IG (mean)", "results": [
|
2 |
-
{"model_id": "meta-llama/Llama-3.1-8B", "scores": {
|
3 |
-
"ioi": {
|
4 |
-
"edge_counts": [10.0, 29.0, 117.0, 269.0, 561.0, 1570.0, 3194.0, 6386.0, 16245.0, 32491.0],
|
5 |
-
"faithfulness": [0.11454112510535433,0.14123527363014815,0.3197643850972241,0.47765884872924175,0.7701570853704176,1.3201798748760563,2.037825774185549,2.651813181821849,3.27612042118584,1.0]},
|
6 |
-
"mcqa": {
|
7 |
-
"edge_counts": [10.0, 21.0, 94.0, 241.0, 527.0, 1469.0, 3046.0, 6036.0, 14832.0, 32491.0],
|
8 |
-
"faithfulness": [[0.02677059664121319,0.1965060952906922,0.449060470868564,0.7604756153676078,0.786575587658478,1.106011020720112,1.3436645156597262,1.5466349080478032,1.4914126224418107,1.0]]}
|
9 |
-
}},
|
10 |
-
{"model_id": "Qwen/Qwen2-1.5B", "scores": {
|
11 |
-
"ioi": {
|
12 |
-
"edge_counts": [],
|
13 |
-
"faithfulness": []},
|
14 |
-
"mcqa": {
|
15 |
-
"edge_counts": [],
|
16 |
-
"faithfulness": []}
|
17 |
-
}}
|
18 |
-
]
|
19 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/__pycache__/populate.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
|
|
src/populate.py
CHANGED
@@ -55,12 +55,13 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
|
|
55 |
|
56 |
# Convert to dataframe
|
57 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
58 |
|
59 |
# Sort by Average score descending
|
60 |
if 'Average' in df.columns:
|
61 |
# Convert '-' to NaN for sorting purposes
|
62 |
df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
|
63 |
-
df = df.sort_values(by=['Average'], ascending=
|
64 |
# Convert NaN back to '-'
|
65 |
df['Average'] = df['Average'].fillna('-')
|
66 |
|
|
|
55 |
|
56 |
# Convert to dataframe
|
57 |
df = pd.DataFrame.from_records(all_data_json)
|
58 |
+
ascending = False if metric_type == "F+" else True
|
59 |
|
60 |
# Sort by Average score descending
|
61 |
if 'Average' in df.columns:
|
62 |
# Convert '-' to NaN for sorting purposes
|
63 |
df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
|
64 |
+
df = df.sort_values(by=['Average'], ascending=ascending, na_position='last')
|
65 |
# Convert NaN back to '-'
|
66 |
df['Average'] = df['Average'].fillna('-')
|
67 |
|