Aaron Mueller commited on
Commit
2d87944
·
1 Parent(s): 1d8e193

dynamic averaging

Browse files
app.py CHANGED
@@ -727,7 +727,8 @@ def process_json(temp_file):
727
  # Define the preset substrings for filtering
728
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
729
 
730
- def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str]) -> pd.DataFrame:
 
731
  """
732
  Filter columns based on the selected substrings.
733
  """
@@ -741,13 +742,30 @@ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: L
741
  if any(sub.lower() in col.lower() for sub in selected_substrings)
742
  or col == "Method"
743
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  return dataframe[filtered_columns]
745
 
746
- def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str]):
747
  """
748
  Update the leaderboard based on the selected substrings.
749
  """
750
- filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings)
 
751
  return filtered_dataframe
752
 
753
  demo = gr.Blocks(css=custom_css)
@@ -812,10 +830,11 @@ with demo:
812
  )
813
  leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
814
  original_leaderboard = gr.State(value=data)
 
815
  # Update the leaderboard when the user selects/deselects substrings
816
  substring_checkbox.change(
817
  fn=update_leaderboard,
818
- inputs=[original_leaderboard, substring_checkbox],
819
  outputs=leaderboard
820
  )
821
  print(f"Leaderboard is {leaderboard}")
 
727
  # Define the preset substrings for filtering
728
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
729
 
730
+ def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_substrings: List[str],
731
+ sort_ascending: bool) -> pd.DataFrame:
732
  """
733
  Filter columns based on the selected substrings.
734
  """
 
742
  if any(sub.lower() in col.lower() for sub in selected_substrings)
743
  or col == "Method"
744
  ]
745
+
746
+ def _compute_row_average(row):
747
+ # If any value is "-", return "-"
748
+ if any(v == "-" for v in row.values):
749
+ return 100 if sort_ascending else -100
750
+ # Convert to numeric, dropping any non-numeric values
751
+ numeric_values = pd.to_numeric(row, errors='coerce')
752
+ # Compute mean of non-NA values
753
+ return numeric_values.mean().round(3)
754
+
755
+ dataframe["Average"] = original_dataframe[filtered_columns].apply(_compute_row_average, axis=1)
756
+ # dataframe["Average"] = dataframe['Average'].mask(dataframe.isna().any(axis=1), '-')
757
+ filtered_columns.append("Average")
758
+ dataframe = dataframe.sort_values('Average', ascending=sort_ascending)
759
+ dataframe["Average"] = dataframe["Average"].replace(-100, "-").replace(100, "-")
760
+
761
  return dataframe[filtered_columns]
762
 
763
+ def update_leaderboard(dataframe: pd.DataFrame, selected_substrings: List[str], sort_ascending: bool = False):
764
  """
765
  Update the leaderboard based on the selected substrings.
766
  """
767
+ filtered_dataframe = filter_columns_by_substrings(dataframe, selected_substrings,
768
+ sort_ascending=sort_ascending)
769
  return filtered_dataframe
770
 
771
  demo = gr.Blocks(css=custom_css)
 
830
  )
831
  leaderboard, data = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH_FEQ, "Subgraph")
832
  original_leaderboard = gr.State(value=data)
833
+ boolean_checkbox = gr.Checkbox(value=True, visible=False) # Default to True
834
  # Update the leaderboard when the user selects/deselects substrings
835
  substring_checkbox.change(
836
  fn=update_leaderboard,
837
+ inputs=[original_leaderboard, substring_checkbox, boolean_checkbox],
838
  outputs=leaderboard
839
  )
840
  print(f"Leaderboard is {leaderboard}")
eval-results-mib-subgraph/submissions/results_2024-10-2T13-36-121.json DELETED
@@ -1,19 +0,0 @@
1
- {"method_name": "EAP-IG (mean)", "results": [
2
- {"model_id": "meta-llama/Llama-3.1-8B", "scores": {
3
- "ioi": {
4
- "edge_counts": [10.0, 29.0, 117.0, 269.0, 561.0, 1570.0, 3194.0, 6386.0, 16245.0, 32491.0],
5
- "faithfulness": [0.11454112510535433,0.14123527363014815,0.3197643850972241,0.47765884872924175,0.7701570853704176,1.3201798748760563,2.037825774185549,2.651813181821849,3.27612042118584,1.0]},
6
- "mcqa": {
7
- "edge_counts": [10.0, 21.0, 94.0, 241.0, 527.0, 1469.0, 3046.0, 6036.0, 14832.0, 32491.0],
8
- "faithfulness": [[0.02677059664121319,0.1965060952906922,0.449060470868564,0.7604756153676078,0.786575587658478,1.106011020720112,1.3436645156597262,1.5466349080478032,1.4914126224418107,1.0]]}
9
- }},
10
- {"model_id": "Qwen/Qwen2-1.5B", "scores": {
11
- "ioi": {
12
- "edge_counts": [],
13
- "faithfulness": []},
14
- "mcqa": {
15
- "edge_counts": [],
16
- "faithfulness": []}
17
- }}
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/__pycache__/populate.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
 
src/populate.py CHANGED
@@ -55,12 +55,13 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
55
 
56
  # Convert to dataframe
57
  df = pd.DataFrame.from_records(all_data_json)
 
58
 
59
  # Sort by Average score descending
60
  if 'Average' in df.columns:
61
  # Convert '-' to NaN for sorting purposes
62
  df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
63
- df = df.sort_values(by=['Average'], ascending=False, na_position='last')
64
  # Convert NaN back to '-'
65
  df['Average'] = df['Average'].fillna('-')
66
 
 
55
 
56
  # Convert to dataframe
57
  df = pd.DataFrame.from_records(all_data_json)
58
+ ascending = False if metric_type == "F+" else True
59
 
60
  # Sort by Average score descending
61
  if 'Average' in df.columns:
62
  # Convert '-' to NaN for sorting purposes
63
  df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
64
+ df = df.sort_values(by=['Average'], ascending=ascending, na_position='last')
65
  # Convert NaN back to '-'
66
  df['Average'] = df['Average'].fillna('-')
67