Aaron Mueller commited on
Commit
b624a39
·
1 Parent(s): 44212b3

prettier leaderboard; draft of submission tab

Browse files
Files changed (5) hide show
  1. app.py +136 -7
  2. caulsal_metric.py +3 -3
  3. src/about.py +11 -11
  4. src/leaderboard/read_evals.py +21 -6
  5. src/populate.py +2 -2
app.py CHANGED
@@ -122,6 +122,7 @@ from gradio_leaderboard import SelectColumns, Leaderboard
122
  import pandas as pd
123
  from typing import List, Dict, Optional
124
  from dataclasses import fields
 
125
 
126
  class SmartSelectColumns(SelectColumns):
127
  """
@@ -270,7 +271,11 @@ try:
270
  except Exception:
271
  restart_space()
272
 
273
-
 
 
 
 
274
 
275
  LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
276
  LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
@@ -486,6 +491,12 @@ def process_json(temp_file):
486
  gr.Markdown("Upload successful!")
487
  return data
488
 
 
 
 
 
 
 
489
 
490
  # Define the preset substrings for filtering
491
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
@@ -546,13 +557,23 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
546
  else:
547
  show_average = False
548
 
 
 
 
 
 
 
 
 
549
  if show_average:
550
  means = filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
 
 
551
  filtered_dataframe["Average"] = means.round(2)
 
552
  filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
553
  filtered_dataframe = filtered_dataframe.replace(float("nan"), "-")
554
 
555
-
556
  # if show_average:
557
  # print([row for index, row in filtered_dataframe.iterrows()])
558
  # filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
@@ -566,6 +587,10 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
566
 
567
  return filtered_dataframe
568
 
 
 
 
 
569
  demo = gr.Blocks(css=custom_css)
570
  with demo:
571
  gr.HTML(TITLE)
@@ -581,11 +606,6 @@ with demo:
581
 
582
  # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
583
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
584
-
585
- # with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
586
- # with gr.Column():
587
- # with gr.Row():
588
- # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
589
 
590
  # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
591
  # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
@@ -719,6 +739,115 @@ with demo:
719
  "Causal Graph"
720
  )
721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  # with gr.Row():
723
  # with gr.Accordion("📙 Citation", open=False):
724
  # citation_button = gr.Textbox(
 
122
  import pandas as pd
123
  from typing import List, Dict, Optional
124
  from dataclasses import fields
125
+ import math
126
 
127
  class SmartSelectColumns(SelectColumns):
128
  """
 
271
  except Exception:
272
  restart_space()
273
 
274
+ def _sigmoid(x):
275
+ try:
276
+ return 1 / (1 + math.exp(-2 * (x-1)))
277
+ except:
278
+ return "-"
279
 
280
  LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
281
  LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
 
491
  gr.Markdown("Upload successful!")
492
  return data
493
 
494
+ def get_hf_username(hf_repo):
495
+ hf_repo = hf_repo.rstrip("/")
496
+ parts = hf_repo.split("/")
497
+ username = parts[-2]
498
+ return username
499
+
500
 
501
  # Define the preset substrings for filtering
502
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 
557
  else:
558
  show_average = False
559
 
560
+ def _transform_floats(df):
561
+ df_transformed = df.copy()
562
+ # Apply transformation row by row
563
+ for i, row in df_transformed.iterrows():
564
+ # Apply sigmoid only to numeric values in the row
565
+ df_transformed.loc[i] = row.apply(lambda x: _sigmoid(x) if isinstance(x, (float, int)) else x)
566
+ return df_transformed
567
+
568
  if show_average:
569
  means = filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
570
+ s_filtered_dataframe = _transform_floats(filtered_dataframe)
571
+ s_means = s_filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
572
  filtered_dataframe["Average"] = means.round(2)
573
+ filtered_dataframe["Score"] = s_means.round(2)
574
  filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
575
  filtered_dataframe = filtered_dataframe.replace(float("nan"), "-")
576
 
 
577
  # if show_average:
578
  # print([row for index, row in filtered_dataframe.iterrows()])
579
  # filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
 
587
 
588
  return filtered_dataframe
589
 
590
+ def process_url(url):
591
+ # Add your URL processing logic here
592
+ return f"You entered the URL: {url}"
593
+
594
  demo = gr.Blocks(css=custom_css)
595
  with demo:
596
  gr.HTML(TITLE)
 
606
 
607
  # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
608
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
609
 
610
  # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
611
  # leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
 
739
  "Causal Graph"
740
  )
741
 
742
+ with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
743
+ gr.Markdown("## 🏆 Submission Portal")
744
+
745
+ # Track selection
746
+ track = gr.Radio(
747
+ choices=[
748
+ "Circuit Localization Track",
749
+ "Causal Variable Localization Track"
750
+ ],
751
+ label="Select Competition Track",
752
+ elem_id="track_selector"
753
+ )
754
+
755
+ with gr.Group(visible=False) as circuit_ui:
756
+ gr.Markdown("### Circuit Localization Requirements")
757
+ hf_repo = gr.Textbox(
758
+ label="HuggingFace Repository URL",
759
+ placeholder="https://huggingface.co/username/repo/tree/main/path",
760
+ info="Must be a valid HuggingFace URL pointing to a folder with 10 circuit files (.json or .pt)"
761
+ )
762
+
763
+ with gr.Group(visible=False) as causal_ui:
764
+ gr.Markdown("### Causal Variable Localization Requirements")
765
+ with gr.Row():
766
+ layer = gr.Number(
767
+ label="Layer Number",
768
+ precision=0,
769
+ minimum=0,
770
+ info="Integer specifying the model layer"
771
+ )
772
+ token_position = gr.Number(
773
+ label="Token Position",
774
+ precision=0,
775
+ minimum=0,
776
+ info="Integer specifying token position"
777
+ )
778
+ code_upload = gr.File(
779
+ label="Upload Python file implementing your featurization function",
780
+ file_types=[".py"],
781
+ )
782
+
783
+ # Common fields
784
+ with gr.Group():
785
+ gr.Markdown("### Team Information")
786
+ team_name = gr.Textbox(label="Team Name")
787
+ contact_email = gr.Textbox(label="Contact Email")
788
+
789
+ # Dynamic UI logic
790
+ def toggle_ui(track):
791
+ circuit = track == "Circuit Localization Track"
792
+ causal = not circuit
793
+ return {
794
+ circuit_ui: gr.Group(visible=circuit),
795
+ causal_ui: gr.Group(visible=causal)
796
+ }
797
+
798
+ track.change(toggle_ui, track, [circuit_ui, causal_ui])
799
+
800
+ # Submission handling
801
+ status = gr.Textbox(label="Submission Status", visible=False)
802
+
803
+ def handle_submission(track, hf_repo, layer, token_position, code_upload, team_name, contact_email):
804
+ errors = []
805
+
806
+ # Validate common fields
807
+ if not team_name.strip():
808
+ errors.append("Team name is required")
809
+ if "@" not in contact_email or "." not in contact_email:
810
+ errors.append("Valid email address is required")
811
+
812
+ # Track-specific validation
813
+ if "Circuit" in track:
814
+ if not hf_repo.startswith("https://huggingface.co/"):
815
+ errors.append("Invalid HuggingFace URL - must start with https://huggingface.co/")
816
+ else:
817
+ # Check rate limit only for valid HF submissions
818
+ username = get_hf_username(hf_repo)
819
+ rate = 0 # TODO: check submissions queue for rates
820
+ rate_limit = 2
821
+ if rate > rate_limit:
822
+ errors.append("Rate limit exceeded (max 2 submissions per week per HF account)")
823
+
824
+ else:
825
+ if not (isinstance(layer, int) and isinstance(token_position, int)):
826
+ errors.append("Layer and token position must be integers")
827
+ if not code_upload:
828
+ errors.append("Code file upload is required")
829
+
830
+ if errors:
831
+ return gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
832
+
833
+ # Process valid submission
834
+ return gr.Textbox("✅ Submission received! Thank you for your entry.", visible=True)
835
+
836
+ submit_btn = gr.Button("Submit Entry", variant="primary")
837
+ submit_btn.click(
838
+ handle_submission,
839
+ inputs=[track, hf_repo, layer, token_position, code_upload, team_name, contact_email],
840
+ outputs=status
841
+ )
842
+
843
+ # Add info about rate limits
844
+ gr.Markdown("""
845
+ ### Submission Policy
846
+ - Maximum 2 valid submissions per HuggingFace account per week
847
+ - Invalid submissions don't count toward your limit
848
+ - Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
849
+ """)
850
+
851
  # with gr.Row():
852
  # with gr.Accordion("📙 Citation", open=False):
853
  # citation_button = gr.Textbox(
caulsal_metric.py CHANGED
@@ -161,7 +161,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
161
  numeric_df = df_copy.applymap(extract_score)
162
 
163
  # Group by base method name and take the mean
164
- aggregated_df = numeric_df.groupby(level=0).max().round(3)
165
 
166
  # Convert back to string format
167
  aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
@@ -198,12 +198,12 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
198
 
199
  # Create new DataFrame with averaged intervention scores
200
  averaged_df = pd.DataFrame({
201
- model_task: numeric_df[cols].mean(axis=1).round(3)
202
  for model_task, cols in model_task_groups.items()
203
  })
204
 
205
  # Add overall average column
206
- averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
207
 
208
  # Sort by Average column
209
  averaged_df = averaged_df.sort_values('Average', ascending=False)
 
161
  numeric_df = df_copy.applymap(extract_score)
162
 
163
  # Group by base method name and take the mean
164
+ aggregated_df = numeric_df.groupby(level=0).max().round(2)
165
 
166
  # Convert back to string format
167
  aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
 
198
 
199
  # Create new DataFrame with averaged intervention scores
200
  averaged_df = pd.DataFrame({
201
+ model_task: numeric_df[cols].mean(axis=1).round(2)
202
  for model_task, cols in model_task_groups.items()
203
  })
204
 
205
  # Add overall average column
206
+ averaged_df['Average'] = averaged_df.mean(axis=1).round(2)
207
 
208
  # Sort by Average column
209
  averaged_df = averaged_df.sort_values('Average', ascending=False)
src/about.py CHANGED
@@ -139,20 +139,20 @@ This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has
139
  """
140
 
141
  EVALUATION_QUEUE_TEXT = """
142
- ## Some good practices before requesting a predictions upload:
143
 
144
- Make sure you can get scores from your predictions file using the `score_predictions.py` script.
145
- ```bash
146
- git clone https://github.com/babylm/evaluation-pipeline-2024/
147
- cd evaluation-pipeline-2024
148
- python score_predictions.py path/to/your/predictions.json.gz
149
- ```
150
- If this step fails, follow the error messages to debug your predictions before getting in touch. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
151
 
152
- Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model.
153
 
154
- Once these steps have been followed, get in touch with the organizers with your predictions file(s), and the scores you've obtained.
155
- We'll verify that we can match your scores, and then upload to the leaderboard. Optionally, you can give us your preferred model display name for the leaderboard, and a link to your model on HuggingFace.
 
 
156
  """
157
 
158
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
 
139
  """
140
 
141
  EVALUATION_QUEUE_TEXT = """
142
+ ## Circuit localization track:
143
 
144
+ You'll need 10 circuits per task/model combination. For each critical threshold k and previous threshold k_-1,
145
+ the circuit should contain no fewer than k_-1% of components, and no more than k% of components. Create a HuggingFace
146
+ dataset or model repository; this will house your circuits. Make a folder where the circuits (and *only* the circuits)
147
+ are contained. Do not worry about the ordering of the files; our evaluation script will read the circuits and sort them
148
+ by size. Provide a link to this folder below.
 
 
149
 
150
+ For specifications about the file format for a circuit, see the README on our project GitHub: TODO
151
 
152
+ Once your model makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
153
+ The evaluations are handled by the National Deep Inference Framework (NDIF).
154
+
155
+ ## Causal variable localization track:
156
  """
157
 
158
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
src/leaderboard/read_evals.py CHANGED
@@ -86,7 +86,19 @@ class EvalResult_MIB_SUBGRAPH:
86
  results=results
87
  )
88
 
89
-
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def to_dict(self, metric_type="F+"):
92
  """Converts the Eval Result to a dict for dataframe display"""
@@ -105,6 +117,7 @@ class EvalResult_MIB_SUBGRAPH:
105
  data_dict[f"{task.value.benchmark}_{model}"] = '-'
106
 
107
  all_scores = []
 
108
  for task, task_results in self.results.items():
109
  for model, metrics in task_results.items():
110
  col_name = f"{task}_{model}"
@@ -124,8 +137,10 @@ class EvalResult_MIB_SUBGRAPH:
124
  score = area_under if metric_type == "F+" else area_from_100
125
  data_dict[col_name] = round(score, 2)
126
  all_scores.append(score)
 
127
 
128
  data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
 
129
  return data_dict
130
 
131
 
@@ -294,7 +309,7 @@ class EvalResult_MIB_CAUSALGRAPH:
294
 
295
  # Initialize results dictionary
296
  results = {}
297
- for task in ["MCQA"]:
298
  results[task] = {}
299
 
300
  # Process each model's results
@@ -309,7 +324,7 @@ class EvalResult_MIB_CAUSALGRAPH:
309
  for intervention_data in layer_data['layer_scores']:
310
  # Calculate average score for counterfactuals
311
  avg_cf_score = np.mean([
312
- cf['score']
313
  for cf in intervention_data['counterfactual_scores']
314
  ])
315
 
@@ -416,7 +431,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
416
  df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
417
 
418
  # Group by base method name and take the max, handling NaN values
419
- aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
420
 
421
  # Convert back to string format and reset index
422
  aggregated_df = aggregated_df.reset_index()
@@ -460,8 +475,8 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
460
  for _, row in df_copy.iterrows():
461
  averaged_row = {'Method': row['Method']}
462
  for model_task, cols in model_task_groups.items():
463
- averaged_row[model_task] = np.mean([row[col] for col in cols]).round(3)
464
- averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(3)
465
  averaged_data.append(averaged_row)
466
 
467
  averaged_df = pd.DataFrame(averaged_data)
 
86
  results=results
87
  )
88
 
89
+ def _sigmoid(self, x):
90
+ try:
91
+ return 1 / (1 + math.exp(-2 * (x-1)))
92
+ except:
93
+ return "-"
94
+
95
+ def _transform_floats(self, df):
96
+ df_transformed = df.copy()
97
+ # Apply transformation row by row
98
+ for i, row in df_transformed.iterrows():
99
+ # Apply sigmoid only to numeric values in the row
100
+ df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
101
+ return df_transformed
102
 
103
  def to_dict(self, metric_type="F+"):
104
  """Converts the Eval Result to a dict for dataframe display"""
 
117
  data_dict[f"{task.value.benchmark}_{model}"] = '-'
118
 
119
  all_scores = []
120
+ transformed_scores = []
121
  for task, task_results in self.results.items():
122
  for model, metrics in task_results.items():
123
  col_name = f"{task}_{model}"
 
137
  score = area_under if metric_type == "F+" else area_from_100
138
  data_dict[col_name] = round(score, 2)
139
  all_scores.append(score)
140
+ transformed_scores.append(self._sigmoid(score))
141
 
142
  data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
143
+ data_dict["Score"] = round(np.mean(transformed_scores), 2) if '-' not in data_dict.values() else '-'
144
  return data_dict
145
 
146
 
 
309
 
310
  # Initialize results dictionary
311
  results = {}
312
+ for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
313
  results[task] = {}
314
 
315
  # Process each model's results
 
324
  for intervention_data in layer_data['layer_scores']:
325
  # Calculate average score for counterfactuals
326
  avg_cf_score = np.mean([
327
+ cf['score'] if 'score' in cf else 0
328
  for cf in intervention_data['counterfactual_scores']
329
  ])
330
 
 
431
  df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
432
 
433
  # Group by base method name and take the max, handling NaN values
434
+ aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(2)
435
 
436
  # Convert back to string format and reset index
437
  aggregated_df = aggregated_df.reset_index()
 
475
  for _, row in df_copy.iterrows():
476
  averaged_row = {'Method': row['Method']}
477
  for model_task, cols in model_task_groups.items():
478
+ averaged_row[model_task] = np.mean([row[col] for col in cols]).round(2)
479
+ averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(2)
480
  averaged_data.append(averaged_row)
481
 
482
  averaged_df = pd.DataFrame(averaged_data)
src/populate.py CHANGED
@@ -77,7 +77,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
77
  numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
78
 
79
  # Group by base method name and take the max
80
- aggregated_df = numeric_df.groupby(level=0).max().round(3)
81
 
82
  # Reset index to get Method as a column
83
  aggregated_df.reset_index(inplace=True)
@@ -116,7 +116,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
116
  averaged_df['Method'] = method_col
117
 
118
  for col_name, cols in result_cols.items():
119
- averaged_df[col_name] = df_copy[cols].mean(axis=1).round(3)
120
 
121
  return averaged_df
122
 
 
77
  numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
78
 
79
  # Group by base method name and take the max
80
+ aggregated_df = numeric_df.groupby(level=0).max().round(2)
81
 
82
  # Reset index to get Method as a column
83
  aggregated_df.reset_index(inplace=True)
 
116
  averaged_df['Method'] = method_col
117
 
118
  for col_name, cols in result_cols.items():
119
+ averaged_df[col_name] = df_copy[cols].mean(axis=1).round(2)
120
 
121
  return averaged_df
122