Spaces:

mib-bench
/

leaderboard

Runtime error

App Files Files Community

Aaron Mueller commited on Mar 26

Commit

b624a39

1 Parent(s): 44212b3

prettier leaderboard; draft of submission tab

Browse files

Files changed (5) hide show

app.py +136 -7
caulsal_metric.py +3 -3
src/about.py +11 -11
src/leaderboard/read_evals.py +21 -6
src/populate.py +2 -2

app.py CHANGED Viewed

@@ -122,6 +122,7 @@ from gradio_leaderboard import SelectColumns, Leaderboard
 import pandas as pd
 from typing import List, Dict, Optional
 from dataclasses import fields
 class SmartSelectColumns(SelectColumns):
     """
@@ -270,7 +271,11 @@ try:
 except Exception:
     restart_space()
 LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
@@ -486,6 +491,12 @@ def process_json(temp_file):
     gr.Markdown("Upload successful!")
     return data
 # Define the preset substrings for filtering
 PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
@@ -546,13 +557,23 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
     else:
         show_average = False
     if show_average:
         means = filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
         filtered_dataframe["Average"] = means.round(2)
         filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
         filtered_dataframe = filtered_dataframe.replace(float("nan"), "-")
     # if show_average:
     #     print([row for index, row in filtered_dataframe.iterrows()])
     #     filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
@@ -566,6 +587,10 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
     return filtered_dataframe
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -581,11 +606,6 @@ with demo:
         # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
         #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        # with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
-        #     with gr.Column():
-        #         with gr.Row():
-        #             gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
         #     leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
@@ -719,6 +739,115 @@ with demo:
                         "Causal Graph"
                     )
     # with gr.Row():
     #     with gr.Accordion("📙 Citation", open=False):
     #         citation_button = gr.Textbox(

 import pandas as pd
 from typing import List, Dict, Optional
 from dataclasses import fields
+import math
 class SmartSelectColumns(SelectColumns):
     """
 except Exception:
     restart_space()
+def _sigmoid(x):
+    try:
+        return 1 / (1 + math.exp(-2 * (x-1)))
+    except:
+        return "-"
 LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
 LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
     gr.Markdown("Upload successful!")
     return data
+def get_hf_username(hf_repo):
+    hf_repo = hf_repo.rstrip("/")
+    parts = hf_repo.split("/")
+    username = parts[-2]
+    return username
 # Define the preset substrings for filtering
 PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
     else:
         show_average = False
+    def _transform_floats(df):
+        df_transformed = df.copy()
+        # Apply transformation row by row
+        for i, row in df_transformed.iterrows():
+            # Apply sigmoid only to numeric values in the row
+            df_transformed.loc[i] = row.apply(lambda x: _sigmoid(x) if isinstance(x, (float, int)) else x)
+        return df_transformed
     if show_average:
         means = filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
+        s_filtered_dataframe = _transform_floats(filtered_dataframe)
+        s_means = s_filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
         filtered_dataframe["Average"] = means.round(2)
+        filtered_dataframe["Score"] = s_means.round(2)
         filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
         filtered_dataframe = filtered_dataframe.replace(float("nan"), "-")
     # if show_average:
     #     print([row for index, row in filtered_dataframe.iterrows()])
     #     filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
     return filtered_dataframe
+def process_url(url):
+    # Add your URL processing logic here
+    return f"You entered the URL: {url}"
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
         # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
         #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         # with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
         #     leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
                         "Causal Graph"
                     )
+        with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown("## 🏆 Submission Portal")
+            # Track selection
+            track = gr.Radio(
+                choices=[
+                    "Circuit Localization Track",
+                    "Causal Variable Localization Track"
+                ],
+                label="Select Competition Track",
+                elem_id="track_selector"
+            )
+            with gr.Group(visible=False) as circuit_ui:
+                gr.Markdown("### Circuit Localization Requirements")
+                hf_repo = gr.Textbox(
+                    label="HuggingFace Repository URL",
+                    placeholder="https://huggingface.co/username/repo/tree/main/path",
+                    info="Must be a valid HuggingFace URL pointing to a folder with 10 circuit files (.json or .pt)"
+                )
+            with gr.Group(visible=False) as causal_ui:
+                gr.Markdown("### Causal Variable Localization Requirements")
+                with gr.Row():
+                    layer = gr.Number(
+                        label="Layer Number",
+                        precision=0,
+                        minimum=0,
+                        info="Integer specifying the model layer"
+                    )
+                    token_position = gr.Number(
+                        label="Token Position",
+                        precision=0,
+                        minimum=0,
+                        info="Integer specifying token position"
+                    )
+                code_upload = gr.File(
+                    label="Upload Python file implementing your featurization function",
+                    file_types=[".py"],
+                )
+            # Common fields
+            with gr.Group():
+                gr.Markdown("### Team Information")
+                team_name = gr.Textbox(label="Team Name")
+                contact_email = gr.Textbox(label="Contact Email")
+            # Dynamic UI logic
+            def toggle_ui(track):
+                circuit = track == "Circuit Localization Track"
+                causal = not circuit
+                return {
+                    circuit_ui: gr.Group(visible=circuit),
+                    causal_ui: gr.Group(visible=causal)
+                }
+            track.change(toggle_ui, track, [circuit_ui, causal_ui])
+            # Submission handling
+            status = gr.Textbox(label="Submission Status", visible=False)
+            def handle_submission(track, hf_repo, layer, token_position, code_upload, team_name, contact_email):
+                errors = []
+                # Validate common fields
+                if not team_name.strip():
+                    errors.append("Team name is required")
+                if "@" not in contact_email or "." not in contact_email:
+                    errors.append("Valid email address is required")
+                # Track-specific validation
+                if "Circuit" in track:
+                    if not hf_repo.startswith("https://huggingface.co/"):
+                        errors.append("Invalid HuggingFace URL - must start with https://huggingface.co/")
+                    else:
+                        # Check rate limit only for valid HF submissions
+                        username = get_hf_username(hf_repo)
+                        rate = 0  # TODO: check submissions queue for rates
+                        rate_limit = 2
+                        if rate > rate_limit:
+                            errors.append("Rate limit exceeded (max 2 submissions per week per HF account)")
+                else:
+                    if not (isinstance(layer, int) and isinstance(token_position, int)):
+                        errors.append("Layer and token position must be integers")
+                    if not code_upload:
+                        errors.append("Code file upload is required")
+                if errors:
+                    return gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
+                # Process valid submission
+                return gr.Textbox("✅ Submission received! Thank you for your entry.", visible=True)
+            submit_btn = gr.Button("Submit Entry", variant="primary")
+            submit_btn.click(
+                handle_submission,
+                inputs=[track, hf_repo, layer, token_position, code_upload, team_name, contact_email],
+                outputs=status
+            )
+            # Add info about rate limits
+            gr.Markdown("""
+            ### Submission Policy
+            - Maximum 2 valid submissions per HuggingFace account per week
+            - Invalid submissions don't count toward your limit
+            - Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
+            """)
     # with gr.Row():
     #     with gr.Accordion("📙 Citation", open=False):
     #         citation_button = gr.Textbox(

caulsal_metric.py CHANGED Viewed

@@ -161,7 +161,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     numeric_df = df_copy.applymap(extract_score)
     # Group by base method name and take the mean
-    aggregated_df = numeric_df.groupby(level=0).max().round(3)
     # Convert back to string format
     aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
@@ -198,12 +198,12 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     # Create new DataFrame with averaged intervention scores
     averaged_df = pd.DataFrame({
-        model_task: numeric_df[cols].mean(axis=1).round(3)
         for model_task, cols in model_task_groups.items()
     })
     # Add overall average column
-    averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
     # Sort by Average column
     averaged_df = averaged_df.sort_values('Average', ascending=False)

     numeric_df = df_copy.applymap(extract_score)
     # Group by base method name and take the mean
+    aggregated_df = numeric_df.groupby(level=0).max().round(2)
     # Convert back to string format
     aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
     # Create new DataFrame with averaged intervention scores
     averaged_df = pd.DataFrame({
+        model_task: numeric_df[cols].mean(axis=1).round(2)
         for model_task, cols in model_task_groups.items()
     })
     # Add overall average column
+    averaged_df['Average'] = averaged_df.mean(axis=1).round(2)
     # Sort by Average column
     averaged_df = averaged_df.sort_values('Average', ascending=False)

src/about.py CHANGED Viewed

@@ -139,20 +139,20 @@ This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before requesting a predictions upload:
-Make sure you can get scores from your predictions file using the `score_predictions.py` script.
-```bash
-git clone https://github.com/babylm/evaluation-pipeline-2024/
-cd evaluation-pipeline-2024
-python score_predictions.py path/to/your/predictions.json.gz
-```
-If this step fails, follow the error messages to debug your predictions before getting in touch. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
-Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model.
-Once these steps have been followed, get in touch with the organizers with your predictions file(s), and the scores you've obtained.
-We'll verify that we can match your scores, and then upload to the leaderboard. Optionally, you can give us your preferred model display name for the leaderboard, and a link to your model on HuggingFace.
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"

 """
 EVALUATION_QUEUE_TEXT = """
+## Circuit localization track:
+You'll need 10 circuits per task/model combination. For each critical threshold k and previous threshold k_-1,
+the circuit should contain no fewer than k_-1% of components, and no more than k% of components. Create a HuggingFace
+dataset or model repository; this will house your circuits. Make a folder where the circuits (and *only* the circuits)
+are contained. Do not worry about the ordering of the files; our evaluation script will read the circuits and sort them
+by size. Provide a link to this folder below.
+For specifications about the file format for a circuit, see the README on our project GitHub: TODO
+Once your model makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
+The evaluations are handled by the National Deep Inference Framework (NDIF).
+## Causal variable localization track:
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -86,7 +86,19 @@ class EvalResult_MIB_SUBGRAPH:
             results=results
         )
     def to_dict(self, metric_type="F+"):
         """Converts the Eval Result to a dict for dataframe display"""
@@ -105,6 +117,7 @@ class EvalResult_MIB_SUBGRAPH:
                 data_dict[f"{task.value.benchmark}_{model}"] = '-'
         all_scores = []
         for task, task_results in self.results.items():
             for model, metrics in task_results.items():
                 col_name = f"{task}_{model}"
@@ -124,8 +137,10 @@ class EvalResult_MIB_SUBGRAPH:
                 score = area_under if metric_type == "F+" else area_from_100
                 data_dict[col_name] = round(score, 2)
                 all_scores.append(score)
         data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
         return data_dict
@@ -294,7 +309,7 @@ class EvalResult_MIB_CAUSALGRAPH:
         # Initialize results dictionary
         results = {}
-        for task in ["MCQA"]:
             results[task] = {}
         # Process each model's results
@@ -309,7 +324,7 @@ class EvalResult_MIB_CAUSALGRAPH:
                     for intervention_data in layer_data['layer_scores']:
                         # Calculate average score for counterfactuals
                         avg_cf_score = np.mean([
-                            cf['score']
                             for cf in intervention_data['counterfactual_scores']
                         ])
@@ -416,7 +431,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
         df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
     # Group by base method name and take the max, handling NaN values
-    aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
     # Convert back to string format and reset index
     aggregated_df = aggregated_df.reset_index()
@@ -460,8 +475,8 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     for _, row in df_copy.iterrows():
         averaged_row = {'Method': row['Method']}
         for model_task, cols in model_task_groups.items():
-            averaged_row[model_task] = np.mean([row[col] for col in cols]).round(3)
-        averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(3)
         averaged_data.append(averaged_row)
     averaged_df = pd.DataFrame(averaged_data)

             results=results
         )
+    def _sigmoid(self, x):
+        try:
+            return 1 / (1 + math.exp(-2 * (x-1)))
+        except:
+            return "-"
+    def _transform_floats(self, df):
+        df_transformed = df.copy()
+        # Apply transformation row by row
+        for i, row in df_transformed.iterrows():
+            # Apply sigmoid only to numeric values in the row
+            df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
+        return df_transformed
     def to_dict(self, metric_type="F+"):
         """Converts the Eval Result to a dict for dataframe display"""
                 data_dict[f"{task.value.benchmark}_{model}"] = '-'
         all_scores = []
+        transformed_scores = []
         for task, task_results in self.results.items():
             for model, metrics in task_results.items():
                 col_name = f"{task}_{model}"
                 score = area_under if metric_type == "F+" else area_from_100
                 data_dict[col_name] = round(score, 2)
                 all_scores.append(score)
+                transformed_scores.append(self._sigmoid(score))
         data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
+        data_dict["Score"] =  round(np.mean(transformed_scores), 2) if '-' not in data_dict.values() else '-'
         return data_dict
         # Initialize results dictionary
         results = {}
+        for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
             results[task] = {}
         # Process each model's results
                     for intervention_data in layer_data['layer_scores']:
                         # Calculate average score for counterfactuals
                         avg_cf_score = np.mean([
+                            cf['score'] if 'score' in cf else 0
                             for cf in intervention_data['counterfactual_scores']
                         ])
         df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
     # Group by base method name and take the max, handling NaN values
+    aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(2)
     # Convert back to string format and reset index
     aggregated_df = aggregated_df.reset_index()
     for _, row in df_copy.iterrows():
         averaged_row = {'Method': row['Method']}
         for model_task, cols in model_task_groups.items():
+            averaged_row[model_task] = np.mean([row[col] for col in cols]).round(2)
+        averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(2)
         averaged_data.append(averaged_row)
     averaged_df = pd.DataFrame(averaged_data)

src/populate.py CHANGED Viewed

@@ -77,7 +77,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
     numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
     # Group by base method name and take the max
-    aggregated_df = numeric_df.groupby(level=0).max().round(3)
     # Reset index to get Method as a column
     aggregated_df.reset_index(inplace=True)
@@ -116,7 +116,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
         averaged_df['Method'] = method_col
     for col_name, cols in result_cols.items():
-        averaged_df[col_name] = df_copy[cols].mean(axis=1).round(3)
     return averaged_df

     numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
     # Group by base method name and take the max
+    aggregated_df = numeric_df.groupby(level=0).max().round(2)
     # Reset index to get Method as a column
     aggregated_df.reset_index(inplace=True)
         averaged_df['Method'] = method_col
     for col_name, cols in result_cols.items():
+        averaged_df[col_name] = df_copy[cols].mean(axis=1).round(2)
     return averaged_df