Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
e00a798
1
Parent(s):
5f7ca36
populate leaderboard df
Browse files- app.py +2 -11
- src/display/utils.py +3 -1
- src/populate.py +47 -9
app.py
CHANGED
|
@@ -47,6 +47,8 @@ def restart_space():
|
|
| 47 |
|
| 48 |
|
| 49 |
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
|
|
|
|
|
|
|
| 50 |
|
| 51 |
logger.info("Initialized LBDB")
|
| 52 |
|
|
@@ -94,17 +96,6 @@ with demo:
|
|
| 94 |
|
| 95 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 96 |
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
| 97 |
-
# TODO: activate
|
| 98 |
-
# leaderboard_df = get_leaderboard_df
|
| 99 |
-
# dummy df
|
| 100 |
-
leaderboard_df = pd.DataFrame(
|
| 101 |
-
{
|
| 102 |
-
AutoEvalColumn.system.name: ["Model A", "Model B", "Model C"], # AutoEvalColumn.model.name
|
| 103 |
-
AutoEvalColumn.system_type.name: ["LLM", "LLM+Agent", "N/A"], # AutoEvalColumn.model_type.name
|
| 104 |
-
AutoEvalColumn.organization.name: ["Org A", "Org B", "Org C"], # AutoEvalColumn.organization.name
|
| 105 |
-
AutoEvalColumn.success_rate.name: [0.01, 0.0, 0.005],
|
| 106 |
-
}
|
| 107 |
-
)
|
| 108 |
leaderboard = init_leaderboard(leaderboard_df)
|
| 109 |
|
| 110 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
|
| 50 |
+
leaderboard_df = get_leaderboard_df(RESULTS_REPO)
|
| 51 |
+
|
| 52 |
|
| 53 |
logger.info("Initialized LBDB")
|
| 54 |
|
|
|
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
leaderboard = init_leaderboard(leaderboard_df)
|
| 100 |
|
| 101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
src/display/utils.py
CHANGED
|
@@ -57,13 +57,15 @@ class ColumnContent:
|
|
| 57 |
# # We use make dataclass to dynamically fill the scores from Tasks
|
| 58 |
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
| 59 |
|
|
|
|
| 60 |
@dataclass(frozen=True)
|
| 61 |
class AutoEvalColumn:
|
| 62 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
| 63 |
system_type = ColumnContent("System Type", "str", True)
|
| 64 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
| 65 |
success_rate = ColumnContent("Success Rate (%)", "number", True)
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
## For the queue columns in the submission tab
|
|
|
|
| 57 |
# # We use make dataclass to dynamically fill the scores from Tasks
|
| 58 |
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
| 59 |
|
| 60 |
+
|
| 61 |
@dataclass(frozen=True)
|
| 62 |
class AutoEvalColumn:
|
| 63 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
| 64 |
system_type = ColumnContent("System Type", "str", True)
|
| 65 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
| 66 |
success_rate = ColumnContent("Success Rate (%)", "number", True)
|
| 67 |
+
problems_solved = ColumnContent("Problems Solved", "number", True)
|
| 68 |
+
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
| 69 |
|
| 70 |
|
| 71 |
## For the queue columns in the submission tab
|
src/populate.py
CHANGED
|
@@ -2,24 +2,62 @@ import json
|
|
| 2 |
import os
|
| 3 |
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
|
| 9 |
|
|
|
|
| 10 |
|
| 11 |
-
|
|
|
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 22 |
-
return df
|
| 23 |
|
| 24 |
|
| 25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
|
| 2 |
import os
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
+
from datasets import load_dataset, get_dataset_config_names
|
| 6 |
+
from tqdm.auto import tqdm
|
| 7 |
|
| 8 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 9 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 10 |
+
from src.envs import TOKEN
|
| 11 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 12 |
+
from src.logger import get_logger
|
| 13 |
|
| 14 |
+
logger = get_logger(__name__)
|
| 15 |
|
| 16 |
+
|
| 17 |
+
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
| 18 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
|
| 21 |
+
|
| 22 |
+
rows = []
|
| 23 |
+
for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
|
| 24 |
+
submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
|
| 25 |
+
submission_df = pd.DataFrame(submission_ds)
|
| 26 |
+
|
| 27 |
+
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
|
| 28 |
+
logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
|
| 29 |
+
continue
|
| 30 |
+
|
| 31 |
+
success_rate = 100 * submission_df["did_pass"].mean()
|
| 32 |
+
num_solved = submission_df["did_pass"].sum()
|
| 33 |
+
first_row = submission_df.iloc[0]
|
| 34 |
+
|
| 35 |
+
rows.append(
|
| 36 |
+
{
|
| 37 |
+
"System Name": first_row["system_name"],
|
| 38 |
+
"System Type": first_row["system_type"],
|
| 39 |
+
"Organization": first_row["organization"],
|
| 40 |
+
"Success Rate (%)": success_rate,
|
| 41 |
+
"Problems Solved": num_solved,
|
| 42 |
+
"Submitted On": pd.to_datetime(first_row.get("submission_ts", "1970-01-01T00:00:00")),
|
| 43 |
+
}
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
full_df = pd.DataFrame(rows)
|
| 47 |
+
|
| 48 |
+
# TODO: forbid multiple submissions under the same name?
|
| 49 |
+
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
|
| 50 |
+
final_df = (
|
| 51 |
+
full_df.sort_values("Submitted On", ascending=False)
|
| 52 |
+
.drop_duplicates(subset=["System Name", "System Type", "Organization"], keep="first")
|
| 53 |
+
.sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False)
|
| 54 |
+
.reset_index(drop=True)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
cols_to_round = ["Success Rate (%)"]
|
| 58 |
+
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
|
| 59 |
|
| 60 |
+
return final_df
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|