Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import pandas as pd | |
| from datasets import get_dataset_config_names, load_dataset | |
| from datasets.exceptions import DatasetNotFoundError | |
| from tqdm.auto import tqdm | |
| from src.display.utils import AutoEvalColumn | |
| from src.envs import TOKEN | |
| from src.logger import get_logger | |
| logger = get_logger(__name__) | |
| def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame: | |
| """ | |
| @brief Creates a dataframe from all the individual experiment results. | |
| """ | |
| empty_df = pd.DataFrame( | |
| columns=[ | |
| AutoEvalColumn.system.name, | |
| AutoEvalColumn.organization.name, | |
| AutoEvalColumn.success_rate_overall.name, | |
| AutoEvalColumn.success_rate_tier1.name, | |
| AutoEvalColumn.success_rate_tier2.name, | |
| AutoEvalColumn.submitted_on.name, | |
| ] | |
| ) | |
| try: | |
| configs = get_dataset_config_names( | |
| results_dataset_name, | |
| token=TOKEN, | |
| ) | |
| except (DatasetNotFoundError, FileNotFoundError): | |
| # Return an empty DataFrame with expected columns | |
| logger.warning("Failed to load configuration", exc_info=True) | |
| return empty_df | |
| if configs == ["default"]: | |
| logger.info("Dataset has only default config — treating as empty") | |
| return empty_df | |
| rows = [] | |
| for submission_id in tqdm( | |
| configs, | |
| total=len(configs), | |
| desc="Processing Submission Results", | |
| ): | |
| submission_ds = load_dataset( | |
| results_dataset_name, | |
| submission_id, | |
| split="train", | |
| token=TOKEN, | |
| ) | |
| submission_df = pd.DataFrame(submission_ds) | |
| if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any(): | |
| logger.warning(f"Skipping {submission_id} due to invalid did_pass values") | |
| continue | |
| assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df" | |
| success_rate = 100 * submission_df["did_pass"].mean() | |
| tier1_success_rate = 100 * submission_df[submission_df["tier"] == 1]["did_pass"].mean() | |
| tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean() | |
| first_row = submission_df.iloc[0] | |
| rows.append( | |
| { | |
| AutoEvalColumn.system.name: first_row["system_name"], | |
| AutoEvalColumn.organization.name: first_row["organization"], | |
| AutoEvalColumn.success_rate_overall.name: success_rate, | |
| AutoEvalColumn.success_rate_tier1.name: tier1_success_rate, | |
| AutoEvalColumn.success_rate_tier2.name: tier2_success_rate, | |
| AutoEvalColumn.submitted_on.name: pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"), | |
| } | |
| ) | |
| full_df = pd.DataFrame(rows) | |
| logger.info(f"Loaded results df with {len(full_df)} entries") | |
| # TODO: Forbid multiple submissions under the same name? | |
| # Keep only the latest entry per unique (System Name, System Type, Organization) triplet | |
| final_df = ( | |
| full_df.sort_values("Submitted On", ascending=False) | |
| .drop_duplicates(subset=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], keep="first") | |
| .sort_values(by=[AutoEvalColumn.success_rate_overall.name], ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| cols_to_round = [ | |
| AutoEvalColumn.success_rate_overall.name, | |
| AutoEvalColumn.success_rate_tier1.name, | |
| AutoEvalColumn.success_rate_tier2.name, | |
| ] | |
| final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2) | |
| return final_df | |