Spaces:
Running
Running
| import os | |
| import pandas as pd | |
| from src.display.utils import AutoEvalColumn | |
| def get_leaderboard_df_crm( | |
| crm_results_path: str, accuracy_cols: list, cost_cols: list | |
| ) -> tuple[pd.DataFrame, pd.DataFrame]: | |
| """Creates a dataframe from all the individual experiment results""" | |
| use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv")) | |
| sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"] | |
| # sf_finetuned_models = [] | |
| leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv")) | |
| leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)] | |
| leaderboard_accuracy_df = leaderboard_accuracy_df.join( | |
| use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"), | |
| on="Use Case Name", | |
| ) | |
| ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates() | |
| leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv")) | |
| leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)] | |
| leaderboard_accuracy_df = leaderboard_accuracy_df.join( | |
| leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]), | |
| on=["Model Name", "Cost and Speed: Flavor"], | |
| ) | |
| leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name") | |
| leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2) | |
| leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv")) | |
| leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv")) | |
| leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)] | |
| leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name") | |
| leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name") | |
| privacy_cols = leaderboard_ts_df[ | |
| [ | |
| "Privacy Zero-Shot Match Avoidance", | |
| "Privacy Zero-Shot Reveal Avoidance", | |
| "Privacy Five-Shot Match Avoidance", | |
| "Privacy Five-Shot Reveal Avoidance", | |
| ] | |
| ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1) | |
| leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x)) | |
| leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0]) | |
| ts_cols = leaderboard_ts_df[ | |
| [ | |
| "Safety", | |
| "Privacy", | |
| "Truthfulness", | |
| "Bias No CI", | |
| ] | |
| ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1) | |
| leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x)) | |
| leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values( | |
| by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False | |
| ) | |
| leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2) | |
| return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df | |