Spaces:
Running
Running
add trust&safety table
Browse files- app.py +110 -2
- crm-results/hf_leaderboard_ts.csv +19 -0
- src/display/utils.py +10 -0
- src/populate.py +6 -1
app.py
CHANGED
|
@@ -13,9 +13,12 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
|
|
| 13 |
COLS,
|
| 14 |
COST_COLS,
|
| 15 |
COST_TYPES,
|
|
|
|
|
|
|
| 16 |
TYPES,
|
| 17 |
AutoEvalColumn,
|
| 18 |
CostEvalColumn,
|
|
|
|
| 19 |
fields,
|
| 20 |
)
|
| 21 |
|
|
@@ -23,11 +26,12 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
|
|
| 23 |
from src.envs import CRM_RESULTS_PATH
|
| 24 |
from src.populate import get_leaderboard_df_crm
|
| 25 |
|
| 26 |
-
original_df, cost_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
|
| 27 |
|
| 28 |
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 29 |
leaderboard_df = original_df.copy()
|
| 30 |
leaderboard_cost_df = cost_df.copy()
|
|
|
|
| 31 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
| 32 |
|
| 33 |
|
|
@@ -70,6 +74,18 @@ def update_cost_table(
|
|
| 70 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 71 |
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
# def highlight_cols(x):
|
| 74 |
# df = x.copy()
|
| 75 |
# df.loc[:, :] = "color: black"
|
|
@@ -126,6 +142,21 @@ def init_leaderboard_cost_df(
|
|
| 126 |
)
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
| 130 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
| 131 |
|
|
@@ -177,6 +208,14 @@ def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
| 177 |
return filtered_df
|
| 178 |
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
demo = gr.Blocks(css=custom_css)
|
| 181 |
with demo:
|
| 182 |
gr.HTML(TITLE)
|
|
@@ -461,8 +500,77 @@ with demo:
|
|
| 461 |
leaderboard_table,
|
| 462 |
queue=True,
|
| 463 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 467 |
|
| 468 |
with gr.Row():
|
|
|
|
| 13 |
COLS,
|
| 14 |
COST_COLS,
|
| 15 |
COST_TYPES,
|
| 16 |
+
TS_COLS,
|
| 17 |
+
TS_TYPES,
|
| 18 |
TYPES,
|
| 19 |
AutoEvalColumn,
|
| 20 |
CostEvalColumn,
|
| 21 |
+
TSEvalColumn,
|
| 22 |
fields,
|
| 23 |
)
|
| 24 |
|
|
|
|
| 26 |
from src.envs import CRM_RESULTS_PATH
|
| 27 |
from src.populate import get_leaderboard_df_crm
|
| 28 |
|
| 29 |
+
original_df, cost_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
|
| 30 |
|
| 31 |
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 32 |
leaderboard_df = original_df.copy()
|
| 33 |
leaderboard_cost_df = cost_df.copy()
|
| 34 |
+
leaderboard_ts_df = ts_df.copy()
|
| 35 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
| 36 |
|
| 37 |
|
|
|
|
| 74 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 75 |
|
| 76 |
|
| 77 |
+
def update_ts_table(
|
| 78 |
+
hidden_df: pd.DataFrame,
|
| 79 |
+
columns: list,
|
| 80 |
+
llm_query: list,
|
| 81 |
+
llm_provider_query: list,
|
| 82 |
+
):
|
| 83 |
+
filtered_df = filter_llm_func(hidden_df, llm_query)
|
| 84 |
+
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
| 85 |
+
df = select_columns_ts_table(filtered_df, columns)
|
| 86 |
+
return df
|
| 87 |
+
|
| 88 |
+
|
| 89 |
# def highlight_cols(x):
|
| 90 |
# df = x.copy()
|
| 91 |
# df.loc[:, :] = "color: black"
|
|
|
|
| 142 |
)
|
| 143 |
|
| 144 |
|
| 145 |
+
def init_leaderboard_ts_df(
|
| 146 |
+
leaderboard_df: pd.DataFrame,
|
| 147 |
+
columns: list,
|
| 148 |
+
llm_query: list,
|
| 149 |
+
llm_provider_query: list,
|
| 150 |
+
):
|
| 151 |
+
|
| 152 |
+
return update_ts_table(
|
| 153 |
+
leaderboard_df,
|
| 154 |
+
columns,
|
| 155 |
+
llm_query,
|
| 156 |
+
llm_provider_query,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
| 161 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
| 162 |
|
|
|
|
| 208 |
return filtered_df
|
| 209 |
|
| 210 |
|
| 211 |
+
def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 212 |
+
always_here_cols = [
|
| 213 |
+
TSEvalColumn.model.name,
|
| 214 |
+
]
|
| 215 |
+
filtered_df = df[always_here_cols + [c for c in TS_COLS if c in df.columns and c in columns]]
|
| 216 |
+
return filtered_df
|
| 217 |
+
|
| 218 |
+
|
| 219 |
demo = gr.Blocks(css=custom_css)
|
| 220 |
with demo:
|
| 221 |
gr.HTML(TITLE)
|
|
|
|
| 500 |
leaderboard_table,
|
| 501 |
queue=True,
|
| 502 |
)
|
| 503 |
+
with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
|
| 504 |
+
with gr.Row():
|
| 505 |
+
with gr.Column():
|
| 506 |
+
with gr.Row():
|
| 507 |
+
shown_columns = gr.CheckboxGroup(
|
| 508 |
+
choices=[c.name for c in fields(TSEvalColumn) if not c.hidden and not c.never_hidden],
|
| 509 |
+
value=[
|
| 510 |
+
c.name
|
| 511 |
+
for c in fields(TSEvalColumn)
|
| 512 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 513 |
+
],
|
| 514 |
+
label="Select columns to show",
|
| 515 |
+
elem_id="column-select",
|
| 516 |
+
interactive=True,
|
| 517 |
+
)
|
| 518 |
+
with gr.Row():
|
| 519 |
+
with gr.Column():
|
| 520 |
+
filter_llm = gr.CheckboxGroup(
|
| 521 |
+
choices=list(ts_df["Model Name"].unique()),
|
| 522 |
+
value=list(ts_df["Model Name"].unique()),
|
| 523 |
+
label="Model Name",
|
| 524 |
+
info="",
|
| 525 |
+
interactive=True,
|
| 526 |
+
)
|
| 527 |
+
with gr.Column():
|
| 528 |
+
filter_llm_provider = gr.CheckboxGroup(
|
| 529 |
+
choices=list(ts_df["LLM Provider"].unique()),
|
| 530 |
+
value=list(ts_df["LLM Provider"].unique()),
|
| 531 |
+
label="LLM Provider",
|
| 532 |
+
info="",
|
| 533 |
+
interactive=True,
|
| 534 |
+
)
|
| 535 |
|
| 536 |
+
leaderboard_table = gr.components.Dataframe(
|
| 537 |
+
value=init_leaderboard_ts_df(
|
| 538 |
+
leaderboard_ts_df,
|
| 539 |
+
shown_columns.value,
|
| 540 |
+
filter_llm.value,
|
| 541 |
+
filter_llm_provider.value,
|
| 542 |
+
),
|
| 543 |
+
headers=[c.name for c in fields(TSEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 544 |
+
datatype=TS_TYPES,
|
| 545 |
+
elem_id="leaderboard-table",
|
| 546 |
+
interactive=False,
|
| 547 |
+
visible=True,
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 551 |
+
value=ts_df[TS_COLS],
|
| 552 |
+
headers=TS_COLS,
|
| 553 |
+
datatype=TS_TYPES,
|
| 554 |
+
visible=False,
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
for selector in [
|
| 558 |
+
shown_columns,
|
| 559 |
+
filter_llm,
|
| 560 |
+
filter_llm_provider,
|
| 561 |
+
]:
|
| 562 |
+
selector.change(
|
| 563 |
+
update_ts_table,
|
| 564 |
+
[
|
| 565 |
+
hidden_leaderboard_table_for_search,
|
| 566 |
+
shown_columns,
|
| 567 |
+
filter_llm,
|
| 568 |
+
filter_llm_provider,
|
| 569 |
+
],
|
| 570 |
+
leaderboard_table,
|
| 571 |
+
queue=True,
|
| 572 |
+
)
|
| 573 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 574 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 575 |
|
| 576 |
with gr.Row():
|
crm-results/hf_leaderboard_ts.csv
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model Name,Truthfulness,Safety,Privacy Zero-Shot Match Avoidance,Privacy Zero-Shot Reveal Avoidance,Privacy Five-Shot Match Avoidance,Privacy Five-Shot Reveal Avoidance,CRM Gender Bias,CRM Company Bias,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Truthfulness,,
|
| 2 |
+
GPT4-o,91%,69%,100%,94%,90%,51%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91%,,
|
| 3 |
+
GPT 4 Turbo,94%,74%,100%,97%,86%,74%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,79%,0.813,
|
| 4 |
+
GPT 3.5 Turbo,45%,59%,100%,13%,36%,2%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,45%,,0.708 (ChatGPT)
|
| 5 |
+
AI21 Jamba-Instruct,68%,65%,100%,100%,90%,81%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,68%,,
|
| 6 |
+
Cohere Command Text,59%,54%,100%,84%,78%,40%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59%,,
|
| 7 |
+
Claude 3 Haiku,86%,80%,100%,98%,95%,40%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86%,,
|
| 8 |
+
Gemini Pro 1,87%,74%,100%,92%,81%,48%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87%,,
|
| 9 |
+
SF-TextBase 70B,98%,63%,100%,90%,54%,8%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,,
|
| 10 |
+
SF-TextSum,82%,51%,100%,89%,87%,27%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82%,,
|
| 11 |
+
XGen 22B,52%,52%,100%,56%,81%,51%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,52%,,
|
| 12 |
+
SF-TextBase 7B,82%,60%,100%,83%,69%,27%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82%,,
|
| 13 |
+
Mistral 7B,32%,42%,100%,97%,92%,82%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32%,0.426,
|
| 14 |
+
Mixtral 8x7B,89%,59%,100%,97%,71%,55%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89%,0.88,
|
| 15 |
+
LLaMA 3 8B,96%,76%,100%,99%,92%,85%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,96%,0.598,
|
| 16 |
+
LLaMA 3 70B,98%,74%,100%,98%,83%,75%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,0.962,
|
| 17 |
+
Gemini Pro 1.5,98%,81%,100%,97%,87%,69%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,,
|
| 18 |
+
Claude 3 Opus,94%,81%,100%,96%,80%,56%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,94%,,
|
| 19 |
+
Cohere Command R+,84%,56%,100%,97%,76%,45%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84%,,
|
src/display/utils.py
CHANGED
|
@@ -73,6 +73,13 @@ cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Ba
|
|
| 73 |
CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
|
| 74 |
|
| 75 |
# Trust & Safety metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
# Scores
|
|
@@ -173,6 +180,9 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
| 173 |
COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
|
| 174 |
COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
| 175 |
|
|
|
|
|
|
|
|
|
|
| 176 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 177 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 178 |
|
|
|
|
| 73 |
CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
|
| 74 |
|
| 75 |
# Trust & Safety metrics
|
| 76 |
+
ts_eval_column_dict = []
|
| 77 |
+
# Init
|
| 78 |
+
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
| 79 |
+
ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 80 |
+
ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", True)])
|
| 81 |
+
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", True)])
|
| 82 |
+
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
| 83 |
|
| 84 |
|
| 85 |
# Scores
|
|
|
|
| 180 |
COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
|
| 181 |
COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
| 182 |
|
| 183 |
+
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
|
| 184 |
+
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
|
| 185 |
+
|
| 186 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 187 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 188 |
|
src/populate.py
CHANGED
|
@@ -30,7 +30,12 @@ def get_leaderboard_df_crm(
|
|
| 30 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 31 |
leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
| 32 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
|
| 30 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 31 |
leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
| 32 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
| 33 |
+
|
| 34 |
+
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
| 35 |
+
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
| 36 |
+
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 37 |
+
|
| 38 |
+
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|
| 39 |
|
| 40 |
|
| 41 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|