Spaces:

vbhat4
/

rubriceval

Sleeping

App Files Files Community

vbhat4 commited on Jul 23, 2024

Commit

6594739

verified ·

1 Parent(s): 39ed450

Add draft application file and resources

Browse files

Files changed (7) hide show

app.py +195 -0
category_scores.png +0 -0
eval_about.jpg +0 -0
feature_comp.png +0 -0
lb_stats.png +0 -0
leaderboard.csv +14 -0
theme.json +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import gradio as gr
+import numpy as np
+import pandas as pd
+import scipy.stats as st
+LEADERBOARD_FILE = "leaderboard.csv"
+def get_leaderboard_df():
+    df = pd.read_csv(LEADERBOARD_FILE)
+    df = df.sort_values(by = ["Score"], ascending = False)
+    df = df.reset_index(drop = True)
+    return df
+def get_model_stats(uploaded_df):
+    overall_score = uploaded_df["avg_score"].mean()
+    data = np.array(list(uploaded_df["avg_score"]))
+    bootstrap_res = st.bootstrap((data,),
+                                 np.mean,
+                                 confidence_level = 0.95,
+                                 n_resamples = 10000,
+                                 method = "percentile")
+    ci_high = bootstrap_res.confidence_interval.high
+    ci_low = bootstrap_res.confidence_interval.low
+    formatted_upper_diff = str(round(ci_high - overall_score, 2))
+    formatted_lower_diff = str(round(overall_score - ci_low, 2))
+    formatted_score = round(overall_score, 2)
+    formatted_ci = f"+{formatted_upper_diff}/-{formatted_lower_diff}"
+    return (formatted_score, formatted_ci)
+def process_upload(file):
+    uploaded_df = pd.read_csv(file.name).dropna()
+    if "avg_score" not in list(uploaded_df.columns):
+        return "Upload failed: file must have column 'avg_score'."
+    overall_score, confidence_interval = get_model_stats(uploaded_df)
+    leaderboard_df = get_leaderboard_df()
+    model_name = file.name.split("gradio/")[1].split("/")[1].split(".csv")[0]
+    new_entry = {"Model": model_name, "Score": overall_score, "95% CI": confidence_interval}
+    leaderboard_df = leaderboard_df.append(new_entry, ignore_index = True)
+    leaderboard_df.to_csv(LEADERBOARD_FILE, index = False)
+    return "Upload complete! The leaderboard has been updated."
+#theme = gr.themes.Default(radius_size = "none")
+def create_ui():
+    text_size = gr.themes.sizes.text_lg
+    # load theme from theme.json
+    theme = gr.themes.Default.load("theme.json")
+    # set text size to large
+    theme.text_size = text_size
+    with gr.Blocks(theme = theme) as demo:
+        with gr.Row():
+            gr.Image("https://ai.stanford.edu/wp-content/themes/sail/img/logo.png",
+                     show_label=False,
+                     show_download_button=False,
+                     container=False,
+                     min_width=200,
+                     scale=0)
+            gr.Image("https://crfm.stanford.edu/static/img/header/crfm-rgb.png",
+                     show_label = False,
+                     show_download_button = False,
+                     container = False,
+                     min_width = 200,
+                     scale = 0)
+        gr.Markdown(
+            """
+            # **RubricEval: A Scalable Human-LLM Evaluation Framework for Open-Ended Tasks**
+            ######
+            """)
+        with gr.TabItem("Leaderboard"):
+            overall_leaderboard_table = gr.Dataframe(get_leaderboard_df,
+                                                     gr.Timer(5),
+                                                     column_widths = ["33.3%", "33.3%", "33.3%"],
+                                                     height = 600)
+            gr.Markdown(
+                """
+                ######
+                ## RubricEval leaderboard statistics (Overall)
+                """
+            )
+            gr.Image("lb_stats.png",
+                     show_label = False,
+                     show_download_button = False,
+                     width = 800)
+            gr.Markdown(
+                """
+                ######
+                ## RubricEval scores by category
+                """
+            )
+            gr.Image("category_scores.png",
+                     show_label = False,
+                     show_download_button = False)
+        with gr.TabItem("About"):
+            gr.Image("eval_about.jpg",
+                     show_label = False,
+                     show_download_button = False)
+            with gr.Accordion("What is RubricEval?"):
+                gr.Markdown(
+                    """
+                    ######
+                    #### Overview
+                    RubricEval is a framework for evaluating instruction-following models.
+                    The core idea is to create example-specific rubrics designed by human experts, which are then applied by an GPT-4o to evaluate model outputs at scale. This process results in more scalable, trustworthy, and interpretable evaluations of language models.
+                    #### Features
+                    **Open-Ended:** The responses of chat models are open-ended in nature, and a small set of reference
+                    answers often can’t capture all acceptable responses. This is a key limitation of reference-based
+                    evaluators like BLEU and BERTScore.
+                    **Multidimensional:** Responses can be good and bad in different ways, which isn’t captured by "head
+                    to head" evaluators like Chatbot Arena and AlpacaEval that simply decide if one response is better
+                    than another generally.
+                    **Absolute:** Evaluators like Chatbot Arena and AlpacaEval use win rates based on pairwise comparisons.
+                    This means that we don’t know how good a model is in absolute terms. For example, a model may
+                    have a low win rate against GPT-4o but still be formidable, and the highest win rate model may not
+                    be perfect despite topping the leaderboard.
+                    **Varying Criteria:** The criteria for what makes a good response is different for each instruction. While
+                    HELM Instruct is open-ended, multidimensional, and absolute, it uses the same set of scoring criteria
+                    for each instruction, missing nuances at the instruction level. Most pairwise comparison evaluators
+                    may implicitly consider varying criteria for each instruction, but these criteria are not explicitly laid
+                    out (WildBench is a notable exception).
+                    **Feedback:** To the best of our knowledge, no current language model evaluation system provides
+                    textual feedback on a model’s overall strengths and weaknesses with respect to some set of
+                    instructions. However, we believe that such feedback would be highly valuable for model developers.
+                    Evaluation is a key piece of iterative model development, and textual feedback could provide insight
+                    on what exactly needs to be improved rather than solely a score which is hard to interpret.
+                    ######
+                    """)
+                gr.Image("feature_comp.png",
+                         show_label = False,
+                         show_download_button = False)
+            with gr.Accordion("Where do evaluation instructions come from?"):
+                gr.Markdown(
+                    """
+                    ######
+                    We utilize a set of approximately 1,000 instructions from WildBench ([https://huggingface.co/spaces/allenai/WildBench](https://huggingface.co/spaces/allenai/WildBench)) which was made publicly available. From this, 392 of the hardest instructions were chosen via a GPT-4 based pairwise comparison method.
+                    Using the WildBench dataset has three primary benefits:
+                    1) It contains a manually curated selection of instructions from real users.
+                    2) The instructions are well spread out across 11 categories, which is useful for benchmarking.
+                    3) Each instruction comes with user-defined criteria of what they’re looking for, which we can make use of directly in our framework
+                    ######
+                    """)
+            with gr.Accordion("How does RubricEval correlate with human preferences?"):
+                gr.Markdown(
+                    """
+                    ######
+                    We used RubricEval to score 13 leading large language models across 11 categories and 392 instructions from WildBench.
+                    Notably, the ranking of these models based on RubricEval scores correlates highly with the ranking of the same models using Chatbot Arena ELO ratings (spearman ρ = 0.98).
+                    The main discordance is in the ranking of Claude 3 Opus (which is ranked relatively lower by RubricEval compared to Chatbot Arena).
+                    RubricEval’s correlation of ρ = 0.98 with human preferences ties length-corrected AlpacaEval’s record 0.98 correlation, while being higher than regular AlpacaEval (ρ = 0.94), MT-Bench (ρ = 0.94), and MMLU (ρ = 0.87).
+                    ######
+                    """)
+            with gr.Accordion("Additional details"):
+                gr.Markdown(
+                    """
+                    ######
+                    See our detailed report at [insert blog link].
+                    ######
+                    """)
+            with gr.Accordion("Citation"):
+                gr.Markdown(
+                    """
+                    ######
+                    [insert citation]
+                    ######
+                    """)
+        with gr.TabItem("Submit Model"):
+            gr.Markdown(
+                """
+                ######
+                #### Want to add a model to this leaderboard?
+                #### 1. Run RubricEval locally for <$x (see [insert github link])
+                #### 2. Upload the evaluation file generated by RubricEval below. Note: the file name will be used as the model name.
+                #### 3. Wait ~5 seconds and refresh the leaderboard page to see that your model has been added!
+                ######
+                """)
+            model_submission = gr.File(file_types = [".csv"], file_count = "single")
+            model_submission.upload(fn = process_upload, inputs = [model_submission], outputs = [])
+    demo.launch()
+if __name__ == "__main__":
+    create_ui()

category_scores.png ADDED Viewed

eval_about.jpg ADDED Viewed

feature_comp.png ADDED Viewed

lb_stats.png ADDED Viewed

leaderboard.csv ADDED Viewed

	@@ -0,0 +1,14 @@

+Model,Score,95% CI
+GPT-4 Omni,3.18,+0.06/-0.06
+GPT-4 Turbo,3.1,+0.06/-0.06
+Gemini 1.5 Pro,3.06,+0.07/-0.07
+Gemini 1.5 Flash,2.98,+0.07/-0.07
+Llama 3 70B,2.9,+0.07/-0.07
+Claude 3 Opus,2.86,+0.08/-0.08
+Claude 3 Sonnet,2.79,+0.08/-0.08
+Claude 3 Haiku,2.73,+0.08/-0.08
+Gemini 1.0 Pro,2.56,+0.07/-0.07
+Llama 3 8B,2.56,+0.07/-0.07
+GPT-3.5 Turbo,2.52,+0.08/-0.08
+Gemma 7B,2.14,+0.07/-0.07
+Gemma 2B,1.83,+0.16/-0.16

theme.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"theme": {"text_size": "20px", "background_fill_primary": "white", "background_fill_primary_dark": "*neutral_950", "background_fill_secondary": "*neutral_50", "background_fill_secondary_dark": "*neutral_900", "block_background_fill": "*background_fill_primary", "block_background_fill_dark": "*neutral_800", "block_border_color": "*border_color_primary", "block_border_color_dark": "*border_color_primary", "block_border_width": "1px", "block_border_width_dark": "1px", "block_info_text_color": "*body_text_color_subdued", "block_info_text_color_dark": "*body_text_color_subdued", "block_info_text_size": "*text_sm", "block_info_text_weight": "400", "block_label_background_fill": "*background_fill_primary", "block_label_background_fill_dark": "*background_fill_secondary", "block_label_border_color": "*border_color_primary", "block_label_border_color_dark": "*border_color_primary", "block_label_border_width": "1px", "block_label_border_width_dark": "1px", "block_label_margin": "0", "block_label_padding": "*spacing_sm *spacing_lg", "block_label_radius": "calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px) 0", "block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)", "block_label_shadow": "*block_shadow", "block_label_text_color": "*neutral_500", "block_label_text_color_dark": "*neutral_200", "block_label_text_size": "*text_sm", "block_label_text_weight": "400", "block_padding": "*spacing_xl calc(*spacing_xl + 2px)", "block_radius": "*radius_lg", "block_shadow": "none", "block_shadow_dark": "none", "block_title_background_fill": "none", "block_title_background_fill_dark": "none", "block_title_border_color": "none", "block_title_border_color_dark": "none", "block_title_border_width": "0px", "block_title_border_width_dark": "0px", "block_title_padding": "0", "block_title_radius": "none", "block_title_text_color": "*neutral_500", "block_title_text_color_dark": "*neutral_200", "block_title_text_size": "*text_md", "block_title_text_weight": "400", "body_background_fill": "*background_fill_primary", "body_background_fill_dark": "*background_fill_primary", "body_text_color": "*neutral_700", "body_text_color_dark": "*neutral_200", "body_text_color_subdued": "*neutral_400", "body_text_color_subdued_dark": "*neutral_500", "body_text_size": "*text_md", "body_text_weight": "400", "border_color_accent": "*primary_300", "border_color_accent_dark": "*neutral_600", "border_color_primary": "*neutral_200", "border_color_primary_dark": "*neutral_700", "button_border_width": "*input_border_width", "button_border_width_dark": "*input_border_width", "button_cancel_background_fill": "*button_secondary_background_fill", "button_cancel_background_fill_dark": "*button_secondary_background_fill", "button_cancel_background_fill_hover": "*button_cancel_background_fill", "button_cancel_background_fill_hover_dark": "*button_cancel_background_fill", "button_cancel_border_color": "*button_secondary_border_color", "button_cancel_border_color_dark": "*button_secondary_border_color", "button_cancel_border_color_hover": "*button_cancel_border_color", "button_cancel_border_color_hover_dark": "*button_cancel_border_color", "button_cancel_text_color": "*button_secondary_text_color", "button_cancel_text_color_dark": "*button_secondary_text_color", "button_cancel_text_color_hover": "*button_cancel_text_color", "button_cancel_text_color_hover_dark": "*button_cancel_text_color", "button_large_padding": "*spacing_lg calc(2 * *spacing_lg)", "button_large_radius": "*radius_lg", "button_large_text_size": "*text_lg", "button_large_text_weight": "500", "button_primary_background_fill": "*primary_200", "button_primary_background_fill_dark": "*primary_700", "button_primary_background_fill_hover": "*button_primary_background_fill", "button_primary_background_fill_hover_dark": "*button_primary_background_fill", "button_primary_border_color": "*primary_200", "button_primary_border_color_dark": "*primary_600", "button_primary_border_color_hover": "*button_primary_border_color", "button_primary_border_color_hover_dark": "*button_primary_border_color", "button_primary_text_color": "*primary_600", "button_primary_text_color_dark": "white", "button_primary_text_color_hover": "*button_primary_text_color", "button_primary_text_color_hover_dark": "*button_primary_text_color", "button_secondary_background_fill": "*neutral_200", "button_secondary_background_fill_dark": "*neutral_600", "button_secondary_background_fill_hover": "*neutral_300", "button_secondary_background_fill_hover_dark": "*neutral_500", "button_secondary_border_color": "*neutral_200", "button_secondary_border_color_dark": "*neutral_600", "button_secondary_border_color_hover": "*button_secondary_border_color", "button_secondary_border_color_hover_dark": "*button_secondary_border_color", "button_secondary_text_color": "*neutral_700", "button_secondary_text_color_dark": "white", "button_secondary_text_color_hover": "*button_secondary_text_color", "button_secondary_text_color_hover_dark": "*button_secondary_text_color", "button_shadow": "none", "button_shadow_active": "none", "button_shadow_hover": "none", "button_small_padding": "*spacing_sm calc(2 * *spacing_sm)", "button_small_radius": "*radius_lg", "button_small_text_size": "*text_md", "button_small_text_weight": "400", "button_transition": "background-color 0.2s ease", "checkbox_background_color": "*background_fill_primary", "checkbox_background_color_dark": "*neutral_800", "checkbox_background_color_focus": "*checkbox_background_color", "checkbox_background_color_focus_dark": "*checkbox_background_color", "checkbox_background_color_hover": "*checkbox_background_color", "checkbox_background_color_hover_dark": "*checkbox_background_color", "checkbox_background_color_selected": "*secondary_600", "checkbox_background_color_selected_dark": "*secondary_600", "checkbox_border_color": "*neutral_300", "checkbox_border_color_dark": "*neutral_700", "checkbox_border_color_focus": "*secondary_500", "checkbox_border_color_focus_dark": "*secondary_500", "checkbox_border_color_hover": "*neutral_300", "checkbox_border_color_hover_dark": "*neutral_600", "checkbox_border_color_selected": "*secondary_600", "checkbox_border_color_selected_dark": "*secondary_600", "checkbox_border_radius": "*radius_sm", "checkbox_border_width": "*input_border_width", "checkbox_border_width_dark": "*input_border_width", "checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")", "checkbox_label_background_fill": "*button_secondary_background_fill", "checkbox_label_background_fill_dark": "*button_secondary_background_fill", "checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_selected": "*checkbox_label_background_fill", "checkbox_label_background_fill_selected_dark": "*checkbox_label_background_fill", "checkbox_label_border_color": "*border_color_primary", "checkbox_label_border_color_dark": "*border_color_primary", "checkbox_label_border_color_hover": "*checkbox_label_border_color", "checkbox_label_border_color_hover_dark": "*checkbox_label_border_color", "checkbox_label_border_width": "*input_border_width", "checkbox_label_border_width_dark": "*input_border_width", "checkbox_label_gap": "*spacing_lg", "checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)", "checkbox_label_shadow": "none", "checkbox_label_text_color": "*body_text_color", "checkbox_label_text_color_dark": "*body_text_color", "checkbox_label_text_color_selected": "*checkbox_label_text_color", "checkbox_label_text_color_selected_dark": "*checkbox_label_text_color", "checkbox_label_text_size": "*text_md", "checkbox_label_text_weight": "400", "checkbox_shadow": "*input_shadow", "color_accent": "*primary_500", "color_accent_soft": "*primary_50", "color_accent_soft_dark": "*neutral_700", "container_radius": "*radius_lg", "embed_radius": "*radius_md", "error_background_fill": "#fee2e2", "error_background_fill_dark": "*background_fill_primary", "error_border_color": "#fecaca", "error_border_color_dark": "*border_color_primary", "error_border_width": "1px", "error_border_width_dark": "1px", "error_text_color": "#ef4444", "error_text_color_dark": "#ef4444", "form_gap_width": "0px", "input_background_fill": "*neutral_100", "input_background_fill_dark": "*neutral_700", "input_background_fill_focus": "*secondary_500", "input_background_fill_focus_dark": "*secondary_600", "input_background_fill_hover": "*input_background_fill", "input_background_fill_hover_dark": "*input_background_fill", "input_border_color": "*border_color_primary", "input_border_color_dark": "*border_color_primary", "input_border_color_focus": "*secondary_300", "input_border_color_focus_dark": "*neutral_700", "input_border_color_hover": "*input_border_color", "input_border_color_hover_dark": "*input_border_color", "input_border_width": "0px", "input_border_width_dark": "0px", "input_padding": "*spacing_xl", "input_placeholder_color": "*neutral_400", "input_placeholder_color_dark": "*neutral_500", "input_radius": "*radius_lg", "input_shadow": "none", "input_shadow_dark": "none", "input_shadow_focus": "*input_shadow", "input_shadow_focus_dark": "*input_shadow", "input_text_size": "*text_md", "input_text_weight": "400", "layout_gap": "*spacing_xxl", "link_text_color": "*secondary_600", "link_text_color_active": "*secondary_600", "link_text_color_active_dark": "*secondary_500", "link_text_color_dark": "*secondary_500", "link_text_color_hover": "*secondary_700", "link_text_color_hover_dark": "*secondary_400", "link_text_color_visited": "*secondary_500", "link_text_color_visited_dark": "*secondary_600", "loader_color": "*color_accent", "loader_color_dark": "*color_accent", "name": "base", "neutral_100": "#f5f5f4", "neutral_200": "#e7e5e4", "neutral_300": "#d6d3d1", "neutral_400": "#a8a29e", "neutral_50": "#fafaf9", "neutral_500": "#78716c", "neutral_600": "#57534e", "neutral_700": "#44403c", "neutral_800": "#292524", "neutral_900": "#1c1917", "neutral_950": "#0f0e0d", "panel_background_fill": "*background_fill_secondary", "panel_background_fill_dark": "*background_fill_secondary", "panel_border_color": "*border_color_primary", "panel_border_color_dark": "*border_color_primary", "panel_border_width": "0", "panel_border_width_dark": "0", "primary_100": "#e0f2fe", "primary_200": "#bae6fd", "primary_300": "#7dd3fc", "primary_400": "#38bdf8", "primary_50": "#f0f9ff", "primary_500": "#0ea5e9", "primary_600": "#0284c7", "primary_700": "#0369a1", "primary_800": "#075985", "primary_900": "#0c4a6e", "primary_950": "#0b4165", "prose_header_text_weight": "500", "prose_text_size": "*text_md", "prose_text_weight": "400", "radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")", "radius_lg": "3px", "radius_md": "3px", "radius_sm": "3px", "radius_xl": "3px", "radius_xs": "3px", "radius_xxl": "3px", "radius_xxs": "3px", "secondary_100": "#e0f2fe", "secondary_200": "#bae6fd", "secondary_300": "#7dd3fc", "secondary_400": "#38bdf8", "secondary_50": "#f0f9ff", "secondary_500": "#0ea5e9", "secondary_600": "#0284c7", "secondary_700": "#0369a1", "secondary_800": "#075985", "secondary_900": "#0c4a6e", "secondary_950": "#0b4165", "section_header_text_size": "*text_md", "section_header_text_weight": "400", "shadow_drop": "rgba(0,0,0,0.05) 0px 1px 2px 0px", "shadow_drop_lg": "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)", "shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset", "shadow_spread": "3px", "shadow_spread_dark": "1px", "slider_color": "*primary_600", "slider_color_dark": "*primary_600", "spacing_lg": "8px", "spacing_md": "6px", "spacing_sm": "4px", "spacing_xl": "10px", "spacing_xs": "2px", "spacing_xxl": "16px", "spacing_xxs": "1px", "stat_background_fill": "*primary_300", "stat_background_fill_dark": "*primary_500", "table_border_color": "*neutral_300", "table_border_color_dark": "*neutral_700", "table_even_background_fill": "white", "table_even_background_fill_dark": "*neutral_950", "table_odd_background_fill": "*neutral_50", "table_odd_background_fill_dark": "*neutral_900", "table_radius": "*radius_lg", "table_row_focus": "*color_accent_soft", "table_row_focus_dark": "*color_accent_soft", "text_lg": "20px", "text_md": "16px", "text_sm": "14px", "text_xl": "24px", "text_xs": "12px", "text_xxl": "28px", "text_xxs": "10px"}, "version": "0.0.1"}