import gradio as gr import numpy as np import pandas as pd import scipy.stats as st LEADERBOARD_FILE = "leaderboard.csv" def get_leaderboard_df(): df = pd.read_csv(LEADERBOARD_FILE) df = df.sort_values(by = ["Score"], ascending = False) df = df.reset_index(drop = True) return df def get_model_stats(uploaded_df): overall_score = uploaded_df["avg_score"].mean() data = np.array(list(uploaded_df["avg_score"])) bootstrap_res = st.bootstrap((data,), np.mean, confidence_level = 0.95, n_resamples = 10000, method = "percentile") ci_high = bootstrap_res.confidence_interval.high ci_low = bootstrap_res.confidence_interval.low formatted_upper_diff = str(round(ci_high - overall_score, 2)) formatted_lower_diff = str(round(overall_score - ci_low, 2)) formatted_score = round(overall_score, 2) formatted_ci = f"+{formatted_upper_diff}/-{formatted_lower_diff}" return (formatted_score, formatted_ci) def process_upload(file): uploaded_df = pd.read_csv(file.name).dropna() if "avg_score" not in list(uploaded_df.columns): return "Upload failed: file must have column 'avg_score'." overall_score, confidence_interval = get_model_stats(uploaded_df) leaderboard_df = get_leaderboard_df() model_name = file.name.split("gradio/")[1].split("/")[1].split(".csv")[0] new_entry = {"Model": model_name, "Score": overall_score, "95% CI": confidence_interval} leaderboard_df = leaderboard_df.append(new_entry, ignore_index = True) leaderboard_df.to_csv(LEADERBOARD_FILE, index = False) return "Upload complete! The leaderboard has been updated." #theme = gr.themes.Default(radius_size = "none") def create_ui(): text_size = gr.themes.sizes.text_lg # load theme from theme.json theme = gr.themes.Default.load("theme.json") # set text size to large theme.text_size = text_size with gr.Blocks(theme = theme) as demo: with gr.Row(): gr.Image("https://ai.stanford.edu/wp-content/themes/sail/img/logo.png", show_label = False, show_download_button = False, show_share_button = False, container = False, min_width = 200, scale = 0) gr.Image("https://crfm.stanford.edu/static/img/header/crfm-rgb.png", show_label = False, show_download_button = False, show_share_button = False, container = False, min_width = 200, scale = 0) gr.Markdown( """ # **RubricEval: A Scalable Human-LLM Evaluation Framework for Open-Ended Tasks** ###### """) with gr.TabItem("Leaderboard"): overall_leaderboard_table = gr.Dataframe(get_leaderboard_df, gr.Timer(5), column_widths = ["33.3%", "33.3%", "33.3%"], height = 600) gr.Markdown( """ ###### ## RubricEval leaderboard statistics (Overall) """ ) gr.Image("lb_stats.png", show_label = False, show_download_button = False, show_share_button = False, width = 800) gr.Markdown( """ ###### ## RubricEval scores by category """ ) gr.Image("category_scores.png", show_label = False, show_download_button = False, show_share_button = False) with gr.TabItem("About"): gr.Image("eval_about.jpg", show_label = False, show_download_button = False, show_share_button = False) with gr.Accordion("What is RubricEval?"): gr.Markdown( """ ###### #### Overview RubricEval is a framework for evaluating instruction-following models. The core idea is to create example-specific rubrics designed by human experts, which are then applied by an GPT-4o to evaluate model outputs at scale. This process results in more scalable, trustworthy, and interpretable evaluations of language models. #### Features **Open-Ended:** The responses of chat models are open-ended in nature, and a small set of reference answers often can’t capture all acceptable responses. This is a key limitation of reference-based evaluators like BLEU and BERTScore. **Multidimensional:** Responses can be good and bad in different ways, which isn’t captured by "head to head" evaluators like Chatbot Arena and AlpacaEval that simply decide if one response is better than another generally. **Absolute:** Evaluators like Chatbot Arena and AlpacaEval use win rates based on pairwise comparisons. This means that we don’t know how good a model is in absolute terms. For example, a model may have a low win rate against GPT-4o but still be formidable, and the highest win rate model may not be perfect despite topping the leaderboard. **Varying Criteria:** The criteria for what makes a good response is different for each instruction. While HELM Instruct is open-ended, multidimensional, and absolute, it uses the same set of scoring criteria for each instruction, missing nuances at the instruction level. Most pairwise comparison evaluators may implicitly consider varying criteria for each instruction, but these criteria are not explicitly laid out (WildBench is a notable exception). **Feedback:** To the best of our knowledge, no current language model evaluation system provides textual feedback on a model’s overall strengths and weaknesses with respect to some set of instructions. However, we believe that such feedback would be highly valuable for model developers. Evaluation is a key piece of iterative model development, and textual feedback could provide insight on what exactly needs to be improved rather than solely a score which is hard to interpret. ###### """) gr.Image("feature_comp.png", show_label = False, show_download_button = False, show_share_button = False) with gr.Accordion("Where do evaluation instructions come from?"): gr.Markdown( """ ###### We utilize a set of approximately 1,000 instructions from WildBench ([https://huggingface.co/spaces/allenai/WildBench](https://huggingface.co/spaces/allenai/WildBench)) which was made publicly available. From this, 392 of the hardest instructions were chosen via a GPT-4 based pairwise comparison method. Using the WildBench dataset has three primary benefits: 1) It contains a manually curated selection of instructions from real users. 2) The instructions are well spread out across 11 categories, which is useful for benchmarking. 3) Each instruction comes with user-defined criteria of what they’re looking for, which we can make use of directly in our framework ###### """) with gr.Accordion("How does RubricEval correlate with human preferences?"): gr.Markdown( """ ###### We used RubricEval to score 13 leading large language models across 11 categories and 392 instructions from WildBench. Notably, the ranking of these models based on RubricEval scores correlates highly with the ranking of the same models using Chatbot Arena ELO ratings (spearman ρ = 0.98). The main discordance is in the ranking of Claude 3 Opus (which is ranked relatively lower by RubricEval compared to Chatbot Arena). RubricEval’s correlation of ρ = 0.98 with human preferences ties length-corrected AlpacaEval’s record 0.98 correlation, while being higher than regular AlpacaEval (ρ = 0.94), MT-Bench (ρ = 0.94), and MMLU (ρ = 0.87). ###### """) with gr.Accordion("Additional details"): gr.Markdown( """ ###### See our detailed report at [insert blog link]. ###### """) with gr.Accordion("Citation"): gr.Markdown( """ ###### [insert citation] ###### """) with gr.TabItem("Submit Model"): gr.Markdown( """ ###### #### Want to add a model to this leaderboard? #### 1. Run RubricEval locally for <$x (see [insert github link]). #### 2. Upload the evaluation file generated by RubricEval below. Note: the file name will be used as the model name. #### 3. Wait ~5 seconds and refresh the leaderboard page to see that your model has been added! ###### """) model_submission = gr.File(file_types = [".csv"], file_count = "single") model_submission.upload(fn = process_upload, inputs = [model_submission], outputs = []) demo.launch() if __name__ == "__main__": create_ui()