File size: 10,548 Bytes
6594739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2f4356
 
 
 
 
 
6594739
 
 
d2f4356
6594739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2f4356
6594739
 
 
 
 
 
 
 
 
 
d2f4356
 
6594739
 
 
d2f4356
 
6594739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2f4356
 
6594739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49557ad
6594739
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import gradio as gr
import numpy as np
import pandas as pd
import scipy.stats as st

LEADERBOARD_FILE = "leaderboard.csv"

def get_leaderboard_df():
    df = pd.read_csv(LEADERBOARD_FILE)
    df = df.sort_values(by = ["Score"], ascending = False)
    df = df.reset_index(drop = True)
    return df

def get_model_stats(uploaded_df):
    overall_score = uploaded_df["avg_score"].mean()
    data = np.array(list(uploaded_df["avg_score"]))
    bootstrap_res = st.bootstrap((data,),
                                 np.mean,
                                 confidence_level = 0.95,
                                 n_resamples = 10000,
                                 method = "percentile")
    ci_high = bootstrap_res.confidence_interval.high
    ci_low = bootstrap_res.confidence_interval.low

    formatted_upper_diff = str(round(ci_high - overall_score, 2))
    formatted_lower_diff = str(round(overall_score - ci_low, 2))

    formatted_score = round(overall_score, 2)
    formatted_ci = f"+{formatted_upper_diff}/-{formatted_lower_diff}"

    return (formatted_score, formatted_ci)

def process_upload(file):
    uploaded_df = pd.read_csv(file.name).dropna()
    if "avg_score" not in list(uploaded_df.columns):
        return "Upload failed: file must have column 'avg_score'."
    overall_score, confidence_interval = get_model_stats(uploaded_df)
    leaderboard_df = get_leaderboard_df()
    model_name = file.name.split("gradio/")[1].split("/")[1].split(".csv")[0]
    new_entry = {"Model": model_name, "Score": overall_score, "95% CI": confidence_interval}
    leaderboard_df = leaderboard_df.append(new_entry, ignore_index = True)
    leaderboard_df.to_csv(LEADERBOARD_FILE, index = False)
    return "Upload complete! The leaderboard has been updated."

#theme = gr.themes.Default(radius_size = "none")
def create_ui():
    text_size = gr.themes.sizes.text_lg
    # load theme from theme.json
    theme = gr.themes.Default.load("theme.json")
    # set text size to large
    theme.text_size = text_size
    with gr.Blocks(theme = theme) as demo:
        with gr.Row():
            gr.Image("https://ai.stanford.edu/wp-content/themes/sail/img/logo.png",
                     show_label = False,
                     show_download_button = False,
                     show_share_button = False,
                     container = False,
                     min_width = 200,
                     scale = 0)
            gr.Image("https://crfm.stanford.edu/static/img/header/crfm-rgb.png",
                     show_label = False,
                     show_download_button = False,
                     show_share_button = False,
                     container = False,
                     min_width = 200,
                     scale = 0)
        gr.Markdown(
            """
            # **RubricEval: A Scalable Human-LLM Evaluation Framework for Open-Ended Tasks**
            ######
            """)
        with gr.TabItem("Leaderboard"):
            overall_leaderboard_table = gr.Dataframe(get_leaderboard_df,
                                                     gr.Timer(5),
                                                     column_widths = ["33.3%", "33.3%", "33.3%"],
                                                     height = 600)
            gr.Markdown(
                """
                ######
                
                ## RubricEval leaderboard statistics (Overall)
                """
            )
            gr.Image("lb_stats.png",
                     show_label = False,
                     show_download_button = False,
                     show_share_button = False,
                     width = 800)
            gr.Markdown(
                """
                ######

                ## RubricEval scores by category
                """
            )
            gr.Image("category_scores.png",
                     show_label = False,
                     show_download_button = False,
                     show_share_button = False)
        with gr.TabItem("About"):
            gr.Image("eval_about.jpg",
                     show_label = False,
                     show_download_button = False,
                     show_share_button = False)
            with gr.Accordion("What is RubricEval?"):
                gr.Markdown(
                    """
                    ######
                    #### Overview
                    RubricEval is a framework for evaluating instruction-following models.
                    The core idea is to create example-specific rubrics designed by human experts, which are then applied by an GPT-4o to evaluate model outputs at scale. This process results in more scalable, trustworthy, and interpretable evaluations of language models.
                    
                    #### Features
                    **Open-Ended:** The responses of chat models are open-ended in nature, and a small set of reference
                    answers often can’t capture all acceptable responses. This is a key limitation of reference-based
                    evaluators like BLEU and BERTScore.
                    
                    **Multidimensional:** Responses can be good and bad in different ways, which isn’t captured by "head
                    to head" evaluators like Chatbot Arena and AlpacaEval that simply decide if one response is better
                    than another generally.
                    
                    **Absolute:** Evaluators like Chatbot Arena and AlpacaEval use win rates based on pairwise comparisons.
                    This means that we don’t know how good a model is in absolute terms. For example, a model may
                    have a low win rate against GPT-4o but still be formidable, and the highest win rate model may not
                    be perfect despite topping the leaderboard.
                    
                    **Varying Criteria:** The criteria for what makes a good response is different for each instruction. While
                    HELM Instruct is open-ended, multidimensional, and absolute, it uses the same set of scoring criteria
                    for each instruction, missing nuances at the instruction level. Most pairwise comparison evaluators
                    may implicitly consider varying criteria for each instruction, but these criteria are not explicitly laid
                    out (WildBench is a notable exception).
                    
                    **Feedback:** To the best of our knowledge, no current language model evaluation system provides
                    textual feedback on a model’s overall strengths and weaknesses with respect to some set of
                    instructions. However, we believe that such feedback would be highly valuable for model developers.
                    Evaluation is a key piece of iterative model development, and textual feedback could provide insight
                    on what exactly needs to be improved rather than solely a score which is hard to interpret.
                    ######
                    """)
                gr.Image("feature_comp.png",
                         show_label = False,
                         show_download_button = False,
                         show_share_button = False)
            with gr.Accordion("Where do evaluation instructions come from?"):
                gr.Markdown(
                    """
                    ######
                    We utilize a set of approximately 1,000 instructions from WildBench ([https://huggingface.co/spaces/allenai/WildBench](https://huggingface.co/spaces/allenai/WildBench)) which was made publicly available. From this, 392 of the hardest instructions were chosen via a GPT-4 based pairwise comparison method.
                    
                    Using the WildBench dataset has three primary benefits:
                    
                    1) It contains a manually curated selection of instructions from real users.
                    
                    2) The instructions are well spread out across 11 categories, which is useful for benchmarking.
                    
                    3) Each instruction comes with user-defined criteria of what they’re looking for, which we can make use of directly in our framework
                    ######
                    """)
            with gr.Accordion("How does RubricEval correlate with human preferences?"):
                gr.Markdown(
                    """
                    ######
                    We used RubricEval to score 13 leading large language models across 11 categories and 392 instructions from WildBench.
                    
                    Notably, the ranking of these models based on RubricEval scores correlates highly with the ranking of the same models using Chatbot Arena ELO ratings (spearman ρ = 0.98).
                    The main discordance is in the ranking of Claude 3 Opus (which is ranked relatively lower by RubricEval compared to Chatbot Arena).
                    RubricEval’s correlation of ρ = 0.98 with human preferences ties length-corrected AlpacaEval’s record 0.98 correlation, while being higher than regular AlpacaEval (ρ = 0.94), MT-Bench (ρ = 0.94), and MMLU (ρ = 0.87).
                    ######
                    """)
            with gr.Accordion("Additional details"):
                gr.Markdown(
                    """
                    ######
                    See our detailed report at [insert blog link].
                    ######
                    """)
            with gr.Accordion("Citation"):
                gr.Markdown(
                    """
                    ######
                    [insert citation]
                    ######
                    """)
        with gr.TabItem("Submit Model"):
            gr.Markdown(
                """
                ######
                #### Want to add a model to this leaderboard?
                #### 1. Run RubricEval locally for <$x (see [insert github link]).
                #### 2. Upload the evaluation file generated by RubricEval below. Note: the file name will be used as the model name.
                #### 3. Wait ~5 seconds and refresh the leaderboard page to see that your model has been added!
                ######
                """)
            model_submission = gr.File(file_types = [".csv"], file_count = "single")
            model_submission.upload(fn = process_upload, inputs = [model_submission], outputs = [])

    demo.launch()

if __name__ == "__main__":
    create_ui()