deep dive
Browse files- app.py +95 -4
- src/display/css_html_js.py +12 -2
- src/leaderboard/read_evals.py +23 -0
app.py
CHANGED
@@ -3,6 +3,9 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchCo
|
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
#from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
@@ -28,6 +31,9 @@ from src.display.utils import (
|
|
28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
from src.submission.submit import add_new_eval
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
def restart_space():
|
@@ -86,7 +92,8 @@ def init_leaderboard(dataframe):
|
|
86 |
interactive=False,
|
87 |
column_widths=[30,50,50,150,60,60,60],
|
88 |
max_height=420,
|
89 |
-
elem_classes="leaderboard_col_style"
|
|
|
90 |
)
|
91 |
|
92 |
|
@@ -95,7 +102,6 @@ def init_skill_leaderboard(dataframe):
|
|
95 |
|
96 |
|
97 |
## create selector for model skills, based on the selector filter the dataframe
|
98 |
-
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
|
99 |
|
100 |
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
|
101 |
|
@@ -153,6 +159,74 @@ def init_size_leaderboard(dataframe):
|
|
153 |
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
|
154 |
return leaderboard_by_skill
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
demo = gr.Blocks(css=custom_css)
|
157 |
with demo:
|
158 |
gr.HTML(TITLE, elem_classes="abl_header")
|
@@ -168,11 +242,28 @@ with demo:
|
|
168 |
with gr.TabItem("π
Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
|
169 |
leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
-
with gr.TabItem("π About", elem_id="llm-benchmark-tab-about", id=
|
173 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
174 |
|
175 |
-
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-submit", id=
|
176 |
with gr.Column():
|
177 |
with gr.Row():
|
178 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
#from huggingface_hub import snapshot_download
|
6 |
+
import re
|
7 |
+
|
8 |
+
|
9 |
|
10 |
from src.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
|
|
31 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
+
from src.leaderboard.read_evals import get_model_answers_html_file
|
35 |
+
|
36 |
+
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
|
37 |
|
38 |
|
39 |
def restart_space():
|
|
|
92 |
interactive=False,
|
93 |
column_widths=[30,50,50,150,60,60,60],
|
94 |
max_height=420,
|
95 |
+
elem_classes="leaderboard_col_style",
|
96 |
+
show_search="search"
|
97 |
)
|
98 |
|
99 |
|
|
|
102 |
|
103 |
|
104 |
## create selector for model skills, based on the selector filter the dataframe
|
|
|
105 |
|
106 |
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
|
107 |
|
|
|
159 |
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
|
160 |
return leaderboard_by_skill
|
161 |
|
162 |
+
def strip_html_tags(model_name):
|
163 |
+
return re.sub('<[^<]+?>', '', model_name)
|
164 |
+
|
165 |
+
|
166 |
+
|
167 |
+
def get_model_info_blocks(chosen_model_name):
|
168 |
+
|
169 |
+
model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
|
170 |
+
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
|
171 |
+
|
172 |
+
model_name_full = model_names[model_names_clean.index(chosen_model_name)]
|
173 |
+
filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
|
174 |
+
skills_bar_df = pd.DataFrame({
|
175 |
+
'Skills': skills,
|
176 |
+
'Scores': filtered_df[skills].values[0]
|
177 |
+
})
|
178 |
+
|
179 |
+
skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
|
180 |
+
|
181 |
+
|
182 |
+
with gr.Accordion("Model Details"):
|
183 |
+
|
184 |
+
with gr.Row():
|
185 |
+
model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name))
|
186 |
+
with gr.Row():
|
187 |
+
benchmark_score = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0]))
|
188 |
+
rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0]))
|
189 |
+
speed = gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0]))
|
190 |
+
contamination = gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0]))
|
191 |
+
size = gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0]))
|
192 |
+
|
193 |
+
with gr.Row():
|
194 |
+
skills_bar = gr.BarPlot(
|
195 |
+
value=skills_bar_df,
|
196 |
+
x="Skills",
|
197 |
+
y="Scores",
|
198 |
+
width=500,
|
199 |
+
height=500,
|
200 |
+
x_label_angle=45,
|
201 |
+
color="Skills",
|
202 |
+
color_title=None,
|
203 |
+
label="Model Skills"
|
204 |
+
)
|
205 |
+
|
206 |
+
|
207 |
+
html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
|
208 |
+
|
209 |
+
if html_file_content == "EMPTY":
|
210 |
+
answers_html = gr.Markdown("")
|
211 |
+
else:
|
212 |
+
with gr.Row():
|
213 |
+
|
214 |
+
##strip style and script tags from html
|
215 |
+
html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
|
216 |
+
html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
|
217 |
+
|
218 |
+
answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
|
219 |
+
label="Model Responses", container=True, elem_classes="model_responses_container")
|
220 |
+
|
221 |
+
|
222 |
+
return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html
|
223 |
+
|
224 |
+
|
225 |
+
|
226 |
+
def init_compare_tab(dataframe):
|
227 |
+
pass
|
228 |
+
|
229 |
+
|
230 |
demo = gr.Blocks(css=custom_css)
|
231 |
with demo:
|
232 |
gr.HTML(TITLE, elem_classes="abl_header")
|
|
|
242 |
with gr.TabItem("π
Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
|
243 |
leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
|
244 |
|
245 |
+
with gr.TabItem("βοΈ Compare", elem_id="llm-benchmark-tab-compare", id=3):
|
246 |
+
init_compare_tab(LEADERBOARD_DF)
|
247 |
+
|
248 |
+
with gr.TabItem("π¬ Deep Dive", elem_id="llm-benchmark-tab-compare", id=4):
|
249 |
+
|
250 |
+
|
251 |
+
model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
|
252 |
+
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
|
253 |
+
with gr.Row():
|
254 |
+
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0])
|
255 |
+
|
256 |
+
|
257 |
+
model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html = get_model_info_blocks(models_dropdown.value)
|
258 |
+
|
259 |
+
models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html])
|
260 |
+
|
261 |
+
|
262 |
|
263 |
+
with gr.TabItem("π About", elem_id="llm-benchmark-tab-about", id=5):
|
264 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
265 |
|
266 |
+
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-submit", id=6):
|
267 |
with gr.Column():
|
268 |
with gr.Row():
|
269 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/display/css_html_js.py
CHANGED
@@ -118,10 +118,20 @@ border-radius: 10px;
|
|
118 |
}
|
119 |
|
120 |
.tabs{
|
121 |
-
gap:0px !important;
|
122 |
}
|
123 |
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
"""
|
127 |
|
|
|
118 |
}
|
119 |
|
120 |
.tabs{
|
121 |
+
gap:0px !important;
|
122 |
}
|
123 |
|
124 |
+
.deep-dive-metric{
|
125 |
+
font-size:20px;
|
126 |
+
padding: 10px;
|
127 |
+
display: flex;
|
128 |
+
flex-direction: column;
|
129 |
+
align-items: normal;
|
130 |
+
max-height: 120px;
|
131 |
+
}
|
132 |
+
.model_responses_container td{
|
133 |
+
max-width:180px;
|
134 |
+
}
|
135 |
|
136 |
"""
|
137 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -232,3 +232,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
232 |
|
233 |
print(results)
|
234 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
print(results)
|
234 |
return results
|
235 |
+
|
236 |
+
|
237 |
+
def get_model_answers_html_file(results_path, model_name):
|
238 |
+
|
239 |
+
model_org,model_name_only = model_name.split("/")
|
240 |
+
model_answers_prefix = f"{results_path}/{model_org}/"
|
241 |
+
|
242 |
+
html_file_content = "EMPTY"
|
243 |
+
|
244 |
+
for root, _, files in os.walk(model_answers_prefix):
|
245 |
+
|
246 |
+
for file_name in files:
|
247 |
+
|
248 |
+
if file_name.startswith(f"{model_name_only}_abb_benchmark_answers_"):
|
249 |
+
|
250 |
+
file_path = os.path.join(root, file_name)
|
251 |
+
|
252 |
+
with open(file_path, "r") as f:
|
253 |
+
|
254 |
+
html_file_content = f.read()
|
255 |
+
break
|
256 |
+
|
257 |
+
return html_file_content
|