karimouda commited on
Commit
01cd9ce
Β·
1 Parent(s): 9ced1ec
app.py CHANGED
@@ -3,6 +3,9 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchCo
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  #from huggingface_hub import snapshot_download
 
 
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -28,6 +31,9 @@ from src.display.utils import (
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
 
 
 
31
 
32
 
33
  def restart_space():
@@ -86,7 +92,8 @@ def init_leaderboard(dataframe):
86
  interactive=False,
87
  column_widths=[30,50,50,150,60,60,60],
88
  max_height=420,
89
- elem_classes="leaderboard_col_style"
 
90
  )
91
 
92
 
@@ -95,7 +102,6 @@ def init_skill_leaderboard(dataframe):
95
 
96
 
97
  ## create selector for model skills, based on the selector filter the dataframe
98
- skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
99
 
100
  skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
101
 
@@ -153,6 +159,74 @@ def init_size_leaderboard(dataframe):
153
  sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
154
  return leaderboard_by_skill
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  demo = gr.Blocks(css=custom_css)
157
  with demo:
158
  gr.HTML(TITLE, elem_classes="abl_header")
@@ -168,11 +242,28 @@ with demo:
168
  with gr.TabItem("πŸ… Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
169
  leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-about", id=4):
173
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
174
 
175
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-submit", id=5):
176
  with gr.Column():
177
  with gr.Row():
178
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  #from huggingface_hub import snapshot_download
6
+ import re
7
+
8
+
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
 
31
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
+ from src.leaderboard.read_evals import get_model_answers_html_file
35
+
36
+ skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
37
 
38
 
39
  def restart_space():
 
92
  interactive=False,
93
  column_widths=[30,50,50,150,60,60,60],
94
  max_height=420,
95
+ elem_classes="leaderboard_col_style",
96
+ show_search="search"
97
  )
98
 
99
 
 
102
 
103
 
104
  ## create selector for model skills, based on the selector filter the dataframe
 
105
 
106
  skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
107
 
 
159
  sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
160
  return leaderboard_by_skill
161
 
162
+ def strip_html_tags(model_name):
163
+ return re.sub('<[^<]+?>', '', model_name)
164
+
165
+
166
+
167
+ def get_model_info_blocks(chosen_model_name):
168
+
169
+ model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
170
+ model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
171
+
172
+ model_name_full = model_names[model_names_clean.index(chosen_model_name)]
173
+ filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
174
+ skills_bar_df = pd.DataFrame({
175
+ 'Skills': skills,
176
+ 'Scores': filtered_df[skills].values[0]
177
+ })
178
+
179
+ skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
180
+
181
+
182
+ with gr.Accordion("Model Details"):
183
+
184
+ with gr.Row():
185
+ model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name))
186
+ with gr.Row():
187
+ benchmark_score = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0]))
188
+ rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0]))
189
+ speed = gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0]))
190
+ contamination = gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0]))
191
+ size = gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0]))
192
+
193
+ with gr.Row():
194
+ skills_bar = gr.BarPlot(
195
+ value=skills_bar_df,
196
+ x="Skills",
197
+ y="Scores",
198
+ width=500,
199
+ height=500,
200
+ x_label_angle=45,
201
+ color="Skills",
202
+ color_title=None,
203
+ label="Model Skills"
204
+ )
205
+
206
+
207
+ html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
208
+
209
+ if html_file_content == "EMPTY":
210
+ answers_html = gr.Markdown("")
211
+ else:
212
+ with gr.Row():
213
+
214
+ ##strip style and script tags from html
215
+ html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
216
+ html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
217
+
218
+ answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
219
+ label="Model Responses", container=True, elem_classes="model_responses_container")
220
+
221
+
222
+ return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html
223
+
224
+
225
+
226
+ def init_compare_tab(dataframe):
227
+ pass
228
+
229
+
230
  demo = gr.Blocks(css=custom_css)
231
  with demo:
232
  gr.HTML(TITLE, elem_classes="abl_header")
 
242
  with gr.TabItem("πŸ… Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
243
  leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
244
 
245
+ with gr.TabItem("βš–οΈ Compare", elem_id="llm-benchmark-tab-compare", id=3):
246
+ init_compare_tab(LEADERBOARD_DF)
247
+
248
+ with gr.TabItem("πŸ”¬ Deep Dive", elem_id="llm-benchmark-tab-compare", id=4):
249
+
250
+
251
+ model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
252
+ model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
253
+ with gr.Row():
254
+ models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0])
255
+
256
+
257
+ model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html = get_model_info_blocks(models_dropdown.value)
258
+
259
+ models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html])
260
+
261
+
262
 
263
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-about", id=5):
264
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
265
 
266
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-submit", id=6):
267
  with gr.Column():
268
  with gr.Row():
269
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/display/css_html_js.py CHANGED
@@ -118,10 +118,20 @@ border-radius: 10px;
118
  }
119
 
120
  .tabs{
121
- gap:0px !important;
122
  }
123
 
124
-
 
 
 
 
 
 
 
 
 
 
125
 
126
  """
127
 
 
118
  }
119
 
120
  .tabs{
121
+ gap:0px !important;
122
  }
123
 
124
+ .deep-dive-metric{
125
+ font-size:20px;
126
+ padding: 10px;
127
+ display: flex;
128
+ flex-direction: column;
129
+ align-items: normal;
130
+ max-height: 120px;
131
+ }
132
+ .model_responses_container td{
133
+ max-width:180px;
134
+ }
135
 
136
  """
137
 
src/leaderboard/read_evals.py CHANGED
@@ -232,3 +232,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
232
 
233
  print(results)
234
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  print(results)
234
  return results
235
+
236
+
237
+ def get_model_answers_html_file(results_path, model_name):
238
+
239
+ model_org,model_name_only = model_name.split("/")
240
+ model_answers_prefix = f"{results_path}/{model_org}/"
241
+
242
+ html_file_content = "EMPTY"
243
+
244
+ for root, _, files in os.walk(model_answers_prefix):
245
+
246
+ for file_name in files:
247
+
248
+ if file_name.startswith(f"{model_name_only}_abb_benchmark_answers_"):
249
+
250
+ file_path = os.path.join(root, file_name)
251
+
252
+ with open(file_path, "r") as f:
253
+
254
+ html_file_content = f.read()
255
+ break
256
+
257
+ return html_file_content