Sealical commited on
Commit
5bf1eab
Β·
1 Parent(s): 81b7fcb

Update space

Browse files
Files changed (2) hide show
  1. app.py +329 -172
  2. old_app.py +204 -0
app.py CHANGED
@@ -1,204 +1,361 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
 
5
  from huggingface_hub import snapshot_download
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
 
 
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
 
 
53
 
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
63
  return Leaderboard(
64
  value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
- AutoEvalColumn.params.name,
78
  type="slider",
79
  min=0.01,
80
- max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
 
 
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
  multiselect=False,
155
- value=None,
156
- interactive=True,
157
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("πŸ“™ Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
+ import os
5
+ import json
6
  from huggingface_hub import snapshot_download
7
 
8
+ # Constants for PhysicalCodeBench
9
+ TITLE = """
10
+ <div style="text-align: center; max-width: 900px; margin: 0 auto;">
11
+ <div>
12
+ <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
13
+ PhysicalCodeBench Leaderboard
14
+ </h1>
15
+ <h3 style="margin-top: 0; margin-bottom: 10px; font-weight: 500;">
16
+ Evaluating LLMs on Physics-based Simulation Code Generation
17
+ </h3>
18
+ </div>
19
+ </div>
20
+ """
21
+
22
+ INTRODUCTION_TEXT = """
23
+ PhysicalCodeBench evaluates the abilities of Large Language Models (LLMs) to generate code for physics-based simulations.
24
+ The benchmark consists of user instructions that describe physical scenarios to be simulated, reference code implementations,
25
+ and resulting simulation videos generated using the [Genesis](https://github.com/Genesis-Embodied-AI/Genesis) physics engine.
26
+
27
+ This leaderboard showcases model performance on the PhysicalCodeBench-50 dataset, measuring both text-based execution success
28
+ and visual quality of the generated simulations.
29
+ """
30
+
31
+ ABOUT_TEXT = """
32
+ ## About PhysicalCodeBench
33
+
34
+ PhysicalCodeBench evaluates an LLM's ability to:
35
+ - Understand natural language descriptions of physical scenarios
36
+ - Generate executable code that correctly implements the physics simulation
37
+ - Produce visually accurate and physically plausible results
38
+
39
+ The benchmark covers a variety of physical phenomena including:
40
+ - Rigid body dynamics (collisions, rolling, bouncing, etc.)
41
+ - Fluid and particle simulations
42
+ - Soft body physics
43
+ - Controlled environments (robotic arms, drones, etc.)
44
+ - Chain reactions and complex interactions
45
+
46
+ ## Evaluation Metrics
47
+
48
+ PhysicalCodeBench uses two main evaluation dimensions:
49
+
50
+ 1. **Text Score (50 points)**: Evaluates code execution success
51
+ - Code runs without errors (25 points)
52
+ - Generates proper output files (10 points)
53
+ - Output files meet required specifications (15 points)
54
+
55
+ 2. **Visual Score (50 points)**: Evaluates simulation quality
56
+ - CLIP Score: Measures text-video alignment (25 points)
57
+ - Motion Smoothness: Evaluates physics simulation quality (25 points)
58
+
59
+ Total score is the sum of Text and Visual scores (maximum 100 points).
60
+ """
61
+
62
+ SUBMISSION_TEXT = """
63
+ ## How to Submit Your Model Results
64
+
65
+ 1. Fork the [PhysicalCodeBench repository](https://github.com/Sealical/PhysicalCodeBench)
66
+ 2. Generate code for all 50 tasks in the benchmark using your model
67
+ 3. Run the evaluation pipeline with your generated code
68
+ 4. Create a submission folder with the following structure:
69
+ ```
70
+ submission/
71
+ β”œβ”€β”€ model_info.json # Contains model details (name, size, etc.)
72
+ β”œβ”€β”€ evaluation_results/ # Directory containing all result files
73
+ └── PhysCodeEval_results.json # Main evaluation results file
74
+ ```
75
+ 5. Submit a pull request with your results
76
+
77
+ Your submission will be verified and added to the leaderboard once approved.
78
+ """
79
 
80
+ CITATION_TEXT = """
81
+ @article{PhysicalCodeBench2025,
82
+ title={PhysicalCodeBench: Evaluating LLMs on Physics-based Simulation Code Generation},
83
+ author={Your Name and Co-authors},
84
+ journal={arXiv preprint arXiv:XXXX.XXXXX},
85
+ year={2025}
86
+ }
87
+ """
88
 
89
+ # Custom CSS for the interface
90
+ custom_css = """
91
+ .markdown-text {
92
+ font-size: 16px !important;
93
+ text-align: left !important;
94
+ }
95
+ .tab-button {
96
+ font-size: 16px !important;
97
+ }
98
+ """
99
 
100
+ # Define column structure for the leaderboard
101
+ class PhysCodeColumn:
102
+ def __init__(self, name, type, displayed_by_default=True, never_hidden=False, hidden=False):
103
+ self.name = name
104
+ self.type = type
105
+ self.displayed_by_default = displayed_by_default
106
+ self.never_hidden = never_hidden
107
+ self.hidden = hidden
108
 
109
+ # Define the columns for our leaderboard
110
+ COLUMNS = [
111
+ PhysCodeColumn("rank", "number", True, True, False),
112
+ PhysCodeColumn("model", "str", True, True, False),
113
+ PhysCodeColumn("model_type", "str", True, False, False),
114
+ PhysCodeColumn("params", "number", True, False, False),
115
+ PhysCodeColumn("text_score", "number", True, False, False),
116
+ PhysCodeColumn("visual_score", "number", True, False, False),
117
+ PhysCodeColumn("total_score", "number", True, False, False),
118
+ PhysCodeColumn("clip_score", "number", False, False, False),
119
+ PhysCodeColumn("motion_smooth_score", "number", False, False, False),
120
+ PhysCodeColumn("execution_success", "number", False, False, False),
121
+ PhysCodeColumn("file_generation", "number", False, False, False),
122
+ PhysCodeColumn("file_quality", "number", False, False, False),
123
+ PhysCodeColumn("submission_date", "date", False, False, False),
124
+ PhysCodeColumn("license", "str", False, False, False)
125
+ ]
126
+
127
+ # Enums for model metadata
128
+ class ModelType:
129
+ Proprietary = "Proprietary"
130
+ OpenSource = "Open Source"
131
+ Unknown = "Unknown"
132
+
133
+ @staticmethod
134
+ def to_str(model_type):
135
+ return model_type
136
+
137
+ # Load sample data (replace with your actual data loading logic)
138
+ def get_leaderboard_df():
139
+ # Sample data based on your README
140
+ data = [
141
+ {
142
+ "rank": 1,
143
+ "model": "GPT4o",
144
+ "model_type": ModelType.Proprietary,
145
+ "params": 1000,
146
+ "text_score": 16.0,
147
+ "visual_score": 18.262,
148
+ "total_score": 34.262,
149
+ "clip_score": 10.2,
150
+ "motion_smooth_score": 8.062,
151
+ "execution_success": 10.0,
152
+ "file_generation": 3.0,
153
+ "file_quality": 3.0,
154
+ "submission_date": "2025-01-15",
155
+ "license": "Proprietary"
156
+ },
157
+ {
158
+ "rank": 2,
159
+ "model": "Gemini-2.0-flash",
160
+ "model_type": ModelType.Proprietary,
161
+ "params": 450,
162
+ "text_score": 15.0,
163
+ "visual_score": 16.963,
164
+ "total_score": 31.963,
165
+ "clip_score": 9.5,
166
+ "motion_smooth_score": 7.463,
167
+ "execution_success": 9.0,
168
+ "file_generation": 3.0,
169
+ "file_quality": 3.0,
170
+ "submission_date": "2025-01-20",
171
+ "license": "Proprietary"
172
+ },
173
+ {
174
+ "rank": 3,
175
+ "model": "DS-R1",
176
+ "model_type": ModelType.OpenSource,
177
+ "params": 32,
178
+ "text_score": 14.0,
179
+ "visual_score": 15.815,
180
+ "total_score": 29.815,
181
+ "clip_score": 8.9,
182
+ "motion_smooth_score": 6.915,
183
+ "execution_success": 8.5,
184
+ "file_generation": 3.0,
185
+ "file_quality": 2.5,
186
+ "submission_date": "2025-01-25",
187
+ "license": "Apache 2.0"
188
+ },
189
+ {
190
+ "rank": 4,
191
+ "model": "DeepSeek-R1-Distill-Qwen-32B",
192
+ "model_type": ModelType.OpenSource,
193
+ "params": 32,
194
+ "text_score": 12.2,
195
+ "visual_score": 15.82,
196
+ "total_score": 28.02,
197
+ "clip_score": 8.8,
198
+ "motion_smooth_score": 7.02,
199
+ "execution_success": 7.2,
200
+ "file_generation": 2.5,
201
+ "file_quality": 2.5,
202
+ "submission_date": "2025-01-28",
203
+ "license": "Apache 2.0"
204
+ },
205
+ {
206
+ "rank": 5,
207
+ "model": "QwQ-32B",
208
+ "model_type": ModelType.OpenSource,
209
+ "params": 32,
210
+ "text_score": 7.1,
211
+ "visual_score": 8.964,
212
+ "total_score": 16.064,
213
+ "clip_score": 4.964,
214
+ "motion_smooth_score": 4.0,
215
+ "execution_success": 4.1,
216
+ "file_generation": 1.5,
217
+ "file_quality": 1.5,
218
+ "submission_date": "2025-02-05",
219
+ "license": "Apache 2.0"
220
+ },
221
+ {
222
+ "rank": 6,
223
+ "model": "Qwen-2.5-32B",
224
+ "model_type": ModelType.OpenSource,
225
+ "params": 32,
226
+ "text_score": 0.7,
227
+ "visual_score": 1.126,
228
+ "total_score": 1.826,
229
+ "clip_score": 0.626,
230
+ "motion_smooth_score": 0.5,
231
+ "execution_success": 0.5,
232
+ "file_generation": 0.1,
233
+ "file_quality": 0.1,
234
+ "submission_date": "2025-02-10",
235
+ "license": "Apache 2.0"
236
+ }
237
+ ]
238
+
239
+ return pd.DataFrame(data)
240
+
241
+ # Function to load submission from JSON file
242
+ def load_submissions_from_json(json_path):
243
+ if os.path.exists(json_path):
244
+ with open(json_path, 'r') as f:
245
+ data = json.load(f)
246
+ return pd.DataFrame(data)
247
+ return None
248
+
249
+ # Initialize the leaderboard
250
  def init_leaderboard(dataframe):
251
  if dataframe is None or dataframe.empty:
252
  raise ValueError("Leaderboard DataFrame is empty or None.")
253
+
254
  return Leaderboard(
255
  value=dataframe,
256
+ datatype=[c.type for c in COLUMNS],
257
  select_columns=SelectColumns(
258
+ default_selection=[c.name for c in COLUMNS if c.displayed_by_default],
259
+ cant_deselect=[c.name for c in COLUMNS if c.never_hidden],
260
  label="Select Columns to Display:",
261
  ),
262
+ search_columns=["model", "license"],
263
+ hide_columns=[c.name for c in COLUMNS if c.hidden],
264
  filter_columns=[
265
+ ColumnFilter("model_type", type="checkboxgroup", label="Model types"),
 
266
  ColumnFilter(
267
+ "params",
268
  type="slider",
269
  min=0.01,
270
+ max=1500,
271
  label="Select the number of parameters (B)",
272
  ),
 
 
 
273
  ],
 
274
  interactive=False,
275
  )
276
 
277
+ # Submission form handling
278
+ def process_submission(model_name, model_type, params, license_type, submission_link):
279
+ # This would be implemented to handle actual submission processing
280
+ return f"Thank you for submitting {model_name}! Your submission will be reviewed and added to the leaderboard once verified."
281
 
282
+ # Main application
283
+ def create_demo():
284
+ # Load the leaderboard data
285
+ leaderboard_df = get_leaderboard_df()
286
+
287
+ # Create the Gradio interface
288
+ demo = gr.Blocks(css=custom_css)
289
+
290
+ with demo:
291
+ gr.HTML(TITLE)
292
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
293
+
294
+ with gr.Tabs() as tabs:
295
+ with gr.TabItem("πŸ… Leaderboard", id=0):
296
+ leaderboard = init_leaderboard(leaderboard_df)
297
+
298
+ with gr.TabItem("πŸ“Š Visualizations", id=1):
299
+ gr.Markdown("## Performance Comparisons")
300
+
301
+ with gr.Row():
302
+ with gr.Column():
303
+ gr.Markdown("### Text vs. Visual Scores")
304
+ # Add a visualization component here (e.g., scatter plot)
305
+
306
+ with gr.Column():
307
+ gr.Markdown("### Score Breakdown by Task Type")
308
+ # Add a visualization component here (e.g., bar chart)
309
+
310
  with gr.Row():
311
+ model_selector = gr.Dropdown(
312
+ choices=leaderboard_df["model"].tolist(),
313
+ label="Select Model for Detailed Analysis",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  multiselect=False,
 
 
315
  )
316
+
317
+ with gr.TabItem("πŸ“ About", id=2):
318
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
319
+
320
+ with gr.TabItem("πŸš€ Submit", id=3):
321
+ gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
322
+
323
+ with gr.Row():
324
+ with gr.Column():
325
+ model_name_input = gr.Textbox(label="Model Name")
326
+ model_type_input = gr.Dropdown(
327
+ choices=["Proprietary", "Open Source"],
328
+ label="Model Type",
329
+ multiselect=False,
330
+ )
331
+ params_input = gr.Number(label="Parameters (billions)")
332
+
333
+ with gr.Column():
334
+ license_input = gr.Textbox(label="License")
335
+ submission_link_input = gr.Textbox(label="GitHub Pull Request URL")
336
+
337
+ submit_button = gr.Button("Submit")
338
+ submission_result = gr.Markdown()
339
+
340
+ submit_button.click(
341
+ process_submission,
342
+ [model_name_input, model_type_input, params_input, license_input, submission_link_input],
343
+ submission_result,
344
+ )
345
+
346
+ with gr.Row():
347
+ with gr.Accordion("πŸ“™ Citation", open=False):
348
+ citation_button = gr.Textbox(
349
+ value=CITATION_TEXT,
350
+ label="Citation",
351
+ lines=8,
352
+ elem_id="citation-button",
353
+ show_copy_button=True,
354
+ )
355
+
356
+ return demo
357
 
358
+ # Launch the application
359
+ if __name__ == "__main__":
360
+ demo = create_demo()
361
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
old_app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ AutoEvalColumn,
22
+ ModelType,
23
+ fields,
24
+ WeightType,
25
+ Precision
26
+ )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
+
31
+
32
+ def restart_space():
33
+ API.restart_space(repo_id=REPO_ID)
34
+
35
+ ### Space initialisation
36
+ try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ )
41
+ except Exception:
42
+ restart_space()
43
+ try:
44
+ print(EVAL_RESULTS_PATH)
45
+ snapshot_download(
46
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ )
48
+ except Exception:
49
+ restart_space()
50
+
51
+
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
+
54
+ (
55
+ finished_eval_queue_df,
56
+ running_eval_queue_df,
57
+ pending_eval_queue_df,
58
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
+
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0.01,
80
+ max=150,
81
+ label="Select the number of parameters (B)",
82
+ ),
83
+ ColumnFilter(
84
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ ),
86
+ ],
87
+ bool_checkboxgroup_label="Hide models",
88
+ interactive=False,
89
+ )
90
+
91
+
92
+ demo = gr.Blocks(css=custom_css)
93
+ with demo:
94
+ gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
+
97
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
+
101
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
+
104
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
+ with gr.Column():
106
+ with gr.Row():
107
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
+
109
+ with gr.Column():
110
+ with gr.Accordion(
111
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ open=False,
113
+ ):
114
+ with gr.Row():
115
+ finished_eval_table = gr.components.Dataframe(
116
+ value=finished_eval_queue_df,
117
+ headers=EVAL_COLS,
118
+ datatype=EVAL_TYPES,
119
+ row_count=5,
120
+ )
121
+ with gr.Accordion(
122
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ open=False,
124
+ ):
125
+ with gr.Row():
126
+ running_eval_table = gr.components.Dataframe(
127
+ value=running_eval_queue_df,
128
+ headers=EVAL_COLS,
129
+ datatype=EVAL_TYPES,
130
+ row_count=5,
131
+ )
132
+
133
+ with gr.Accordion(
134
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ open=False,
136
+ ):
137
+ with gr.Row():
138
+ pending_eval_table = gr.components.Dataframe(
139
+ value=pending_eval_queue_df,
140
+ headers=EVAL_COLS,
141
+ datatype=EVAL_TYPES,
142
+ row_count=5,
143
+ )
144
+ with gr.Row():
145
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ model_name_textbox = gr.Textbox(label="Model name")
150
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
+ model_type = gr.Dropdown(
152
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
+ label="Model type",
154
+ multiselect=False,
155
+ value=None,
156
+ interactive=True,
157
+ )
158
+
159
+ with gr.Column():
160
+ precision = gr.Dropdown(
161
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
+ label="Precision",
163
+ multiselect=False,
164
+ value="float16",
165
+ interactive=True,
166
+ )
167
+ weight_type = gr.Dropdown(
168
+ choices=[i.value.name for i in WeightType],
169
+ label="Weights type",
170
+ multiselect=False,
171
+ value="Original",
172
+ interactive=True,
173
+ )
174
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+
176
+ submit_button = gr.Button("Submit Eval")
177
+ submission_result = gr.Markdown()
178
+ submit_button.click(
179
+ add_new_eval,
180
+ [
181
+ model_name_textbox,
182
+ base_model_name_textbox,
183
+ revision_name_textbox,
184
+ precision,
185
+ weight_type,
186
+ model_type,
187
+ ],
188
+ submission_result,
189
+ )
190
+
191
+ with gr.Row():
192
+ with gr.Accordion("πŸ“™ Citation", open=False):
193
+ citation_button = gr.Textbox(
194
+ value=CITATION_BUTTON_TEXT,
195
+ label=CITATION_BUTTON_LABEL,
196
+ lines=20,
197
+ elem_id="citation-button",
198
+ show_copy_button=True,
199
+ )
200
+
201
+ scheduler = BackgroundScheduler()
202
+ scheduler.add_job(restart_space, "interval", seconds=1800)
203
+ scheduler.start()
204
+ demo.queue(default_concurrency_limit=40).launch()