Spaces:
Sleeping
Sleeping
update
Browse files- .gitattributes +32 -0
- app.py +10 -7
- results_qwen/Llama-2-70b-chat-hf.csv +3 -0
- results_qwen/Llama-2-70b-chat-hf.jpg +3 -0
- results_qwen/Llama-2-70b-chat-hf.pkl +3 -0
- results_qwen/Llama-2-70b-chat-hf.png +3 -0
- results_qwen/Mistral-7B-Instruct-v0.2.csv +3 -0
- results_qwen/Mistral-7B-Instruct-v0.2.jpg +3 -0
- results_qwen/Mistral-7B-Instruct-v0.2.pkl +3 -0
- results_qwen/Mistral-7B-Instruct-v0.2.png +3 -0
- results_qwen/Qwen1.5-72B-Chat.csv +3 -0
- results_qwen/Qwen1.5-72B-Chat.jpg +3 -0
- results_qwen/Qwen1.5-72B-Chat.pkl +3 -0
- results_qwen/Qwen1.5-72B-Chat.png +3 -0
- results_qwen/StripedHyena-Nous-7B.csv +3 -0
- results_qwen/StripedHyena-Nous-7B.jpg +3 -0
- results_qwen/StripedHyena-Nous-7B.pkl +3 -0
- results_qwen/StripedHyena-Nous-7B.png +3 -0
- results_qwen/Yi-34B-Chat.csv +3 -0
- results_qwen/Yi-34B-Chat.jpg +3 -0
- results_qwen/Yi-34B-Chat.pkl +3 -0
- results_qwen/Yi-34B-Chat.png +3 -0
- results_qwen/claude-3-sonnet-20240229.csv +3 -0
- results_qwen/claude-3-sonnet-20240229.jpg +3 -0
- results_qwen/claude-3-sonnet-20240229.pkl +3 -0
- results_qwen/claude-3-sonnet-20240229.png +3 -0
- results_qwen/dbrx-instruct.csv +3 -0
- results_qwen/dbrx-instruct.jpg +3 -0
- results_qwen/dbrx-instruct.pkl +3 -0
- results_qwen/dbrx-instruct.png +3 -0
- results_qwen/gpt-35-turbo.csv +3 -0
- results_qwen/gpt-35-turbo.jpg +3 -0
- results_qwen/gpt-35-turbo.pkl +3 -0
- results_qwen/gpt-35-turbo.png +3 -0
.gitattributes
CHANGED
@@ -231,3 +231,35 @@ results_qwen/gpt-4-turbo-2024-04-09.pkl filter=lfs diff=lfs merge=lfs -text
|
|
231 |
results_qwen/CodeLlama-70b-Instruct-hf.png filter=lfs diff=lfs merge=lfs -text
|
232 |
results_qwen/claude-3-haiku-20240307.jpg filter=lfs diff=lfs merge=lfs -text
|
233 |
results_qwen/deepseek-llm-67b-chat.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
results_qwen/CodeLlama-70b-Instruct-hf.png filter=lfs diff=lfs merge=lfs -text
|
232 |
results_qwen/claude-3-haiku-20240307.jpg filter=lfs diff=lfs merge=lfs -text
|
233 |
results_qwen/deepseek-llm-67b-chat.png filter=lfs diff=lfs merge=lfs -text
|
234 |
+
results_qwen/dbrx-instruct.png filter=lfs diff=lfs merge=lfs -text
|
235 |
+
results_qwen/gpt-35-turbo.jpg filter=lfs diff=lfs merge=lfs -text
|
236 |
+
results_qwen/gpt-35-turbo.pkl filter=lfs diff=lfs merge=lfs -text
|
237 |
+
results_qwen/Qwen1.5-72B-Chat.png filter=lfs diff=lfs merge=lfs -text
|
238 |
+
results_qwen/dbrx-instruct.csv filter=lfs diff=lfs merge=lfs -text
|
239 |
+
results_qwen/dbrx-instruct.pkl filter=lfs diff=lfs merge=lfs -text
|
240 |
+
results_qwen/Yi-34B-Chat.pkl filter=lfs diff=lfs merge=lfs -text
|
241 |
+
results_qwen/gpt-35-turbo.csv filter=lfs diff=lfs merge=lfs -text
|
242 |
+
results_qwen/Llama-2-70b-chat-hf.jpg filter=lfs diff=lfs merge=lfs -text
|
243 |
+
results_qwen/Llama-2-70b-chat-hf.png filter=lfs diff=lfs merge=lfs -text
|
244 |
+
results_qwen/Mistral-7B-Instruct-v0.2.csv filter=lfs diff=lfs merge=lfs -text
|
245 |
+
results_qwen/Mistral-7B-Instruct-v0.2.png filter=lfs diff=lfs merge=lfs -text
|
246 |
+
results_qwen/Qwen1.5-72B-Chat.pkl filter=lfs diff=lfs merge=lfs -text
|
247 |
+
results_qwen/Mistral-7B-Instruct-v0.2.jpg filter=lfs diff=lfs merge=lfs -text
|
248 |
+
results_qwen/Mistral-7B-Instruct-v0.2.pkl filter=lfs diff=lfs merge=lfs -text
|
249 |
+
results_qwen/Qwen1.5-72B-Chat.csv filter=lfs diff=lfs merge=lfs -text
|
250 |
+
results_qwen/Yi-34B-Chat.png filter=lfs diff=lfs merge=lfs -text
|
251 |
+
results_qwen/Llama-2-70b-chat-hf.csv filter=lfs diff=lfs merge=lfs -text
|
252 |
+
results_qwen/StripedHyena-Nous-7B.csv filter=lfs diff=lfs merge=lfs -text
|
253 |
+
results_qwen/StripedHyena-Nous-7B.pkl filter=lfs diff=lfs merge=lfs -text
|
254 |
+
results_qwen/StripedHyena-Nous-7B.jpg filter=lfs diff=lfs merge=lfs -text
|
255 |
+
results_qwen/Yi-34B-Chat.csv filter=lfs diff=lfs merge=lfs -text
|
256 |
+
results_qwen/gpt-35-turbo.png filter=lfs diff=lfs merge=lfs -text
|
257 |
+
results_qwen/Llama-2-70b-chat-hf.pkl filter=lfs diff=lfs merge=lfs -text
|
258 |
+
results_qwen/claude-3-sonnet-20240229.csv filter=lfs diff=lfs merge=lfs -text
|
259 |
+
results_qwen/claude-3-sonnet-20240229.png filter=lfs diff=lfs merge=lfs -text
|
260 |
+
results_qwen/dbrx-instruct.jpg filter=lfs diff=lfs merge=lfs -text
|
261 |
+
results_qwen/Qwen1.5-72B-Chat.jpg filter=lfs diff=lfs merge=lfs -text
|
262 |
+
results_qwen/StripedHyena-Nous-7B.png filter=lfs diff=lfs merge=lfs -text
|
263 |
+
results_qwen/claude-3-sonnet-20240229.jpg filter=lfs diff=lfs merge=lfs -text
|
264 |
+
results_qwen/Yi-34B-Chat.jpg filter=lfs diff=lfs merge=lfs -text
|
265 |
+
results_qwen/claude-3-sonnet-20240229.pkl filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -435,14 +435,8 @@ def show_intersection_heatmap(evt: gr.SelectData):
|
|
435 |
|
436 |
with gr.Blocks() as demo:
|
437 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
438 |
-
with gr.Tab("Text-only Benchmark"):
|
439 |
-
gr.Markdown("# Text-only Leaderboard")
|
440 |
-
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
441 |
-
gr.Markdown("## Heatmap")
|
442 |
-
heatmap_image = gr.Image(label="", show_label=False)
|
443 |
-
leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
|
444 |
|
445 |
-
with gr.Tab("Text-only Benchmark
|
446 |
gr.Markdown("# Text-only Leaderboard (Judged by Qwen)")
|
447 |
leader_board = gr.Dataframe(accuracy_df_qwen, headers=headers_with_icons)
|
448 |
gr.Markdown("## Heatmap")
|
@@ -527,6 +521,15 @@ with gr.Blocks() as demo:
|
|
527 |
)
|
528 |
heatmap_image = gr.Plot(label="Model Heatmap")
|
529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
included_models_cot.select(
|
531 |
fn=calculate_order_by_first_substring_cot,
|
532 |
inputs=[included_models_cot],
|
|
|
435 |
|
436 |
with gr.Blocks() as demo:
|
437 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
+
with gr.Tab("Text-only Benchmark"):
|
440 |
gr.Markdown("# Text-only Leaderboard (Judged by Qwen)")
|
441 |
leader_board = gr.Dataframe(accuracy_df_qwen, headers=headers_with_icons)
|
442 |
gr.Markdown("## Heatmap")
|
|
|
521 |
)
|
522 |
heatmap_image = gr.Plot(label="Model Heatmap")
|
523 |
|
524 |
+
with gr.Tab("Text-only Benchmark (deprecated)"):
|
525 |
+
gr.Markdown("# Text-only Leaderboard")
|
526 |
+
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
527 |
+
gr.Markdown("## Heatmap")
|
528 |
+
heatmap_image = gr.Image(label="", show_label=False)
|
529 |
+
leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
|
530 |
+
|
531 |
+
# ============ Callbacks ============
|
532 |
+
|
533 |
included_models_cot.select(
|
534 |
fn=calculate_order_by_first_substring_cot,
|
535 |
inputs=[included_models_cot],
|
results_qwen/Llama-2-70b-chat-hf.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5e5a2bcd63b330efb3c92c9d2bfc3a708cb14348dec1bf4e7eb34e604348efa
|
3 |
+
size 18452553
|
results_qwen/Llama-2-70b-chat-hf.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/Llama-2-70b-chat-hf.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d74e63ba62f3f074e16b0731f5d3f53ecd6f6d431ca6579a46fb95e8e0fc0494
|
3 |
+
size 18434995
|
results_qwen/Llama-2-70b-chat-hf.png
ADDED
![]() |
Git LFS Details
|
results_qwen/Mistral-7B-Instruct-v0.2.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7fc1fd3720541d6da41e0b3a8ba222576cf9deddc09483adeff44233c43e52b0
|
3 |
+
size 25120060
|
results_qwen/Mistral-7B-Instruct-v0.2.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/Mistral-7B-Instruct-v0.2.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5c9386da642fcba5d2d83da27bcb9c43324ade40332fe1f0d449391c49e95bd
|
3 |
+
size 25132544
|
results_qwen/Mistral-7B-Instruct-v0.2.png
ADDED
![]() |
Git LFS Details
|
results_qwen/Qwen1.5-72B-Chat.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd7d69ce42103b008ad375df143d73d9022725be435bb1585a392df01d588d4d
|
3 |
+
size 12095649
|
results_qwen/Qwen1.5-72B-Chat.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/Qwen1.5-72B-Chat.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6716ad73c760159b278364b9c67e1171cf44075148e306575cf57a4e14faf9d7
|
3 |
+
size 12128493
|
results_qwen/Qwen1.5-72B-Chat.png
ADDED
![]() |
Git LFS Details
|
results_qwen/StripedHyena-Nous-7B.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8699ac2a760851df8b3ce3b8531f5185e28cbad084494b71f758d6ede787f365
|
3 |
+
size 33824580
|
results_qwen/StripedHyena-Nous-7B.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/StripedHyena-Nous-7B.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e17907d0209ed8478772675566defd213ecc6cb96106225225b842466fad986
|
3 |
+
size 33818513
|
results_qwen/StripedHyena-Nous-7B.png
ADDED
![]() |
Git LFS Details
|
results_qwen/Yi-34B-Chat.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e65a08895fd3a369c29db7ed8e4a58399bc579689f33a6845594632d4d16346
|
3 |
+
size 18312597
|
results_qwen/Yi-34B-Chat.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/Yi-34B-Chat.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5468af21ae979604dfadd5de9f3c85c550b520deb17bb03130564b47b21334a9
|
3 |
+
size 18366214
|
results_qwen/Yi-34B-Chat.png
ADDED
![]() |
Git LFS Details
|
results_qwen/claude-3-sonnet-20240229.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04d09742be7bb2f08133917006a0b1df70233566b4e2eb87393965beaedf37c5
|
3 |
+
size 20960824
|
results_qwen/claude-3-sonnet-20240229.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/claude-3-sonnet-20240229.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a75b7af6e5dede497165284750fffb7acf9bf287d53b447f75e734c0e69c306
|
3 |
+
size 20960376
|
results_qwen/claude-3-sonnet-20240229.png
ADDED
![]() |
Git LFS Details
|
results_qwen/dbrx-instruct.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56ee4a0903ab2b5c5d1c478ccaee9063c93a6e82602aead01aa0c83ea75ab17a
|
3 |
+
size 15793228
|
results_qwen/dbrx-instruct.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/dbrx-instruct.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ce392c07fc7c1c7c64d941f7b77a06614f1b51f76ce5d5947dafa0191ddf8ee
|
3 |
+
size 15820291
|
results_qwen/dbrx-instruct.png
ADDED
![]() |
Git LFS Details
|
results_qwen/gpt-35-turbo.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3789603ef21192c8267df2bfc434e536c06bb36387ee753cfc079ca5ca062367
|
3 |
+
size 8664643
|
results_qwen/gpt-35-turbo.jpg
ADDED
![]() |
Git LFS Details
|
results_qwen/gpt-35-turbo.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c505f8733712a8076c79a5d0e7c78773eb558364ba3baa08c540673bb4de3bdc
|
3 |
+
size 8672346
|
results_qwen/gpt-35-turbo.png
ADDED
![]() |
Git LFS Details
|