tohid.abedini
commited on
Commit
Β·
c604eb0
1
Parent(s):
2472705
test
Browse files- README.md +2 -2
- app.py +6 -28
- leaderboard_base.jsonl β leaderboard_data.jsonl +0 -0
- leaderboard_persian.jsonl +0 -16
- utils.py +1 -1
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: LLM Leaderboard
|
3 |
emoji: π
|
4 |
colorFrom: red
|
5 |
colorTo: red
|
@@ -8,7 +8,7 @@ sdk_version: 4.42.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
short_description:
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Persian LLM Leaderboard
|
3 |
emoji: π
|
4 |
colorFrom: red
|
5 |
colorTo: red
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
short_description: Persian LLM Leaderboard
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -9,36 +9,30 @@ from utils import LLM_BENCHMARKS_ABOUT_TEXT, LLM_BENCHMARKS_SUBMIT_TEXT, custom_
|
|
9 |
abs_path = Path(__file__).parent
|
10 |
|
11 |
# Any pandas-compatible data
|
12 |
-
|
13 |
-
base_df = jsonl_to_dataframe(str(abs_path / "leaderboard_base.jsonl"))
|
14 |
|
15 |
|
16 |
all_columns = ["Model", "Average β¬οΈ", "Precision", "#Params (B)", "Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
|
17 |
columns_to_average = ["Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
|
18 |
|
19 |
|
20 |
-
|
21 |
-
persian_df = add_average_column_to_df(persian_df, columns_to_average, index=3)
|
22 |
|
23 |
-
|
24 |
-
persian_df = apply_markdown_format_for_columns(df=persian_df, model_column_name="Model")
|
25 |
|
26 |
-
columns_data_type = ["markdown" for i in range(len(
|
27 |
# "str", "number", "bool", "date", "markdown"
|
28 |
# columns_data_type[0] = "markdown"
|
29 |
|
30 |
|
31 |
with gr.Blocks(css=custom_css) as demo:
|
32 |
gr.Markdown("""
|
33 |
-
#
|
34 |
""")
|
35 |
|
36 |
with gr.Tab("ποΈ Persian Leaderboard"):
|
37 |
-
gr.Markdown("""## Persian LLM Leaderboard
|
38 |
-
Evaluating Persian Fine-Tuned models
|
39 |
-
""")
|
40 |
Leaderboard(
|
41 |
-
value=
|
42 |
datatype=columns_data_type,
|
43 |
select_columns=SelectColumns(
|
44 |
default_selection=all_columns,
|
@@ -49,22 +43,6 @@ with gr.Blocks(css=custom_css) as demo:
|
|
49 |
hide_columns=["model_name_for_query",],
|
50 |
filter_columns=["Precision", "#Params (B)"],
|
51 |
)
|
52 |
-
with gr.Tab("π₯ Base Leaderboard"):
|
53 |
-
gr.Markdown("""## Base LLM Leaderboard
|
54 |
-
Evaluating Base Models
|
55 |
-
""")
|
56 |
-
Leaderboard(
|
57 |
-
value=base_df,
|
58 |
-
datatype= columns_data_type,
|
59 |
-
select_columns=SelectColumns(
|
60 |
-
default_selection=all_columns,
|
61 |
-
cant_deselect=["Model"],
|
62 |
-
label="Select Columns to Show",
|
63 |
-
),
|
64 |
-
search_columns=["model_name_for_query"],
|
65 |
-
hide_columns=["model_name_for_query",],
|
66 |
-
filter_columns=["Precision", "#Params (B)"],
|
67 |
-
)
|
68 |
with gr.TabItem("π About"):
|
69 |
gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)
|
70 |
|
|
|
9 |
abs_path = Path(__file__).parent
|
10 |
|
11 |
# Any pandas-compatible data
|
12 |
+
leaderboard_df = jsonl_to_dataframe(str(abs_path / "leaderboard_data.jsonl"))
|
|
|
13 |
|
14 |
|
15 |
all_columns = ["Model", "Average β¬οΈ", "Precision", "#Params (B)", "Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
|
16 |
columns_to_average = ["Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
|
17 |
|
18 |
|
19 |
+
leaderboard_df = add_average_column_to_df(leaderboard_df, columns_to_average, index=3)
|
|
|
20 |
|
21 |
+
leaderboard_df = apply_markdown_format_for_columns(df=leaderboard_df, model_column_name="Model")
|
|
|
22 |
|
23 |
+
columns_data_type = ["markdown" for i in range(len(leaderboard_df.columns))]
|
24 |
# "str", "number", "bool", "date", "markdown"
|
25 |
# columns_data_type[0] = "markdown"
|
26 |
|
27 |
|
28 |
with gr.Blocks(css=custom_css) as demo:
|
29 |
gr.Markdown("""
|
30 |
+
# Persian LLM Leaderboard
|
31 |
""")
|
32 |
|
33 |
with gr.Tab("ποΈ Persian Leaderboard"):
|
|
|
|
|
|
|
34 |
Leaderboard(
|
35 |
+
value=leaderboard_df,
|
36 |
datatype=columns_data_type,
|
37 |
select_columns=SelectColumns(
|
38 |
default_selection=all_columns,
|
|
|
43 |
hide_columns=["model_name_for_query",],
|
44 |
filter_columns=["Precision", "#Params (B)"],
|
45 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
with gr.TabItem("π About"):
|
47 |
gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)
|
48 |
|
leaderboard_base.jsonl β leaderboard_data.jsonl
RENAMED
File without changes
|
leaderboard_persian.jsonl
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
{"Model": "PartAI/Dorna-Llama3-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 33.88, "ARC Easy": 70.40, "ARC Challenge": 61.07, "MMLU Pro": 23.39, "GSM8k Persian": 11.09, "Multiple Choice Persian": 52.86, "Hub License": "llama3", "Model sha": "main", "model_name_for_query": "PartAI/Dorna-Llama3-8B-Instruct"}
|
2 |
-
{"Model":"meta-llama/Llama-3.2-1B-Instruct", "Precision": "bfloat16", "#Params (B)": 1.24, "Part Multiple Choice": 28.28, "ARC Easy": 47.10, "ARC Challenge": 39.00, "MMLU Pro": 12.17, "GSM8k Persian": 2.77, "Multiple Choice Persian": 36.88, "Hub License": "llama3.2", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.2-1B-Instruct"}
|
3 |
-
{"Model":"meta-llama/Llama-3.1-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 36.68, "ARC Easy": 78.40, "ARC Challenge": 60.40, "MMLU Pro": 21.00, "GSM8k Persian": 12.48, "Multiple Choice Persian": 54.24, "Hub License": "llama3.1", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.1-8B-Instruct"}
|
4 |
-
{"Model":"meta-llama/Meta-Llama-3-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 34.99, "ARC Easy": 72.90, "ARC Challenge": 57.70, "MMLU Pro": 25.54, "GSM8k Persian": 12.48, "Multiple Choice Persian": 53.85, "Hub License": "llama3", "Model sha": "main", "model_name_for_query": "meta-llama/Meta-Llama-3-8B-Instruct"}
|
5 |
-
{"Model":"CohereForAI/aya-23-8B", "Precision": "float16", "#Params (B)": 8.03, "Part Multiple Choice": 32.82, "ARC Easy": 80.46, "ARC Challenge": 64.43, "MMLU Pro": 18.62, "GSM8k Persian": 7.13, "Multiple Choice Persian": 52.86, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-23-8B"}
|
6 |
-
{"Model":"CohereForAI/aya-23-35B", "Precision": "float16", "#Params (B)": 35, "Part Multiple Choice": 36.79, "ARC Easy": 87.93, "ARC Challenge": 72.48, "MMLU Pro": 25.54, "GSM8k Persian": 16.24, "Multiple Choice Persian": 61.14, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-23-35B"}
|
7 |
-
{"Model":"CohereForAI/aya-expanse-8b", "Precision": "float16", "#Params (B)": 8.03, "Part Multiple Choice": 34.91, "ARC Easy": 79.60, "ARC Challenge": 70.47, "MMLU Pro": 25.06, "GSM8k Persian": 11.68, "Multiple Choice Persian": 58.38, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-expanse-8b"}
|
8 |
-
{"Model":"CohereForAI/aya-expanse-32b", "Precision": "float16", "#Params (B)": 32.3, "Part Multiple Choice": 43.36, "ARC Easy": 93.10, "ARC Challenge": 79.87, "MMLU Pro": 31.03, "GSM8k Persian": 20.40, "Multiple Choice Persian": 62.33, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-expanse-32b"}
|
9 |
-
{"Model":"Qwen/Qwen2-7B-Instruct", "Precision": "bfloat16", "#Params (B)": 7.62, "Part Multiple Choice": 35.90, "ARC Easy": 77.30, "ARC Challenge": 68.46, "MMLU Pro": 23.87, "GSM8k Persian": 14.06, "Multiple Choice Persian": 51.68, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2-7B-Instruct"}
|
10 |
-
{"Model":"Qwen/Qwen2.5-7B-Instruct", "Precision": "bfloat16", "#Params (B)": 7.62, "Part Multiple Choice": 36.72, "ARC Easy": 79.02, "ARC Challenge": 69.13, "MMLU Pro": 21.96, "GSM8k Persian": 19.80, "Multiple Choice Persian": 52.66, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2.5-7B-Instruct"}
|
11 |
-
{"Model":"Qwen/Qwen2.5-32B-Instruct", "Precision": "bfloat16", "#Params (B)": 32.8, "Part Multiple Choice": 46.06, "ARC Easy": 90.80, "ARC Challenge": 85.91, "MMLU Pro": 38.19, "GSM8k Persian": 42.97, "Multiple Choice Persian": 61.34, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2.5-32B-Instruct"}
|
12 |
-
{"Model":"google/gemma-2-2b-it", "Precision": "bfloat16", "#Params (B)": 2.61, "Part Multiple Choice": 31.12, "ARC Easy": 71.26, "ARC Challenge": 57.72, "MMLU Pro": 16.23, "GSM8k Persian": 4.95, "Multiple Choice Persian": 49.90, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-2b-it"}
|
13 |
-
{"Model":"google/gemma-2-9b-it", "Precision": "bfloat16", "#Params (B)": 9.24, "Part Multiple Choice": 42.70, "ARC Easy": 93.10, "ARC Challenge": 84.56, "MMLU Pro": 31.74, "GSM8k Persian": 20.00, "Multiple Choice Persian": 62.33, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-9b-it"}
|
14 |
-
{"Model":"google/gemma-2-27b-it", "Precision": "bfloat16", "#Params (B)": 27.2, "Part Multiple Choice": 46.03, "ARC Easy": 95.98, "ARC Challenge": 85.91, "MMLU Pro": 36.28, "GSM8k Persian": 25.54, "Multiple Choice Persian": 63.12, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-27b-it"}
|
15 |
-
{"Model":"universitytehran/PersianMind-v1.0", "Precision": "bfloat16", "#Params (B)": 6.82, "Part Multiple Choice": 29.27, "ARC Easy": 58.91, "ARC Challenge": 48.32, "MMLU Pro": 15.51, "GSM8k Persian": 3.56, "Multiple Choice Persian": 45.36, "Hub License": "cc-by-nc-sa-4.0", "Model sha": "main", "model_name_for_query": "universitytehran/PersianMind-v1.0"}
|
16 |
-
{"Model":"MaralGPT/Maral-7B-alpha-1", "Precision": "bfloat16", "#Params (B)": 7.24, "Part Multiple Choice": 26.67, "ARC Easy": 44.54, "ARC Challenge": 30.87, "MMLU Pro": 15.99, "GSM8k Persian": 7.33, "Multiple Choice Persian": 36.09, "Hub License": "mit", "Model sha": "main", "model_name_for_query": "MaralGPT/Maral-7B-alpha-1"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
CHANGED
@@ -110,7 +110,7 @@ body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-ma
|
|
110 |
"""
|
111 |
|
112 |
LLM_BENCHMARKS_ABOUT_TEXT = f"""
|
113 |
-
# Persian LLM
|
114 |
|
115 |
> The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.
|
116 |
|
|
|
110 |
"""
|
111 |
|
112 |
LLM_BENCHMARKS_ABOUT_TEXT = f"""
|
113 |
+
# Persian LLM Leaderboard (v1.0.0)
|
114 |
|
115 |
> The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.
|
116 |
|