tohid.abedini commited on
Commit
c604eb0
Β·
1 Parent(s): 2472705
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: LLM Leaderboard
3
  emoji: πŸ…
4
  colorFrom: red
5
  colorTo: red
@@ -8,7 +8,7 @@ sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Part LLM Leaderboard
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Persian LLM Leaderboard
3
  emoji: πŸ…
4
  colorFrom: red
5
  colorTo: red
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: Persian LLM Leaderboard
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -9,36 +9,30 @@ from utils import LLM_BENCHMARKS_ABOUT_TEXT, LLM_BENCHMARKS_SUBMIT_TEXT, custom_
9
  abs_path = Path(__file__).parent
10
 
11
  # Any pandas-compatible data
12
- persian_df = jsonl_to_dataframe(str(abs_path / "leaderboard_persian.jsonl"))
13
- base_df = jsonl_to_dataframe(str(abs_path / "leaderboard_base.jsonl"))
14
 
15
 
16
  all_columns = ["Model", "Average ⬆️", "Precision", "#Params (B)", "Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
17
  columns_to_average = ["Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
18
 
19
 
20
- base_df = add_average_column_to_df(base_df, columns_to_average, index=3)
21
- persian_df = add_average_column_to_df(persian_df, columns_to_average, index=3)
22
 
23
- base_df = apply_markdown_format_for_columns(df=base_df, model_column_name="Model")
24
- persian_df = apply_markdown_format_for_columns(df=persian_df, model_column_name="Model")
25
 
26
- columns_data_type = ["markdown" for i in range(len(persian_df.columns))]
27
  # "str", "number", "bool", "date", "markdown"
28
  # columns_data_type[0] = "markdown"
29
 
30
 
31
  with gr.Blocks(css=custom_css) as demo:
32
  gr.Markdown("""
33
- # Part LLM Leaderboard
34
  """)
35
 
36
  with gr.Tab("πŸŽ–οΈ Persian Leaderboard"):
37
- gr.Markdown("""## Persian LLM Leaderboard
38
- Evaluating Persian Fine-Tuned models
39
- """)
40
  Leaderboard(
41
- value=persian_df,
42
  datatype=columns_data_type,
43
  select_columns=SelectColumns(
44
  default_selection=all_columns,
@@ -49,22 +43,6 @@ with gr.Blocks(css=custom_css) as demo:
49
  hide_columns=["model_name_for_query",],
50
  filter_columns=["Precision", "#Params (B)"],
51
  )
52
- with gr.Tab("πŸ₯‡ Base Leaderboard"):
53
- gr.Markdown("""## Base LLM Leaderboard
54
- Evaluating Base Models
55
- """)
56
- Leaderboard(
57
- value=base_df,
58
- datatype= columns_data_type,
59
- select_columns=SelectColumns(
60
- default_selection=all_columns,
61
- cant_deselect=["Model"],
62
- label="Select Columns to Show",
63
- ),
64
- search_columns=["model_name_for_query"],
65
- hide_columns=["model_name_for_query",],
66
- filter_columns=["Precision", "#Params (B)"],
67
- )
68
  with gr.TabItem("πŸ“ About"):
69
  gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)
70
 
 
9
  abs_path = Path(__file__).parent
10
 
11
  # Any pandas-compatible data
12
+ leaderboard_df = jsonl_to_dataframe(str(abs_path / "leaderboard_data.jsonl"))
 
13
 
14
 
15
  all_columns = ["Model", "Average ⬆️", "Precision", "#Params (B)", "Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
16
  columns_to_average = ["Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
17
 
18
 
19
+ leaderboard_df = add_average_column_to_df(leaderboard_df, columns_to_average, index=3)
 
20
 
21
+ leaderboard_df = apply_markdown_format_for_columns(df=leaderboard_df, model_column_name="Model")
 
22
 
23
+ columns_data_type = ["markdown" for i in range(len(leaderboard_df.columns))]
24
  # "str", "number", "bool", "date", "markdown"
25
  # columns_data_type[0] = "markdown"
26
 
27
 
28
  with gr.Blocks(css=custom_css) as demo:
29
  gr.Markdown("""
30
+ # Persian LLM Leaderboard
31
  """)
32
 
33
  with gr.Tab("πŸŽ–οΈ Persian Leaderboard"):
 
 
 
34
  Leaderboard(
35
+ value=leaderboard_df,
36
  datatype=columns_data_type,
37
  select_columns=SelectColumns(
38
  default_selection=all_columns,
 
43
  hide_columns=["model_name_for_query",],
44
  filter_columns=["Precision", "#Params (B)"],
45
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  with gr.TabItem("πŸ“ About"):
47
  gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)
48
 
leaderboard_base.jsonl β†’ leaderboard_data.jsonl RENAMED
File without changes
leaderboard_persian.jsonl DELETED
@@ -1,16 +0,0 @@
1
- {"Model": "PartAI/Dorna-Llama3-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 33.88, "ARC Easy": 70.40, "ARC Challenge": 61.07, "MMLU Pro": 23.39, "GSM8k Persian": 11.09, "Multiple Choice Persian": 52.86, "Hub License": "llama3", "Model sha": "main", "model_name_for_query": "PartAI/Dorna-Llama3-8B-Instruct"}
2
- {"Model":"meta-llama/Llama-3.2-1B-Instruct", "Precision": "bfloat16", "#Params (B)": 1.24, "Part Multiple Choice": 28.28, "ARC Easy": 47.10, "ARC Challenge": 39.00, "MMLU Pro": 12.17, "GSM8k Persian": 2.77, "Multiple Choice Persian": 36.88, "Hub License": "llama3.2", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.2-1B-Instruct"}
3
- {"Model":"meta-llama/Llama-3.1-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 36.68, "ARC Easy": 78.40, "ARC Challenge": 60.40, "MMLU Pro": 21.00, "GSM8k Persian": 12.48, "Multiple Choice Persian": 54.24, "Hub License": "llama3.1", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.1-8B-Instruct"}
4
- {"Model":"meta-llama/Meta-Llama-3-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 34.99, "ARC Easy": 72.90, "ARC Challenge": 57.70, "MMLU Pro": 25.54, "GSM8k Persian": 12.48, "Multiple Choice Persian": 53.85, "Hub License": "llama3", "Model sha": "main", "model_name_for_query": "meta-llama/Meta-Llama-3-8B-Instruct"}
5
- {"Model":"CohereForAI/aya-23-8B", "Precision": "float16", "#Params (B)": 8.03, "Part Multiple Choice": 32.82, "ARC Easy": 80.46, "ARC Challenge": 64.43, "MMLU Pro": 18.62, "GSM8k Persian": 7.13, "Multiple Choice Persian": 52.86, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-23-8B"}
6
- {"Model":"CohereForAI/aya-23-35B", "Precision": "float16", "#Params (B)": 35, "Part Multiple Choice": 36.79, "ARC Easy": 87.93, "ARC Challenge": 72.48, "MMLU Pro": 25.54, "GSM8k Persian": 16.24, "Multiple Choice Persian": 61.14, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-23-35B"}
7
- {"Model":"CohereForAI/aya-expanse-8b", "Precision": "float16", "#Params (B)": 8.03, "Part Multiple Choice": 34.91, "ARC Easy": 79.60, "ARC Challenge": 70.47, "MMLU Pro": 25.06, "GSM8k Persian": 11.68, "Multiple Choice Persian": 58.38, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-expanse-8b"}
8
- {"Model":"CohereForAI/aya-expanse-32b", "Precision": "float16", "#Params (B)": 32.3, "Part Multiple Choice": 43.36, "ARC Easy": 93.10, "ARC Challenge": 79.87, "MMLU Pro": 31.03, "GSM8k Persian": 20.40, "Multiple Choice Persian": 62.33, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-expanse-32b"}
9
- {"Model":"Qwen/Qwen2-7B-Instruct", "Precision": "bfloat16", "#Params (B)": 7.62, "Part Multiple Choice": 35.90, "ARC Easy": 77.30, "ARC Challenge": 68.46, "MMLU Pro": 23.87, "GSM8k Persian": 14.06, "Multiple Choice Persian": 51.68, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2-7B-Instruct"}
10
- {"Model":"Qwen/Qwen2.5-7B-Instruct", "Precision": "bfloat16", "#Params (B)": 7.62, "Part Multiple Choice": 36.72, "ARC Easy": 79.02, "ARC Challenge": 69.13, "MMLU Pro": 21.96, "GSM8k Persian": 19.80, "Multiple Choice Persian": 52.66, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2.5-7B-Instruct"}
11
- {"Model":"Qwen/Qwen2.5-32B-Instruct", "Precision": "bfloat16", "#Params (B)": 32.8, "Part Multiple Choice": 46.06, "ARC Easy": 90.80, "ARC Challenge": 85.91, "MMLU Pro": 38.19, "GSM8k Persian": 42.97, "Multiple Choice Persian": 61.34, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2.5-32B-Instruct"}
12
- {"Model":"google/gemma-2-2b-it", "Precision": "bfloat16", "#Params (B)": 2.61, "Part Multiple Choice": 31.12, "ARC Easy": 71.26, "ARC Challenge": 57.72, "MMLU Pro": 16.23, "GSM8k Persian": 4.95, "Multiple Choice Persian": 49.90, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-2b-it"}
13
- {"Model":"google/gemma-2-9b-it", "Precision": "bfloat16", "#Params (B)": 9.24, "Part Multiple Choice": 42.70, "ARC Easy": 93.10, "ARC Challenge": 84.56, "MMLU Pro": 31.74, "GSM8k Persian": 20.00, "Multiple Choice Persian": 62.33, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-9b-it"}
14
- {"Model":"google/gemma-2-27b-it", "Precision": "bfloat16", "#Params (B)": 27.2, "Part Multiple Choice": 46.03, "ARC Easy": 95.98, "ARC Challenge": 85.91, "MMLU Pro": 36.28, "GSM8k Persian": 25.54, "Multiple Choice Persian": 63.12, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-27b-it"}
15
- {"Model":"universitytehran/PersianMind-v1.0", "Precision": "bfloat16", "#Params (B)": 6.82, "Part Multiple Choice": 29.27, "ARC Easy": 58.91, "ARC Challenge": 48.32, "MMLU Pro": 15.51, "GSM8k Persian": 3.56, "Multiple Choice Persian": 45.36, "Hub License": "cc-by-nc-sa-4.0", "Model sha": "main", "model_name_for_query": "universitytehran/PersianMind-v1.0"}
16
- {"Model":"MaralGPT/Maral-7B-alpha-1", "Precision": "bfloat16", "#Params (B)": 7.24, "Part Multiple Choice": 26.67, "ARC Easy": 44.54, "ARC Challenge": 30.87, "MMLU Pro": 15.99, "GSM8k Persian": 7.33, "Multiple Choice Persian": 36.09, "Hub License": "mit", "Model sha": "main", "model_name_for_query": "MaralGPT/Maral-7B-alpha-1"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -110,7 +110,7 @@ body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-ma
110
  """
111
 
112
  LLM_BENCHMARKS_ABOUT_TEXT = f"""
113
- # Persian LLM Evaluation Leaderboard (v1)
114
 
115
  > The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.
116
 
 
110
  """
111
 
112
  LLM_BENCHMARKS_ABOUT_TEXT = f"""
113
+ # Persian LLM Leaderboard (v1.0.0)
114
 
115
  > The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.
116