Spaces:

opll-org
/

Open-Persian-LLM-Leaderboard

Running

App Files Files Community

tohid.abedini commited on Nov 16, 2024

Commit

c604eb0

1 Parent(s): 2472705

test

Browse files

Files changed (5) hide show

README.md +2 -2
app.py +6 -28
leaderboard_base.jsonl → leaderboard_data.jsonl +0 -0
leaderboard_persian.jsonl +0 -16
utils.py +1 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: LLM Leaderboard
 emoji: 🏅
 colorFrom: red
 colorTo: red
@@ -8,7 +8,7 @@ sdk_version: 4.42.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: Part LLM Leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Persian LLM Leaderboard
 emoji: 🏅
 colorFrom: red
 colorTo: red
 app_file: app.py
 pinned: false
 license: mit
+short_description: Persian LLM Leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -9,36 +9,30 @@ from utils import LLM_BENCHMARKS_ABOUT_TEXT, LLM_BENCHMARKS_SUBMIT_TEXT, custom_
 abs_path = Path(__file__).parent
 # Any pandas-compatible data
-persian_df = jsonl_to_dataframe(str(abs_path / "leaderboard_persian.jsonl"))
-base_df = jsonl_to_dataframe(str(abs_path / "leaderboard_base.jsonl"))
 all_columns = ["Model", "Average ⬆️", "Precision", "#Params (B)", "Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
 columns_to_average = ["Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
-base_df = add_average_column_to_df(base_df, columns_to_average, index=3)
-persian_df = add_average_column_to_df(persian_df, columns_to_average, index=3)
-base_df = apply_markdown_format_for_columns(df=base_df, model_column_name="Model")
-persian_df = apply_markdown_format_for_columns(df=persian_df, model_column_name="Model")
-columns_data_type = ["markdown" for i in range(len(persian_df.columns))]
 # "str", "number", "bool", "date", "markdown"
 # columns_data_type[0] = "markdown"
 with gr.Blocks(css=custom_css) as demo:
     gr.Markdown("""
-    # Part LLM Leaderboard
     """)
     with gr.Tab("🎖️ Persian Leaderboard"):
-        gr.Markdown("""## Persian LLM Leaderboard
-                Evaluating Persian Fine-Tuned models
-                """)
         Leaderboard(
-        value=persian_df,
         datatype=columns_data_type,
         select_columns=SelectColumns(
             default_selection=all_columns,
@@ -49,22 +43,6 @@ with gr.Blocks(css=custom_css) as demo:
         hide_columns=["model_name_for_query",],
         filter_columns=["Precision", "#Params (B)"],
     )
-    with gr.Tab("🥇 Base Leaderboard"):
-        gr.Markdown("""## Base LLM Leaderboard
-                Evaluating Base Models
-                """)
-        Leaderboard(
-        value=base_df,
-        datatype= columns_data_type,
-        select_columns=SelectColumns(
-            default_selection=all_columns,
-            cant_deselect=["Model"],
-            label="Select Columns to Show",
-        ),
-        search_columns=["model_name_for_query"],
-        hide_columns=["model_name_for_query",],
-        filter_columns=["Precision", "#Params (B)"],
-    )
     with gr.TabItem("📝 About"):
         gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)

 abs_path = Path(__file__).parent
 # Any pandas-compatible data
+leaderboard_df = jsonl_to_dataframe(str(abs_path / "leaderboard_data.jsonl"))
 all_columns = ["Model", "Average ⬆️", "Precision", "#Params (B)", "Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
 columns_to_average = ["Part Multiple Choice", "ARC Easy", "ARC Challenge", "MMLU Pro", "GSM8k Persian", "Multiple Choice Persian"]
+leaderboard_df = add_average_column_to_df(leaderboard_df, columns_to_average, index=3)
+leaderboard_df = apply_markdown_format_for_columns(df=leaderboard_df, model_column_name="Model")
+columns_data_type = ["markdown" for i in range(len(leaderboard_df.columns))]
 # "str", "number", "bool", "date", "markdown"
 # columns_data_type[0] = "markdown"
 with gr.Blocks(css=custom_css) as demo:
     gr.Markdown("""
+    # Persian LLM Leaderboard
     """)
     with gr.Tab("🎖️ Persian Leaderboard"):
         Leaderboard(
+        value=leaderboard_df,
         datatype=columns_data_type,
         select_columns=SelectColumns(
             default_selection=all_columns,
         hide_columns=["model_name_for_query",],
         filter_columns=["Precision", "#Params (B)"],
     )
     with gr.TabItem("📝 About"):
         gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)

leaderboard_base.jsonl → leaderboard_data.jsonl RENAMED Viewed

File without changes

leaderboard_persian.jsonl DELETED Viewed

@@ -1,16 +0,0 @@
-{"Model": "PartAI/Dorna-Llama3-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 33.88, "ARC Easy": 70.40, "ARC Challenge": 61.07, "MMLU Pro": 23.39, "GSM8k Persian": 11.09, "Multiple Choice Persian": 52.86, "Hub License": "llama3", "Model sha": "main", "model_name_for_query": "PartAI/Dorna-Llama3-8B-Instruct"}
-{"Model":"meta-llama/Llama-3.2-1B-Instruct", "Precision": "bfloat16", "#Params (B)": 1.24, "Part Multiple Choice": 28.28, "ARC Easy": 47.10, "ARC Challenge": 39.00, "MMLU Pro": 12.17, "GSM8k Persian": 2.77, "Multiple Choice Persian": 36.88, "Hub License": "llama3.2", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.2-1B-Instruct"}
-{"Model":"meta-llama/Llama-3.1-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 36.68, "ARC Easy": 78.40, "ARC Challenge": 60.40, "MMLU Pro": 21.00, "GSM8k Persian": 12.48, "Multiple Choice Persian": 54.24, "Hub License": "llama3.1", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.1-8B-Instruct"}
-{"Model":"meta-llama/Meta-Llama-3-8B-Instruct", "Precision": "bfloat16", "#Params (B)": 8.03, "Part Multiple Choice": 34.99, "ARC Easy": 72.90, "ARC Challenge": 57.70, "MMLU Pro": 25.54, "GSM8k Persian": 12.48, "Multiple Choice Persian": 53.85, "Hub License": "llama3", "Model sha": "main", "model_name_for_query": "meta-llama/Meta-Llama-3-8B-Instruct"}
-{"Model":"CohereForAI/aya-23-8B", "Precision": "float16", "#Params (B)": 8.03, "Part Multiple Choice": 32.82, "ARC Easy": 80.46, "ARC Challenge": 64.43, "MMLU Pro": 18.62, "GSM8k Persian": 7.13, "Multiple Choice Persian": 52.86, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-23-8B"}
-{"Model":"CohereForAI/aya-23-35B", "Precision": "float16", "#Params (B)": 35, "Part Multiple Choice": 36.79, "ARC Easy": 87.93, "ARC Challenge": 72.48, "MMLU Pro": 25.54, "GSM8k Persian": 16.24, "Multiple Choice Persian": 61.14, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-23-35B"}
-{"Model":"CohereForAI/aya-expanse-8b", "Precision": "float16", "#Params (B)": 8.03, "Part Multiple Choice": 34.91, "ARC Easy": 79.60, "ARC Challenge": 70.47, "MMLU Pro": 25.06, "GSM8k Persian": 11.68, "Multiple Choice Persian": 58.38, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-expanse-8b"}
-{"Model":"CohereForAI/aya-expanse-32b", "Precision": "float16", "#Params (B)": 32.3, "Part Multiple Choice": 43.36, "ARC Easy": 93.10, "ARC Challenge": 79.87, "MMLU Pro": 31.03, "GSM8k Persian": 20.40, "Multiple Choice Persian": 62.33, "Hub License": "cc-by-nc-4.0", "Model sha": "main", "model_name_for_query": "CohereForAI/aya-expanse-32b"}
-{"Model":"Qwen/Qwen2-7B-Instruct", "Precision": "bfloat16", "#Params (B)": 7.62, "Part Multiple Choice": 35.90, "ARC Easy": 77.30, "ARC Challenge": 68.46, "MMLU Pro": 23.87, "GSM8k Persian": 14.06, "Multiple Choice Persian": 51.68, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2-7B-Instruct"}
-{"Model":"Qwen/Qwen2.5-7B-Instruct", "Precision": "bfloat16", "#Params (B)": 7.62, "Part Multiple Choice": 36.72, "ARC Easy": 79.02, "ARC Challenge": 69.13, "MMLU Pro": 21.96, "GSM8k Persian": 19.80, "Multiple Choice Persian": 52.66, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2.5-7B-Instruct"}
-{"Model":"Qwen/Qwen2.5-32B-Instruct", "Precision": "bfloat16", "#Params (B)": 32.8, "Part Multiple Choice": 46.06, "ARC Easy": 90.80, "ARC Challenge": 85.91, "MMLU Pro": 38.19, "GSM8k Persian": 42.97, "Multiple Choice Persian": 61.34, "Hub License": "apache-2.0", "Model sha": "main", "model_name_for_query": "Qwen/Qwen2.5-32B-Instruct"}
-{"Model":"google/gemma-2-2b-it", "Precision": "bfloat16", "#Params (B)": 2.61, "Part Multiple Choice": 31.12, "ARC Easy": 71.26, "ARC Challenge": 57.72, "MMLU Pro": 16.23, "GSM8k Persian": 4.95, "Multiple Choice Persian": 49.90, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-2b-it"}
-{"Model":"google/gemma-2-9b-it", "Precision": "bfloat16", "#Params (B)": 9.24, "Part Multiple Choice": 42.70, "ARC Easy": 93.10, "ARC Challenge": 84.56, "MMLU Pro": 31.74, "GSM8k Persian": 20.00, "Multiple Choice Persian": 62.33, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-9b-it"}
-{"Model":"google/gemma-2-27b-it", "Precision": "bfloat16", "#Params (B)": 27.2, "Part Multiple Choice": 46.03, "ARC Easy": 95.98, "ARC Challenge": 85.91, "MMLU Pro": 36.28, "GSM8k Persian": 25.54, "Multiple Choice Persian": 63.12, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-27b-it"}
-{"Model":"universitytehran/PersianMind-v1.0", "Precision": "bfloat16", "#Params (B)": 6.82, "Part Multiple Choice": 29.27, "ARC Easy": 58.91, "ARC Challenge": 48.32, "MMLU Pro": 15.51, "GSM8k Persian": 3.56, "Multiple Choice Persian": 45.36, "Hub License": "cc-by-nc-sa-4.0", "Model sha": "main", "model_name_for_query": "universitytehran/PersianMind-v1.0"}
-{"Model":"MaralGPT/Maral-7B-alpha-1", "Precision": "bfloat16", "#Params (B)": 7.24, "Part Multiple Choice": 26.67, "ARC Easy": 44.54, "ARC Challenge": 30.87, "MMLU Pro": 15.99, "GSM8k Persian": 7.33, "Multiple Choice Persian": 36.09, "Hub License": "mit", "Model sha": "main", "model_name_for_query": "MaralGPT/Maral-7B-alpha-1"}

utils.py CHANGED Viewed

@@ -110,7 +110,7 @@ body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-ma
 """
 LLM_BENCHMARKS_ABOUT_TEXT = f"""
-# Persian LLM Evaluation Leaderboard (v1)
 > The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.

 """
 LLM_BENCHMARKS_ABOUT_TEXT = f"""
+# Persian LLM Leaderboard (v1.0.0)
 > The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.