tohid.abedini
[Add] about
2c3fe6c
raw
history blame
11.1 kB
import json
import os
import gradio as gr
import pandas as pd
from envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Vazirmatn&display=swap');
body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-markdown {
font-family: 'Vazirmatn', sans-serif !important;
}
.markdown-text {
font-size: 16px !important;
}
#models-to-add-text {
font-size: 18px !important;
}
#citation-button span {
font-size: 16px !important;
}
#citation-button textarea {
font-size: 16px !important;
}
#citation-button > label > button {
margin: 6px;
transform: scale(1.3);
}
#leaderboard-table {
margin-top: 15px
}
#leaderboard-table-lite {
margin-top: 15px
}
#search-bar-table-box > div:first-child {
background: none;
border: none;
}
#search-bar {
padding: 0px;
}
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
#leaderboard-table td:nth-child(2),
#leaderboard-table th:nth-child(2) {
max-width: 400px;
overflow: auto;
white-space: nowrap;
}
.tab-buttons button {
font-size: 20px;
}
#scale-logo {
border-style: none !important;
box-shadow: none;
display: block;
margin-left: auto;
margin-right: auto;
max-width: 600px;
}
#scale-logo .download {
display: none;
}
#filter_type{
border: 0;
padding-left: 0;
padding-top: 0;
}
#filter_type label {
display: flex;
}
#filter_type label > span{
margin-top: var(--spacing-lg);
margin-right: 0.5em;
}
#filter_type label > .wrap{
width: 103px;
}
#filter_type label > .wrap .wrap-inner{
padding: 2px;
}
#filter_type label > .wrap .wrap-inner input{
width: 1px
}
#filter-columns-type{
border:0;
padding:0.5;
}
#filter-columns-size{
border:0;
padding:0.5;
}
#box-filter > .form{
border: 0
}
"""
LLM_BENCHMARKS_ABOUT_TEXT = f"""
## Persian LLM Evaluation Leaderboard (v1)
The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian language models. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.
## Key Features
1. **Open Evaluation Access**
The leaderboard allows open participation, meaning that developers and researchers working with open-source models can submit evaluation requests for their models. This accessibility encourages the development and testing of Persian LLMs within the broader AI ecosystem.
2. **Task Diversity**
Six specialized tasks have been curated for this leaderboard, each tailored to challenge different aspects of a model’s capabilities. These tasks include:
- **Part Multiple Choice**
- **ARC Easy**
- **ARC Challenging**
- **MMLU Pro**
- **GSM8k Persian**
- **Multiple Choice Persian**
Each dataset is available in Persian, providing a robust testing ground for models in a non-English setting.
3. **Open-Source Dataset Sample**
A sample of the evaluation dataset is hosted on [Hugging Face Datasets](https://huggingface.co/datasets/PartAI/llm-leaderboard-datasets-sample), offering the AI community a glimpse of the benchmark content and format. This sample allows developers to pre-assess their models against representative data before a full leaderboard evaluation.
4. **Collaborative Development**
This leaderboard represents a significant collaboration between Part AI and Professor Saeedeh Momtazi of Amirkabir University of Technology, leveraging academic research and industrial expertise to create a high-quality, open benchmarking tool. The partnership underscores a shared commitment to advancing Persian-language AI technologies.
5. **Comprehensive Evaluation Pipeline**
By integrating a standardized evaluation pipeline, models are assessed across a variety of data types, including text, mathematical formulas, and numerical data. This multi-faceted approach enhances the evaluation’s reliability and allows for precise, nuanced assessment of model performance across multiple dimensions.
## Background and Goals
Recent months have seen a notable increase in the development of Persian language models by research centers and AI companies in Iran. However, the lack of reliable, standardized benchmarks for Persian models has made it challenging to evaluate model quality comprehensively. Global benchmarks typically do not support Persian, resulting in skewed or unreliable results for Persian-based AI.
This leaderboard addresses this gap by providing a locally-focused, transparent system that enables consistent, fair comparisons of Persian models. It is expected to be a valuable tool for Persian-speaking businesses and developers, allowing them to select models best suited to their needs. Researchers and model developers also benefit from the competitive environment, with opportunities to showcase and improve their models based on benchmark rankings.
## Data Privacy and Integrity
To maintain evaluation integrity and prevent overfitting or data leakage, only part of the benchmark dataset is openly available. This limited access approach upholds model evaluation reliability, ensuring that results are genuinely representative of each model’s capabilities across unseen data.
The leaderboard represents a significant milestone in Persian language AI and is positioned to become the leading standard for LLM evaluation in the Persian-speaking world.
"""
LLM_BENCHMARKS_SUBMIT_TEXT = """## Submit your model
Submit your model to the leaderboard using the below form AFTER following the following steps:
- Create a HuggingFace account and request to join the [MLSB organization](https://huggingface.co/MLSB)
- Create a new space in the MLSB organization and add your model using the inference templates: https://huggingface.co/new-space?owner=MLSB
- Fill the submission form.
## Prerequisites:
To qualify for submission, each team must:
- Provide an MLSB submission ID (find it on CMT) or a link to a preprint/paper describing their methodology. This publication does not have to specifically report training or evaluation on the P(L)INDER dataset. Previously published methods, such as DiffDock, only need to link their existing paper. Note that entry into this competition does not equate to an MLSB workshop paper submission.
- Create a copy of the provided [inference templates](https://huggingface.co/MLSB/).
- Go to the top right corner of the page of the respective inference template and click on the drop-down menu (vertical ellipsis) right next to the “Community”, then select “Duplicate this space”.
- Change files in the newly create space to reflect the peculiarities of your model
- Edit `requirements.txt` to capture all python dependencies.
- Modify the Dockerfile as appropriate (including selecting the right base image)
- Include a `inference_app.py` file. This contains a `predict` function that should be modified to reflect the specifics of inference using their model.
- Include a `train.py` file to ensure that training and model selection use only the Persian/Base datasets and to clearly show any additional hyperparameters used.
- Provide a LICENSE file that allows for reuse, derivative works, and distribution of the provided software and weights (e.g., MIT or Apache2 license).
- Submit to the leaderboard via the [form below](https://huggingface.co/spaces/MLSB/leaderboard2024).
- On submission page, add reference to the newly created space in the format username/space (e.g mlsb/alphafold3). You can create the space on your personal Huggingface account and transfer it to MLSB for the submission to get a GPU assigned.
After a brief technical review by our organizers we will grant you a free GPU until MLSB so that anyone can play with the model and we will run the evaluation.
If you have a questions please email: [email protected]
"""
def load_jsonl(input_file):
data = []
with open(input_file, 'r') as f:
for line in f:
data.append(json.loads(line))
return data
def jsonl_to_dataframe(input_file):
data = load_jsonl(input_file)
return pd.DataFrame(data)
def add_average_column_to_df(df,columns_to_average, index=3):
average_column = df[columns_to_average].mean(axis=1)
df.insert(index, "Average ⬆️", average_column)
return df
def model_hyperlink(link, model_name):
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
def make_clickable_model(model_name):
link = f"https://huggingface.co/{model_name}"
return model_hyperlink(link, model_name)
def apply_clickable_model(df, column_name):
df[column_name] = df[column_name].apply(make_clickable_model)
return df
def submit(model_name, model_id, contact_email, challenge, submission_id, paper_link, architecture, license):
if model_name == "" or model_id == "" or challenge == "" or architecture == "" or license == "":
gr.Error("Please fill all the fields")
return
if submission_id == "" and paper_link == "":
gr.Error("Provide either a link to a paper describing the method or a submission ID for the MLSB workshop.")
return
try:
user_name = ""
if "/" in model_id:
user_name = model_id.split("/")[0]
model_path = model_id.split("/")[1]
eval_entry = {
"model_name": model_name,
"model_id": model_id,
"challenge": challenge,
"submission_id": submission_id,
"architecture": architecture,
"license": license
}
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
os.makedirs(OUT_DIR, exist_ok=True)
out_path = f"{OUT_DIR}/{user_name}_{model_path}.json"
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry))
print("Uploading eval file")
API.upload_file(
path_or_fileobj=out_path,
path_in_repo=out_path.split("eval-queue/")[1],
repo_id=QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add {model_name} to eval queue",
)
gr.Info("Successfully submitted", duration=10)
# Remove the local file
os.remove(out_path)
except:
gr.Error("Error submitting the model")