Spaces:
Running
Running
Commit
·
766b3f7
1
Parent(s):
7c77550
sync wi FIDLE
Browse files- src/about.py +26 -4
- src/display/utils.py +15 -14
- src/envs.py +4 -4
- src/leaderboard/read_evals.py +12 -8
- src/populate.py +7 -1
- src/submission/check_validity.py +1 -1
- src/submission/submit.py +1 -1
src/about.py
CHANGED
@@ -12,8 +12,12 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,11 +25,29 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
|
|
|
|
|
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("community|ifeval-fr|0", "norm_acc", "IFEval-Fr")
|
16 |
+
task1 = Task("community|pr-fouras|0", "pr-fouras-qem", "Pr-Fouras")
|
17 |
+
task2 = Task("community|kangourou-to|0", "norm_acc", "Kangourou-TO")
|
18 |
+
task3 = Task("community|gpqa-fr|0", "norm_acc", "GPQA-Fr")
|
19 |
+
task4 = Task("community|bac-fr|0", "bac-fr-qem", "Bac-Fr")
|
20 |
+
task5 = Task("community|sornette|0", "norm_acc", "Sornette")
|
21 |
|
22 |
NUM_FEWSHOT = 0 # Change with your few shot
|
23 |
# ---------------------------------------------------
|
|
|
25 |
|
26 |
|
27 |
# Your leaderboard name
|
28 |
+
TITLE = """<h1 align="center" id="space-title">
|
29 |
+
<img src="https://www.deepmama.com/images/fideval.png" alt="FIDLE Evaluator" width="100%">
|
30 |
+
</h1>
|
31 |
+
"""
|
32 |
|
33 |
# What does your leaderboard evaluate?
|
34 |
INTRODUCTION_TEXT = """
|
35 |
+
-------------------------
|
36 |
+
# FIDLE LLM-FR Leaderboard 🏆
|
37 |
+
|
38 |
+
This is a leaderboard exclusively **in French**. We do not intend to become a reference for LLM evaluations. This is for informational and educational purposes only. Please cross-reference with other, more official leaderboards.
|
39 |
+
|
40 |
+
**Note: The assessments have been adapted to the Reasoning Language Model**: all *tasks* are in generative mode, with no limit on token generation.
|
41 |
+
* **IFEval-Fr** : French Translation of [IFEval](https://huggingface.co/datasets/google/IFEval)
|
42 |
+
* **Pr-Fouras** : "Père Fouras"'s Riddles (ex : [fan site](https://www.fan-fortboyard.fr/pages/fanzone/enigmes-du-pere-fouras/))
|
43 |
+
* **Sornette** : Classification of texts (GORAFI, wikipedia, le saviez-vous, ...) into 4 categories - `burlesque et fantaisiste`, `ludique et didactique`, `insidieux et mensonger`, `moral et accablant`
|
44 |
+
* **Kangourou-TO** : MATH Quizzes [Kangourou](www.mathkang.org). *Text Only* : Only questions without figures.
|
45 |
+
|
46 |
+
**Model Types**:
|
47 |
+
* 🪨 - Base, Pretrained, Foundation Model
|
48 |
+
* 💬 - Chat Model (Instruct, RLHF, DPO, ...)
|
49 |
+
* 💅🏻 - Fine-tuned Model
|
50 |
+
* 🤔 - Reasoning Model
|
51 |
"""
|
52 |
|
53 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/utils.py
CHANGED
@@ -23,6 +23,7 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
@@ -30,15 +31,15 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
|
|
36 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -62,10 +63,10 @@ class ModelDetails:
|
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
65 |
-
PT = ModelDetails(name="pretrained", symbol="
|
66 |
-
FT = ModelDetails(name="fine-tuned", symbol="
|
67 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="
|
68 |
-
RL = ModelDetails(name="RL-tuned", symbol="
|
69 |
Unknown = ModelDetails(name="", symbol="?")
|
70 |
|
71 |
def to_str(self, separator=" "):
|
@@ -73,13 +74,13 @@ class ModelType(Enum):
|
|
73 |
|
74 |
@staticmethod
|
75 |
def from_str(type):
|
76 |
-
if "fine-tuned" in type or "
|
77 |
return ModelType.FT
|
78 |
-
if "pretrained" in type or "
|
79 |
return ModelType.PT
|
80 |
-
if "RL-tuned" in type or "
|
81 |
return ModelType.RL
|
82 |
-
if "instruction-tuned" in type or "
|
83 |
return ModelType.IFT
|
84 |
return ModelType.Unknown
|
85 |
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("R", "number", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
#Scores
|
|
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
34 |
+
#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
+
#auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
36 |
+
#auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
37 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
38 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
39 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
|
|
40 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
41 |
+
#auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
42 |
+
#auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
43 |
|
44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
63 |
|
64 |
|
65 |
class ModelType(Enum):
|
66 |
+
PT = ModelDetails(name="pretrained", symbol="🪨")
|
67 |
+
FT = ModelDetails(name="fine-tuned", symbol="💅🏻")
|
68 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="💬")
|
69 |
+
RL = ModelDetails(name="RL-tuned", symbol="🤔")
|
70 |
Unknown = ModelDetails(name="", symbol="?")
|
71 |
|
72 |
def to_str(self, separator=" "):
|
|
|
74 |
|
75 |
@staticmethod
|
76 |
def from_str(type):
|
77 |
+
if "fine-tuned" in type or "💅🏻" in type:
|
78 |
return ModelType.FT
|
79 |
+
if "pretrained" in type or "🪨" in type:
|
80 |
return ModelType.PT
|
81 |
+
if "RL-tuned" in type or "🤔" in type:
|
82 |
return ModelType.RL
|
83 |
+
if "instruction-tuned" in type or "💬" in type:
|
84 |
return ModelType.IFT
|
85 |
return ModelType.Unknown
|
86 |
|
src/envs.py
CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
-
QUEUE_REPO = f"{OWNER}/
|
14 |
-
RESULTS_REPO = f"{OWNER}/
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "FIDLE-CNRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/LLM-FR_leaderboard"
|
13 |
+
QUEUE_REPO = f"{OWNER}/LLM-FR_requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/LLM-FR_results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
@@ -31,6 +31,7 @@ class EvalResult:
|
|
31 |
num_params: int = 0
|
32 |
date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
|
|
34 |
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
@@ -38,7 +39,7 @@ class EvalResult:
|
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
41 |
-
config = data.get("
|
42 |
|
43 |
# Precision
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
@@ -109,21 +110,24 @@ class EvalResult:
|
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
|
|
|
|
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
AutoEvalColumn.license.name: self.license,
|
124 |
AutoEvalColumn.likes.name: self.likes,
|
125 |
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
@@ -133,7 +137,7 @@ class EvalResult:
|
|
133 |
|
134 |
|
135 |
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
-
"""Selects the correct request file for a given model.
|
137 |
request_files = os.path.join(
|
138 |
requests_path,
|
139 |
f"{model_name}_eval_request_*.json",
|
|
|
31 |
num_params: int = 0
|
32 |
date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
34 |
+
rank: int = 0
|
35 |
|
36 |
@classmethod
|
37 |
def init_from_json_file(self, json_filepath):
|
|
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
42 |
+
config = data.get("config_general")
|
43 |
|
44 |
# Precision
|
45 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
110 |
|
111 |
def to_dict(self):
|
112 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
113 |
+
# weighted average calculation
|
114 |
+
task_weights = [0.25, 0.25, 0.25, 0.09, 0.09, 0.07]
|
115 |
+
average = sum(np.array([v for v in self.results.values() if v is not None]) * np.array(task_weights)) / sum(task_weights)
|
116 |
data_dict = {
|
117 |
"eval_name": self.eval_name, # not a column, just a save name,
|
118 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
119 |
+
#AutoEvalColumn.model_type.name: self.model_type.value.name,
|
120 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
121 |
+
#AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
122 |
+
#AutoEvalColumn.architecture.name: self.architecture,
|
123 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
124 |
+
#AutoEvalColumn.revision.name: self.revision,
|
125 |
AutoEvalColumn.average.name: average,
|
126 |
AutoEvalColumn.license.name: self.license,
|
127 |
AutoEvalColumn.likes.name: self.likes,
|
128 |
AutoEvalColumn.params.name: self.num_params,
|
129 |
+
#AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
130 |
+
AutoEvalColumn.rank.name: self.rank,
|
131 |
}
|
132 |
|
133 |
for task in Tasks:
|
|
|
137 |
|
138 |
|
139 |
def get_request_file_for_model(requests_path, model_name, precision):
|
140 |
+
"""Selects the correct request file for a given model."""
|
141 |
request_files = os.path.join(
|
142 |
requests_path,
|
143 |
f"{model_name}_eval_request_*.json",
|
src/populate.py
CHANGED
@@ -14,11 +14,17 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
return df
|
23 |
|
24 |
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False).reset_index(drop=True)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
+
|
23 |
+
# add rank column
|
24 |
+
df[AutoEvalColumn.rank.name] = df.index + 1
|
25 |
+
df.loc[0, AutoEvalColumn.rank.name] = '1 🥇'
|
26 |
+
df.loc[1, AutoEvalColumn.rank.name] = '2 🥈'
|
27 |
+
df.loc[2, AutoEvalColumn.rank.name] = '3 🥉'
|
28 |
return df
|
29 |
|
30 |
|
src/submission/check_validity.py
CHANGED
@@ -31,7 +31,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
31 |
|
32 |
return True, ""
|
33 |
|
34 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=
|
35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
try:
|
37 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
|
31 |
|
32 |
return True, ""
|
33 |
|
34 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=True, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
try:
|
37 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
src/submission/submit.py
CHANGED
@@ -81,7 +81,7 @@ def add_new_eval(
|
|
81 |
"revision": revision,
|
82 |
"precision": precision,
|
83 |
"weight_type": weight_type,
|
84 |
-
"status": "
|
85 |
"submitted_time": current_time,
|
86 |
"model_type": model_type,
|
87 |
"likes": model_info.likes,
|
|
|
81 |
"revision": revision,
|
82 |
"precision": precision,
|
83 |
"weight_type": weight_type,
|
84 |
+
"status": "FINISHED",
|
85 |
"submitted_time": current_time,
|
86 |
"model_type": model_type,
|
87 |
"likes": model_info.likes,
|