BertrandCabotIDRIS commited on
Commit
766b3f7
·
1 Parent(s): 7c77550

sync wi FIDLE

Browse files
src/about.py CHANGED
@@ -12,8 +12,12 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,11 +25,29 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 
 
 
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("community|ifeval-fr|0", "norm_acc", "IFEval-Fr")
16
+ task1 = Task("community|pr-fouras|0", "pr-fouras-qem", "Pr-Fouras")
17
+ task2 = Task("community|kangourou-to|0", "norm_acc", "Kangourou-TO")
18
+ task3 = Task("community|gpqa-fr|0", "norm_acc", "GPQA-Fr")
19
+ task4 = Task("community|bac-fr|0", "bac-fr-qem", "Bac-Fr")
20
+ task5 = Task("community|sornette|0", "norm_acc", "Sornette")
21
 
22
  NUM_FEWSHOT = 0 # Change with your few shot
23
  # ---------------------------------------------------
 
25
 
26
 
27
  # Your leaderboard name
28
+ TITLE = """<h1 align="center" id="space-title">
29
+ <img src="https://www.deepmama.com/images/fideval.png" alt="FIDLE Evaluator" width="100%">
30
+ </h1>
31
+ """
32
 
33
  # What does your leaderboard evaluate?
34
  INTRODUCTION_TEXT = """
35
+ -------------------------
36
+ # FIDLE LLM-FR Leaderboard 🏆
37
+
38
+ This is a leaderboard exclusively **in French**. We do not intend to become a reference for LLM evaluations. This is for informational and educational purposes only. Please cross-reference with other, more official leaderboards.
39
+
40
+ **Note: The assessments have been adapted to the Reasoning Language Model**: all *tasks* are in generative mode, with no limit on token generation.
41
+ * **IFEval-Fr** : French Translation of [IFEval](https://huggingface.co/datasets/google/IFEval)
42
+ * **Pr-Fouras** : "Père Fouras"'s Riddles (ex : [fan site](https://www.fan-fortboyard.fr/pages/fanzone/enigmes-du-pere-fouras/))
43
+ * **Sornette** : Classification of texts (GORAFI, wikipedia, le saviez-vous, ...) into 4 categories - `burlesque et fantaisiste`, `ludique et didactique`, `insidieux et mensonger`, `moral et accablant`
44
+ * **Kangourou-TO** : MATH Quizzes [Kangourou](www.mathkang.org). *Text Only* : Only questions without figures.
45
+
46
+ **Model Types**:
47
+ * 🪨 - Base, Pretrained, Foundation Model
48
+ * 💬 - Chat Model (Instruct, RLHF, DPO, ...)
49
+ * 💅🏻 - Fine-tuned Model
50
+ * 🤔 - Reasoning Model
51
  """
52
 
53
  # Which evaluations are you running? how can people reproduce what you have?
src/display/utils.py CHANGED
@@ -23,6 +23,7 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
 
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
@@ -30,15 +31,15 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 
36
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -62,10 +63,10 @@ class ModelDetails:
62
 
63
 
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -73,13 +74,13 @@ class ModelType(Enum):
73
 
74
  @staticmethod
75
  def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
  return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
  return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
  return ModelType.RL
82
- if "instruction-tuned" in type or "" in type:
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("R", "number", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
  #Scores
 
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
+ #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
+ #auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
36
+ #auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
37
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
38
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
39
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 
40
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
41
+ #auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
42
+ #auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
43
 
44
  # We use make dataclass to dynamically fill the scores from Tasks
45
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
63
 
64
 
65
  class ModelType(Enum):
66
+ PT = ModelDetails(name="pretrained", symbol="🪨")
67
+ FT = ModelDetails(name="fine-tuned", symbol="💅🏻")
68
+ IFT = ModelDetails(name="instruction-tuned", symbol="💬")
69
+ RL = ModelDetails(name="RL-tuned", symbol="🤔")
70
  Unknown = ModelDetails(name="", symbol="?")
71
 
72
  def to_str(self, separator=" "):
 
74
 
75
  @staticmethod
76
  def from_str(type):
77
+ if "fine-tuned" in type or "💅🏻" in type:
78
  return ModelType.FT
79
+ if "pretrained" in type or "🪨" in type:
80
  return ModelType.PT
81
+ if "RL-tuned" in type or "🤔" in type:
82
  return ModelType.RL
83
+ if "instruction-tuned" in type or "💬" in type:
84
  return ModelType.IFT
85
  return ModelType.Unknown
86
 
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "FIDLE-CNRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/LLM-FR_leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/LLM-FR_requests"
14
+ RESULTS_REPO = f"{OWNER}/LLM-FR_results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -31,6 +31,7 @@ class EvalResult:
31
  num_params: int = 0
32
  date: str = "" # submission date of request file
33
  still_on_hub: bool = False
 
34
 
35
  @classmethod
36
  def init_from_json_file(self, json_filepath):
@@ -38,7 +39,7 @@ class EvalResult:
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
41
- config = data.get("config")
42
 
43
  # Precision
44
  precision = Precision.from_str(config.get("model_dtype"))
@@ -109,21 +110,24 @@ class EvalResult:
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
  AutoEvalColumn.license.name: self.license,
124
  AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
127
  }
128
 
129
  for task in Tasks:
@@ -133,7 +137,7 @@ class EvalResult:
133
 
134
 
135
  def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
138
  requests_path,
139
  f"{model_name}_eval_request_*.json",
 
31
  num_params: int = 0
32
  date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
+ rank: int = 0
35
 
36
  @classmethod
37
  def init_from_json_file(self, json_filepath):
 
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
 
42
+ config = data.get("config_general")
43
 
44
  # Precision
45
  precision = Precision.from_str(config.get("model_dtype"))
 
110
 
111
  def to_dict(self):
112
  """Converts the Eval Result to a dict compatible with our dataframe display"""
113
+ # weighted average calculation
114
+ task_weights = [0.25, 0.25, 0.25, 0.09, 0.09, 0.07]
115
+ average = sum(np.array([v for v in self.results.values() if v is not None]) * np.array(task_weights)) / sum(task_weights)
116
  data_dict = {
117
  "eval_name": self.eval_name, # not a column, just a save name,
118
  AutoEvalColumn.precision.name: self.precision.value.name,
119
+ #AutoEvalColumn.model_type.name: self.model_type.value.name,
120
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
121
+ #AutoEvalColumn.weight_type.name: self.weight_type.value.name,
122
+ #AutoEvalColumn.architecture.name: self.architecture,
123
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
124
+ #AutoEvalColumn.revision.name: self.revision,
125
  AutoEvalColumn.average.name: average,
126
  AutoEvalColumn.license.name: self.license,
127
  AutoEvalColumn.likes.name: self.likes,
128
  AutoEvalColumn.params.name: self.num_params,
129
+ #AutoEvalColumn.still_on_hub.name: self.still_on_hub,
130
+ AutoEvalColumn.rank.name: self.rank,
131
  }
132
 
133
  for task in Tasks:
 
137
 
138
 
139
  def get_request_file_for_model(requests_path, model_name, precision):
140
+ """Selects the correct request file for a given model."""
141
  request_files = os.path.join(
142
  requests_path,
143
  f"{model_name}_eval_request_*.json",
src/populate.py CHANGED
@@ -14,11 +14,17 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
 
 
 
 
 
 
22
  return df
23
 
24
 
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False).reset_index(drop=True)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
+
23
+ # add rank column
24
+ df[AutoEvalColumn.rank.name] = df.index + 1
25
+ df.loc[0, AutoEvalColumn.rank.name] = '1 🥇'
26
+ df.loc[1, AutoEvalColumn.rank.name] = '2 🥈'
27
+ df.loc[2, AutoEvalColumn.rank.name] = '3 🥉'
28
  return df
29
 
30
 
src/submission/check_validity.py CHANGED
@@ -31,7 +31,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
31
 
32
  return True, ""
33
 
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
  try:
37
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
31
 
32
  return True, ""
33
 
34
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=True, test_tokenizer=False) -> tuple[bool, str]:
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
  try:
37
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
src/submission/submit.py CHANGED
@@ -81,7 +81,7 @@ def add_new_eval(
81
  "revision": revision,
82
  "precision": precision,
83
  "weight_type": weight_type,
84
- "status": "PENDING",
85
  "submitted_time": current_time,
86
  "model_type": model_type,
87
  "likes": model_info.likes,
 
81
  "revision": revision,
82
  "precision": precision,
83
  "weight_type": weight_type,
84
+ "status": "FINISHED",
85
  "submitted_time": current_time,
86
  "model_type": model_type,
87
  "likes": model_info.likes,