bugfix
Browse files- app.py +2 -2
- src/about.py +8 -3
- src/leaderboard/read_evals.py +16 -7
- src/populate.py +34 -34
app.py
CHANGED
|
@@ -27,7 +27,7 @@ from src.display.utils import (
|
|
| 27 |
Precision,
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
-
from src.populate import
|
| 31 |
from src.submission.submit import add_new_eval
|
| 32 |
|
| 33 |
|
|
@@ -49,7 +49,7 @@ def restart_space():
|
|
| 49 |
# restart_space()
|
| 50 |
|
| 51 |
try:
|
| 52 |
-
print(EVAL_RESULTS_PATH)
|
| 53 |
snapshot_download(
|
| 54 |
repo_id=RESULTS_REPO,
|
| 55 |
local_dir=EVAL_RESULTS_PATH,
|
|
|
|
| 27 |
Precision,
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
+
from src.populate import get_leaderboard_df
|
| 31 |
from src.submission.submit import add_new_eval
|
| 32 |
|
| 33 |
|
|
|
|
| 49 |
# restart_space()
|
| 50 |
|
| 51 |
try:
|
| 52 |
+
print("Saving results locally at:", EVAL_RESULTS_PATH)
|
| 53 |
snapshot_download(
|
| 54 |
repo_id=RESULTS_REPO,
|
| 55 |
local_dir=EVAL_RESULTS_PATH,
|
src/about.py
CHANGED
|
@@ -15,18 +15,23 @@ class Task:
|
|
| 15 |
# ---------------------------------------------------
|
| 16 |
class Tasks(Enum):
|
| 17 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 18 |
-
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
|
| 19 |
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
|
| 20 |
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
|
| 21 |
-
|
| 22 |
task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
|
|
|
|
|
|
|
|
|
|
| 23 |
task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
|
| 24 |
task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
|
|
|
|
|
|
|
| 25 |
task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
|
| 26 |
task8 = Task("news_sum", "bertscore,none", "News Sum")
|
|
|
|
| 27 |
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
|
| 28 |
task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
|
| 29 |
-
task11 = Task("xcopa_it", "acc,none", "
|
| 30 |
|
| 31 |
|
| 32 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
|
| 15 |
# ---------------------------------------------------
|
| 16 |
class Tasks(Enum):
|
| 17 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
|
|
| 18 |
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
|
| 19 |
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
|
| 20 |
+
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
|
| 21 |
task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
|
| 22 |
+
task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
|
| 23 |
+
task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS")
|
| 24 |
+
task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo")
|
| 25 |
task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
|
| 26 |
task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
|
| 27 |
+
task14 = Task("ironita_irony", "f1,none", "IronITA Irony")
|
| 28 |
+
task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm")
|
| 29 |
task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
|
| 30 |
task8 = Task("news_sum", "bertscore,none", "News Sum")
|
| 31 |
+
task16 = Task("sentipolc", "f1,none", "SENTIPOLC")
|
| 32 |
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
|
| 33 |
task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
|
| 34 |
+
task11 = Task("xcopa_it", "acc,none", "XCOPA")
|
| 35 |
|
| 36 |
|
| 37 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -11,6 +11,8 @@ from src.display.formatting import make_clickable_model
|
|
| 11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
|
|
|
|
|
|
| 14 |
|
| 15 |
@dataclass
|
| 16 |
class EvalResult:
|
|
@@ -80,6 +82,8 @@ class EvalResult:
|
|
| 80 |
architecture = ";".join(architectures)
|
| 81 |
|
| 82 |
# Extract results available in this file (some results are split in several files)
|
|
|
|
|
|
|
| 83 |
results = {}
|
| 84 |
for task in Tasks:
|
| 85 |
task = task.value
|
|
@@ -102,6 +106,8 @@ class EvalResult:
|
|
| 102 |
|
| 103 |
results[task.benchmark] = mean_acc
|
| 104 |
|
|
|
|
|
|
|
| 105 |
return self(
|
| 106 |
eval_name=result_key,
|
| 107 |
full_model=full_model,
|
|
@@ -204,7 +210,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 204 |
for model_result_filepath in model_result_filepaths:
|
| 205 |
# Creation of result
|
| 206 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 207 |
-
|
|
|
|
| 208 |
|
| 209 |
# Store results of same eval together
|
| 210 |
eval_name = eval_result.eval_name
|
|
@@ -213,12 +220,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 213 |
else:
|
| 214 |
eval_results[eval_name] = eval_result
|
| 215 |
|
| 216 |
-
|
| 217 |
-
for v in eval_results.
|
| 218 |
try:
|
| 219 |
v.to_dict() # we test if the dict version is complete
|
| 220 |
-
|
| 221 |
-
except
|
| 222 |
-
|
|
|
|
|
|
|
| 223 |
|
| 224 |
-
return
|
|
|
|
| 11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
+
import pdb
|
| 15 |
+
|
| 16 |
|
| 17 |
@dataclass
|
| 18 |
class EvalResult:
|
|
|
|
| 82 |
architecture = ";".join(architectures)
|
| 83 |
|
| 84 |
# Extract results available in this file (some results are split in several files)
|
| 85 |
+
|
| 86 |
+
# pdb.set_trace()
|
| 87 |
results = {}
|
| 88 |
for task in Tasks:
|
| 89 |
task = task.value
|
|
|
|
| 106 |
|
| 107 |
results[task.benchmark] = mean_acc
|
| 108 |
|
| 109 |
+
# pdb.set_trace()
|
| 110 |
+
|
| 111 |
return self(
|
| 112 |
eval_name=result_key,
|
| 113 |
full_model=full_model,
|
|
|
|
| 210 |
for model_result_filepath in model_result_filepaths:
|
| 211 |
# Creation of result
|
| 212 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 213 |
+
|
| 214 |
+
# eval_result.update_with_request_file(requests_path)
|
| 215 |
|
| 216 |
# Store results of same eval together
|
| 217 |
eval_name = eval_result.eval_name
|
|
|
|
| 220 |
else:
|
| 221 |
eval_results[eval_name] = eval_result
|
| 222 |
|
| 223 |
+
results_for_table = list()
|
| 224 |
+
for k, v in eval_results.items():
|
| 225 |
try:
|
| 226 |
v.to_dict() # we test if the dict version is complete
|
| 227 |
+
results_for_table.append(v)
|
| 228 |
+
except RuntimeError as e: # not all eval values present
|
| 229 |
+
print(f"Issue with results of: ", k)
|
| 230 |
+
raise e
|
| 231 |
+
# continue
|
| 232 |
|
| 233 |
+
return results_for_table
|
src/populate.py
CHANGED
|
@@ -22,37 +22,37 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 22 |
return raw_data, df
|
| 23 |
|
| 24 |
|
| 25 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 22 |
return raw_data, df
|
| 23 |
|
| 24 |
|
| 25 |
+
# def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
| 26 |
+
# """Creates the different dataframes for the evaluation queues requestes"""
|
| 27 |
+
# entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
| 28 |
+
# all_evals = []
|
| 29 |
+
|
| 30 |
+
# for entry in entries:
|
| 31 |
+
# if ".json" in entry:
|
| 32 |
+
# file_path = os.path.join(save_path, entry)
|
| 33 |
+
# with open(file_path) as fp:
|
| 34 |
+
# data = json.load(fp)
|
| 35 |
+
|
| 36 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
| 37 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 38 |
+
|
| 39 |
+
# all_evals.append(data)
|
| 40 |
+
# elif ".md" not in entry:
|
| 41 |
+
# # this is a folder
|
| 42 |
+
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
| 43 |
+
# for sub_entry in sub_entries:
|
| 44 |
+
# file_path = os.path.join(save_path, entry, sub_entry)
|
| 45 |
+
# with open(file_path) as fp:
|
| 46 |
+
# data = json.load(fp)
|
| 47 |
+
|
| 48 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
| 49 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 50 |
+
# all_evals.append(data)
|
| 51 |
+
|
| 52 |
+
# pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
| 53 |
+
# running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 54 |
+
# finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
| 55 |
+
# df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
| 56 |
+
# df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
| 57 |
+
# df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
| 58 |
+
# return df_finished[cols], df_running[cols], df_pending[cols]
|