File size: 9,159 Bytes
815b0dc 1fffe05 815b0dc 1873be0 815b0dc 42c2745 1fffe05 0ec6e70 815b0dc 1873be0 815b0dc bca2446 815b0dc 42c2745 0ec6e70 815b0dc 0ec6e70 815b0dc bca2446 1fffe05 815b0dc 2f00a93 42c2745 815b0dc 1fffe05 815b0dc 2f00a93 a2fa160 815b0dc 50cb770 a2fa160 0ec6e70 a2fa160 74236d8 a2fa160 74236d8 d812604 53034cd 0ec6e70 1fffe05 815b0dc bca2446 4be0753 bca2446 2f00a93 42c2745 bca2446 4be0753 bca2446 2f00a93 a2fa160 bca2446 50cb770 0ec6e70 50cb770 d812604 53034cd 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 bca2446 0ec6e70 4be0753 bca2446 815b0dc bca2446 815b0dc a2fa160 74236d8 e29cf37 815b0dc e29cf37 0ec6e70 e29cf37 0ec6e70 e29cf37 815b0dc e29cf37 0ec6e70 e29cf37 0ec6e70 e29cf37 0ec6e70 815b0dc 4be0753 444518c 53034cd 0ec6e70 53034cd 0ec6e70 42c2745 0ec6e70 |
|
import glob
import json
import os
import time
from dataclasses import dataclass
from datetime import datetime
import pandas as pd
from huggingface_hub import hf_hub_download, snapshot_download
from loguru import logger
from competitions.enums import SubmissionStatus
@dataclass
class Leaderboard:
end_date: datetime
eval_higher_is_better: bool
max_selected_submissions: int
competition_id: str
token: str
scoring_metric: str
def __post_init__(self):
self.non_score_columns = ["id", "submission_datetime"]
def _process_public_lb(self):
start_time = time.time()
submissions_folder = snapshot_download(
repo_id=self.competition_id,
allow_patterns="submission_info/*.json",
use_auth_token=self.token,
repo_type="dataset",
)
logger.info(f"Downloaded submissions in {time.time() - start_time} seconds")
start_time = time.time()
submissions = []
for submission in glob.glob(os.path.join(submissions_folder, "submission_info", "*.json")):
with open(submission, "r", encoding="utf-8") as f:
submission_info = json.load(f)
# only select submissions that are done
submission_info["submissions"] = [
sub for sub in submission_info["submissions"] if sub["status"] == SubmissionStatus.SUCCESS.value
]
submission_info["submissions"] = [
sub
for sub in submission_info["submissions"]
if datetime.strptime(sub["datetime"], "%Y-%m-%d %H:%M:%S") < self.end_date
]
if len(submission_info["submissions"]) == 0:
continue
user_id = submission_info["id"]
user_submissions = []
for sub in submission_info["submissions"]:
_sub = {
"id": user_id,
# "submission_id": sub["submission_id"],
# "submission_comment": sub["submission_comment"],
# "status": sub["status"],
# "selected": sub["selected"],
}
for k, v in sub["public_score"].items():
_sub[k] = v
_sub["submission_datetime"] = sub["datetime"]
user_submissions.append(_sub)
user_submissions.sort(key=lambda x: x[self.scoring_metric], reverse=self.eval_higher_is_better)
best_user_submission = user_submissions[0]
submissions.append(best_user_submission)
logger.info(f"Processed submissions in {time.time() - start_time} seconds")
return submissions
def _process_private_lb(self):
start_time = time.time()
submissions_folder = snapshot_download(
repo_id=self.competition_id,
allow_patterns="submission_info/*.json",
use_auth_token=self.token,
repo_type="dataset",
)
logger.info(f"Downloaded submissions in {time.time() - start_time} seconds")
start_time = time.time()
submissions = []
for submission in glob.glob(os.path.join(submissions_folder, "submission_info", "*.json")):
with open(submission, "r", encoding="utf-8") as f:
submission_info = json.load(f)
submission_info["submissions"] = [
sub for sub in submission_info["submissions"] if sub["status"] == SubmissionStatus.SUCCESS.value
]
if len(submission_info["submissions"]) == 0:
continue
user_id = submission_info["id"]
user_submissions = []
for sub in submission_info["submissions"]:
_sub = {
"id": user_id,
# "submission_id": sub["submission_id"],
# "submission_comment": sub["submission_comment"],
# "status": sub["status"],
"selected": sub["selected"],
}
for k, v in sub["public_score"].items():
_sub[f"public_{k}"] = v
for k, v in sub["private_score"].items():
_sub[f"private_{k}"] = v
_sub["submission_datetime"] = sub["datetime"]
user_submissions.append(_sub)
# count the number of submissions which are selected
selected_submissions = 0
for sub in user_submissions:
if sub["selected"]:
selected_submissions += 1
if selected_submissions == 0:
# select submissions with best public score
user_submissions.sort(
key=lambda x: x[f"public_{self.scoring_metric}"], reverse=self.eval_higher_is_better
)
# select only the best submission
best_user_submission = user_submissions[0]
elif selected_submissions <= self.max_selected_submissions:
# select only the selected submissions
user_submissions = [sub for sub in user_submissions if sub["selected"]]
# sort by private score
user_submissions.sort(
key=lambda x: x[f"private_{self.scoring_metric}"], reverse=self.eval_higher_is_better
)
# select only the best submission
best_user_submission = user_submissions[0]
else:
logger.warning(
f"User {user_id} has more than {self.max_selected_submissions} selected submissions. Skipping user..."
)
continue
# remove all keys that start with "public_"
best_user_submission = {k: v for k, v in best_user_submission.items() if not k.startswith("public_")}
# remove private_ from the keys
best_user_submission = {k.replace("private_", ""): v for k, v in best_user_submission.items()}
# remove selected key
best_user_submission.pop("selected")
submissions.append(best_user_submission)
logger.info(f"Processed submissions in {time.time() - start_time} seconds")
return submissions
def fetch(self, private=False):
if private:
submissions = self._process_private_lb()
else:
submissions = self._process_public_lb()
if len(submissions) == 0:
return pd.DataFrame()
df = pd.DataFrame(submissions)
# convert submission datetime to pandas datetime
df["submission_datetime"] = pd.to_datetime(df["submission_datetime"], format="%Y-%m-%d %H:%M:%S")
# only keep submissions before the end date
df = df[df["submission_datetime"] < self.end_date].reset_index(drop=True)
# sort by submission datetime
# sort by public score and submission datetime
if self.eval_higher_is_better:
if private:
df = df.sort_values(
by=[self.scoring_metric, "submission_datetime"],
ascending=[False, True],
)
else:
df = df.sort_values(
by=[self.scoring_metric, "submission_datetime"],
ascending=[False, True],
)
else:
if private:
df = df.sort_values(
by=[self.scoring_metric, "submission_datetime"],
ascending=[True, True],
)
else:
df = df.sort_values(
by=[self.scoring_metric, "submission_datetime"],
ascending=[True, True],
)
# only keep 4 significant digits in the scores
for col in df.columns:
if col in self.non_score_columns:
continue
df[col] = df[col].round(4)
# reset index
df = df.reset_index(drop=True)
df["rank"] = df.index + 1
# convert datetime column to string
df["submission_datetime"] = df["submission_datetime"].dt.strftime("%Y-%m-%d %H:%M:%S")
# send submission_datetime to the end
columns = df.columns.tolist()
columns.remove("submission_datetime")
columns.append("submission_datetime")
df = df[columns]
# send rank to first position
columns = df.columns.tolist()
columns.remove("rank")
columns = ["rank"] + columns
df = df[columns]
team_metadata = hf_hub_download(
repo_id=self.competition_id,
filename="teams.json",
token=self.token,
repo_type="dataset",
)
with open(team_metadata, "r", encoding="utf-8") as f:
team_metadata = json.load(f)
df["id"] = df["id"].apply(lambda x: team_metadata[x]["name"])
return df
|