galb-dai's picture
Remove some unused code/imports.
416ebf1
raw
history blame
3.21 kB
import time
from datetime import datetime, timezone
import pandas as pd
from datasets import Dataset
from pandas.api.types import is_integer_dtype
from src.datamodel.data import F1Data
from src.display.formatting import styled_error, styled_message
from src.display.utils import ModelType
from src.envs import SUBMISSIONS_REPO
from src.logger import get_logger
logger = get_logger(__name__)
def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
logger.info("Validating DS size %d columns %s set %s", len(pd_ds), pd_ds.columns, set(pd_ds.columns))
expected_cols = ["problem_id", "solution"]
if set(pd_ds.columns) != set(expected_cols):
return f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}"
if not is_integer_dtype(pd_ds["problem_id"]):
return "problem_id must be str convertible to int"
if any(type(v) is not str for v in pd_ds["solution"]):
return "solution must be of type str"
submitted_ids = set(pd_ds.problem_id.astype(str))
if submitted_ids != lbdb.code_problem_ids:
missing = lbdb.code_problem_ids - submitted_ids
unknown = submitted_ids - lbdb.code_problem_ids
return f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown"
if len(pd_ds) > len(lbdb.code_problem_ids):
return "Duplicate problem IDs exist in uploaded file"
return None
def add_new_solutions(
lbdb: F1Data,
system_name: str,
org: str,
sys_type: str,
submission_path: str,
skip_validation: bool = False,
):
logger.info("ADD SUBMISSION! %s path %s", str((system_name, org, sys_type)), submission_path)
if not system_name:
return styled_error("Please fill system name")
if not org:
return styled_error("Please fill organization name")
if not sys_type:
return styled_error("Please select system type")
sys_type = ModelType.from_str(sys_type).name
if not submission_path:
return styled_error("Please upload JSONL solutions file")
try:
submission_df = pd.read_json(submission_path, lines=True)
except Exception as e:
return styled_error(f"Cannot read uploaded JSONL file: {str(e)}")
if not skip_validation:
validation_error = validate_submission(lbdb, submission_df)
if validation_error:
return styled_error(validation_error)
submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
# Seems good, creating the eval
print(f"Adding new submission: {submission_id}")
submission_ts = time.time_ns()
def add_info(row):
return {
**row,
"system_name": system_name,
"organization": org,
"system_type": sys_type,
"submission_id": submission_id,
"submission_ts": submission_ts,
}
ds = Dataset.from_pandas(submission_df).map(add_info)
ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
return styled_message(
"Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."
)