tomerz-aai's picture
wip
ed67886
raw
history blame
1.06 kB
import functools
import time
from datasets import load_dataset
from src.envs import TOKEN
from src.logger import get_logger
logger = get_logger(__name__)
class F1Data:
def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str):
self.cp_dataset_name = cp_ds_name
self.submissions_dataset_name = sub_ds_name
self.results_dataset_name = res_ds_name
self.initialize()
@functools.cached_property
def code_problem_formulas(self) -> set[str]:
return set(self.code_problems.keys())
def initialize(self):
logger.info("Initialize F1Data TOMEN='%s'", TOKEN)
start_time = time.monotonic()
cp_ds = load_dataset(self.cp_dataset_name, split="hard", token=TOKEN)
logger.info("Loaded code-problems dataset from %s in %f sec", self.cp_dataset_name, time.monotonic() - start_time)
self.code_problems: dict[str, str] = {r["formula_name"]: r["code_problem"]["problem_description"] for r in cp_ds}
logger.info("Code problems info: %s", self.code_problems)