galb-dai's picture
Some changes to validation logic.
5048713
raw
history blame
2.85 kB
import json
import os
import string
DATASET_SIZE = 120
MIN_INPUT_LENGTH = 2
MAX_INPUT_LENGTH = 20
MIN_SUBMISSION_SIZE = 1
MAX_SUBMISSION_SIZE = 1024 * 1024 * 120 # 120 MB.
MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024 # 1MB.
MAX_SUBMISSION_LINES = DATASET_SIZE + 1 # Allow empty line.
def is_valid(
s: str,
min_length: int = MIN_INPUT_LENGTH,
max_length: int = MAX_INPUT_LENGTH,
) -> bool:
"""
@brief Checks whether the given string is valid.
@param s The string to validate.
@return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between
min length and max length.
"""
characters = [c for c in s] # Not using the length from len(.) as that includes unicode characters.
if len(characters) < min_length or len(characters) > max_length:
return False
# Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings.
ALLOWED = (
[c for c in string.ascii_lowercase]
+ [c for c in string.ascii_uppercase]
+ [c for c in string.digits]
+ [" ", ".", "-"]
)
for c in s:
if c not in ALLOWED:
return False
return True
def is_submission_file_valid(submission_path: str) -> bool:
"""
@brief Checks whether the given submission file is valid.
@param submission_path The path to the submission file.
@return True iff the file is within the size constraints, a JSONL, and every line is no longer than
the fixed maximum bound.
"""
if not os.path.exists(submission_path):
return False
submission_size = os.stat(submission_path).st_size
if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE:
return False
with open(submission_path, "r") as f:
# Not using readlines() to avoid consuming a large buffer at once.
n_lines = 0
seen_ids = set()
while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0:
n_lines += 1
if n_lines > MAX_SUBMISSION_LINES:
return False
if not line.startswith("{") or not line.endswith("}"):
return False
d = json.loads(line)
if set(d.keys()) != set(["problem_id", "solution"]):
return False
if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)):
return False
if not d["problem_id"].isdigit():
return False
problem_id = int(d["problem_id"])
if problem_id < 0 or problem_id >= DATASET_SIZE:
return False
if problem_id in seen_ids:
return False # Duplicate submission.
seen_ids.add(problem_id)
return True