from datasets import load_dataset from transformers import AutoTokenizer def get_tokenizer(model_name="bert-base-uncased", max_len=128): token = AutoTokenizer.from_pretrained(model_name) token.model_max_length = max_len return token def load(tokenizer, split="validation"): ds = load_dataset("glue", "stsb", split=split)