File size: 346 Bytes
90c8b51
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from datasets import load_dataset
from transformers import AutoTokenizer

def get_tokenizer(model_name="bert-base-uncased", max_len=128):
    token = AutoTokenizer.from_pretrained(model_name)
    token.model_max_length = max_len
    return token

def load(tokenizer, split="validation"):
    ds = load_dataset("glue", "stsb", split=split)