File size: 720 Bytes
59c6d5c
 
64611e9
59c6d5c
 
 
 
f7f8950
59c6d5c
64611e9
59c6d5c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from transformers import GPT2Tokenizer
from pathlib import Path
from .constants import CHECKPOINT_PATH, HF_TOKEN


def modified_tokenizer(model_name="ai-forever/rugpt3small_based_on_gpt2", cache_dir="model_cache", data_path=Path(CHECKPOINT_PATH)):
    if cache_dir:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=str(data_path / cache_dir))
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name, token=HF_TOKEN)
    special_tokens_dict = {
        "additional_special_tokens": [
            "<user>", 
            "<says>", 
            "<response>"
        ]
    }
    tokenizer.add_special_tokens(special_tokens_dict)
    tokenizer.add_tokens(["<laugh>"])
    return tokenizer