Spaces:
Running
Running
File size: 720 Bytes
59c6d5c 64611e9 59c6d5c f7f8950 59c6d5c 64611e9 59c6d5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
from transformers import GPT2Tokenizer
from pathlib import Path
from .constants import CHECKPOINT_PATH, HF_TOKEN
def modified_tokenizer(model_name="ai-forever/rugpt3small_based_on_gpt2", cache_dir="model_cache", data_path=Path(CHECKPOINT_PATH)):
if cache_dir:
tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=str(data_path / cache_dir))
else:
tokenizer = GPT2Tokenizer.from_pretrained(model_name, token=HF_TOKEN)
special_tokens_dict = {
"additional_special_tokens": [
"<user>",
"<says>",
"<response>"
]
}
tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.add_tokens(["<laugh>"])
return tokenizer |