File size: 500 Bytes
4ed02d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from transformers import LlamaTokenizerFast


# Ultra-FineWeb classifier is using "deepseek-ai/DeepSeek-V2"
# path = "deepseek-ai/DeepSeek-V2"
path = "local_tokenizer"
tokenizer = LlamaTokenizerFast.from_pretrained(path, trust_remote_code=True)

# test tokenizer
content = "MiniCPM4: Ultra-Efficient LLMs on End Devices"
token_ids = tokenizer.encode(content, add_special_tokens=False)
print(token_ids)

# decode each token and print
for token_id in token_ids:
    print(tokenizer.decode([token_id]))