Spaces:
Running
Running
update
Browse files- character_util.py +26 -5
- playground_app.py +1 -1
- playground_examples.py +0 -138
- playground_util.py +9 -7
- test/gpt4/gpt4_demo.py +38 -0
character_util.py
CHANGED
@@ -23,6 +23,8 @@ default_columns = ["digit", "zh"]
|
|
23 |
def _to_unicode(text):
|
24 |
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
|
25 |
|
|
|
|
|
26 |
|
27 |
def _get_coding_length(tokenizer, vocab, filter=None):
|
28 |
"""
|
@@ -62,6 +64,18 @@ def _dist(token_lens):
|
|
62 |
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
63 |
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
def iter_vocab(
|
66 |
tokenizer_name: str,
|
67 |
from_cache: bool = True,
|
@@ -103,8 +117,8 @@ def iter_vocab(
|
|
103 |
tags = []
|
104 |
if token is None: # 有些词典有空的id(不连续)
|
105 |
continue
|
106 |
-
if isinstance(token, bytes):
|
107 |
-
token = token
|
108 |
|
109 |
if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
110 |
if tokenizer.sp_model.is_byte(token_id):
|
@@ -128,8 +142,11 @@ def iter_vocab(
|
|
128 |
"id": token_id,
|
129 |
"token": token,
|
130 |
"token_decode": decode_str,
|
131 |
-
"token_dumps": json.dumps(token),
|
|
|
132 |
"token_unicode": _to_unicode(token),
|
|
|
|
|
133 |
"token_len": len(decode_str),
|
134 |
},
|
135 |
ensure_ascii=False) + "\n")
|
@@ -212,5 +229,9 @@ def get_character_table(
|
|
212 |
|
213 |
if __name__ == "__main__":
|
214 |
# aa = get_character_table(tokenizer_filter="baichuan")
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
23 |
def _to_unicode(text):
|
24 |
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
|
25 |
|
26 |
+
def _to_unicode_decimal(text):
|
27 |
+
return [ord(chr) for chr in text]
|
28 |
|
29 |
def _get_coding_length(tokenizer, vocab, filter=None):
|
30 |
"""
|
|
|
64 |
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
65 |
|
66 |
|
67 |
+
def _decode_bytes_to_str(token: bytes) -> str:
|
68 |
+
try:
|
69 |
+
token = token.decode("utf-8", errors="strict")
|
70 |
+
except:
|
71 |
+
try:
|
72 |
+
# for single byte, such as b'\xa1'
|
73 |
+
token = token.decode('latin-1')
|
74 |
+
except:
|
75 |
+
logger.warning(f"token {token} decode failed")
|
76 |
+
token = token.decode("utf-8", errors="ignore")
|
77 |
+
return token
|
78 |
+
|
79 |
def iter_vocab(
|
80 |
tokenizer_name: str,
|
81 |
from_cache: bool = True,
|
|
|
117 |
tags = []
|
118 |
if token is None: # 有些词典有空的id(不连续)
|
119 |
continue
|
120 |
+
if isinstance(token, bytes): # convert bytes to string
|
121 |
+
token = _decode_bytes_to_str(token)
|
122 |
|
123 |
if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
124 |
if tokenizer.sp_model.is_byte(token_id):
|
|
|
142 |
"id": token_id,
|
143 |
"token": token,
|
144 |
"token_decode": decode_str,
|
145 |
+
"token_dumps": json.dumps(token), # unicode:
|
146 |
+
# https://en.wikipedia.org/wiki/List_of_Unicode_characters
|
147 |
"token_unicode": _to_unicode(token),
|
148 |
+
"token_unicode_decimal": _to_unicode_decimal(token), # 十进制的
|
149 |
+
# "token_utf8_bytes": "",
|
150 |
"token_len": len(decode_str),
|
151 |
},
|
152 |
ensure_ascii=False) + "\n")
|
|
|
229 |
|
230 |
if __name__ == "__main__":
|
231 |
# aa = get_character_table(tokenizer_filter="baichuan")
|
232 |
+
|
233 |
+
# iter_vocab("openai/gpt-4o", from_cache=False)
|
234 |
+
iter_vocab("openai/gpt-oss-20b", from_cache=False)
|
235 |
+
|
236 |
+
# df = get_character_table()
|
237 |
+
# logger.info(f"\n{df.to_markdown(index=False)}")
|
playground_app.py
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
|
5 |
import gradio as gr
|
6 |
from vocab import tokenizer_factory
|
7 |
-
from
|
8 |
from playground_util import (tokenize,
|
9 |
tokenize_pair, basic_count,
|
10 |
get_overlap_token_size, on_load)
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
from vocab import tokenizer_factory
|
7 |
+
from config import example_types, example_fn
|
8 |
from playground_util import (tokenize,
|
9 |
tokenize_pair, basic_count,
|
10 |
get_overlap_token_size, on_load)
|
playground_examples.py
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
## characters
|
4 |
-
|
5 |
-
- alphanumeric characters
|
6 |
-
- numeric characters
|
7 |
-
- special characters: A special character is a character that is not an alphabetic or numeric character.
|
8 |
-
- ASCII control characters
|
9 |
-
- punctuation marks
|
10 |
-
- accent marks
|
11 |
-
- 数学符号
|
12 |
-
- whitespace:
|
13 |
-
- https://en.wikipedia.org/wiki/Whitespace_character
|
14 |
-
- https://emptycharacter.com/
|
15 |
-
|
16 |
-
|
17 |
-
https://www.computerhope.com/jargon/s/specchar.htm
|
18 |
-
"""
|
19 |
-
|
20 |
-
import random
|
21 |
-
from datasets import load_dataset
|
22 |
-
|
23 |
-
default_user_input = """\
|
24 |
-
Replace this text in the input field to see how tokenization works.
|
25 |
-
Buenos días!
|
26 |
-
Tokenizer 是自然语言处理(NLP)中的一个关键组件,它的主要作用是将人类语言文本转换为计算机可以理解的数字表示形式。
|
27 |
-
ラグビーワールドカップ2023フランス"""
|
28 |
-
# default_tokenizer_name_1 = "Meta/llama3"
|
29 |
-
# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
30 |
-
default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
|
31 |
-
default_tokenizer_name_2 = "openai/gpt-4o"
|
32 |
-
|
33 |
-
|
34 |
-
def get_sample_input():
|
35 |
-
default_inputs = {
|
36 |
-
"en": "Replace this text in the input field to see how tokenization works.",
|
37 |
-
"zh-Hans": "",
|
38 |
-
"es": "",
|
39 |
-
"de": "",
|
40 |
-
}
|
41 |
-
random.seed(10) # For reproducibility
|
42 |
-
lines = []
|
43 |
-
for lang in default_inputs.keys():
|
44 |
-
dataset = load_dataset("eson/cc100-samples", lang, split="train")
|
45 |
-
print(dataset)
|
46 |
-
print(1)
|
47 |
-
return default_inputs
|
48 |
-
|
49 |
-
|
50 |
-
examples = {
|
51 |
-
"en": [
|
52 |
-
["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
|
53 |
-
[
|
54 |
-
"whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline",
|
55 |
-
"huggyllama/llama-7b",
|
56 |
-
"google-bert/bert-base-cased",
|
57 |
-
], # chatglm 有blank_n, bert丢掉了空格,
|
58 |
-
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
59 |
-
[
|
60 |
-
'punctuation: ,.:/?+=",。!?;【】〔〕〖〗',
|
61 |
-
"google/gemma-7b",
|
62 |
-
"huggyllama/llama-7b",
|
63 |
-
], # llama词典有点小
|
64 |
-
[
|
65 |
-
"symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
|
66 |
-
"baichuan-inc/Baichuan-7B",
|
67 |
-
"huggyllama/llama-7b",
|
68 |
-
],
|
69 |
-
# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
|
70 |
-
],
|
71 |
-
"zh": [
|
72 |
-
[
|
73 |
-
"空格测试: 2个空格 8个空格",
|
74 |
-
"llama",
|
75 |
-
"chatglm2_6b",
|
76 |
-
], # chatglm 有blank_n,
|
77 |
-
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
78 |
-
[
|
79 |
-
"符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
|
80 |
-
"baichuan_7b",
|
81 |
-
"llama",
|
82 |
-
],
|
83 |
-
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
84 |
-
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
85 |
-
],
|
86 |
-
}
|
87 |
-
|
88 |
-
|
89 |
-
more_examples = [
|
90 |
-
# bert系列
|
91 |
-
(
|
92 |
-
"google-bert/bert-base-cased",
|
93 |
-
"google-bert/bert-base-uncased",
|
94 |
-
"",
|
95 |
-
"",
|
96 |
-
), # # clue VS kplug, bert VS clue
|
97 |
-
("bert-base-cased", "clue", "", "增加了[]()"),
|
98 |
-
("roberta-chinese-clue", "kplug", "", ""),
|
99 |
-
# llama系列 (基于sentencepiece)
|
100 |
-
(
|
101 |
-
"baichuan",
|
102 |
-
"baichuan2",
|
103 |
-
"baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1",
|
104 |
-
),
|
105 |
-
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
|
106 |
-
("llama", "chinese-llama-2-7b", ""),
|
107 |
-
("llama", "llama3", "扩充词典"),
|
108 |
-
("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
|
109 |
-
# glm系列 (基于sentencepiece)
|
110 |
-
("glm", "chatglm1", ""),
|
111 |
-
("chatglm1", "chatglm2", ""),
|
112 |
-
# gpt2系列
|
113 |
-
("gpt2", "moss", ""),
|
114 |
-
("", "", ""),
|
115 |
-
# openai系列 (tiktoken)
|
116 |
-
("qwen", "gpt_35_turbo", ""),
|
117 |
-
]
|
118 |
-
|
119 |
-
lang = "en"
|
120 |
-
|
121 |
-
example_types = [t[0].split(":")[0] for t in examples[lang]]
|
122 |
-
|
123 |
-
|
124 |
-
def example_fn(example_idx):
|
125 |
-
return examples[lang][example_idx]
|
126 |
-
|
127 |
-
|
128 |
-
def get_more_example():
|
129 |
-
import urllib.parse
|
130 |
-
|
131 |
-
url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
|
132 |
-
for tokenizer1, tokenizer2, text, comment in more_examples:
|
133 |
-
full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
|
134 |
-
print(full_url)
|
135 |
-
|
136 |
-
|
137 |
-
if __name__ == "__main__":
|
138 |
-
get_more_example()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
playground_util.py
CHANGED
@@ -6,7 +6,7 @@ from vocab import tokenizer_factory
|
|
6 |
from character_util import iter_vocab
|
7 |
from utils.log_util import logger
|
8 |
from utils.i18n_util import get_lang
|
9 |
-
from
|
10 |
default_tokenizer_name_1,
|
11 |
default_tokenizer_name_2,
|
12 |
default_user_input,
|
@@ -137,10 +137,11 @@ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
|
|
137 |
|
138 |
vocab_set_1 = tokenizer1.get_vocab().keys()
|
139 |
vocab_set_2 = tokenizer2.get_vocab().keys()
|
|
|
140 |
|
141 |
token1 = next(iter(vocab_set_1))
|
142 |
token2 = next(iter(vocab_set_2))
|
143 |
-
if type(token1) != type(token2): #
|
144 |
if isinstance(token1, str):
|
145 |
vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
|
146 |
if isinstance(token2, str):
|
@@ -209,9 +210,10 @@ def test_coding():
|
|
209 |
if __name__ == "__main__":
|
210 |
|
211 |
# print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
|
212 |
-
print(
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
)
|
|
|
217 |
# print(basic_count("internlm_chat_7b"))
|
|
|
6 |
from character_util import iter_vocab
|
7 |
from utils.log_util import logger
|
8 |
from utils.i18n_util import get_lang
|
9 |
+
from config import (
|
10 |
default_tokenizer_name_1,
|
11 |
default_tokenizer_name_2,
|
12 |
default_user_input,
|
|
|
137 |
|
138 |
vocab_set_1 = tokenizer1.get_vocab().keys()
|
139 |
vocab_set_2 = tokenizer2.get_vocab().keys()
|
140 |
+
# v1 = {v:k for k, v in vocab1.items()}
|
141 |
|
142 |
token1 = next(iter(vocab_set_1))
|
143 |
token2 = next(iter(vocab_set_2))
|
144 |
+
if type(token1) != type(token2): # convert string to byte
|
145 |
if isinstance(token1, str):
|
146 |
vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
|
147 |
if isinstance(token2, str):
|
|
|
210 |
if __name__ == "__main__":
|
211 |
|
212 |
# print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
|
213 |
+
print(get_overlap_token_size("openai/gpt-oss-20b", "openai/gpt-4o"))
|
214 |
+
# print(
|
215 |
+
# get_overlap_token_size(
|
216 |
+
# "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
|
217 |
+
# )
|
218 |
+
# )
|
219 |
# print(basic_count("internlm_chat_7b"))
|
test/gpt4/gpt4_demo.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
## tiktoken版本的 unicode 错误,什么原因?
|
4 |
+
- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的
|
5 |
+
- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的
|
6 |
+
"""
|
7 |
+
|
8 |
+
import sys
|
9 |
+
import pdb
|
10 |
+
|
11 |
+
sys.path.append("../../")
|
12 |
+
|
13 |
+
from vocab import tokenizer_factory
|
14 |
+
from character_util import _decode_bytes_to_str
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
|
19 |
+
tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")
|
20 |
+
|
21 |
+
|
22 |
+
vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
|
23 |
+
vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}
|
24 |
+
|
25 |
+
min_vocab_size = min(len(vocab_1), len(vocab_2))
|
26 |
+
|
27 |
+
for i in range(min_vocab_size):
|
28 |
+
if i == 188:
|
29 |
+
import pdb; pdb.set_trace()
|
30 |
+
print(i)
|
31 |
+
|
32 |
+
token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
|
33 |
+
token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
|
34 |
+
token_str2 = _decode_bytes_to_str(token_str2)
|
35 |
+
if token_str1 != token_str2:
|
36 |
+
pdb.set_trace()
|
37 |
+
print(i, token_str1, token_str2)
|
38 |
+
|