Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files

xet

Community

xu-song commited on Aug 7

Commit

abd6a65

1 Parent(s): e6c318a

update

Browse files

Files changed (5) hide show

character_util.py +26 -5
playground_app.py +1 -1
playground_examples.py +0 -138
playground_util.py +9 -7
test/gpt4/gpt4_demo.py +38 -0

character_util.py CHANGED Viewed

@@ -23,6 +23,8 @@ default_columns = ["digit", "zh"]
 def _to_unicode(text):
     return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
 def _get_coding_length(tokenizer, vocab, filter=None):
     """
@@ -62,6 +64,18 @@ def _dist(token_lens):
     return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
 def iter_vocab(
         tokenizer_name: str,
         from_cache: bool = True,
@@ -103,8 +117,8 @@ def iter_vocab(
         tags = []
         if token is None:  # 有些词典有空的id（不连续）
             continue
-        if isinstance(token, bytes):
-            token = token.decode("utf-8", errors="ignore")
         if hasattr(tokenizer, "sp_model"):  # 基于 sentencepiece 包
             if tokenizer.sp_model.is_byte(token_id):
@@ -128,8 +142,11 @@ def iter_vocab(
                 "id": token_id,
                 "token": token,
                 "token_decode": decode_str,
-                "token_dumps": json.dumps(token),
                 "token_unicode": _to_unicode(token),
                 "token_len": len(decode_str),
             },
             ensure_ascii=False) + "\n")
@@ -212,5 +229,9 @@ def get_character_table(
 if __name__ == "__main__":
     # aa = get_character_table(tokenizer_filter="baichuan")
-    df = get_character_table()
-    logger.info(f"\n{df.to_markdown(index=False)}")

 def _to_unicode(text):
     return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
+def _to_unicode_decimal(text):
+    return [ord(chr) for chr in text]
 def _get_coding_length(tokenizer, vocab, filter=None):
     """
     return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
+def _decode_bytes_to_str(token: bytes) -> str:
+    try:
+        token = token.decode("utf-8", errors="strict")
+    except:
+        try:
+            # for single byte, such as b'\xa1'
+            token = token.decode('latin-1')
+        except:
+            logger.warning(f"token {token} decode failed")
+            token = token.decode("utf-8", errors="ignore")
+    return token
 def iter_vocab(
         tokenizer_name: str,
         from_cache: bool = True,
         tags = []
         if token is None:  # 有些词典有空的id（不连续）
             continue
+        if isinstance(token, bytes):  # convert bytes to string
+            token = _decode_bytes_to_str(token)
         if hasattr(tokenizer, "sp_model"):  # 基于 sentencepiece 包
             if tokenizer.sp_model.is_byte(token_id):
                 "id": token_id,
                 "token": token,
                 "token_decode": decode_str,
+                "token_dumps": json.dumps(token),  # unicode:
+                # https://en.wikipedia.org/wiki/List_of_Unicode_characters
                 "token_unicode": _to_unicode(token),
+                "token_unicode_decimal": _to_unicode_decimal(token), # 十进制的
+                # "token_utf8_bytes": "",
                 "token_len": len(decode_str),
             },
             ensure_ascii=False) + "\n")
 if __name__ == "__main__":
     # aa = get_character_table(tokenizer_filter="baichuan")
+    # iter_vocab("openai/gpt-4o", from_cache=False)
+    iter_vocab("openai/gpt-oss-20b", from_cache=False)
+    # df = get_character_table()
+    # logger.info(f"\n{df.to_markdown(index=False)}")

playground_app.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import gradio as gr
 from vocab import tokenizer_factory
-from playground_examples import example_types, example_fn
 from playground_util import (tokenize,
                              tokenize_pair, basic_count,
                              get_overlap_token_size, on_load)

 import gradio as gr
 from vocab import tokenizer_factory
+from config import example_types, example_fn
 from playground_util import (tokenize,
                              tokenize_pair, basic_count,
                              get_overlap_token_size, on_load)

playground_examples.py DELETED Viewed

@@ -1,138 +0,0 @@
-"""
-## characters
-- alphanumeric characters
-- numeric characters
-- special characters: A special character is a character that is not an alphabetic or numeric character.
-    - ASCII control characters
-    - punctuation marks
-    - accent marks
-    - 数学符号
-    - whitespace:
-        - https://en.wikipedia.org/wiki/Whitespace_character
-        - https://emptycharacter.com/
-https://www.computerhope.com/jargon/s/specchar.htm
-"""
-import random
-from datasets import load_dataset
-default_user_input = """\
-Replace this text in the input field to see how tokenization works.
-Buenos días!
-Tokenizer 是自然语言处理（NLP）中的一个关键组件，它的主要作用是将人类语言文本转换为计算机可以理解的数字表示形式。
-ラグビーワールドカップ2023フランス"""
-# default_tokenizer_name_1 = "Meta/llama3"
-# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
-default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
-default_tokenizer_name_2 = "openai/gpt-4o"
-def get_sample_input():
-    default_inputs = {
-        "en": "Replace this text in the input field to see how tokenization works.",
-        "zh-Hans": "",
-        "es": "",
-        "de": "",
-    }
-    random.seed(10)  # For reproducibility
-    lines = []
-    for lang in default_inputs.keys():
-        dataset = load_dataset("eson/cc100-samples", lang, split="train")
-        print(dataset)
-        print(1)
-    return default_inputs
-examples = {
-    "en": [
-        ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"],  #
-        [
-            "whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline",
-            "huggyllama/llama-7b",
-            "google-bert/bert-base-cased",
-        ],  # chatglm 有blank_n, bert丢掉了空格，
-        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
-        [
-            'punctuation: ,.:/?+="，。！？；【】〔〕〖〗',
-            "google/gemma-7b",
-            "huggyllama/llama-7b",
-        ],  # llama词典有点小
-        [
-            "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
-            "baichuan-inc/Baichuan-7B",
-            "huggyllama/llama-7b",
-        ],
-        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
-    ],
-    "zh": [
-        [
-            "空格测试：  2个空格        8个空格",
-            "llama",
-            "chatglm2_6b",
-        ],  # chatglm 有blank_n,
-        ["标点测试：，。！？；", "baichuan_7b", "llama"],
-        [
-            "符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
-            "baichuan_7b",
-            "llama",
-        ],
-        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
-        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
-    ],
-}
-more_examples = [
-    # bert系列
-    (
-        "google-bert/bert-base-cased",
-        "google-bert/bert-base-uncased",
-        "",
-        "",
-    ),  # # clue VS kplug， bert VS clue
-    ("bert-base-cased", "clue", "", "增加了[]()"),
-    ("roberta-chinese-clue", "kplug", "", ""),
-    # llama系列 (基于sentencepiece)
-    (
-        "baichuan",
-        "baichuan2",
-        "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1",
-    ),
-    ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
-    ("llama", "chinese-llama-2-7b", ""),
-    ("llama", "llama3", "扩充词典"),
-    ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
-    # glm系列 （基于sentencepiece）
-    ("glm", "chatglm1", ""),
-    ("chatglm1", "chatglm2", ""),
-    # gpt2系列
-    ("gpt2", "moss", ""),
-    ("", "", ""),
-    # openai系列 （tiktoken）
-    ("qwen", "gpt_35_turbo", ""),
-]
-lang = "en"
-example_types = [t[0].split(":")[0] for t in examples[lang]]
-def example_fn(example_idx):
-    return examples[lang][example_idx]
-def get_more_example():
-    import urllib.parse
-    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
-    for tokenizer1, tokenizer2, text, comment in more_examples:
-        full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
-        print(full_url)
-if __name__ == "__main__":
-    get_more_example()

playground_util.py CHANGED Viewed

@@ -6,7 +6,7 @@ from vocab import tokenizer_factory
 from character_util import iter_vocab
 from utils.log_util import logger
 from utils.i18n_util import get_lang
-from playground_examples import (
     default_tokenizer_name_1,
     default_tokenizer_name_2,
     default_user_input,
@@ -137,10 +137,11 @@ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
     vocab_set_1 = tokenizer1.get_vocab().keys()
     vocab_set_2 = tokenizer2.get_vocab().keys()
     token1 = next(iter(vocab_set_1))
     token2 = next(iter(vocab_set_2))
-    if type(token1) != type(token2):  # bytes  str
         if isinstance(token1, str):
             vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
         if isinstance(token2, str):
@@ -209,9 +210,10 @@ def test_coding():
 if __name__ == "__main__":
     # print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
-    print(
-        get_overlap_token_size(
-            "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
-        )
-    )
     # print(basic_count("internlm_chat_7b"))

 from character_util import iter_vocab
 from utils.log_util import logger
 from utils.i18n_util import get_lang
+from config import (
     default_tokenizer_name_1,
     default_tokenizer_name_2,
     default_user_input,
     vocab_set_1 = tokenizer1.get_vocab().keys()
     vocab_set_2 = tokenizer2.get_vocab().keys()
+    # v1 = {v:k for k, v in vocab1.items()}
     token1 = next(iter(vocab_set_1))
     token2 = next(iter(vocab_set_2))
+    if type(token1) != type(token2):  # convert string to byte
         if isinstance(token1, str):
             vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
         if isinstance(token2, str):
 if __name__ == "__main__":
     # print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
+    print(get_overlap_token_size("openai/gpt-oss-20b", "openai/gpt-4o"))
+    # print(
+    #     get_overlap_token_size(
+    #         "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
+    #     )
+    # )
     # print(basic_count("internlm_chat_7b"))

test/gpt4/gpt4_demo.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+## tiktoken版本的 unicode 错误，什么原因？
+- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1}  错误的
+- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1}   正确的
+"""
+import sys
+import pdb
+sys.path.append("../../")
+from vocab import tokenizer_factory
+from character_util import _decode_bytes_to_str
+tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
+tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")
+vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
+vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}
+min_vocab_size = min(len(vocab_1), len(vocab_2))
+for i in range(min_vocab_size):
+    if i == 188:
+        import pdb; pdb.set_trace()
+        print(i)
+    token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
+    token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
+    token_str2 = _decode_bytes_to_str(token_str2)
+    if token_str1 != token_str2:
+        pdb.set_trace()
+        print(i, token_str1, token_str2)