Gül Sena Altıntaş commited on
Commit
f58b113
·
1 Parent(s): 37a99cb

Fixed tokenmonster issue

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. mappings.py +1 -0
  3. requirements.txt +2 -1
  4. utils.py +22 -4
app.py CHANGED
@@ -716,7 +716,7 @@ with gr.Blocks(
716
 
717
  - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
718
  - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
719
- - **Gemma-2**: Google's model with SentencePiece
720
  - **Qwen3/2.5**: Alibaba's models with BPE
721
  - **BERT/DistilBERT**: Google's models with WordPiece
722
  - **BLOOM**: BigScience's multilingual model with BPE
 
716
 
717
  - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
718
  - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
719
+ - **Gemma-2**: Google's model with SentencePiece (though HuggingFace uses BPE)
720
  - **Qwen3/2.5**: Alibaba's models with BPE
721
  - **BERT/DistilBERT**: Google's models with WordPiece
722
  - **BLOOM**: BigScience's multilingual model with BPE
mappings.py CHANGED
@@ -14,6 +14,7 @@ MODEL_MAP = {
14
  "byt5": "google/byt5-small",
15
  }
16
 
 
17
  TOKENIZER_INFO = {
18
  "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
19
  "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
 
14
  "byt5": "google/byt5-small",
15
  }
16
 
17
+
18
  TOKENIZER_INFO = {
19
  "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
20
  "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
requirements.txt CHANGED
@@ -3,4 +3,5 @@ tiktoken
3
  transformers
4
  torch
5
  pandas
6
- plotly
 
 
3
  transformers
4
  torch
5
  pandas
6
+ plotly
7
+ tokenmonster
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import re
3
  import unicodedata
 
4
 
5
  import tiktoken
6
  from transformers import AutoTokenizer
@@ -8,6 +9,20 @@ from transformers import AutoTokenizer
8
  from mappings import MODEL_MAP, TOKENIZER_INFO
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def get_token_type(token_text):
12
  if re.match(r"^\s+$", token_text):
13
  return "whitespace"
@@ -93,7 +108,6 @@ def tokenize_with_tiktoken(text, model):
93
  def tokenize_with_hf(text, model):
94
  try:
95
  model_name = MODEL_MAP.get(model, "gpt2")
96
-
97
  # Get token from environment
98
  hf_token = os.getenv("HF_TOKEN")
99
  if not hf_token:
@@ -103,9 +117,11 @@ def tokenize_with_hf(text, model):
103
  "tokens": [],
104
  "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
105
  }
106
-
107
- print(f"DEBUG: Loading model {model_name} with token")
108
- tokenizer = AutoTokenizer.from_pretrained(
 
 
109
  model_name, token=hf_token, trust_remote_code=True
110
  )
111
  token_data = []
@@ -117,6 +133,7 @@ def tokenize_with_hf(text, model):
117
  )
118
  token_ids = encoding["input_ids"]
119
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
 
120
  # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
121
 
122
  for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
@@ -145,6 +162,7 @@ def tokenize_with_hf(text, model):
145
  except Exception as e:
146
  error_msg = str(e)
147
  print(f"DEBUG: Error: {error_msg}")
 
148
 
149
  # Provide helpful error messages
150
  if "gated repo" in error_msg.lower():
 
1
  import os
2
  import re
3
  import unicodedata
4
+ import traceback
5
 
6
  import tiktoken
7
  from transformers import AutoTokenizer
 
9
  from mappings import MODEL_MAP, TOKENIZER_INFO
10
 
11
 
12
+ class TokenMonsterTokenizer:
13
+ def __init__(self, name):
14
+ import tokenmonster
15
+ self.name = name
16
+ self.vocab = tokenmonster.load(name.split("/")[-1])
17
+
18
+ def __call__(self, text, **kwargs):
19
+ ids = list(self.vocab.tokenize(text))
20
+ return {"input_ids": ids}
21
+
22
+ def convert_ids_to_tokens(self, ids):
23
+ return [self.vocab.decode(id_) for id_ in ids]
24
+
25
+
26
  def get_token_type(token_text):
27
  if re.match(r"^\s+$", token_text):
28
  return "whitespace"
 
108
  def tokenize_with_hf(text, model):
109
  try:
110
  model_name = MODEL_MAP.get(model, "gpt2")
 
111
  # Get token from environment
112
  hf_token = os.getenv("HF_TOKEN")
113
  if not hf_token:
 
117
  "tokens": [],
118
  "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
119
  }
120
+
121
+ if "tokenmonster" in model_name:
122
+ tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
123
+ else:
124
+ tokenizer = AutoTokenizer.from_pretrained(
125
  model_name, token=hf_token, trust_remote_code=True
126
  )
127
  token_data = []
 
133
  )
134
  token_ids = encoding["input_ids"]
135
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
136
+ print(model_name, tokens, token_ids)
137
  # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
138
 
139
  for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
 
162
  except Exception as e:
163
  error_msg = str(e)
164
  print(f"DEBUG: Error: {error_msg}")
165
+ print(traceback.format_exc())
166
 
167
  # Provide helpful error messages
168
  if "gated repo" in error_msg.lower():