minpeter commited on
Commit
02d0ecc
·
verified ·
1 Parent(s): 6442523

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +3 -3
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +22 -74
special_tokens_map.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "eos_token": {
3
- "content": "<|im_end|>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "pad_token": {
10
- "content": "<|pad_token|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "unk_token": {
17
- "content": "<|unk_token|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
1
  {
2
  "eos_token": {
3
+ "content": "<|endoftext|>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "pad_token": {
10
+ "content": "<|endoftext|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "unk_token": {
17
+ "content": "<|endoftext|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,38 +1,32 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
- "32000": {
4
- "content": "<|im_start|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "32001": {
12
- "content": "<|im_end|>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "32002": {
20
- "content": "<|unk_token|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "32003": {
28
- "content": "<|pad_token|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "32004": {
36
  "content": "<tool_call>",
37
  "lstrip": false,
38
  "normalized": false,
@@ -40,7 +34,7 @@
40
  "single_word": false,
41
  "special": false
42
  },
43
- "32005": {
44
  "content": "</tool_call>",
45
  "lstrip": false,
46
  "normalized": false,
@@ -48,7 +42,7 @@
48
  "single_word": false,
49
  "special": false
50
  },
51
- "32006": {
52
  "content": "<think>",
53
  "lstrip": false,
54
  "normalized": false,
@@ -56,7 +50,7 @@
56
  "single_word": false,
57
  "special": false
58
  },
59
- "32007": {
60
  "content": "</think>",
61
  "lstrip": false,
62
  "normalized": false,
@@ -64,7 +58,7 @@
64
  "single_word": false,
65
  "special": false
66
  },
67
- "32008": {
68
  "content": "<|unused_special_token_0|>",
69
  "lstrip": false,
70
  "normalized": false,
@@ -72,7 +66,7 @@
72
  "single_word": false,
73
  "special": true
74
  },
75
- "32009": {
76
  "content": "<|unused_special_token_1|>",
77
  "lstrip": false,
78
  "normalized": false,
@@ -80,7 +74,7 @@
80
  "single_word": false,
81
  "special": true
82
  },
83
- "32010": {
84
  "content": "<|unused_special_token_2|>",
85
  "lstrip": false,
86
  "normalized": false,
@@ -88,68 +82,22 @@
88
  "single_word": false,
89
  "special": true
90
  },
91
- "32011": {
92
  "content": "<|unused_special_token_3|>",
93
  "lstrip": false,
94
  "normalized": false,
95
  "rstrip": false,
96
  "single_word": false,
97
  "special": true
98
- },
99
- "32012": {
100
- "content": "<|unused_special_token_4|>",
101
- "lstrip": false,
102
- "normalized": false,
103
- "rstrip": false,
104
- "single_word": false,
105
- "special": true
106
- },
107
- "32013": {
108
- "content": "<|unused_special_token_5|>",
109
- "lstrip": false,
110
- "normalized": false,
111
- "rstrip": false,
112
- "single_word": false,
113
- "special": true
114
- },
115
- "32014": {
116
- "content": "<|unused_special_token_6|>",
117
- "lstrip": false,
118
- "normalized": false,
119
- "rstrip": false,
120
- "single_word": false,
121
- "special": true
122
- },
123
- "32015": {
124
- "content": "<|unused_special_token_7|>",
125
- "lstrip": false,
126
- "normalized": false,
127
- "rstrip": false,
128
- "single_word": false,
129
- "special": true
130
- },
131
- "32016": {
132
- "content": "<|unused_special_token_8|>",
133
- "lstrip": false,
134
- "normalized": false,
135
- "rstrip": false,
136
- "single_word": false,
137
- "special": true
138
- },
139
- "32017": {
140
- "content": "<|unused_special_token_9|>",
141
- "lstrip": false,
142
- "normalized": false,
143
- "rstrip": false,
144
- "single_word": false,
145
- "special": true
146
  }
147
  },
 
148
  "clean_up_tokenization_spaces": false,
149
- "eos_token": "<|im_end|>",
150
  "extra_special_tokens": {},
151
  "model_max_length": 2048,
152
- "pad_token": "<|pad_token|>",
153
- "tokenizer_class": "PreTrainedTokenizer",
154
- "unk_token": "<|unk_token|>"
 
155
  }
 
1
  {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
 
 
 
 
 
 
 
 
7
  "lstrip": false,
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
+ "1": {
14
+ "content": "<|im_start|>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
21
+ "2": {
22
+ "content": "<|im_end|>",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
28
  },
29
+ "31992": {
30
  "content": "<tool_call>",
31
  "lstrip": false,
32
  "normalized": false,
 
34
  "single_word": false,
35
  "special": false
36
  },
37
+ "31993": {
38
  "content": "</tool_call>",
39
  "lstrip": false,
40
  "normalized": false,
 
42
  "single_word": false,
43
  "special": false
44
  },
45
+ "31994": {
46
  "content": "<think>",
47
  "lstrip": false,
48
  "normalized": false,
 
50
  "single_word": false,
51
  "special": false
52
  },
53
+ "31995": {
54
  "content": "</think>",
55
  "lstrip": false,
56
  "normalized": false,
 
58
  "single_word": false,
59
  "special": false
60
  },
61
+ "31996": {
62
  "content": "<|unused_special_token_0|>",
63
  "lstrip": false,
64
  "normalized": false,
 
66
  "single_word": false,
67
  "special": true
68
  },
69
+ "31997": {
70
  "content": "<|unused_special_token_1|>",
71
  "lstrip": false,
72
  "normalized": false,
 
74
  "single_word": false,
75
  "special": true
76
  },
77
+ "31998": {
78
  "content": "<|unused_special_token_2|>",
79
  "lstrip": false,
80
  "normalized": false,
 
82
  "single_word": false,
83
  "special": true
84
  },
85
+ "31999": {
86
  "content": "<|unused_special_token_3|>",
87
  "lstrip": false,
88
  "normalized": false,
89
  "rstrip": false,
90
  "single_word": false,
91
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  }
93
  },
94
+ "bos_token": null,
95
  "clean_up_tokenization_spaces": false,
96
+ "eos_token": "<|endoftext|>",
97
  "extra_special_tokens": {},
98
  "model_max_length": 2048,
99
+ "pad_token": "<|endoftext|>",
100
+ "split_special_tokens": false,
101
+ "tokenizer_class": "PreTrainedTokenizerFast",
102
+ "unk_token": "<|endoftext|>"
103
  }