Orion-zhen commited on
Commit
db503de
·
verified ·
1 Parent(s): a32f33e

Upload 5 files

Browse files
models/Kimi-VL-A3B/config.json CHANGED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KimiVLForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_kimi_vl.KimiVLConfig",
7
+ "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
8
+ "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
9
+ },
10
+ "vision_config": {
11
+ "model_type": "moonvit",
12
+ "patch_size": 14,
13
+ "num_attention_heads": 16,
14
+ "num_hidden_layers": 27,
15
+ "hidden_size": 1152,
16
+ "intermediate_size": 4304,
17
+ "init_pos_emb_height": 64,
18
+ "init_pos_emb_width": 64,
19
+ "merge_kernel_size": [
20
+ 2,
21
+ 2
22
+ ],
23
+ "torch_dtype": "bfloat16"
24
+ },
25
+ "text_config": {
26
+ "vocab_size": 163840,
27
+ "max_position_embeddings": 131072,
28
+ "hidden_size": 2048,
29
+ "intermediate_size": 11264,
30
+ "moe_intermediate_size": 1408,
31
+ "num_hidden_layers": 27,
32
+ "num_attention_heads": 16,
33
+ "n_shared_experts": 2,
34
+ "n_routed_experts": 64,
35
+ "ep_size": 1,
36
+ "routed_scaling_factor": 2.446,
37
+ "kv_lora_rank": 512,
38
+ "q_lora_rank": null,
39
+ "qk_rope_head_dim": 64,
40
+ "v_head_dim": 128,
41
+ "qk_nope_head_dim": 128,
42
+ "topk_method": "noaux_tc",
43
+ "n_group": 1,
44
+ "topk_group": 1,
45
+ "num_experts_per_tok": 6,
46
+ "moe_layer_freq": 1,
47
+ "first_k_dense_replace": 1,
48
+ "norm_topk_prob": true,
49
+ "scoring_func": "sigmoid",
50
+ "aux_loss_alpha": 0.001,
51
+ "seq_aux": true,
52
+ "num_key_value_heads": 16,
53
+ "hidden_act": "silu",
54
+ "initializer_range": 0.02,
55
+ "rms_norm_eps": 1e-05,
56
+ "pretraining_tp": 1,
57
+ "use_cache": true,
58
+ "rope_theta": 800000.0,
59
+ "rope_scaling": null,
60
+ "attention_bias": false,
61
+ "attention_dropout": 0.0,
62
+ "bos_token_id": 163584,
63
+ "pad_token_id": 163839,
64
+ "eos_token_id": 163585,
65
+ "torch_dtype": "bfloat16",
66
+ "tie_word_embeddings": false
67
+ },
68
+ "ignore_index": -100,
69
+ "media_placeholder_token_id": 163605,
70
+ "torch_dtype": "bfloat16",
71
+ "transformers_version": "4.50.3",
72
+ "tie_word_embeddings": false,
73
+ "vocab_size": 163840,
74
+ "model_type": "kimi_vl"
75
+ }
models/Kimi-VL-A3B/configuration_kimi_vl.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+ from typing import Optional, Union
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
8
+
9
+
10
+ class DeepseekV3Config(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
13
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
14
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
+ documentation from [`PretrainedConfig`] for more information.
18
+
19
+ Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 129280):
23
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
25
+ hidden_size (`int`, *optional*, defaults to 4096):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 11008):
28
+ Dimension of the MLP representations.
29
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
30
+ Dimension of the MoE representations.
31
+ num_hidden_layers (`int`, *optional*, defaults to 32):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
34
+ Number of nextn predict layers in the DeepSeekV3 Model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer in the Transformer decoder.
37
+ n_shared_experts (`int`, *optional*, defaults to None):
38
+ Number of shared experts, None means dense model.
39
+ n_routed_experts (`int`, *optional*, defaults to None):
40
+ Number of routed experts, None means dense model.
41
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
42
+ Scaling factor or routed experts.
43
+ topk_method (`str`, *optional*, defaults to `gready`):
44
+ Topk method used in routed gate.
45
+ n_group (`int`, *optional*, defaults to None):
46
+ Number of groups for routed experts.
47
+ topk_group (`int`, *optional*, defaults to None):
48
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
49
+ num_experts_per_tok (`int`, *optional*, defaults to None):
50
+ Number of selected experts, None means dense model.
51
+ moe_layer_freq (`int`, *optional*, defaults to 1):
52
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
53
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
54
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
55
+ \--k dense layers--/
56
+ norm_topk_prob (`bool`, *optional*, defaults to False):
57
+ Whether to normalize the weights of the routed experts.
58
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
59
+ Method of computing expert weights.
60
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
61
+ Auxiliary loss weight coefficient.
62
+ seq_aux = (`bool`, *optional*, defaults to True):
63
+ Whether to compute the auxiliary loss for each individual sample.
64
+ num_key_value_heads (`int`, *optional*):
65
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
66
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
67
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
68
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
69
+ by meanpooling all the original heads within that group. For more details checkout [this
70
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string) in the decoder.
74
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
75
+ The maximum sequence length that this model might ever be used with.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
79
+ The epsilon used by the rms normalization layers.
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
82
+ relevant if `config.is_decoder=True`.
83
+ pad_token_id (`int`, *optional*):
84
+ Padding token id.
85
+ bos_token_id (`int`, *optional*, defaults to 1):
86
+ Beginning of stream token id.
87
+ eos_token_id (`int`, *optional*, defaults to 2):
88
+ End of stream token id.
89
+ pretraining_tp (`int`, *optional*, defaults to 1):
90
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
91
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
92
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
93
+ issue](https://github.com/pytorch/pytorch/issues/76232).
94
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
95
+ Whether to tie weight embeddings
96
+ rope_theta (`float`, *optional*, defaults to 10000.0):
97
+ The base period of the RoPE embeddings.
98
+ rope_scaling (`Dict`, *optional*):
99
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
100
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
101
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
102
+ `max_position_embeddings` to the expected new maximum.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+
108
+ ```python
109
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
110
+
111
+ >>> # Initializing a Deepseek-V3 style configuration
112
+ >>> configuration = DeepseekV3Config()
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "deepseek_v3"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=129280,
124
+ hidden_size=7168,
125
+ intermediate_size=18432,
126
+ moe_intermediate_size=2048,
127
+ num_hidden_layers=61,
128
+ num_nextn_predict_layers=1,
129
+ num_attention_heads=128,
130
+ num_key_value_heads=128,
131
+ n_shared_experts=1,
132
+ n_routed_experts=256,
133
+ ep_size=1,
134
+ routed_scaling_factor=2.5,
135
+ kv_lora_rank=512,
136
+ q_lora_rank=1536,
137
+ qk_rope_head_dim=64,
138
+ v_head_dim=128,
139
+ qk_nope_head_dim=128,
140
+ topk_method="noaux_tc",
141
+ n_group=8,
142
+ topk_group=4,
143
+ num_experts_per_tok=8,
144
+ moe_layer_freq=1,
145
+ first_k_dense_replace=3,
146
+ norm_topk_prob=True,
147
+ scoring_func="sigmoid",
148
+ aux_loss_alpha=0.001,
149
+ seq_aux=True,
150
+ hidden_act="silu",
151
+ max_position_embeddings=4096,
152
+ initializer_range=0.02,
153
+ rms_norm_eps=1e-6,
154
+ use_cache=True,
155
+ pad_token_id=None,
156
+ bos_token_id=0,
157
+ eos_token_id=1,
158
+ pretraining_tp=1,
159
+ tie_word_embeddings=False,
160
+ rope_theta=10000.0,
161
+ rope_scaling=None,
162
+ attention_bias=False,
163
+ attention_dropout=0.0,
164
+ **kwargs,
165
+ ):
166
+ self.vocab_size = vocab_size
167
+ self.max_position_embeddings = max_position_embeddings
168
+ self.hidden_size = hidden_size
169
+ self.intermediate_size = intermediate_size
170
+ self.moe_intermediate_size = moe_intermediate_size
171
+ self.num_hidden_layers = num_hidden_layers
172
+ self.num_nextn_predict_layers = num_nextn_predict_layers
173
+ self.num_attention_heads = num_attention_heads
174
+ self.n_shared_experts = n_shared_experts
175
+ self.n_routed_experts = n_routed_experts
176
+ self.ep_size = ep_size
177
+ self.routed_scaling_factor = routed_scaling_factor
178
+ self.kv_lora_rank = kv_lora_rank
179
+ self.q_lora_rank = q_lora_rank
180
+ self.qk_rope_head_dim = qk_rope_head_dim
181
+ self.v_head_dim = v_head_dim
182
+ self.qk_nope_head_dim = qk_nope_head_dim
183
+ self.topk_method = topk_method
184
+ self.n_group = n_group
185
+ self.topk_group = topk_group
186
+ self.num_experts_per_tok = num_experts_per_tok
187
+ self.moe_layer_freq = moe_layer_freq
188
+ self.first_k_dense_replace = first_k_dense_replace
189
+ self.norm_topk_prob = norm_topk_prob
190
+ self.scoring_func = scoring_func
191
+ self.aux_loss_alpha = aux_loss_alpha
192
+ self.seq_aux = seq_aux
193
+ # for backward compatibility
194
+ if num_key_value_heads is None:
195
+ num_key_value_heads = num_attention_heads
196
+
197
+ self.num_key_value_heads = num_key_value_heads
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.pretraining_tp = pretraining_tp
202
+ self.use_cache = use_cache
203
+ self.rope_theta = rope_theta
204
+ self.rope_scaling = rope_scaling
205
+ self.attention_bias = attention_bias
206
+ self.attention_dropout = attention_dropout
207
+
208
+ super().__init__(
209
+ pad_token_id=pad_token_id,
210
+ bos_token_id=bos_token_id,
211
+ eos_token_id=eos_token_id,
212
+ tie_word_embeddings=tie_word_embeddings,
213
+ **kwargs,
214
+ )
215
+
216
+
217
+ class MoonViTConfig(PretrainedConfig):
218
+ model_type = "moonvit"
219
+
220
+ def __init__(
221
+ self,
222
+ patch_size: int = 14,
223
+ init_pos_emb_height: int = 64,
224
+ init_pos_emb_width: int = 64,
225
+ num_attention_heads: int = 16,
226
+ num_hidden_layers: int = 27,
227
+ hidden_size: int = 1152,
228
+ intermediate_size: int = 4304,
229
+ merge_kernel_size: tuple[int, int] = (2, 2),
230
+ **kwargs,
231
+ ):
232
+ super().__init__(**kwargs)
233
+ self.patch_size = patch_size
234
+ # Positional embedding config
235
+ self.init_pos_emb_height = init_pos_emb_height
236
+ self.init_pos_emb_width = init_pos_emb_width
237
+ # Transformer config
238
+ self.num_hidden_layers = num_hidden_layers
239
+ self.num_attention_heads = num_attention_heads
240
+ self.hidden_size = hidden_size
241
+ self.intermediate_size = intermediate_size
242
+ # Patch merger config
243
+ self.merge_kernel_size = merge_kernel_size
244
+
245
+
246
+ class KimiVLConfig(PretrainedConfig):
247
+ model_type = "kimi_vl"
248
+
249
+ def __init__(
250
+ self,
251
+ vision_config: Optional[Union[dict, MoonViTConfig]] = None,
252
+ text_config: Optional[Union[dict, DeepseekV3Config]] = None,
253
+ ignore_index: int = -100,
254
+ media_placeholder_token_id: int = 163605,
255
+ pad_token_id: int = 0,
256
+ **kwargs,
257
+ ):
258
+ if vision_config is None:
259
+ vision_config = MoonViTConfig()
260
+ elif isinstance(vision_config, dict):
261
+ vision_config = MoonViTConfig(**vision_config)
262
+ self.vision_config = vision_config
263
+
264
+ if text_config is None:
265
+ text_config = DeepseekV3Config()
266
+ elif isinstance(text_config, dict):
267
+ text_config = DeepseekV3Config(**text_config)
268
+ self.text_config = text_config
269
+
270
+ self.ignore_index = ignore_index
271
+ self.media_placeholder_token_id = media_placeholder_token_id
272
+
273
+ attn_implementation = kwargs.get("attn_implementation")
274
+ if attn_implementation is not None:
275
+ if attn_implementation in ["eager", "flash_attention_2"]:
276
+ self._attn_implementation = attn_implementation
277
+ self.vision_config._attn_implementation = attn_implementation
278
+ self.text_config._attn_implementation = attn_implementation
279
+ else:
280
+ raise ValueError(
281
+ f"Invalid attention implementation: {attn_implementation}"
282
+ )
283
+
284
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
models/Kimi-VL-A3B/tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
models/Kimi-VL-A3B/tokenization_moonshot.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.utils import to_py_obj
20
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+ SPIECE_UNDERLINE = "▁"
26
+
27
+
28
+ class TikTokenTokenizer(PreTrainedTokenizer):
29
+ """
30
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
31
+
32
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
33
+ this superclass for more information regarding those methods.
34
+
35
+ Args:
36
+ vocab_file (`str`):
37
+ The path to the Tiktoken model file.
38
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
39
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
40
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
41
+ The end of sequence token.
42
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
43
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
44
+ token instead. The second to last item in special_tokens.
45
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
46
+ The token used for padding, for example when batching sequences of different lengths.
47
+ additional_special_tokens (list of `str`, *optional*):
48
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
49
+ skipped when decoding if `skip_special_tokens` is set to `True`.
50
+ """
51
+
52
+ vocab_files_names = VOCAB_FILES_NAMES
53
+
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ special_tokens: Dict[str, int]
57
+
58
+ num_reserved_special_tokens = 256
59
+
60
+ pat_str = "|".join(
61
+ [
62
+ r"""[\p{Han}]+""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
65
+ r"""\p{N}{1,3}""",
66
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
67
+ r"""\s*[\r\n]+""",
68
+ r"""\s+(?!\S)""",
69
+ r"""\s+""",
70
+ ]
71
+ )
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_file,
76
+ bos_token: Union[str, AddedToken] = "[BOS]",
77
+ eos_token: Union[str, AddedToken] = "[EOS]",
78
+ unk_token: Union[str, AddedToken] = "[UNK]",
79
+ pad_token: Union[str, AddedToken] = "[PAD]",
80
+ additional_special_tokens: Optional[List[str]] = None,
81
+ added_tokens_decoder: Optional[dict] = None,
82
+ **kwargs,
83
+ ):
84
+ assert os.path.isfile(vocab_file), vocab_file
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_middle|>",
89
+ "<|im_user|>",
90
+ "<|im_assistant|>",
91
+ "<|im_system|>",
92
+ ]
93
+ special_tokens_mapping = {
94
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
95
+ }
96
+
97
+ special_tokens = (
98
+ [str(bos_token), str(eos_token)]
99
+ + additional_special_tokens
100
+ + [str(unk_token), str(pad_token)]
101
+ )
102
+
103
+ self.vocab_file = vocab_file
104
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
105
+ num_base_tokens = len(mergeable_ranks)
106
+ self.special_tokens = {
107
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
108
+ for i in range(
109
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
110
+ )
111
+ }
112
+
113
+ self.model = tiktoken.Encoding(
114
+ name=Path(vocab_file).name,
115
+ pat_str=self.pat_str,
116
+ mergeable_ranks=mergeable_ranks,
117
+ special_tokens=self.special_tokens,
118
+ )
119
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
120
+
121
+ self.n_words: int = self.model.n_vocab
122
+ # BOS / EOS token IDs
123
+ self.bos_id: int = self.special_tokens[str(bos_token)]
124
+ self.eos_id: int = self.special_tokens[str(eos_token)]
125
+ logger.info(
126
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
127
+ )
128
+
129
+ self.pad_id: int = self.special_tokens[str(pad_token)]
130
+ self.unk_id: int = self.special_tokens[str(unk_token)]
131
+
132
+ self.byte_encoder = bytes_to_unicode()
133
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
134
+
135
+ self.decoder = {}
136
+ for i in range(self.n_words):
137
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = "".join(
139
+ [
140
+ self.byte_encoder[ord(char)]
141
+ for char in self.model.decode_single_token_bytes(i).decode(
142
+ "latin-1"
143
+ )
144
+ ]
145
+ )
146
+ self.decoder[i] = decoding
147
+
148
+ self.encoder = {}
149
+ for i in range(self.n_words):
150
+ if i in self.decoder:
151
+ self.encoder[self.decoder[i]] = i
152
+
153
+ super().__init__(
154
+ bos_token=bos_token,
155
+ eos_token=eos_token,
156
+ unk_token=unk_token,
157
+ pad_token=pad_token,
158
+ additional_special_tokens=additional_special_tokens,
159
+ **kwargs,
160
+ )
161
+ self.all_special_ids_set = set(self.all_special_ids)
162
+
163
+ def encode(
164
+ self, text: str, allow_special_tokens: bool = True, **kwargs
165
+ ) -> List[int]:
166
+ """
167
+ Encodes a string into a list of token IDs.
168
+
169
+ Args:
170
+ text (str): The input string to be encoded.
171
+
172
+ Returns:
173
+ list[int]: A list of token IDs.
174
+ """
175
+ # If there are other args, we should call super().encode because there are a lot of code
176
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
177
+ if len(kwargs) > 0:
178
+ return super().encode(text, **kwargs)
179
+
180
+ assert type(text) is str
181
+
182
+ # The tiktoken tokenizer can handle <=400k chars without
183
+ # pyo3_runtime.PanicException.
184
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
185
+
186
+ # https://github.com/openai/tiktoken/issues/195
187
+ # Here we iterate over subsequences and split if we exceed the limit
188
+ # of max consecutive non-whitespace or whitespace characters.
189
+ MAX_NO_WHITESPACES_CHARS = 25_000
190
+
191
+ substrs = (
192
+ substr
193
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
194
+ for substr in self._split_whitespaces_or_nonwhitespaces(
195
+ text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
196
+ )
197
+ )
198
+ t: List[int] = []
199
+ for substr in substrs:
200
+ if allow_special_tokens:
201
+ t.extend(
202
+ # we should consider special token as a common token
203
+ self.model.encode(
204
+ substr,
205
+ allowed_special="all",
206
+ )
207
+ )
208
+ else:
209
+ t.extend(
210
+ # we should consider special token as a common token
211
+ self.model.encode(
212
+ substr,
213
+ disallowed_special=(),
214
+ )
215
+ )
216
+ return t
217
+
218
+ def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
219
+ """
220
+ Decodes a list of token IDs into a string.
221
+
222
+ Args:
223
+ t (List[int]): The list of token IDs to be decoded.
224
+
225
+ Returns:
226
+ str: The decoded string.
227
+ """
228
+ # If there are other args, we should call super().decode because there are a lot of code
229
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
230
+ if len(kwargs) > 0:
231
+ return super().decode(token_ids, **kwargs)
232
+
233
+ token_ids = to_py_obj(token_ids)
234
+
235
+ if type(token_ids) is int:
236
+ token_ids = [token_ids]
237
+
238
+ return self.model.decode(cast(List[int], token_ids))
239
+
240
+ @staticmethod
241
+ def _split_whitespaces_or_nonwhitespaces(
242
+ s: str, max_consecutive_slice_len: int
243
+ ) -> Iterator[str]:
244
+ """
245
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
246
+ consecutive whitespaces or consecutive non-whitespaces.
247
+ """
248
+ current_slice_len = 0
249
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
250
+ slice_start = 0
251
+
252
+ for i in range(len(s)):
253
+ is_now_space = s[i].isspace()
254
+
255
+ if current_slice_is_space ^ is_now_space:
256
+ current_slice_len = 1
257
+ current_slice_is_space = is_now_space
258
+ else:
259
+ current_slice_len += 1
260
+ if current_slice_len > max_consecutive_slice_len:
261
+ yield s[slice_start:i]
262
+ slice_start = i
263
+ current_slice_len = 1
264
+ yield s[slice_start:]
265
+
266
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
267
+
268
+ @property
269
+ def vocab_size(self) -> int:
270
+ return self.n_words
271
+
272
+ def get_vocab(self) -> Dict[str, int]:
273
+ return self.encoder
274
+
275
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
276
+ return [self.decoder[t] for t in self.encode(text)]
277
+
278
+ def _convert_token_to_id(self, token: str) -> int:
279
+ return self.encoder.get(token, self.unk_id)
280
+
281
+ def _convert_id_to_token(self, index: int) -> str:
282
+ return self.decoder.get(index)
283
+
284
+ @staticmethod
285
+ def clean_up_tokenization(out_string: str) -> str:
286
+ return out_string
287
+
288
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
289
+ text = "".join(tokens).replace(SPIECE_UNDERLINE, "")
290
+ text = bytearray([self.byte_decoder[c] for c in text]).decode(
291
+ "utf-8", "replace"
292
+ )
293
+ return text
294
+
295
+ def save_vocabulary(
296
+ self, save_directory: str, filename_prefix: Optional[str] = None
297
+ ) -> Tuple[str]:
298
+ if not os.path.isdir(save_directory):
299
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
300
+ return
301
+ out_vocab_file = os.path.join(
302
+ save_directory,
303
+ (filename_prefix + "-" if filename_prefix else "")
304
+ + VOCAB_FILES_NAMES["vocab_file"],
305
+ )
306
+
307
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
308
+ out_vocab_file
309
+ ) and os.path.isfile(self.vocab_file):
310
+ copyfile(self.vocab_file, out_vocab_file)
311
+
312
+ return (out_vocab_file,)
models/Kimi-VL-A3B/tokenizer_config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163601": {
28
+ "content": "<|im_middle|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163587": {
36
+ "content": "<|im_user|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163588": {
44
+ "content": "<|im_assistant|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163594": {
52
+ "content": "<|im_system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163602": {
60
+ "content": "<|media_start|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163603": {
68
+ "content": "<|media_content|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163604": {
76
+ "content": "<|media_end|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "163605": {
84
+ "content": "<|media_pad|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "163838": {
92
+ "content": "[PAD]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "163839": {
100
+ "content": "[UNK]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ }
107
+ },
108
+ "additional_special_tokens": [
109
+ "<|im_end|>",
110
+ "<|im_user|>",
111
+ "<|im_assistant|>",
112
+ "<|im_system|>",
113
+ "<|im_middle|>",
114
+ "<|media_start|>",
115
+ "<|media_content|>",
116
+ "<|media_end|>",
117
+ "<|media_pad|>"
118
+ ],
119
+ "bos_token": "[BOS]",
120
+ "clean_up_tokenization_spaces": false,
121
+ "eos_token": "[EOS]",
122
+ "extra_special_tokens": {},
123
+ "model_max_length": 1048576,
124
+ "pad_token": "[PAD]",
125
+ "unk_token": "[UNK]",
126
+ "tokenizer_class": "TikTokenTokenizer",
127
+ "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
128
+ "auto_map": {
129
+ "AutoTokenizer": [
130
+ "tokenization_moonshot.TikTokenTokenizer",
131
+ null
132
+ ]
133
+ }
134
+ }