Orion-zhen commited on
Commit
41f8d0a
·
verified ·
1 Parent(s): 6bd3564

Upload 3 files

Browse files
models/ChatGLM/config.json CHANGED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "THUDM/chatglm-6b",
3
+ "architectures": [
4
+ "ChatGLMModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_chatglm.ChatGLMConfig",
8
+ "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
+ },
11
+ "bos_token_id": 130004,
12
+ "eos_token_id": 130005,
13
+ "mask_token_id": 130000,
14
+ "gmask_token_id": 130001,
15
+ "pad_token_id": 3,
16
+ "hidden_size": 4096,
17
+ "inner_hidden_size": 16384,
18
+ "layernorm_epsilon": 1e-05,
19
+ "max_sequence_length": 2048,
20
+ "model_type": "chatglm",
21
+ "num_attention_heads": 32,
22
+ "num_layers": 28,
23
+ "position_encoding_2d": true,
24
+ "torch_dtype": "float16",
25
+ "transformers_version": "4.23.1",
26
+ "use_cache": true,
27
+ "vocab_size": 130528
28
+ }
models/ChatGLM/configuration_chatglm.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ ChatGLM model configuration """
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+
9
+ class ChatGLMConfig(PretrainedConfig):
10
+ r"""
11
+ This is the configuration class to store the configuration of a [`~ChatGLMModel`].
12
+ It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
13
+ architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
14
+ the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used
17
+ to control the model outputs. Read the documentation from [`PretrainedConfig`]
18
+ for more information.
19
+
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 150528):
23
+ Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`~ChatGLMModel`] or
25
+ [`~TFChatGLMModel`].
26
+ hidden_size (`int`, *optional*, defaults to 4096):
27
+ Dimension of the encoder layers and the pooler layer.
28
+ num_hidden_layers (`int`, *optional*, defaults to 28):
29
+ Number of hidden layers in the Transformer encoder.
30
+ num_attention_heads (`int`, *optional*, defaults to 32):
31
+ Number of attention heads for each attention layer in the Transformer encoder.
32
+ inner_hidden_size (`int`, *optional*, defaults to 16384):
33
+ Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
34
+ max_sequence_length (`int`, *optional*, defaults to 512):
35
+ The maximum sequence length that this model might ever be used with.
36
+ Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
37
+ layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
38
+ The epsilon used by the layer normalization layers.
39
+ use_cache (`bool`, *optional*, defaults to `True`):
40
+ Whether the model should return the last key/values attentions (not used by all models).
41
+ Example:
42
+
43
+ ```python
44
+ >>> from configuration_chatglm import ChatGLMConfig
45
+ >>> from modeling_chatglm import ChatGLMModel
46
+
47
+ >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
48
+ >>> configuration = ChatGLMConfig()
49
+
50
+ >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
51
+ >>> model = ChatGLMModel(configuration)
52
+
53
+ >>> # Accessing the model configuration
54
+ >>> configuration = model.config
55
+ ```
56
+ """
57
+ model_type = "chatglm"
58
+
59
+ def __init__(
60
+ self,
61
+ vocab_size=150528,
62
+ hidden_size=4096,
63
+ num_layers=28,
64
+ num_attention_heads=32,
65
+ layernorm_epsilon=1e-5,
66
+ use_cache=False,
67
+ bos_token_id=150004,
68
+ eos_token_id=150005,
69
+ mask_token_id=150000,
70
+ gmask_token_id=150001,
71
+ pad_token_id=0,
72
+ max_sequence_length=2048,
73
+ inner_hidden_size=16384,
74
+ position_encoding_2d=True,
75
+ quantization_bit=0,
76
+ pre_seq_len=None,
77
+ prefix_projection=False,
78
+ **kwargs
79
+ ):
80
+ self.num_layers = num_layers
81
+ self.vocab_size = vocab_size
82
+ self.hidden_size = hidden_size
83
+ self.num_attention_heads = num_attention_heads
84
+ self.max_sequence_length = max_sequence_length
85
+ self.layernorm_epsilon = layernorm_epsilon
86
+ self.inner_hidden_size = inner_hidden_size
87
+ self.use_cache = use_cache
88
+ self.bos_token_id = bos_token_id
89
+ self.eos_token_id = eos_token_id
90
+ self.pad_token_id = pad_token_id
91
+ self.mask_token_id = mask_token_id
92
+ self.gmask_token_id = gmask_token_id
93
+ self.position_encoding_2d = position_encoding_2d
94
+ self.quantization_bit = quantization_bit
95
+ self.pre_seq_len = pre_seq_len
96
+ self.prefix_projection = prefix_projection
97
+
98
+ super().__init__(
99
+ pad_token_id=pad_token_id,
100
+ bos_token_id=bos_token_id,
101
+ eos_token_id=eos_token_id,
102
+ **kwargs
103
+ )
models/ChatGLM/tokenization_chatglm.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for ChatGLM."""
2
+ from typing import List, Optional, Union
3
+ import os
4
+
5
+ from transformers.tokenization_utils import PreTrainedTokenizer
6
+ from transformers.utils import logging, PaddingStrategy
7
+ from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
8
+ from typing import Dict
9
+ import sentencepiece as spm
10
+ import numpy as np
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
15
+ "THUDM/chatglm-6b": 2048,
16
+ }
17
+
18
+
19
+ class TextTokenizer:
20
+ def __init__(self, model_path):
21
+ self.sp = spm.SentencePieceProcessor()
22
+ self.sp.Load(model_path)
23
+ self.num_tokens = self.sp.vocab_size()
24
+
25
+ def encode(self, text):
26
+ return self.sp.EncodeAsIds(text)
27
+
28
+ def decode(self, ids: List[int]):
29
+ return self.sp.DecodeIds(ids)
30
+
31
+ def tokenize(self, text):
32
+ return self.sp.EncodeAsPieces(text)
33
+
34
+ def convert_tokens_to_string(self, tokens):
35
+ return self.sp.DecodePieces(tokens)
36
+
37
+ def convert_tokens_to_ids(self, tokens):
38
+ return [self.sp.PieceToId(token) for token in tokens]
39
+
40
+ def convert_token_to_id(self, token):
41
+ return self.sp.PieceToId(token)
42
+
43
+ def convert_id_to_token(self, idx):
44
+ return self.sp.IdToPiece(idx)
45
+
46
+ def __len__(self):
47
+ return self.num_tokens
48
+
49
+
50
+ class SPTokenizer:
51
+ def __init__(
52
+ self,
53
+ vocab_file,
54
+ num_image_tokens=20000,
55
+ max_blank_length=80,
56
+ byte_fallback=True,
57
+ ):
58
+ assert vocab_file is not None
59
+ self.vocab_file = vocab_file
60
+ self.num_image_tokens = num_image_tokens
61
+ self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
62
+ self.max_blank_length = max_blank_length
63
+ self.byte_fallback = byte_fallback
64
+ self.text_tokenizer = TextTokenizer(vocab_file)
65
+
66
+ def _get_text_tokenizer(self):
67
+ return self.text_tokenizer
68
+
69
+ @staticmethod
70
+ def get_blank_token(length: int):
71
+ assert length >= 2
72
+ return f"<|blank_{length}|>"
73
+
74
+ @staticmethod
75
+ def get_tab_token():
76
+ return f"<|tab|>"
77
+
78
+ @property
79
+ def num_text_tokens(self):
80
+ return self.text_tokenizer.num_tokens
81
+
82
+ @property
83
+ def num_tokens(self):
84
+ return self.num_image_tokens + self.num_text_tokens
85
+
86
+ @staticmethod
87
+ def _encode_whitespaces(text: str, max_len: int = 80):
88
+ text = text.replace("\t", SPTokenizer.get_tab_token())
89
+ for i in range(max_len, 1, -1):
90
+ text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
91
+ return text
92
+
93
+ def _preprocess(self, text: str, linebreak=True, whitespaces=True):
94
+ if linebreak:
95
+ text = text.replace("\n", "<n>")
96
+ if whitespaces:
97
+ text = self._encode_whitespaces(text, max_len=self.max_blank_length)
98
+ return text
99
+
100
+ def encode(
101
+ self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
102
+ ) -> List[int]:
103
+ """
104
+ @param text: Text to encode.
105
+ @param linebreak: Whether to encode newline (\n) in text.
106
+ @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
107
+ @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
108
+ @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
109
+ """
110
+ text = self._preprocess(text, linebreak, whitespaces)
111
+ if not add_dummy_prefix:
112
+ text = "<n>" + text
113
+ tmp = self._get_text_tokenizer().encode(text)
114
+ tokens = [x + self.num_image_tokens for x in tmp]
115
+ return tokens if add_dummy_prefix else tokens[2:]
116
+
117
+ def postprocess(self, text):
118
+ text = text.replace("<n>", "\n")
119
+ text = text.replace(SPTokenizer.get_tab_token(), "\t")
120
+ for i in range(2, self.max_blank_length + 1):
121
+ text = text.replace(self.get_blank_token(i), " " * i)
122
+ return text
123
+
124
+ def decode(self, text_ids: List[int]) -> str:
125
+ ids = [int(_id) - self.num_image_tokens for _id in text_ids]
126
+ ids = [_id for _id in ids if _id >= 0]
127
+ text = self._get_text_tokenizer().decode(ids)
128
+ text = self.postprocess(text)
129
+ return text
130
+
131
+ def decode_tokens(self, tokens: List[str]) -> str:
132
+ text = self._get_text_tokenizer().convert_tokens_to_string(tokens)
133
+ text = self.postprocess(text)
134
+ return text
135
+
136
+ def tokenize(
137
+ self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
138
+ ) -> List[str]:
139
+ """
140
+ @param text: Text to encode.
141
+ @param linebreak: Whether to encode newline (\n) in text.
142
+ @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
143
+ @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
144
+ @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
145
+ """
146
+ text = self._preprocess(text, linebreak, whitespaces)
147
+ if not add_dummy_prefix:
148
+ text = "<n>" + text
149
+ tokens = self._get_text_tokenizer().tokenize(text)
150
+ return tokens if add_dummy_prefix else tokens[2:]
151
+
152
+ def __getitem__(self, x: Union[int, str]):
153
+ if isinstance(x, int):
154
+ if x < self.num_image_tokens:
155
+ return "<image_{}>".format(x)
156
+ else:
157
+ return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
158
+ elif isinstance(x, str):
159
+ if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
160
+ return int(x[7:-1])
161
+ else:
162
+ return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
163
+ else:
164
+ raise ValueError("The key should be str or int.")
165
+
166
+
167
+ class ChatGLMTokenizer(PreTrainedTokenizer):
168
+ """
169
+ Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
170
+
171
+ Args:
172
+ vocab_file (`str`):
173
+ Path to the vocabulary file.
174
+ """
175
+
176
+ vocab_files_names = {"vocab_file": "ice_text.model"}
177
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
178
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
179
+
180
+ def __init__(
181
+ self,
182
+ vocab_file,
183
+ do_lower_case=False,
184
+ remove_space=False,
185
+ bos_token='<sop>',
186
+ eos_token='<eop>',
187
+ end_token='</s>',
188
+ mask_token='[MASK]',
189
+ gmask_token='[gMASK]',
190
+ padding_side="left",
191
+ pad_token="<pad>",
192
+ unk_token="<unk>",
193
+ num_image_tokens=20000,
194
+ **kwargs
195
+ ) -> None:
196
+ super().__init__(
197
+ do_lower_case=do_lower_case,
198
+ remove_space=remove_space,
199
+ padding_side=padding_side,
200
+ bos_token=bos_token,
201
+ eos_token=eos_token,
202
+ end_token=end_token,
203
+ mask_token=mask_token,
204
+ gmask_token=gmask_token,
205
+ pad_token=pad_token,
206
+ unk_token=unk_token,
207
+ num_image_tokens=num_image_tokens,
208
+ **kwargs
209
+ )
210
+
211
+ self.do_lower_case = do_lower_case
212
+ self.remove_space = remove_space
213
+ self.vocab_file = vocab_file
214
+
215
+ self.bos_token = bos_token
216
+ self.eos_token = eos_token
217
+ self.end_token = end_token
218
+ self.mask_token = mask_token
219
+ self.gmask_token = gmask_token
220
+
221
+ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
222
+
223
+ """ Initialisation """
224
+
225
+ @property
226
+ def gmask_token_id(self) -> Optional[int]:
227
+ if self.gmask_token is None:
228
+ return None
229
+ return self.convert_tokens_to_ids(self.gmask_token)
230
+
231
+ @property
232
+ def end_token_id(self) -> Optional[int]:
233
+ """
234
+ `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
235
+ set.
236
+ """
237
+ if self.end_token is None:
238
+ return None
239
+ return self.convert_tokens_to_ids(self.end_token)
240
+
241
+ @property
242
+ def vocab_size(self):
243
+ """ Returns vocab size """
244
+ return self.sp_tokenizer.num_tokens
245
+
246
+ def get_vocab(self):
247
+ """ Returns vocab as a dict """
248
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
249
+ vocab.update(self.added_tokens_encoder)
250
+ return vocab
251
+
252
+ def preprocess_text(self, inputs):
253
+ if self.remove_space:
254
+ outputs = " ".join(inputs.strip().split())
255
+ else:
256
+ outputs = inputs
257
+
258
+ if self.do_lower_case:
259
+ outputs = outputs.lower()
260
+
261
+ return outputs
262
+
263
+ def _tokenize(self, text, **kwargs):
264
+ """ Returns a tokenized string. """
265
+ text = self.preprocess_text(text)
266
+
267
+ seq = self.sp_tokenizer.tokenize(text)
268
+
269
+ return seq
270
+
271
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
272
+ return self.sp_tokenizer.decode_tokens(tokens)
273
+
274
+ def _decode(
275
+ self,
276
+ token_ids: Union[int, List[int]],
277
+ **kwargs
278
+ ) -> str:
279
+ if isinstance(token_ids, int):
280
+ token_ids = [token_ids]
281
+ if len(token_ids) == 0:
282
+ return ""
283
+ if self.pad_token_id in token_ids: # remove pad
284
+ token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
285
+ return super()._decode(token_ids, **kwargs)
286
+
287
+ def _convert_token_to_id(self, token):
288
+ """ Converts a token (str) in an id using the vocab. """
289
+ return self.sp_tokenizer[token]
290
+
291
+ def _convert_id_to_token(self, index):
292
+ """Converts an index (integer) in a token (str) using the vocab."""
293
+ return self.sp_tokenizer[index]
294
+
295
+ def save_vocabulary(self, save_directory, filename_prefix=None):
296
+ """
297
+ Save the vocabulary and special tokens file to a directory.
298
+
299
+ Args:
300
+ save_directory (`str`):
301
+ The directory in which to save the vocabulary.
302
+ filename_prefix (`str`, *optional*):
303
+ An optional prefix to add to the named of the saved files.
304
+
305
+ Returns:
306
+ `Tuple(str)`: Paths to the files saved.
307
+ """
308
+ if os.path.isdir(save_directory):
309
+ vocab_file = os.path.join(
310
+ save_directory, self.vocab_files_names["vocab_file"]
311
+ )
312
+ else:
313
+ vocab_file = save_directory
314
+
315
+ with open(self.vocab_file, 'rb') as fin:
316
+ proto_str = fin.read()
317
+
318
+ with open(vocab_file, "wb") as writer:
319
+ writer.write(proto_str)
320
+
321
+ return (vocab_file,)
322
+
323
+ def build_inputs_with_special_tokens(
324
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
325
+ ) -> List[int]:
326
+ """
327
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
328
+ adding special tokens. A BERT sequence has the following format:
329
+
330
+ - single sequence: `[CLS] X [SEP]`
331
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
332
+
333
+ Args:
334
+ token_ids_0 (`List[int]`):
335
+ List of IDs to which the special tokens will be added.
336
+ token_ids_1 (`List[int]`, *optional*):
337
+ Optional second list of IDs for sequence pairs.
338
+
339
+ Returns:
340
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
341
+ """
342
+ gmask_id = self.sp_tokenizer[self.gmask_token]
343
+ eos_id = self.sp_tokenizer[self.eos_token]
344
+ token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
345
+ if token_ids_1 is not None:
346
+ token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
347
+ return token_ids_0
348
+
349
+ def _pad(
350
+ self,
351
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
352
+ max_length: Optional[int] = None,
353
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
354
+ pad_to_multiple_of: Optional[int] = None,
355
+ return_attention_mask: Optional[bool] = None,
356
+ ) -> dict:
357
+ """
358
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
359
+
360
+ Args:
361
+ encoded_inputs:
362
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
363
+ max_length: maximum length of the returned list and optionally padding length (see below).
364
+ Will truncate by taking into account the special tokens.
365
+ padding_strategy: PaddingStrategy to use for padding.
366
+
367
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
368
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
369
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
370
+ The tokenizer padding sides are defined in self.padding_side:
371
+
372
+ - 'left': pads on the left of the sequences
373
+ - 'right': pads on the right of the sequences
374
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
375
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
376
+ `>= 7.5` (Volta).
377
+ return_attention_mask:
378
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
379
+ """
380
+ # Load from model defaults
381
+ bos_token_id = self.sp_tokenizer[self.bos_token]
382
+ mask_token_id = self.sp_tokenizer[self.mask_token]
383
+ gmask_token_id = self.sp_tokenizer[self.gmask_token]
384
+ assert self.padding_side == "left"
385
+
386
+ required_input = encoded_inputs[self.model_input_names[0]]
387
+ seq_length = len(required_input)
388
+
389
+ if padding_strategy == PaddingStrategy.LONGEST:
390
+ max_length = len(required_input)
391
+
392
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
393
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
394
+
395
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
396
+
397
+ # Initialize attention mask if not present.
398
+ if max_length is not None:
399
+ if "attention_mask" not in encoded_inputs:
400
+ if bos_token_id in required_input:
401
+ context_length = required_input.index(bos_token_id)
402
+ else:
403
+ context_length = seq_length
404
+ attention_mask = np.ones((1, seq_length, seq_length))
405
+ attention_mask = np.tril(attention_mask)
406
+ attention_mask[:, :, :context_length] = 1
407
+ attention_mask = np.bool_(attention_mask < 0.5)
408
+ encoded_inputs["attention_mask"] = attention_mask
409
+
410
+ if "position_ids" not in encoded_inputs:
411
+ if bos_token_id in required_input:
412
+ context_length = required_input.index(bos_token_id)
413
+ else:
414
+ context_length = seq_length
415
+ position_ids = np.arange(seq_length, dtype=np.int64)
416
+ mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
417
+ if mask_token in required_input:
418
+ mask_position = required_input.index(mask_token)
419
+ position_ids[context_length:] = mask_position
420
+ block_position_ids = np.concatenate(
421
+ [np.zeros(context_length, dtype=np.int64),
422
+ np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
423
+ encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
424
+
425
+ if needs_to_be_padded:
426
+ difference = max_length - len(required_input)
427
+
428
+ if "attention_mask" in encoded_inputs:
429
+ encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
430
+ pad_width=[(0, 0), (difference, 0), (difference, 0)],
431
+ mode='constant', constant_values=True)
432
+ if "token_type_ids" in encoded_inputs:
433
+ encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
434
+ "token_type_ids"
435
+ ]
436
+ if "special_tokens_mask" in encoded_inputs:
437
+ encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
438
+ if "position_ids" in encoded_inputs:
439
+ encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
440
+ pad_width=[(0, 0), (difference, 0)])
441
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
442
+
443
+ return encoded_inputs