Spaces:
Sleeping
Sleeping
| # source: https://huggingface.co/learn/nlp-course/en/chapter6/8?fw=pt | |
| from tokenizers import normalizers, models, decoders, pre_tokenizers, trainers, Tokenizer, processors | |
| from datasets import load_dataset | |
| dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train") | |
| def get_training_corpus(batch_size=1000): | |
| for i in range(0, len(dataset), batch_size): | |
| yield dataset[i: i + batch_size]["text"] | |
| tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]")) | |
| tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]) | |
| print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?")) | |
| tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() # pre_tokenizers.BertPreTokenizer() | |
| print(tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")) | |
| pre_tokenizer = pre_tokenizers.WhitespaceSplit() | |
| print(pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")) | |
| # manually selecting individual splitters | |
| pre_tokenizer = pre_tokenizers.Sequence( | |
| [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()] | |
| ) | |
| print(pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")) | |
| special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] | |
| trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens) | |
| # train from an iterator | |
| tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) | |
| cls_token_id = tokenizer.token_to_id("[CLS]") | |
| sep_token_id = tokenizer.token_to_id("[SEP]") | |
| print(cls_token_id, sep_token_id) | |
| """ | |
| To write the template for the TemplateProcessor, we have to specify how to treat a single sentence and a pair of sentences. | |
| For both, we write the special tokens we want to use; the first (or single) sentence is represented by $A, | |
| while the second sentence (if encoding a pair) is represented by $B. For each of these (special tokens and sentences), | |
| we also specify the corresponding token type ID after a colon. | |
| The classic BERT template is thus defined as follows: | |
| """ | |
| tokenizer.post_processor = processors.TemplateProcessing( | |
| single=f"[CLS]:0 $A:0 [SEP]:0", | |
| pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", | |
| special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)], | |
| ) | |
| encoding = tokenizer.encode("Let's test this tokenizer.") | |
| print(encoding.tokens) | |
| encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.") | |
| print(encoding.tokens) | |
| print(encoding.type_ids) | |
| tokenizer.decoder = decoders.WordPiece(prefix="##") | |
| from transformers import PreTrainedTokenizerFast | |
| wrapped_tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=tokenizer, | |
| # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively | |
| unk_token="[UNK]", | |
| pad_token="[PAD]", | |
| cls_token="[CLS]", | |
| sep_token="[SEP]", | |
| mask_token="[MASK]", | |
| ) |