tymbos commited on
Commit
29cc980
·
verified ·
1 Parent(s): 59d3e79

Update train_tokenizer.py

Browse files
Files changed (1) hide show
  1. train_tokenizer.py +10 -15
train_tokenizer.py CHANGED
@@ -1,32 +1,27 @@
1
  from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
2
- from datasets import load_dataset
3
- import re
4
 
5
- # Προσαρμοσμένος Pre-tokenizer για κώδικα
6
- code_regex = r"""'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|//.*|\/\*[\s\S]*?\*\/|\b(?:if|else|for|while|return|function)\b|[<>]=?|\+{1,2}|-{1,2}|&&|\|\||[!*/%^&|=-]|\d+\.\d+|\d+|\.\d+|[:;,.{}[\]()]|\p{L}+|\p{N}+|\s+|\S"""
7
-
8
- def train_tokenizer(iterator, vocab_size=32000, min_frequency=2):
9
  tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
10
 
11
- # Προχωρημένο Normalization
12
  tokenizer.normalizer = normalizers.Sequence([
13
  normalizers.NFC(),
14
- #normalizers.StripAccents() # Προαιρετικό για τόνους
15
  ])
16
 
17
- # Προσαρμοσμένος Pre-tokenizer με Split
18
  tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
19
- pre_tokenizers.Split(pattern=re.compile(code_regex), behavior='isolated'),
20
- pre_tokenizers.ByteLevel(add_prefix_space=False)
 
21
  ])
22
 
23
- # Προχωρημένος Trainer
24
  trainer = trainers.BpeTrainer(
25
  vocab_size=vocab_size,
26
  min_frequency=min_frequency,
27
- special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"],
28
- continuing_subword_prefix="",
29
- show_progress=True
30
  )
31
 
32
  tokenizer.train_from_iterator(iterator, trainer=trainer)
 
1
  from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
 
 
2
 
3
+ def train_tokenizer(iterator, vocab_size=50000, min_frequency=3):
 
 
 
4
  tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
5
 
6
+ # Normalization για ελληνικά και unicode
7
  tokenizer.normalizer = normalizers.Sequence([
8
  normalizers.NFC(),
9
+ normalizers.StripAccents()
10
  ])
11
 
12
+ # Προχωρημένος pre-tokenizer για μικτά κείμενα
13
  tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
14
+ pre_tokenizers.WhitespaceSplit(),
15
+ pre_tokenizers.Punctuation(),
16
+ pre_tokenizers.Digits(individual_digits=True)
17
  ])
18
 
19
+ # Ειδικά tokens για ιστορικά κείμενα
20
  trainer = trainers.BpeTrainer(
21
  vocab_size=vocab_size,
22
  min_frequency=min_frequency,
23
+ special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>", "[CITATION]"],
24
+ continuing_subword_prefix=""
 
25
  )
26
 
27
  tokenizer.train_from_iterator(iterator, trainer=trainer)