Update train_tokenizer.py
Browse files- train_tokenizer.py +10 -15
train_tokenizer.py
CHANGED
@@ -1,32 +1,27 @@
|
|
1 |
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
2 |
-
from datasets import load_dataset
|
3 |
-
import re
|
4 |
|
5 |
-
|
6 |
-
code_regex = r"""'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|//.*|\/\*[\s\S]*?\*\/|\b(?:if|else|for|while|return|function)\b|[<>]=?|\+{1,2}|-{1,2}|&&|\|\||[!*/%^&|=-]|\d+\.\d+|\d+|\.\d+|[:;,.{}[\]()]|\p{L}+|\p{N}+|\s+|\S"""
|
7 |
-
|
8 |
-
def train_tokenizer(iterator, vocab_size=32000, min_frequency=2):
|
9 |
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
10 |
|
11 |
-
#
|
12 |
tokenizer.normalizer = normalizers.Sequence([
|
13 |
normalizers.NFC(),
|
14 |
-
|
15 |
])
|
16 |
|
17 |
-
#
|
18 |
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
19 |
-
pre_tokenizers.
|
20 |
-
pre_tokenizers.
|
|
|
21 |
])
|
22 |
|
23 |
-
#
|
24 |
trainer = trainers.BpeTrainer(
|
25 |
vocab_size=vocab_size,
|
26 |
min_frequency=min_frequency,
|
27 |
-
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"],
|
28 |
-
continuing_subword_prefix=""
|
29 |
-
show_progress=True
|
30 |
)
|
31 |
|
32 |
tokenizer.train_from_iterator(iterator, trainer=trainer)
|
|
|
1 |
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
|
|
|
|
2 |
|
3 |
+
def train_tokenizer(iterator, vocab_size=50000, min_frequency=3):
|
|
|
|
|
|
|
4 |
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
5 |
|
6 |
+
# Normalization για ελληνικά και unicode
|
7 |
tokenizer.normalizer = normalizers.Sequence([
|
8 |
normalizers.NFC(),
|
9 |
+
normalizers.StripAccents()
|
10 |
])
|
11 |
|
12 |
+
# Προχωρημένος pre-tokenizer για μικτά κείμενα
|
13 |
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
14 |
+
pre_tokenizers.WhitespaceSplit(),
|
15 |
+
pre_tokenizers.Punctuation(),
|
16 |
+
pre_tokenizers.Digits(individual_digits=True)
|
17 |
])
|
18 |
|
19 |
+
# Ειδικά tokens για ιστορικά κείμενα
|
20 |
trainer = trainers.BpeTrainer(
|
21 |
vocab_size=vocab_size,
|
22 |
min_frequency=min_frequency,
|
23 |
+
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>", "[CITATION]"],
|
24 |
+
continuing_subword_prefix=""
|
|
|
25 |
)
|
26 |
|
27 |
tokenizer.train_from_iterator(iterator, trainer=trainer)
|