|
|
|
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers |
|
import os |
|
|
|
def train_tokenizer(iterator, vocab_size=50000, min_frequency=3, output_dir="tokenizer_model"): |
|
""" |
|
Εκπαιδεύει έναν Tokenizer τύπου BPE και αποθηκεύει το αποτέλεσμα. |
|
""" |
|
print("🔄 Ξεκινάει η εκπαίδευση του tokenizer...") |
|
|
|
|
|
tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) |
|
|
|
|
|
tokenizer.normalizer = normalizers.NFC() |
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
|
pre_tokenizers.WhitespaceSplit(), |
|
pre_tokenizers.Punctuation(), |
|
pre_tokenizers.Digits(individual_digits=True) |
|
]) |
|
|
|
|
|
trainer = trainers.BpeTrainer( |
|
vocab_size=vocab_size, |
|
min_frequency=min_frequency, |
|
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>", "[CITATION]"], |
|
continuing_subword_prefix="" |
|
) |
|
|
|
|
|
tokenizer.train_from_iterator(iterator, trainer=trainer) |
|
tokenizer.decoder = decoders.ByteLevel() |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
|
|
tokenizer.save(os.path.join(output_dir, "tokenizer.json")) |
|
tokenizer.model.save(output_dir) |
|
|
|
print(f"✅ Ο tokenizer αποθηκεύτηκε στον φάκελο '{output_dir}'!") |
|
|
|
return tokenizer |