from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers def train_tokenizer(iterator, vocab_size=50000, min_frequency=3): tokenizer = Tokenizer(models.BPE(unk_token="")) # Normalization για ελληνικά και unicode tokenizer.normalizer = normalizers.Sequence([ normalizers.NFC(), normalizers.StripAccents() ]) # Προχωρημένος pre-tokenizer για μικτά κείμενα tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation(), pre_tokenizers.Digits(individual_digits=True) ]) # Ειδικά tokens για ιστορικά κείμενα trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=["<|endoftext|>", "", "", "", "[CITATION]"], continuing_subword_prefix="" ) tokenizer.train_from_iterator(iterator, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() return tokenizer