|
|
import regex |
|
|
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers |
|
|
from tokenizers.pre_tokenizers import PreTokenizer |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
code_regex = r"""(?x: |
|
|
//.*?$| # Σχόλια τύπου // |
|
|
/\*.*?\*/| # Σχόλια τύπου /* */ |
|
|
"(?:\\.|[^\\"])*"| # Strings με διπλά εισαγωγικά |
|
|
'(?:\\.|[^\\'])*'| # Strings με μονά εισαγωγικά |
|
|
\b(?:if|else|for|while|return|function)\b| # Κλειδικά για κώδικα |
|
|
[<>]=?|\+\+|--|&&|\|\||[-+*/%=&|^~!]=?| # Operators |
|
|
\d+\.?\d*|\.\d+| # Αριθμοί |
|
|
[{}[\](),.;:]| # Σύμβολα |
|
|
\p{L}+|\p{N}+| # Unicode γράμματα/αριθμοί |
|
|
\s+| # Διαστήματα |
|
|
\S # Οποιοσδήποτε άλλος χαρακτήρας |
|
|
)""" |
|
|
|
|
|
def custom_pre_tokenizer(pretokenized_string): |
|
|
""" |
|
|
Custom pre-tokenizer που χρησιμοποιεί το regex για να εξάγει tokens με offsets. |
|
|
""" |
|
|
|
|
|
tokens_with_offsets = [(m.group(), m.span()) for m in regex.finditer(code_regex, pretokenized_string.sequence)] |
|
|
pretokenized_string.split(lambda: tokens_with_offsets) |
|
|
|
|
|
def train_tokenizer(iterator, vocab_size=32000, min_frequency=2): |
|
|
|
|
|
tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) |
|
|
|
|
|
|
|
|
tokenizer.normalizer = normalizers.Sequence([ |
|
|
normalizers.NFC(), |
|
|
normalizers.StripAccents() |
|
|
]) |
|
|
|
|
|
|
|
|
tokenizer.pre_tokenizer = PreTokenizer.custom(custom_pre_tokenizer) |
|
|
|
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
|
|
tokenizer.pre_tokenizer, |
|
|
pre_tokenizers.ByteLevel(add_prefix_space=False) |
|
|
]) |
|
|
|
|
|
|
|
|
trainer = trainers.BpeTrainer( |
|
|
vocab_size=vocab_size, |
|
|
min_frequency=min_frequency, |
|
|
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"], |
|
|
continuing_subword_prefix="", |
|
|
show_progress=True |
|
|
) |
|
|
|
|
|
|
|
|
tokenizer.train_from_iterator(iterator, trainer=trainer) |
|
|
|
|
|
|
|
|
tokenizer.decoder = decoders.ByteLevel() |
|
|
|
|
|
return tokenizer |