Spaces:
Runtime error
Runtime error
saicharan2804
commited on
Commit
·
0cee7ca
1
Parent(s):
719a68e
Updated training
Browse files- trainBpeTokenizer.py +11 -2
trainBpeTokenizer.py
CHANGED
|
@@ -3,6 +3,15 @@ from tokenizers.models import BPE
|
|
| 3 |
from tokenizers.trainers import BpeTrainer
|
| 4 |
from tokenizers.pre_tokenizers import ByteLevel
|
| 5 |
from tokenizers.processors import TemplateProcessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Initialize a tokenizer
|
| 8 |
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
|
@@ -14,7 +23,7 @@ tokenizer.pre_tokenizer = ByteLevel()
|
|
| 14 |
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 15 |
|
| 16 |
# Path to the file(s) for training the tokenizer
|
| 17 |
-
files = [
|
| 18 |
|
| 19 |
# Train the tokenizer
|
| 20 |
tokenizer.train(files, trainer)
|
|
@@ -30,4 +39,4 @@ tokenizer.post_processor = TemplateProcessing(
|
|
| 30 |
)
|
| 31 |
|
| 32 |
# Save the tokenizer
|
| 33 |
-
tokenizer.save(
|
|
|
|
| 3 |
from tokenizers.trainers import BpeTrainer
|
| 4 |
from tokenizers.pre_tokenizers import ByteLevel
|
| 5 |
from tokenizers.processors import TemplateProcessing
|
| 6 |
+
import argparse
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
parser = argparse.ArgumentParser(description='Train BPE Tokenizer.')
|
| 10 |
+
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
|
| 11 |
+
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
|
| 12 |
+
|
| 13 |
+
# Parse the arguments
|
| 14 |
+
args = parser.parse_args()
|
| 15 |
|
| 16 |
# Initialize a tokenizer
|
| 17 |
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
|
|
|
| 23 |
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 24 |
|
| 25 |
# Path to the file(s) for training the tokenizer
|
| 26 |
+
files = [args.dataset_file_path]
|
| 27 |
|
| 28 |
# Train the tokenizer
|
| 29 |
tokenizer.train(files, trainer)
|
|
|
|
| 39 |
)
|
| 40 |
|
| 41 |
# Save the tokenizer
|
| 42 |
+
tokenizer.save(args.output_file_path)
|