File size: 2,319 Bytes
9003bbb 00fcad7 9003bbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from tokenizers import ByteLevelBPETokenizer
from transformers import (
GPT2Config,
GPT2LMHeadModel,
GPT2TokenizerFast,
Trainer,
TrainingArguments,
TextDataset,
DataCollatorForLanguageModeling
)
import os
# Step 1: Train a tokenizer from scratch
tokenizer_dir = "./tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)
# Initialize and train the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="train_data.txt", vocab_size=1000, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>"
])
tokenizer.save_model(tokenizer_dir)
# Load it into a Hugging Face tokenizer
hf_tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_dir)
hf_tokenizer.add_special_tokens({
"pad_token": "<pad>",
"bos_token": "<s>",
"eos_token": "</s>"
})
# Step 2: Prepare the dataset
def load_dataset(file_path, tokenizer, block_size=128):
return TextDataset(
tokenizer=tokenizer,
file_path=file_path,
block_size=block_size
)
train_dataset = load_dataset("train_data.txt", hf_tokenizer)
# Step 3: Define GPT2 config for a tiny model
config = GPT2Config(
vocab_size=hf_tokenizer.vocab_size,
n_positions=2048,
n_ctx=2048,
n_embd=1024,
n_layer=12,
n_head=2,
bos_token_id=hf_tokenizer.bos_token_id,
eos_token_id=hf_tokenizer.eos_token_id,
pad_token_id=hf_tokenizer.pad_token_id
)
# Step 4: Initialize model from scratch
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(hf_tokenizer))
# Step 5: Define data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=hf_tokenizer,
mlm=False,
)
# Step 6: Define training arguments
training_args = TrainingArguments(
output_dir=".",
overwrite_output_dir=True,
num_train_epochs=30,
per_device_train_batch_size=4,
save_total_limit=0,
logging_steps=50,
prediction_loss_only=True,
report_to="none"
)
# Step 7: Trainer setup
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
# Step 8: Train!
trainer.train()
# Step 9: Save everything
trainer.save_model(".")
hf_tokenizer.save_pretrained(".")
|