File size: 2,319 Bytes
9003bbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00fcad7
 
 
 
9003bbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from tokenizers import ByteLevelBPETokenizer
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    Trainer,
    TrainingArguments,
    TextDataset,
    DataCollatorForLanguageModeling
)
import os

# Step 1: Train a tokenizer from scratch
tokenizer_dir = "./tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)

# Initialize and train the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="train_data.txt", vocab_size=1000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
])
tokenizer.save_model(tokenizer_dir)

# Load it into a Hugging Face tokenizer
hf_tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_dir)
hf_tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "bos_token": "<s>",
    "eos_token": "</s>"
})

# Step 2: Prepare the dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset("train_data.txt", hf_tokenizer)

# Step 3: Define GPT2 config for a tiny model
config = GPT2Config(
    vocab_size=hf_tokenizer.vocab_size,
    n_positions=2048,
    n_ctx=2048,
    n_embd=1024,
    n_layer=12,
    n_head=2,
    bos_token_id=hf_tokenizer.bos_token_id,
    eos_token_id=hf_tokenizer.eos_token_id,
    pad_token_id=hf_tokenizer.pad_token_id
)

# Step 4: Initialize model from scratch
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(hf_tokenizer))

# Step 5: Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tokenizer,
    mlm=False,
)

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir=".",
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_device_train_batch_size=4,
    save_total_limit=0,
    logging_steps=50,
    prediction_loss_only=True,
    report_to="none"
)

# Step 7: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Step 8: Train!
trainer.train()

# Step 9: Save everything
trainer.save_model(".")
hf_tokenizer.save_pretrained(".")