|
import argparse |
|
import os |
|
import sys |
|
import shutil |
|
import random |
|
import numpy as np |
|
import time |
|
import copy |
|
import math |
|
import matplotlib.pyplot as plt |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
import torch.nn as nn |
|
from torch.autograd import Variable |
|
import transformers |
|
from transformers import GPT2TokenizerFast |
|
|
|
|
|
def subsequent_mask(size): |
|
"""Mask out subsequent positions.""" |
|
attn_shape = (1, size, size) |
|
mask = torch.triu(torch.ones(attn_shape), diagonal=1).bool() |
|
return mask |
|
|
|
|
|
def read_corpus(filename, tokenizer): |
|
print(f"Reading corpus from {filename}...") |
|
seq = [] |
|
with open(filename, 'rt') as f: |
|
for line in f: |
|
line = line.rstrip('\n') |
|
tokens = tokenizer(line) |
|
seq.extend(tokens['input_ids']) |
|
print(f"Read {len(seq)} tokens from {filename}") |
|
return seq |
|
|
|
class Embedder(nn.Module): |
|
def __init__(self, vocab_size, d_model): |
|
super().__init__() |
|
self.d_model = d_model |
|
self.embed = nn.Embedding(vocab_size, d_model) |
|
def forward(self, x): |
|
return self.embed(x.long()) |
|
|
|
class PositionalEncoder(nn.Module): |
|
def __init__(self, d_model, max_seq_len = 4096, dropout = 0.1): |
|
super().__init__() |
|
self.d_model = d_model |
|
self.dropout = nn.Dropout(dropout) |
|
|
|
pe = torch.zeros(max_seq_len, d_model) |
|
for pos in range(max_seq_len): |
|
for i in range(0, d_model, 2): |
|
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model))) |
|
if i + 1 < d_model: |
|
pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) |
|
pe = pe.unsqueeze(0) |
|
self.register_buffer('pe', pe) |
|
|
|
def forward(self, x): |
|
|
|
x = x * math.sqrt(self.d_model) |
|
|
|
seq_len = x.size(1) |
|
x = x + self.pe[:, :seq_len] |
|
return self.dropout(x) |
|
|
|
class Norm(nn.Module): |
|
def __init__(self, d_model, eps=1e-6): |
|
super().__init__() |
|
self.size = d_model |
|
|
|
self.alpha = nn.Parameter(torch.ones(self.size)) |
|
self.bias = nn.Parameter(torch.zeros(self.size)) |
|
self.eps = eps |
|
|
|
def forward(self, x): |
|
norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / \ |
|
(x.std(dim=-1, keepdim=True) + self.eps) + self.bias |
|
return norm |
|
|
|
def attention(q, k, v, d_k, mask=None, dropout=None): |
|
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) |
|
if mask is not None: |
|
mask = mask.unsqueeze(1) |
|
scores = scores.masked_fill(mask == 0, -1e9) |
|
scores = F.softmax(scores, dim=-1) |
|
if dropout is not None: |
|
scores = dropout(scores) |
|
output = torch.matmul(scores, v) |
|
return output |
|
|
|
class MultiHeadAttention(nn.Module): |
|
def __init__(self, heads, d_model, dropout=0.1): |
|
super().__init__() |
|
self.d_model = d_model |
|
self.d_k = d_model // heads |
|
self.h = heads |
|
self.q_linear = nn.Linear(d_model, d_model) |
|
self.v_linear = nn.Linear(d_model, d_model) |
|
self.k_linear = nn.Linear(d_model, d_model) |
|
self.dropout = nn.Dropout(dropout) |
|
self.out = nn.Linear(d_model, d_model) |
|
|
|
def forward(self, q, k, v, mask=None): |
|
bs = q.size(0) |
|
|
|
k = self.k_linear(k).view(bs, -1, self.h, self.d_k) |
|
q = self.q_linear(q).view(bs, -1, self.h, self.d_k) |
|
v = self.v_linear(v).view(bs, -1, self.h, self.d_k) |
|
|
|
k = k.transpose(1,2) |
|
q = q.transpose(1,2) |
|
v = v.transpose(1,2) |
|
|
|
scores = attention(q, k, v, self.d_k, mask, self.dropout) |
|
|
|
concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model) |
|
output = self.out(concat) |
|
return output |
|
|
|
class FeedForward(nn.Module): |
|
def __init__(self, d_model, d_ff=2048, dropout=0.1): |
|
super().__init__() |
|
|
|
self.linear_1 = nn.Linear(d_model, d_ff) |
|
self.dropout = nn.Dropout(dropout) |
|
self.linear_2 = nn.Linear(d_ff, d_model) |
|
|
|
def forward(self, x): |
|
x = self.dropout(F.relu(self.linear_1(x))) |
|
x = self.linear_2(x) |
|
return x |
|
|
|
def get_clones(module, N): |
|
return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) |
|
|
|
class DecoderLayer(nn.Module): |
|
def __init__(self, d_model, heads, dropout=0.1): |
|
super().__init__() |
|
self.norm_1 = Norm(d_model) |
|
self.norm_2 = Norm(d_model) |
|
self.attn = MultiHeadAttention(heads, d_model, dropout) |
|
self.ff = FeedForward(d_model, dropout=dropout) |
|
self.dropout_1 = nn.Dropout(dropout) |
|
self.dropout_2 = nn.Dropout(dropout) |
|
|
|
def forward(self, x, trg_mask): |
|
x2 = self.norm_1(x) |
|
x = x + self.dropout_1(self.attn(x2, x2, x2, trg_mask)) |
|
x2 = self.norm_2(x) |
|
x = x + self.dropout_2(self.ff(x2)) |
|
return x |
|
|
|
class Decoder(nn.Module): |
|
def __init__(self, vocab_size, d_model, N, heads, dropout): |
|
super().__init__() |
|
self.N = N |
|
self.embed = Embedder(vocab_size, d_model) |
|
self.pe = PositionalEncoder(d_model, dropout=dropout) |
|
self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) |
|
self.norm = Norm(d_model) |
|
|
|
def forward(self, x, trg_mask): |
|
x = self.embed(x) |
|
x = self.pe(x) |
|
for layer in self.layers: |
|
x = layer(x, trg_mask) |
|
return self.norm(x) |
|
|
|
class GPT2LM(nn.Module): |
|
def __init__(self, vocab_size, d_model, N, heads, dropout, tie_weights=False): |
|
super().__init__() |
|
self.decoder = Decoder(vocab_size, d_model, N, heads, dropout) |
|
self.out = nn.Linear(d_model, vocab_size) |
|
if tie_weights: |
|
self.out.weight = self.decoder.embed.embed.weight |
|
print("✅ Tied embeddings enabled.") |
|
|
|
def forward(self, x, mask): |
|
d_output = self.decoder(x, mask) |
|
return self.out(d_output) |
|
|
|
|
|
def batchify(data, batch_size, seq_len): |
|
nbatch = len(data) // batch_size |
|
data = torch.tensor(data[:nbatch * batch_size], dtype=torch.long) |
|
data = data.view(batch_size, -1) |
|
for i in range(0, data.size(1) - 1, seq_len): |
|
seq_len_i = min(seq_len, data.size(1) - 1 - i) |
|
src = data[:, i:i + seq_len_i] |
|
tgt = data[:, i + 1:i + 1 + seq_len_i] |
|
yield src, tgt |
|
|
|
|
|
def train_model(model, opt): |
|
print("Starting training...") |
|
model.train() |
|
train_ppls = [] |
|
valid_ppls = [] |
|
total_tokens = 0 |
|
total_time = 0 |
|
|
|
for epoch in range(opt.epochs): |
|
total_loss = 0 |
|
batches = 0 |
|
epoch_tokens = 0 |
|
epoch_start_time = time.time() |
|
|
|
for src, tgt in batchify(opt.train, opt.batchsize, opt.seqlen): |
|
batch_start_time = time.time() |
|
src, tgt = src.to(opt.device), tgt.to(opt.device) |
|
mask = subsequent_mask(src.size(1)).to(opt.device) |
|
output = model(src, mask) |
|
loss = F.cross_entropy(output.view(-1, opt.vocab_size), tgt.reshape(-1), ignore_index=opt.src_pad) |
|
opt.optimizer.zero_grad() |
|
loss.backward() |
|
opt.optimizer.step() |
|
|
|
batch_time = time.time() - batch_start_time |
|
total_loss += loss.item() |
|
batches += 1 |
|
|
|
|
|
tokens_in_batch = src.numel() |
|
epoch_tokens += tokens_in_batch |
|
total_tokens += tokens_in_batch |
|
total_time += batch_time |
|
|
|
tokens_per_sec = tokens_in_batch / batch_time |
|
if batches % opt.printevery == 0: |
|
print(f"Epoch {epoch+1}, Batch {batches}, Loss: {loss.item():.4f}, Speed: {tokens_per_sec:.2f} tokens/sec") |
|
|
|
epoch_time = time.time() - epoch_start_time |
|
epoch_tokens_per_sec = epoch_tokens / epoch_time |
|
|
|
avg_loss = total_loss / batches |
|
train_ppl = math.exp(avg_loss) |
|
train_ppls.append(train_ppl) |
|
|
|
print(f"Epoch {epoch+1}/{opt.epochs}, Loss: {avg_loss:.4f}, Perplexity: {train_ppl:.2f}") |
|
print(f"Epoch training speed: {epoch_tokens_per_sec:.2f} tokens/sec") |
|
|
|
valid_ppl = test_model(model, opt.valid, opt, tag=f"valid-epoch{epoch+1}") |
|
valid_ppls.append(valid_ppl) |
|
|
|
|
|
avg_tokens_per_sec = total_tokens / total_time |
|
print(f"\nTraining completed.") |
|
print(f"Average training speed: {avg_tokens_per_sec:.2f} tokens/sec") |
|
|
|
|
|
with open(os.path.join("saved", opt.dir_name, "training_speed.txt"), "w") as f: |
|
f.write(f"Total tokens processed: {total_tokens}\n") |
|
f.write(f"Total training time: {total_time:.2f} seconds\n") |
|
f.write(f"Average training speed: {avg_tokens_per_sec:.2f} tokens/sec\n") |
|
|
|
|
|
dir_name = os.path.join("saved", opt.dir_name) |
|
if not os.path.exists(dir_name): |
|
os.makedirs(dir_name) |
|
print(f"Created directory: {dir_name}") |
|
|
|
|
|
save_path = os.path.join(dir_name, "gpt2lm_wiki103.pth") |
|
print(f"Saving model to: {save_path}") |
|
print(f"Directory exists: {os.path.exists(dir_name)}") |
|
print(f"Write permissions: {os.access(dir_name, os.W_OK)}") |
|
torch.save(model.state_dict(), save_path) |
|
print(f"Model saved successfully to {save_path}") |
|
|
|
|
|
plt.plot(range(1, opt.epochs+1), train_ppls, label='Train PPL') |
|
plt.plot(range(1, opt.epochs+1), valid_ppls, label='Valid PPL') |
|
plt.xlabel("Epoch") |
|
plt.ylabel("Perplexity") |
|
plt.legend() |
|
plt.title("Training & Validation Perplexity") |
|
plt.savefig(os.path.join(dir_name, "learning_curve.png")) |
|
print(f"Saved learning curve to {dir_name}/learning_curve.png") |
|
|
|
|
|
with open(os.path.join(dir_name, "perplexity_log.txt"), "w") as f: |
|
for i in range(opt.epochs): |
|
f.write(f"Epoch {i+1}: Train PPL = {train_ppls[i]:.2f}, Valid PPL = {valid_ppls[i]:.2f}\n") |
|
|
|
return avg_tokens_per_sec |
|
|
|
def test_model(model, data, opt, tag="valid"): |
|
print(f"Running {tag} set...") |
|
model.eval() |
|
total_loss, batches = 0, 0 |
|
with torch.no_grad(): |
|
for src, tgt in batchify(data, opt.batchsize, opt.seqlen): |
|
src, tgt = src.to(opt.device), tgt.to(opt.device) |
|
mask = subsequent_mask(src.size(1)).to(opt.device) |
|
output = model(src, mask) |
|
loss = F.cross_entropy(output.view(-1, opt.vocab_size), tgt.reshape(-1), ignore_index=opt.src_pad) |
|
total_loss += loss.item() |
|
batches += 1 |
|
avg_loss = total_loss / batches |
|
ppl = math.exp(avg_loss) |
|
print(f"{tag.capitalize()} PPL: {ppl:.2f}") |
|
model.train() |
|
return ppl |
|
|
|
def main(): |
|
random.seed(10) |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('-no_cuda', action='store_true') |
|
parser.add_argument('-SGDR', action='store_true') |
|
parser.add_argument('-epochs', type=int, default=1) |
|
parser.add_argument('-d_model', type=int, default=512) |
|
parser.add_argument('-n_layers', type=int, default=6) |
|
parser.add_argument('-heads', type=int, default=8) |
|
parser.add_argument('-dropout', type=float, default=0.1) |
|
parser.add_argument('-batchsize', type=int, default=8) |
|
parser.add_argument('-printevery', type=int, default=100) |
|
parser.add_argument('-lr', type=float, default=0.0001) |
|
parser.add_argument('-seqlen', type=int, default=512) |
|
parser.add_argument('-threshold', type=int, default=3) |
|
parser.add_argument('-savename', type=str) |
|
parser.add_argument('-loadname', type=str) |
|
parser.add_argument('-tied', type=int, default=1) |
|
parser.add_argument('-dir_name', type=str, default='wiki103_model') |
|
parser.add_argument('-norm', type=float, default=2.0) |
|
opt = parser.parse_args() |
|
opt.verbose = False |
|
|
|
|
|
if not opt.no_cuda and torch.cuda.is_available(): |
|
opt.device = torch.device("cuda:0") |
|
print(f"Using CUDA device: {torch.cuda.get_device_name(0)}") |
|
else: |
|
opt.device = torch.device("cpu") |
|
print("Using CPU for training") |
|
|
|
time_name = time.strftime("%y%m%d_%H%M%S") |
|
opt.time_name = time_name |
|
dir_name = "saved/%s" % (opt.dir_name) |
|
if not os.path.exists(dir_name): |
|
os.makedirs(dir_name) |
|
source_name = sys.argv[0] |
|
shutil.copy(source_name, dir_name + "/" + os.path.basename(source_name)) |
|
opt.log_file = dir_name + "/log_file.txt" |
|
print(str(opt)) |
|
|
|
|
|
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") |
|
|
|
opt.train = read_corpus('wiki103.train.txt', tokenizer) |
|
opt.valid = read_corpus('wiki103.valid.txt', tokenizer) |
|
opt.test = read_corpus('wiki103.test.txt', tokenizer) |
|
|
|
obs = len(opt.train) |
|
print(f"Training set: {obs} tokens") |
|
|
|
opt.vocab_size = 50257 |
|
temp = [] |
|
for i in range(opt.vocab_size): |
|
temp.append(i) |
|
opt.indices = torch.tensor(temp).to(opt.device) |
|
|
|
|
|
model = GPT2LM(opt.vocab_size, opt.d_model, opt.n_layers, opt.heads, opt.dropout, tie_weights=(opt.tied == 1)).to(opt.device) |
|
model_parameters = filter(lambda p: p.requires_grad, model.parameters()) |
|
params = sum([np.prod(p.size()) for p in model_parameters]) |
|
text = 'total params: %d' % (params) |
|
print(text) |
|
|
|
|
|
opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9) |
|
|
|
if opt.savename is not None: |
|
try: |
|
os.mkdir(opt.savename) |
|
except: |
|
nothing = 1 |
|
opt.src_pad = 0 |
|
opt.trg_pad = 0 |
|
|
|
|
|
avg_tokens_per_sec = train_model(model, opt) |
|
test_model(model, opt.valid, opt, tag="valid") |
|
test_model(model, opt.test, opt, tag="test") |
|
|
|
print(f"\nFinal training speed: {avg_tokens_per_sec:.2f} tokens/sec") |
|
|
|
|
|
with open(os.path.join(dir_name, "speed_improvement_suggestions.txt"), "w") as f: |
|
f.write("Suggestions for improving training speed:\n\n") |
|
f.write("1. Use mixed precision training (FP16/BF16)\n") |
|
f.write("2. Increase batch size and use gradient accumulation\n") |
|
f.write("3. Enable tensor core operations on compatible GPUs\n") |
|
f.write("4. Optimize data loading with prefetching and parallel workers\n") |
|
f.write("5. Use model parallelism or distributed training\n") |
|
f.write("6. Consider using optimized implementations like FlashAttention\n") |
|
f.write("7. Experiment with smaller model sizes or pruning\n") |
|
f.write("8. Profile and optimize bottlenecks\n") |
|
f.write("9. Use memory-efficient optimizers\n") |
|
f.write("10. Consider efficient implementations like xformers or rotary embeddings\n") |
|
|
|
if __name__ == "__main__": |
|
main() |