|
|
|
import os
|
|
import time
|
|
import math
|
|
import pickle
|
|
from contextlib import nullcontext
|
|
|
|
import numpy as np
|
|
import torch
|
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
from torch.distributed import init_process_group, destroy_process_group
|
|
|
|
import tiktoken
|
|
from rich.traceback import install
|
|
install()
|
|
from model import GPTConfig, GPT
|
|
|
|
|
|
|
|
SPECIAL_TOKENS = {'<|im_start|>', '<|im_end|>', '<|system|>', '<|user|>', '<|assistant|>', "<|im_start|>", "<|endoftext|>", "<|endofprompt|>"}
|
|
print(f"ℹ️ Using special tokens: {SPECIAL_TOKENS}")
|
|
|
|
|
|
|
|
out_dir = 'out'
|
|
eval_interval = 95
|
|
log_interval = 1
|
|
eval_iters = 95
|
|
eval_only = False
|
|
always_save_checkpoint = True
|
|
|
|
init_from = 'resume'
|
|
|
|
wandb_log = False
|
|
wandb_project = 'owt'
|
|
wandb_run_name= 'run' + str(time.time())
|
|
|
|
|
|
dataset = 'mydata'
|
|
data_file = 'lmsys_chat_1m.txt'
|
|
tokenizer_name = 'cl100k_base'
|
|
token_dtype = 'uint32'
|
|
|
|
|
|
n_layer = 1
|
|
n_head = 16
|
|
n_embd = 1024
|
|
dropout = 0.05
|
|
bias = True
|
|
|
|
|
|
learning_rate = 3e-4
|
|
max_iters = 20000
|
|
weight_decay = 0.05
|
|
beta1 = 0.9
|
|
beta2 = 0.98
|
|
grad_clip = 1.0
|
|
|
|
|
|
decay_lr = True
|
|
warmup_iters = 100
|
|
lr_decay_iters = 10000
|
|
min_lr = 1e-5
|
|
|
|
|
|
batch_size = 4
|
|
gradient_accumulation_steps = 5 * 4
|
|
block_size = 1024
|
|
|
|
|
|
|
|
backend = 'nccl'
|
|
|
|
|
|
device = 'cuda'
|
|
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
|
|
compile = False
|
|
|
|
|
|
save_interval = 200
|
|
checkpoint_limit = None
|
|
|
|
|
|
|
|
config_keys = [k for k,v in globals().items()
|
|
if not k.startswith('_') and isinstance(v, (int,float,bool,str,list))]
|
|
exec(open('configurator.py').read())
|
|
config = {k: globals()[k] for k in config_keys}
|
|
|
|
|
|
|
|
data_dir = os.path.join('data', dataset)
|
|
train_bin_path = os.path.join(data_dir, 'train.bin')
|
|
val_bin_path = os.path.join(data_dir, 'val.bin')
|
|
meta_path = os.path.join(data_dir, 'meta.pkl')
|
|
dtype_token = np.dtype(token_dtype)
|
|
|
|
if not (os.path.exists(train_bin_path) and os.path.exists(val_bin_path) and os.path.exists(meta_path)):
|
|
print(f"ℹ️ Preprocessing raw text from {data_file} ...")
|
|
raw_text = open(data_file, 'r', encoding='utf-8').read()
|
|
enc = tiktoken.get_encoding(tokenizer_name)
|
|
encode = enc.encode
|
|
vocab_size= enc.n_vocab
|
|
|
|
|
|
if np.issubdtype(dtype_token, np.integer):
|
|
info = np.iinfo(dtype_token)
|
|
if info.max < vocab_size:
|
|
raise ValueError(f"token_dtype={token_dtype} max={info.max} < vocab_size={vocab_size}")
|
|
|
|
tokens = np.array(encode(raw_text, allowed_special=SPECIAL_TOKENS), dtype=dtype_token)
|
|
n = tokens.shape[0]
|
|
split = int(0.9 * n)
|
|
train_tokens = tokens[:split]
|
|
val_tokens = tokens[split:]
|
|
|
|
os.makedirs(data_dir, exist_ok=True)
|
|
train_tokens.tofile(train_bin_path)
|
|
val_tokens.tofile(val_bin_path)
|
|
with open(meta_path, 'wb') as f:
|
|
pickle.dump({
|
|
'vocab_size': vocab_size,
|
|
'tokenizer': tokenizer_name,
|
|
'token_dtype': token_dtype,
|
|
'special_tokens': SPECIAL_TOKENS,
|
|
}, f)
|
|
print(f"✅ Wrote {train_bin_path} ({train_tokens.nbytes} bytes), "
|
|
f"{val_bin_path} ({val_tokens.nbytes} bytes), and {meta_path}")
|
|
|
|
|
|
|
|
ddp = int(os.environ.get('RANK', -1)) != -1
|
|
if ddp:
|
|
init_process_group(backend=backend)
|
|
ddp_rank = int(os.environ['RANK'])
|
|
ddp_local_rank = int(os.environ['LOCAL_RANK'])
|
|
ddp_world_size = int(os.environ['WORLD_SIZE'])
|
|
device = f'cuda:{ddp_local_rank}'
|
|
torch.cuda.set_device(device)
|
|
master_process = (ddp_rank == 0)
|
|
seed_offset = ddp_rank
|
|
assert gradient_accumulation_steps % ddp_world_size == 0
|
|
gradient_accumulation_steps //= ddp_world_size
|
|
else:
|
|
master_process = True
|
|
seed_offset = 0
|
|
ddp_world_size = 1
|
|
|
|
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
|
|
print(f"ℹ️ tokens per iteration = {tokens_per_iter:,}")
|
|
|
|
if master_process:
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
torch.manual_seed(1337 + seed_offset)
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
torch.backends.cudnn.allow_tf32 = True
|
|
device_type = 'cuda' if 'cuda' in device else 'cpu'
|
|
ptdtype = {'float32':torch.float32, 'bfloat16':torch.bfloat16, 'float16':torch.float16}[dtype]
|
|
ctx = nullcontext() if device_type=='cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
|
|
|
|
|
|
|
def get_batch(split):
|
|
data = np.memmap(os.path.join(data_dir, f'{split}.bin'),
|
|
dtype=dtype_token, mode='r')
|
|
ix = torch.randint(len(data) - block_size, (batch_size,))
|
|
x = torch.stack([torch.from_numpy(data[i:i+block_size].astype(np.int64)) for i in ix])
|
|
y = torch.stack([torch.from_numpy(data[i+1:i+1+block_size].astype(np.int64)) for i in ix])
|
|
if device_type == 'cuda':
|
|
x,y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
|
|
else:
|
|
x,y = x.to(device), y.to(device)
|
|
return x, y
|
|
|
|
|
|
|
|
iter_num = 0
|
|
best_val_loss = 1e9
|
|
|
|
meta = pickle.load(open(meta_path,'rb'))
|
|
vocab_size = meta['vocab_size']
|
|
|
|
model_args = dict(
|
|
n_layer = n_layer,
|
|
n_head = n_head,
|
|
n_embd = n_embd,
|
|
block_size = block_size,
|
|
bias = bias,
|
|
vocab_size = vocab_size,
|
|
dropout = dropout,
|
|
)
|
|
|
|
if init_from == 'scratch':
|
|
print("ℹ️ Initializing new model from scratch")
|
|
model = GPT(GPTConfig(**model_args))
|
|
|
|
elif init_from == 'resume':
|
|
print(f"ℹ️ Resuming from {out_dir}")
|
|
ckpt = torch.load(os.path.join(out_dir,'ckpt.pt'), map_location=device)
|
|
for k in ['n_layer','n_head','n_embd','block_size','bias','vocab_size']:
|
|
model_args[k] = ckpt['model_args'][k]
|
|
model = GPT(GPTConfig(**model_args))
|
|
state = ckpt['model']
|
|
for key in list(state.keys()):
|
|
if key.startswith('_orig_mod.'):
|
|
state[key[len('_orig_mod.'):]] = state.pop(key)
|
|
model.load_state_dict(state)
|
|
iter_num = ckpt['iter_num']
|
|
best_val_loss = ckpt['best_val_loss']
|
|
|
|
elif init_from.startswith('gpt2'):
|
|
print(f"ℹ️ Initializing from OpenAI GPT-2 weights: {init_from}")
|
|
override = dict(dropout=dropout)
|
|
model = GPT.from_pretrained(init_from, override)
|
|
for k in ['n_layer','n_head','n_embd','block_size','bias','vocab_size']:
|
|
model_args[k] = getattr(model.config, k)
|
|
|
|
if block_size < model.config.block_size:
|
|
model.crop_block_size(block_size)
|
|
model_args['block_size'] = block_size
|
|
|
|
model.to(device)
|
|
scaler = torch.cuda.amp.GradScaler(enabled=(dtype=='float16'))
|
|
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1,beta2), device_type)
|
|
if init_from == 'resume':
|
|
optimizer.load_state_dict(ckpt['optimizer'])
|
|
|
|
|
|
|
|
if compile:
|
|
print("ℹ️ Compiling the model...")
|
|
model = torch.compile(model)
|
|
if ddp:
|
|
model = DDP(model, device_ids=[ddp_local_rank])
|
|
|
|
raw_model = model.module if ddp else model
|
|
|
|
|
|
|
|
if master_process:
|
|
ckpt = {
|
|
'model': raw_model.state_dict(),
|
|
'optimizer': optimizer.state_dict(),
|
|
'model_args': model_args,
|
|
'iter_num': iter_num,
|
|
'best_val_loss': best_val_loss,
|
|
'config': config,
|
|
}
|
|
ckpt_path = os.path.join(out_dir, f'ckpt_{iter_num:06d}.pt')
|
|
print(f"💾 Saving initial checkpoint to {ckpt_path}")
|
|
torch.save(ckpt, ckpt_path)
|
|
|
|
|
|
|
|
@torch.no_grad()
|
|
def estimate_loss():
|
|
out = {}
|
|
model.eval()
|
|
for split in ('train','val'):
|
|
losses = torch.zeros(eval_iters)
|
|
for k in range(eval_iters):
|
|
X,Y = get_batch(split)
|
|
with ctx:
|
|
_, loss = model(X,Y)
|
|
losses[k] = loss.item()
|
|
out[split] = losses.mean().item()
|
|
model.train()
|
|
return out
|
|
|
|
def get_lr(it):
|
|
if it < warmup_iters:
|
|
return learning_rate * (it+1) / (warmup_iters+1)
|
|
if it > lr_decay_iters:
|
|
return min_lr
|
|
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
|
coeff = 0.5 * (1 + math.cos(math.pi * decay_ratio))
|
|
return min_lr + coeff * (learning_rate - min_lr)
|
|
|
|
if wandb_log and master_process:
|
|
import wandb
|
|
wandb.init(project=wandb_project, name=wandb_run_name, config=config)
|
|
|
|
|
|
|
|
X, Y = get_batch('train')
|
|
t0 = time.time()
|
|
local_iter = 0
|
|
while True:
|
|
lr = get_lr(iter_num) if decay_lr else learning_rate
|
|
for pg in optimizer.param_groups:
|
|
pg['lr'] = lr
|
|
|
|
if iter_num % eval_interval == 0 and master_process:
|
|
losses = estimate_loss()
|
|
print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
|
if wandb_log:
|
|
wandb.log({"iter":iter_num, "train/loss":losses['train'], "val/loss":losses['val'], "lr":lr})
|
|
|
|
should_save = (
|
|
losses['val'] < best_val_loss
|
|
or always_save_checkpoint
|
|
or (iter_num % save_interval == 0)
|
|
)
|
|
if should_save and iter_num > 0:
|
|
best_val_loss = min(best_val_loss, losses['val'])
|
|
ckpt = {
|
|
'model': raw_model.state_dict(),
|
|
'optimizer': optimizer.state_dict(),
|
|
'model_args': model_args,
|
|
'iter_num': iter_num,
|
|
'best_val_loss': best_val_loss,
|
|
'config': config,
|
|
}
|
|
ckpt_path = os.path.join(out_dir, f'ckpt_{iter_num:06d}.pt')
|
|
print(f"💾 Saving checkpoint to {ckpt_path}")
|
|
torch.save(ckpt, ckpt_path)
|
|
if checkpoint_limit is not None:
|
|
all_ckpts = sorted(f for f in os.listdir(out_dir)
|
|
if f.startswith('ckpt_') and f.endswith('.pt'))
|
|
for old in all_ckpts[:-checkpoint_limit]:
|
|
os.remove(os.path.join(out_dir, old))
|
|
|
|
if iter_num == 0 and eval_only:
|
|
break
|
|
|
|
for micro in range(gradient_accumulation_steps):
|
|
if ddp:
|
|
model.require_backward_grad_sync = (micro == gradient_accumulation_steps - 1)
|
|
with ctx:
|
|
logits, loss = model(X, Y)
|
|
loss = loss / gradient_accumulation_steps
|
|
X, Y = get_batch('train')
|
|
scaler.scale(loss).backward()
|
|
|
|
if grad_clip != 0.0:
|
|
scaler.unscale_(optimizer)
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
|
scaler.step(optimizer)
|
|
scaler.update()
|
|
optimizer.zero_grad(set_to_none=True)
|
|
|
|
dt = time.time() - t0
|
|
t0 = time.time()
|
|
if iter_num % log_interval == 0 and master_process:
|
|
lossf = loss.item() * gradient_accumulation_steps
|
|
if local_iter >= 5:
|
|
mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
|
|
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {mfu*100:.2f}%")
|
|
else:
|
|
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
|
|
|
|
iter_num += 1
|
|
local_iter += 1
|
|
if iter_num > max_iters:
|
|
break
|
|
|
|
if ddp:
|
|
destroy_process_group()
|
|
|