| import time | |
| out_dir = 'out-shakespeare' | |
| eval_interval = 5 | |
| eval_iters = 40 | |
| wandb_log = False # feel free to turn on | |
| wandb_project = 'shakespeare' | |
| wandb_run_name = 'ft-' + str(time.time()) | |
| dataset = 'shakespeare' | |
| init_from = 'gpt2-xl' # this is the largest GPT-2 model | |
| # only save checkpoints if the validation loss improves | |
| always_save_checkpoint = False | |
| # the number of examples per iter: | |
| # 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter | |
| # shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters | |
| batch_size = 1 | |
| gradient_accumulation_steps = 32 | |
| max_iters = 20 | |
| # finetune at constant LR | |
| learning_rate = 3e-5 | |
| decay_lr = False | |