diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py index eb6545ed..148a4c4f 100644 --- a/config/finetune_shakespeare.py +++ b/config/finetune_shakespeare.py @@ -1,22 +1,25 @@ import time out_dir = 'out-shakespeare' -eval_interval = 200 +eval_interval = 5 +eval_iters = 40 wandb_log = False # feel free to turn on wandb_project = 'shakespeare' wandb_run_name = 'ft-' + str(time.time()) -compile = False # takes too little time to finetune, not worth it - -# save a nice and overfit checkpoint that -# will only speak Shakespeare and forgets -# everything else about the world #dark -always_save_checkpoint = True dataset = 'shakespeare' -init_from = 'gpt2-xl' -batch_size = 1 -block_size = 512 +init_from = 'gpt2-xl' # this is the largest GPT-2 model -learning_rate = 1e-5 -max_iters = 1000 +# only save checkpoints if the validation loss improves +always_save_checkpoint = False + +# the number of examples per iter: +# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter +# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters +batch_size = 1 +gradient_accumulation_steps = 32 +max_iters = 20 + +# finetune at constant LR +learning_rate = 3e-5 decay_lr = False diff --git a/config/train_shakespeare_char.py b/config/train_shakespeare_char.py index c50e4dd1..cb0d3332 100644 --- a/config/train_shakespeare_char.py +++ b/config/train_shakespeare_char.py @@ -15,7 +15,7 @@ wandb_run_name = 'mini-gpt' dataset = 'shakespeare_char' batch_size = 64 -block_size = 256 # context of up to 128 previous characters +block_size = 256 # context of up to 256 previous characters # baby GPT model :) n_layer = 6