diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py
index eb6545ed..148a4c4f 100644
--- a/config/finetune_shakespeare.py
+++ b/config/finetune_shakespeare.py
@@ -1,22 +1,25 @@
 import time
 
 out_dir = 'out-shakespeare'
-eval_interval = 200
+eval_interval = 5
+eval_iters = 40
 wandb_log = False # feel free to turn on
 wandb_project = 'shakespeare'
 wandb_run_name = 'ft-' + str(time.time())
-compile = False # takes too little time to finetune, not worth it
-
-# save a nice and overfit checkpoint that
-# will only speak Shakespeare and forgets
-# everything else about the world #dark
-always_save_checkpoint = True
 
 dataset = 'shakespeare'
-init_from = 'gpt2-xl'
-batch_size = 1
-block_size = 512
+init_from = 'gpt2-xl' # this is the largest GPT-2 model
 
-learning_rate = 1e-5
-max_iters = 1000
+# only save checkpoints if the validation loss improves
+always_save_checkpoint = False
+
+# the number of examples per iter:
+# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
+# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
+batch_size = 1
+gradient_accumulation_steps = 32
+max_iters = 20
+
+# finetune at constant LR
+learning_rate = 3e-5
 decay_lr = False
diff --git a/config/train_shakespeare_char.py b/config/train_shakespeare_char.py
index c50e4dd1..cb0d3332 100644
--- a/config/train_shakespeare_char.py
+++ b/config/train_shakespeare_char.py
@@ -15,7 +15,7 @@ wandb_run_name = 'mini-gpt'
 
 dataset = 'shakespeare_char'
 batch_size = 64
-block_size = 256 # context of up to 128 previous characters
+block_size = 256 # context of up to 256 previous characters
 
 # baby GPT model :)
 n_layer = 6