diff --git a/README.md b/README.md index 16069cf5..4fa35997 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,16 @@ $ python sample.py Training on 1 A100 40GB GPU overnight currently gets loss ~3.74, training on 4 gets ~3.60. Random chance at init is -ln(1/50257) = 10.82. Which brings us to baselines: +## finetuning + +For an example of how to finetune a GPT on new text go to `data/shakespeare` and look at `prepare.py` to download the tiny shakespeare dataset and render it into a `train.bin` and `val.bin`. Unlike OpenWebText this will run in seconds. Finetuning takes very little time, e.g. on a single GPT just a few minutes. Run an example finetuning like: + +``` +$ python train.py finetune_shakespeare +``` + +This will load the config parameter overrides in `config/finetune_shakespeare.py` (I didn't tune them much though). Basically, we initialize from a GPT2 checkpoint with `init_from` and train as normal, except shorter and with a small learning rate. The best checkpoint (lowest validation loss) will be in the `out_dir` directory, e.g. in `out-shakespeare` by default, per the config file. You can then run the code in `sample.py` to generate infinite Shakespeare. Note that you'll have to edit it to point to the correct `out_dir`. + ## baselines OpenAI GPT-2 checkpoints allow us to get some baselines in place for openwebtext. We can get the numbers as follows: diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py new file mode 100644 index 00000000..69d49a6e --- /dev/null +++ b/config/finetune_shakespeare.py @@ -0,0 +1,22 @@ +import time + +out_dir = 'out-shakespeare' +eval_interval = 200 +wandb_log = False # feel free to turn on +wandb_project = 'shakespeare' +wandb_run_name = 'ft-' + str(time.time()) +compile_model = False # takes too little time to finetune, not worth it + +# save a nice and overfit checkpoint that +# will only speak Shakespeare and forgets +# everything else about the world #dark +always_save_checkpoint = True + +dataset = 'shakespeare' +init_from = 'gpt2-xl' +batch_size = 1 +block_size = 512 + +learning_rate = 1e-5 +max_iters = 1000 +decay_lr = False diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index 81ca13ab..0aadbb8d 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -35,6 +35,7 @@ enc = tiktoken.get_encoding("gpt2") def process(example): ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe + # note: I think eot should be prepended not appended... hmm. it's called "eot" though... out = {'ids': ids, 'len': len(ids)} return out diff --git a/data/shakespeare/prepare.py b/data/shakespeare/prepare.py new file mode 100644 index 00000000..9c52626d --- /dev/null +++ b/data/shakespeare/prepare.py @@ -0,0 +1,32 @@ +import os +import requests +import tiktoken +import numpy as np + +# download the tiny shakespeare dataset +if not os.path.exists('input.txt'): + data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' + with open('input.txt', 'w') as f: + f.write(requests.get(data_url).text) + +with open('input.txt', 'r') as f: + data = f.read() +n = len(data) +train_data = data[:int(n*0.9)] +val_data = data[int(n*0.9):] + +# encode with tiktoken gpt2 bpe +enc = tiktoken.get_encoding("gpt2") +train_ids = enc.encode_ordinary(train_data) +val_ids = enc.encode_ordinary(val_data) +print(f"train has {len(train_ids)} tokens") +print(f"val has {len(val_ids)} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +train_ids.tofile('train.bin') +val_ids.tofile('val.bin') + +# train.bin has 301,966 tokens +# val.bin has 36,059 tokens diff --git a/data/shakespeare/readme.md b/data/shakespeare/readme.md new file mode 100644 index 00000000..1e6c457d --- /dev/null +++ b/data/shakespeare/readme.md @@ -0,0 +1,9 @@ + +# tiny shakespeare + +Tiny shakespeare, of the good old char-rnn fame :) + +After running `prepare.py`: + +- train.bin has 301,966 tokens +- val.bin has 36,059 tokens diff --git a/model.py b/model.py index 327711a2..799eb71e 100644 --- a/model.py +++ b/model.py @@ -90,7 +90,7 @@ class Block(nn.Module): x = x + self.mlp(self.ln_2(x)) return x -@dataclass(frozen=True) +@dataclass class GPTConfig: block_size: int = 1024 vocab_size: int = 50257 @@ -105,7 +105,7 @@ class GPT(nn.Module): super().__init__() assert config.vocab_size is not None assert config.block_size is not None - self.block_size = config.block_size + self.config = config self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), @@ -123,7 +123,7 @@ class GPT(nn.Module): def forward(self, idx, targets=None): device = idx.device b, t = idx.size() - assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}" + assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t) # forward the GPT model itself @@ -146,27 +146,36 @@ class GPT(nn.Module): # model surgery to decrease the block size if necessary # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024) # but want to use a smaller block size for some smaller, simpler model - assert block_size <= self.block_size - self.block_size = block_size + assert block_size <= self.config.block_size + self.config.block_size = block_size self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size]) for block in self.transformer.h: block.attn.bias = block.attn.bias[:,:,:block_size,:block_size] @classmethod - def from_pretrained(cls, model_type): + def from_pretrained(cls, model_type, override_args): assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'} + # only dropout can be overridden see more notes below + assert all(k == 'dropout' for k in override_args) from transformers import GPT2LMHeadModel print("loading weights from pretrained gpt: %s" % model_type) - layer_config = { + # n_layer, n_head and n_embd are determined from model_type + config_args = { 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params }[model_type] + # we can override the dropout rate + if 'dropout' in override_args: + config_args['dropout'] = override_args['dropout'] + # block_size is always 1024 for GPT model checkpoints + # if one wants a lower block_size it has to be done through model surgery + # later, by calling crop_block_size() # create a from-scratch initialized minGPT model - config = GPTConfig(block_size=1024, **layer_config) + config = GPTConfig(block_size=1024, **config_args) model = GPT(config) sd = model.state_dict() @@ -248,7 +257,7 @@ class GPT(nn.Module): """ for _ in range(max_new_tokens): # if the sequence context is growing too long we must crop it at block_size - idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:] + idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:] # forward the model to get the logits for the index in the sequence logits, _ = self(idx_cond) # pluck the logits at the final step and scale by desired temperature diff --git a/sample.py b/sample.py index 6a93dfb1..b68296af 100644 --- a/sample.py +++ b/sample.py @@ -21,18 +21,18 @@ model = GPT(gptconf) model.load_state_dict(checkpoint['model']) model.eval() model.to(device) -model = torch.compile(model) # requires PyTorch 2.0 +#model = torch.compile(model) # requires PyTorch 2.0 (optional) enc = tiktoken.get_encoding("gpt2") -#start = enc.encode("\n") -start = [enc.eot_token] +start = enc.encode("\n") # user choice on what token to start with +#start = [enc.eot_token] x = (torch.tensor(start, dtype=torch.long, device=device)[None, ...]) -for k in range(1): +for k in range(10): with torch.no_grad(): with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): - y = model.generate(x, 300, temperature=0.8, top_k=200) + y = model.generate(x, 500, temperature=0.8, top_k=200) print(enc.decode(y[0].tolist())) print('---------------') diff --git a/train.py b/train.py index 296d3d27..ff0ad588 100644 --- a/train.py +++ b/train.py @@ -31,6 +31,7 @@ eval_interval = 500 log_interval = 1 eval_iters = 50 eval_only = False # if True, script exits right after the first eval +always_save_checkpoint = False # if True, always save a checkpoint after each eval # wandb logging wandb_log = False # disabled by default wandb_entity = 'karpathy' @@ -138,6 +139,7 @@ elif init_from == 'resume': checkpoint_model_args = checkpoint['model_args'] for k, v in model_args.items(): assert checkpoint_model_args[k] == v, "for now" + # TODO: think through how passed in params should interact with checkpoint params gptconf = GPTConfig(**model_args) model = GPT(gptconf) model.load_state_dict(checkpoint['model']) @@ -146,9 +148,14 @@ elif init_from == 'resume': elif init_from.startswith('gpt2'): print(f"Initializing from OpenAI GPT-2 weights: {init_from}") # initialize from OpenAI GPT-2 weights - model = GPT.from_pretrained(init_from) + override_args = dict(dropout=dropout) + model = GPT.from_pretrained(init_from, override_args) + # read off and override the GPT sizing model args from the model config + model_args['n_layer'] = model.config.n_layer + model_args['n_head'] = model.config.n_head + model_args['n_embd'] = model.config.n_embd # crop down the model block size if desired -if block_size < model.block_size: +if block_size < model.config.block_size: model.crop_block_size(block_size) model.to(device) @@ -227,7 +234,7 @@ while True: "val/loss": losses['val'], "lr": lr, }) - if losses['val'] < best_val_loss: + if losses['val'] < best_val_loss or always_save_checkpoint: best_val_loss = losses['val'] raw_model = model.module if ddp else model if iter_num > 0: @@ -238,6 +245,7 @@ while True: 'iter_num': iter_num, 'best_val_loss': best_val_loss, } + print(f"saving checkpoint to {out_dir}") torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt')) if iter_num == 0 and eval_only: break @@ -260,7 +268,8 @@ while True: iter_num += 1 # termination conditions - if iter_num >= max_iters: + if iter_num > max_iters: break -destroy_process_group() +if ddp: + destroy_process_group()