package convert import ( "io" "io/fs" "os" "path/filepath" "strings" "testing" "github.com/google/go-cmp/cmp" ) func createTokenizerFS(t *testing.T, dir string, files map[string]io.Reader) fs.FS { t.Helper() for k, v := range files { if err := func() error { f, err := os.Create(filepath.Join(dir, k)) if err != nil { return err } defer f.Close() if _, err := io.Copy(f, v); err != nil { return err } return nil }(); err != nil { t.Fatalf("unexpected error: %v", err) } } return os.DirFS(dir) } func TestParseTokenizer(t *testing.T) { cases := []struct { name string fsys fs.FS specialTokenTypes []string want *Tokenizer }{ { name: "string chat template", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{}`), "tokenizer_config.json": strings.NewReader(`{ "chat_template": "" }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{Model: "gpt2"}, Pre: "default", Template: "", }, }, { name: "list chat template", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{}`), "tokenizer_config.json": strings.NewReader(`{ "chat_template": [ { "name": "default", "template": "" }, { "name": "tools", "template": "" } ] }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{Model: "gpt2"}, Pre: "default", Template: "", }, }, { name: "added tokens", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "added_tokens": [ { "id": 999, "content": "", "special": false } ] }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", Tokens: []string{""}, Scores: []float32{999}, Types: []int32{4}, }, Pre: "default", }, }, { name: "added tokens overlap vocab", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "added_tokens": [ { "id": 0, "content": "", "special": true } ], "model": { "vocab": { "": 0 } } }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", Tokens: []string{""}, Scores: []float32{0}, Types: []int32{3}, }, Pre: "default", }, }, { name: "special token types", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "added_tokens": [ { "id": 0, "content": "", "special": true }, { "id": 1, "content": "", "special": true }, { "id": 2, "content": "", "special": true }, { "id": 3, "content": "", "special": true } ], "model": { "vocab": { "": 0, "": 1, "": 2, "": 3 } } }`), "tokenizer_config.json": strings.NewReader(`{ "add_bos_token": true, "add_eos_token": false, "bos_token": "", "eos_token": "", "pad_token": "", "unk_token": "" }`), }), specialTokenTypes: []string{"pad", "eos", "bos", "unk"}, want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", Tokens: []string{"", "", "", ""}, Scores: []float32{0, 1, 2, 3}, Types: []int32{3, 3, 3, 3}, }, SpecialVocabulary: []*SpecialVocabulary{ {Type: "pad", Content: "", ID: 0, AddToken: false}, {Type: "eos", Content: "", ID: 1, AddToken: false}, {Type: "bos", Content: "", ID: 2, AddToken: true}, {Type: "unk", Content: "", ID: 3, AddToken: false}, }, Pre: "default", }, }, { name: "llama-bpe pretokenizer and control tokens", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "added_tokens": [ {"id": 1, "content": "<|startoftext|>", "special": true}, {"id": 6, "content": "<|im_start|>", "special": true}, {"id": 7, "content": "<|im_end|>", "special": true}, {"id": 8, "content": "<|tool_list_start|>", "special": true}, {"id": 9, "content": "<|tool_list_end|>", "special": true}, {"id": 10, "content": "<|tool_call_start|>", "special": true}, {"id": 11, "content": "<|tool_call_end|>", "special": true}, {"id": 12, "content": "<|tool_response_start|>", "special": true}, {"id": 13, "content": "<|tool_response_end|>", "special": true}, {"id": 396, "content": "", "special": true}, {"id": 64400, "content": "", "special": true}, {"id": 64401, "content": "", "special": true} ], "model": { "vocab": { "<|startoftext|>": 1, "<|im_start|>": 6, "<|im_end|>": 7, "<|tool_list_start|>": 8, "<|tool_list_end|>": 9, "<|tool_call_start|>": 10, "<|tool_call_end|>": 11, "<|tool_response_start|>": 12, "<|tool_response_end|>": 13, "": 396, "": 64400, "": 64401 } }, "pre_tokenizer": { "type": "Sequence", "pretokenizers": [ { "type": "Split", "pattern": { "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" }, "behavior": "Isolated", "invert": false }, { "type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": false } ] } }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", Tokens: []string{ "<|startoftext|>", "<|im_start|>", "<|im_end|>", "<|tool_list_start|>", "<|tool_list_end|>", "<|tool_call_start|>", "<|tool_call_end|>", "<|tool_response_start|>", "<|tool_response_end|>", "", "", "", }, Scores: []float32{1, 6, 7, 8, 9, 10, 11, 12, 13, 396, 64400, 64401}, Types: []int32{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, }, Pre: "llama-bpe", }, }, { name: "list string merges", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "model": { "merges": [ "a b", "c d", "e f" ] } }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", }, Merges: []string{ "a b", "c d", "e f", }, Pre: "default", }, }, { name: "list list string merges", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "model": { "merges": [ [ "a", "b" ], [ "c", "d" ], [ "e", "f" ] ] } }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", }, Merges: []string{ "a b", "c d", "e f", }, Pre: "default", }, }, { name: "generation config eos token ids", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "added_tokens": [ { "id": 0, "content": "", "special": true }, { "id": 1, "content": "", "special": true }, { "id": 2, "content": "", "special": true }, { "id": 3, "content": "", "special": true } ], "model": { "vocab": { "": 0, "": 1, "": 2, "": 3 } } }`), "tokenizer_config.json": strings.NewReader(`{ "add_bos_token": true, "add_eos_token": false, "bos_token": "", "eos_token": "" }`), "generation_config.json": strings.NewReader(`{ "bos_token_id": 0, "eos_token_id": [1, 2, 3] }`), }), specialTokenTypes: []string{"pad", "eos", "bos", "unk"}, want: &Tokenizer{ Vocabulary: &Vocabulary{ Model: "gpt2", Tokens: []string{"", "", "", ""}, Scores: []float32{0, 1, 2, 3}, Types: []int32{3, 3, 3, 3}, }, SpecialVocabulary: []*SpecialVocabulary{ {Type: "eos", Content: "", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false}, {Type: "bos", Content: "", ID: 0, AddToken: true}, }, Pre: "default", }, }, { name: "qwen35 pretokenizer", fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ "tokenizer.json": strings.NewReader(`{ "pre_tokenizer": { "type": "Sequence", "pretokenizers": [ { "type": "Split", "pattern": { "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" } } ] } }`), }), want: &Tokenizer{ Vocabulary: &Vocabulary{Model: "gpt2"}, Pre: "qwen35", }, }, } for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { tokenizer, err := parseTokenizer(tt.fsys, tt.specialTokenTypes) if err != nil { t.Fatalf("unexpected error: %v", err) } if diff := cmp.Diff(tt.want, tokenizer); diff != "" { t.Errorf("unexpected tokenizer (-want +got):\n%s", diff) } }) } }