package gemma4 // TestGemma4TokenizerMatchesReference verifies our BPE tokenizer matches // the Rust tokenizers library (the reference implementation) for Gemma 4. // // The test loads vocabulary from any local ollama gemma4 GGUF model. // Skips if no gemma4 model is installed. // // Set VERIFY_HF_TOKENIZER=1 to verify against the Rust tokenizers library // via Python. Requires python3 with tokenizers>=0.21 on PATH: // // VERIFY_HF_TOKENIZER=1 go test ./model/models/gemma4/ -run TestGemma4Tokenizer -v // // Workflow for adding a new test case: // 1. Add {name: "...", input: "..."} to the test list (no want field) // 2. Run with VERIFY_HF_TOKENIZER=1 — it prints the reference IDs // 3. Paste those IDs into the want field // 4. Run without VERIFY_HF_TOKENIZER — our tokenizer must match import ( "encoding/json" "fmt" "os" "os/exec" "path/filepath" "strings" "testing" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/fs/gguf" "github.com/ollama/ollama/tokenizer" ) type tokenizerRefCase struct { name string input string want []int32 } // Reference token IDs generated by the Rust tokenizers library using // vocab/merges from a gemma4 GGUF with add_special_tokens=False. var gemma4TokenizerRefCases = []tokenizerRefCase{ // Basic ASCII {name: "basic word", input: "hello", want: []int32{23391}}, {name: "two words", input: "hello world", want: []int32{23391, 1902}}, {name: "punctuation", input: "Hello, World!", want: []int32{9259, 236764, 4109, 236888}}, // Space handling (pretokenizer bug: GPT-2 splitter mangled leading/multiple spaces) {name: "leading space", input: " hello", want: []int32{29104}}, {name: "double leading space", input: " hello", want: []int32{138, 23391}}, {name: "double space between words", input: "hello world", want: []int32{23391, 138, 12392}}, {name: "only spaces", input: " ", want: []int32{139}}, {name: "repeated spaces", input: " ", want: []int32{142}}, {name: "leading spaces phrase", input: " leading spaces", want: []int32{5830, 9952}}, {name: "multiple interior spaces", input: "multiple spaces", want: []int32{43819, 140, 35220}}, // Polish diacritics (issue #15231 — Decode mangled U+0105-U+0142) {name: "polish diacritics", input: "ąęśćżźółń", want: []int32{237198, 237202, 14732, 237277, 238992, 24875, 238041}}, {name: "polish sentence", input: "Zażółć gęślą jaźń", want: []int32{236953, 40512, 24875, 237289, 549, 237202, 62081, 237198, 4828, 238992, 238041}}, // French accents (issue #15229 — Decode mangled U+00E0-U+00FF) {name: "french accents", input: "café résumé naïve", want: []int32{123125, 236859, 118515, 120362}}, {name: "french with apostrophe", input: "L'élève a mangé", want: []int32{236798, 236789, 161654, 496, 14695, 236859}}, // German umlauts {name: "german umlauts", input: "über Straße Größe", want: []int32{28223, 80176, 112880}}, // Codepoints in GPT-2 byte reversal range (U+0100-U+0142) {name: "codepoints in gpt2 byte range", input: "ąęćł", want: []int32{237198, 226110, 237114}}, {name: "latin extended A", input: "ĀāĂ㥹", want: []int32{241920, 237448, 241645, 237106, 243514, 237198}}, // CJK & Japanese {name: "chinese", input: "你好世界", want: []int32{144626, 12811}}, {name: "japanese hiragana", input: "こんにちは", want: []int32{85141}}, // Mixed scripts {name: "mixed scripts", input: "hello ąęść world café 你好", want: []int32{23391, 236743, 237198, 237202, 14732, 1902, 33443, 43758, 237389}}, // Whitespace {name: "empty string", input: "", want: []int32{}}, {name: "newlines", input: "\n\n", want: []int32{108}}, {name: "tabs", input: "\t\t", want: []int32{255969}}, // Code-like content {name: "python code", input: "def foo(x): return x + 1", want: []int32{2063, 46293, 236769, 236781, 1473, 994, 1123, 900, 236743, 236770}}, {name: "json", input: `{"key": "value"}`, want: []int32{14937, 2478, 1083, 623, 2394, 25938}}, // Misc {name: "repeated char", input: "aaaaaa", want: []int32{50354, 9236}}, {name: "emoji", input: "hello 👋 world", want: []int32{23391, 155818, 1902}}, {name: "digits", input: "12345", want: []int32{236770, 236778, 236800, 236812, 236810}}, {name: "float", input: "3.14159", want: []int32{236800, 236761, 236770, 236812, 236770, 236810, 236819}}, } // findGemma4GGUF looks for any gemma4 model GGUF in the local ollama store. func findGemma4GGUF() (string, error) { modelsDir := envconfig.Models() manifestDir := filepath.Join(modelsDir, "manifests", "registry.ollama.ai", "library", "gemma4") entries, err := os.ReadDir(manifestDir) if err != nil { return "", fmt.Errorf("no gemma4 manifests in %s: %w", manifestDir, err) } blobDir := filepath.Join(modelsDir, "blobs") for _, entry := range entries { if entry.IsDir() { continue } data, err := os.ReadFile(filepath.Join(manifestDir, entry.Name())) if err != nil { continue } var manifest struct { Layers []struct { MediaType string `json:"mediaType"` Digest string `json:"digest"` } `json:"layers"` } if err := json.Unmarshal(data, &manifest); err != nil { continue } for _, layer := range manifest.Layers { if layer.MediaType == "application/vnd.ollama.image.model" { blobPath := filepath.Join(blobDir, strings.Replace(layer.Digest, ":", "-", 1)) if _, err := os.Stat(blobPath); err == nil { return blobPath, nil } } } } return "", fmt.Errorf("no gemma4 model blob found in %s", modelsDir) } // loadGemma4Tokenizer opens a GGUF and builds a BPE tokenizer from its // tokenizer metadata — the same configuration used at inference time. func loadGemma4Tokenizer(t *testing.T, ggufPath string) tokenizer.BytePairEncoding { t.Helper() f, err := gguf.Open(ggufPath) if err != nil { t.Fatalf("gguf.Open: %v", err) } defer f.Close() tokens := f.KeyValue("tokenizer.ggml.tokens").Strings() if len(tokens) == 0 { t.Fatal("no tokenizer.ggml.tokens in GGUF") } scores64 := f.KeyValue("tokenizer.ggml.scores").Floats() scores := make([]float32, len(scores64)) for i, s := range scores64 { scores[i] = float32(s) } types64 := f.KeyValue("tokenizer.ggml.token_type").Ints() types := make([]int32, len(types64)) for i, tt := range types64 { types[i] = int32(tt) } merges := f.KeyValue("tokenizer.ggml.merges").Strings() vocab := &tokenizer.Vocabulary{ Values: tokens, Types: types, Scores: scores, Merges: merges, BOS: []int32{2}, EOS: []int32{1}, AddBOS: false, } return tokenizer.NewBytePairEncodingWithOptions(vocab, []string{}, tokenizer.WithSentencePieceNormalizer()) } // writeTokenizerJSON reconstructs a tokenizer.json from GGUF metadata // for the Rust tokenizers library to load as an independent reference. func writeTokenizerJSON(t *testing.T, ggufPath string) string { t.Helper() f, err := gguf.Open(ggufPath) if err != nil { t.Fatalf("gguf.Open: %v", err) } defer f.Close() tokens := f.KeyValue("tokenizer.ggml.tokens").Strings() mergeStrs := f.KeyValue("tokenizer.ggml.merges").Strings() vocab := make(map[string]int, len(tokens)) for i, tok := range tokens { vocab[tok] = i } merges := make([][2]string, len(mergeStrs)) for i, m := range mergeStrs { parts := strings.SplitN(m, " ", 2) if len(parts) == 2 { merges[i] = [2]string{parts[0], parts[1]} } } tj := map[string]any{ "version": "1.0", "model": map[string]any{ "type": "BPE", "vocab": vocab, "merges": merges, }, "normalizer": map[string]any{ "type": "Replace", "pattern": map[string]string{"String": " "}, "content": "\u2581", }, } tmpFile, err := os.CreateTemp(t.TempDir(), "gemma4_tokenizer_*.json") if err != nil { t.Fatalf("create temp file: %v", err) } if err := json.NewEncoder(tmpFile).Encode(tj); err != nil { tmpFile.Close() t.Fatalf("encode tokenizer.json: %v", err) } tmpFile.Close() return tmpFile.Name() } func TestGemma4TokenizerMatchesReference(t *testing.T) { ggufPath, err := findGemma4GGUF() if err != nil { t.Skipf("skipping: %v", err) } t.Logf("using GGUF: %s", ggufPath) tok := loadGemma4Tokenizer(t, ggufPath) verify := os.Getenv("VERIFY_HF_TOKENIZER") != "" var tokenizerJSONPath string if verify { if err := exec.Command("python3", "-c", "from tokenizers import Tokenizer").Run(); err != nil { t.Fatal("VERIFY_HF_TOKENIZER=1 requires python3 with tokenizers>=0.21 on PATH") } tokenizerJSONPath = writeTokenizerJSON(t, ggufPath) defer os.Remove(tokenizerJSONPath) t.Log("VERIFY_HF_TOKENIZER=1: verifying against Rust tokenizers library") } for _, tc := range gemma4TokenizerRefCases { t.Run(tc.name, func(t *testing.T) { ids, err := tok.Encode(tc.input, false) if err != nil { t.Fatalf("Encode(%q): %v", tc.input, err) } if tc.want != nil { if fmt.Sprint(ids) != fmt.Sprint(tc.want) { t.Errorf("Encode(%q):\n got: %v\n want: %v", tc.input, ids, tc.want) } } else { t.Errorf("no expected IDs for %q; our tokenizer produced: %v", tc.input, ids) } if len(ids) > 0 { decoded, err := tok.Decode(ids) if err != nil { t.Fatalf("Decode: %v", err) } if decoded != tc.input { t.Errorf("roundtrip %q: Decode(Encode) = %q", tc.input, decoded) } } if verify { refIDs := encodeWithRustTokenizer(t, tokenizerJSONPath, tc.input) if fmt.Sprint(refIDs) != fmt.Sprint(ids) { fmt.Fprintf(os.Stderr, "\nREFERENCE OUTPUT for %s (copy-paste as want):\nwant: []int32{%s},\n\n", tc.name, int32SliceStr(refIDs)) } if tc.want != nil && fmt.Sprint(refIDs) != fmt.Sprint(tc.want) { t.Errorf("hardcoded expected IDs don't match reference for %q:\n ref: %v\n hardcoded: %v", tc.input, refIDs, tc.want) } } }) } } func encodeWithRustTokenizer(t *testing.T, tokenizerPath, text string) []int32 { t.Helper() if text == "" { return nil } script := fmt.Sprintf(` from tokenizers import Tokenizer t = Tokenizer.from_file(%q) ids = t.encode(%q, add_special_tokens=False).ids print(",".join(str(i) for i in ids)) `, tokenizerPath, text) cmd := exec.Command("python3", "-c", script) var stdout, stderr strings.Builder cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { t.Fatalf("python3 failed: %v\nstderr: %s", err, stderr.String()) } parts := strings.Split(strings.TrimSpace(stdout.String()), ",") var ids []int32 for _, p := range parts { if p == "" { continue } var id int32 fmt.Sscanf(p, "%d", &id) ids = append(ids, id) } return ids } func int32SliceStr(ids []int32) string { parts := make([]string, len(ids)) for i, id := range ids { parts[i] = fmt.Sprintf("%d", id) } return strings.Join(parts, ", ") }