ollama/model/models/gemma4/tokenizer_reference_test.go

package gemma4

// TestGemma4TokenizerMatchesReference verifies our BPE tokenizer matches
// the Rust tokenizers library (the reference implementation) for Gemma 4.
//
// The test loads vocabulary from any local ollama gemma4 GGUF model.
// Skips if no gemma4 model is installed.
//
// Set VERIFY_HF_TOKENIZER=1 to verify against the Rust tokenizers library
// via Python. Requires python3 with tokenizers>=0.21 on PATH:
//
//   VERIFY_HF_TOKENIZER=1 go test ./model/models/gemma4/ -run TestGemma4Tokenizer -v
//
// Workflow for adding a new test case:
//   1. Add {name: "...", input: "..."} to the test list (no want field)
//   2. Run with VERIFY_HF_TOKENIZER=1 — it prints the reference IDs
//   3. Paste those IDs into the want field
//   4. Run without VERIFY_HF_TOKENIZER — our tokenizer must match

import (
	"encoding/json"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"strings"
	"testing"

	"github.com/ollama/ollama/envconfig"
	"github.com/ollama/ollama/fs/gguf"
	"github.com/ollama/ollama/tokenizer"
)

type tokenizerRefCase struct {
	name  string
	input string
	want  []int32
}

// Reference token IDs generated by the Rust tokenizers library using
// vocab/merges from a gemma4 GGUF with add_special_tokens=False.
var gemma4TokenizerRefCases = []tokenizerRefCase{
	// Basic ASCII
	{name: "basic word", input: "hello", want: []int32{23391}},
	{name: "two words", input: "hello world", want: []int32{23391, 1902}},
	{name: "punctuation", input: "Hello, World!", want: []int32{9259, 236764, 4109, 236888}},

	// Space handling (pretokenizer bug: GPT-2 splitter mangled leading/multiple spaces)
	{name: "leading space", input: " hello", want: []int32{29104}},
	{name: "double leading space", input: "  hello", want: []int32{138, 23391}},
	{name: "double space between words", input: "hello  world", want: []int32{23391, 138, 12392}},
	{name: "only spaces", input: "   ", want: []int32{139}},
	{name: "repeated spaces", input: "      ", want: []int32{142}},
	{name: "leading spaces phrase", input: " leading spaces", want: []int32{5830, 9952}},
	{name: "multiple interior spaces", input: "multiple    spaces", want: []int32{43819, 140, 35220}},

	// Polish diacritics (issue #15231 — Decode mangled U+0105-U+0142)
	{name: "polish diacritics", input: "ąęśćżźółń", want: []int32{237198, 237202, 14732, 237277, 238992, 24875, 238041}},
	{name: "polish sentence", input: "Zażółć gęślą jaźń", want: []int32{236953, 40512, 24875, 237289, 549, 237202, 62081, 237198, 4828, 238992, 238041}},

	// French accents (issue #15229 — Decode mangled U+00E0-U+00FF)
	{name: "french accents", input: "café résumé naïve", want: []int32{123125, 236859, 118515, 120362}},
	{name: "french with apostrophe", input: "L'élève a mangé", want: []int32{236798, 236789, 161654, 496, 14695, 236859}},

	// German umlauts
	{name: "german umlauts", input: "über Straße Größe", want: []int32{28223, 80176, 112880}},

	// Codepoints in GPT-2 byte reversal range (U+0100-U+0142)
	{name: "codepoints in gpt2 byte range", input: "ąęćł", want: []int32{237198, 226110, 237114}},
	{name: "latin extended A", input: "ĀāĂăĄą", want: []int32{241920, 237448, 241645, 237106, 243514, 237198}},

	// CJK & Japanese
	{name: "chinese", input: "你好世界", want: []int32{144626, 12811}},
	{name: "japanese hiragana", input: "こんにちは", want: []int32{85141}},

	// Mixed scripts
	{name: "mixed scripts", input: "hello ąęść world café 你好", want: []int32{23391, 236743, 237198, 237202, 14732, 1902, 33443, 43758, 237389}},

	// Whitespace
	{name: "empty string", input: "", want: []int32{}},
	{name: "newlines", input: "\n\n", want: []int32{108}},
	{name: "tabs", input: "\t\t", want: []int32{255969}},

	// Code-like content
	{name: "python code", input: "def foo(x): return x + 1", want: []int32{2063, 46293, 236769, 236781, 1473, 994, 1123, 900, 236743, 236770}},
	{name: "json", input: `{"key": "value"}`, want: []int32{14937, 2478, 1083, 623, 2394, 25938}},

	// Misc
	{name: "repeated char", input: "aaaaaa", want: []int32{50354, 9236}},
	{name: "emoji", input: "hello 👋 world", want: []int32{23391, 155818, 1902}},
	{name: "digits", input: "12345", want: []int32{236770, 236778, 236800, 236812, 236810}},
	{name: "float", input: "3.14159", want: []int32{236800, 236761, 236770, 236812, 236770, 236810, 236819}},
}

// findGemma4GGUF looks for any gemma4 model GGUF in the local ollama store.
func findGemma4GGUF() (string, error) {
	modelsDir := envconfig.Models()
	manifestDir := filepath.Join(modelsDir, "manifests", "registry.ollama.ai", "library", "gemma4")
	entries, err := os.ReadDir(manifestDir)
	if err != nil {
		return "", fmt.Errorf("no gemma4 manifests in %s: %w", manifestDir, err)
	}

	blobDir := filepath.Join(modelsDir, "blobs")

	for _, entry := range entries {
		if entry.IsDir() {
			continue
		}

		data, err := os.ReadFile(filepath.Join(manifestDir, entry.Name()))
		if err != nil {
			continue
		}

		var manifest struct {
			Layers []struct {
				MediaType string `json:"mediaType"`
				Digest    string `json:"digest"`
			} `json:"layers"`
		}
		if err := json.Unmarshal(data, &manifest); err != nil {
			continue
		}

		for _, layer := range manifest.Layers {
			if layer.MediaType == "application/vnd.ollama.image.model" {
				blobPath := filepath.Join(blobDir, strings.Replace(layer.Digest, ":", "-", 1))
				if _, err := os.Stat(blobPath); err == nil {
					return blobPath, nil
				}
			}
		}
	}

	return "", fmt.Errorf("no gemma4 model blob found in %s", modelsDir)
}

// loadGemma4Tokenizer opens a GGUF and builds a BPE tokenizer from its
// tokenizer metadata — the same configuration used at inference time.
func loadGemma4Tokenizer(t *testing.T, ggufPath string) tokenizer.BytePairEncoding {
	t.Helper()

	f, err := gguf.Open(ggufPath)
	if err != nil {
		t.Fatalf("gguf.Open: %v", err)
	}
	defer f.Close()

	tokens := f.KeyValue("tokenizer.ggml.tokens").Strings()
	if len(tokens) == 0 {
		t.Fatal("no tokenizer.ggml.tokens in GGUF")
	}

	scores64 := f.KeyValue("tokenizer.ggml.scores").Floats()
	scores := make([]float32, len(scores64))
	for i, s := range scores64 {
		scores[i] = float32(s)
	}

	types64 := f.KeyValue("tokenizer.ggml.token_type").Ints()
	types := make([]int32, len(types64))
	for i, tt := range types64 {
		types[i] = int32(tt)
	}

	merges := f.KeyValue("tokenizer.ggml.merges").Strings()

	vocab := &tokenizer.Vocabulary{
		Values: tokens,
		Types:  types,
		Scores: scores,
		Merges: merges,
		BOS:    []int32{2},
		EOS:    []int32{1},
		AddBOS: false,
	}

	return tokenizer.NewBytePairEncodingWithOptions(vocab, []string{},
		tokenizer.WithSentencePieceNormalizer())
}

// writeTokenizerJSON reconstructs a tokenizer.json from GGUF metadata
// for the Rust tokenizers library to load as an independent reference.
func writeTokenizerJSON(t *testing.T, ggufPath string) string {
	t.Helper()

	f, err := gguf.Open(ggufPath)
	if err != nil {
		t.Fatalf("gguf.Open: %v", err)
	}
	defer f.Close()

	tokens := f.KeyValue("tokenizer.ggml.tokens").Strings()
	mergeStrs := f.KeyValue("tokenizer.ggml.merges").Strings()

	vocab := make(map[string]int, len(tokens))
	for i, tok := range tokens {
		vocab[tok] = i
	}

	merges := make([][2]string, len(mergeStrs))
	for i, m := range mergeStrs {
		parts := strings.SplitN(m, " ", 2)
		if len(parts) == 2 {
			merges[i] = [2]string{parts[0], parts[1]}
		}
	}

	tj := map[string]any{
		"version": "1.0",
		"model": map[string]any{
			"type":   "BPE",
			"vocab":  vocab,
			"merges": merges,
		},
		"normalizer": map[string]any{
			"type":    "Replace",
			"pattern": map[string]string{"String": " "},
			"content": "\u2581",
		},
	}

	tmpFile, err := os.CreateTemp(t.TempDir(), "gemma4_tokenizer_*.json")
	if err != nil {
		t.Fatalf("create temp file: %v", err)
	}

	if err := json.NewEncoder(tmpFile).Encode(tj); err != nil {
		tmpFile.Close()
		t.Fatalf("encode tokenizer.json: %v", err)
	}
	tmpFile.Close()

	return tmpFile.Name()
}

func TestGemma4TokenizerMatchesReference(t *testing.T) {
	ggufPath, err := findGemma4GGUF()
	if err != nil {
		t.Skipf("skipping: %v", err)
	}
	t.Logf("using GGUF: %s", ggufPath)

	tok := loadGemma4Tokenizer(t, ggufPath)

	verify := os.Getenv("VERIFY_HF_TOKENIZER") != ""
	var tokenizerJSONPath string
	if verify {
		if err := exec.Command("python3", "-c", "from tokenizers import Tokenizer").Run(); err != nil {
			t.Fatal("VERIFY_HF_TOKENIZER=1 requires python3 with tokenizers>=0.21 on PATH")
		}
		tokenizerJSONPath = writeTokenizerJSON(t, ggufPath)
		defer os.Remove(tokenizerJSONPath)
		t.Log("VERIFY_HF_TOKENIZER=1: verifying against Rust tokenizers library")
	}

	for _, tc := range gemma4TokenizerRefCases {
		t.Run(tc.name, func(t *testing.T) {
			ids, err := tok.Encode(tc.input, false)
			if err != nil {
				t.Fatalf("Encode(%q): %v", tc.input, err)
			}

			if tc.want != nil {
				if fmt.Sprint(ids) != fmt.Sprint(tc.want) {
					t.Errorf("Encode(%q):\n  got:  %v\n  want: %v", tc.input, ids, tc.want)
				}
			} else {
				t.Errorf("no expected IDs for %q; our tokenizer produced: %v", tc.input, ids)
			}

			if len(ids) > 0 {
				decoded, err := tok.Decode(ids)
				if err != nil {
					t.Fatalf("Decode: %v", err)
				}
				if decoded != tc.input {
					t.Errorf("roundtrip %q: Decode(Encode) = %q", tc.input, decoded)
				}
			}

			if verify {
				refIDs := encodeWithRustTokenizer(t, tokenizerJSONPath, tc.input)

				if fmt.Sprint(refIDs) != fmt.Sprint(ids) {
					fmt.Fprintf(os.Stderr, "\nREFERENCE OUTPUT for %s (copy-paste as want):\nwant: []int32{%s},\n\n",
						tc.name, int32SliceStr(refIDs))
				}

				if tc.want != nil && fmt.Sprint(refIDs) != fmt.Sprint(tc.want) {
					t.Errorf("hardcoded expected IDs don't match reference for %q:\n  ref:      %v\n  hardcoded: %v",
						tc.input, refIDs, tc.want)
				}
			}
		})
	}
}

func encodeWithRustTokenizer(t *testing.T, tokenizerPath, text string) []int32 {
	t.Helper()

	if text == "" {
		return nil
	}

	script := fmt.Sprintf(`
from tokenizers import Tokenizer
t = Tokenizer.from_file(%q)
ids = t.encode(%q, add_special_tokens=False).ids
print(",".join(str(i) for i in ids))
`, tokenizerPath, text)

	cmd := exec.Command("python3", "-c", script)
	var stdout, stderr strings.Builder
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr
	if err := cmd.Run(); err != nil {
		t.Fatalf("python3 failed: %v\nstderr: %s", err, stderr.String())
	}

	parts := strings.Split(strings.TrimSpace(stdout.String()), ",")
	var ids []int32
	for _, p := range parts {
		if p == "" {
			continue
		}
		var id int32
		fmt.Sscanf(p, "%d", &id)
		ids = append(ids, id)
	}
	return ids
}

func int32SliceStr(ids []int32) string {
	parts := make([]string, len(ids))
	for i, id := range ids {
		parts[i] = fmt.Sprintf("%d", id)
	}
	return strings.Join(parts, ", ")
}