tokenizer: add byte fallback for SentencePiece BPE encoding (#15232)

* tokenizer: add byte fallback for SentencePiece BPE encoding When BPE merging produces tokens not in the vocabulary, fall back to encoding each UTF-8 byte as <0xHH> byte tokens instead of silently dropping the character. Also teach Decode to convert <0xHH> tokens back to raw bytes. Fixes #15229, fixes #15231 * tokenizer fixes
2026-04-21 16:25:42 +02:00 · 2026-04-02 13:04:45 -07:00
parent 96b202d34b
commit de9673ac3f
3 changed files with 513 additions and 49 deletions
--- a/model/models/gemma4/tokenizer_reference_test.go
+++ b/model/models/gemma4/tokenizer_reference_test.go
@@ -0,0 +1,341 @@
+package gemma4
+
+// TestGemma4TokenizerMatchesReference verifies our BPE tokenizer matches
+// the Rust tokenizers library (the reference implementation) for Gemma 4.
+//
+// The test loads vocabulary from any local ollama gemma4 GGUF model.
+// Skips if no gemma4 model is installed.
+//
+// Set VERIFY_HF_TOKENIZER=1 to verify against the Rust tokenizers library
+// via Python. Requires python3 with tokenizers>=0.21 on PATH:
+//
+//   VERIFY_HF_TOKENIZER=1 go test ./model/models/gemma4/ -run TestGemma4Tokenizer -v
+//
+// Workflow for adding a new test case:
+//   1. Add {name: "...", input: "..."} to the test list (no want field)
+//   2. Run with VERIFY_HF_TOKENIZER=1 — it prints the reference IDs
+//   3. Paste those IDs into the want field
+//   4. Run without VERIFY_HF_TOKENIZER — our tokenizer must match
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/fs/gguf"
+	"github.com/ollama/ollama/tokenizer"
+)
+
+type tokenizerRefCase struct {
+	name  string
+	input string
+	want  []int32
+}
+
+// Reference token IDs generated by the Rust tokenizers library using
+// vocab/merges from a gemma4 GGUF with add_special_tokens=False.
+var gemma4TokenizerRefCases = []tokenizerRefCase{
+	// Basic ASCII
+	{name: "basic word", input: "hello", want: []int32{23391}},
+	{name: "two words", input: "hello world", want: []int32{23391, 1902}},
+	{name: "punctuation", input: "Hello, World!", want: []int32{9259, 236764, 4109, 236888}},
+
+	// Space handling (pretokenizer bug: GPT-2 splitter mangled leading/multiple spaces)
+	{name: "leading space", input: " hello", want: []int32{29104}},
+	{name: "double leading space", input: "  hello", want: []int32{138, 23391}},
+	{name: "double space between words", input: "hello  world", want: []int32{23391, 138, 12392}},
+	{name: "only spaces", input: "   ", want: []int32{139}},
+	{name: "repeated spaces", input: "      ", want: []int32{142}},
+	{name: "leading spaces phrase", input: " leading spaces", want: []int32{5830, 9952}},
+	{name: "multiple interior spaces", input: "multiple    spaces", want: []int32{43819, 140, 35220}},
+
+	// Polish diacritics (issue #15231 — Decode mangled U+0105-U+0142)
+	{name: "polish diacritics", input: "ąęśćżźółń", want: []int32{237198, 237202, 14732, 237277, 238992, 24875, 238041}},
+	{name: "polish sentence", input: "Zażółć gęślą jaźń", want: []int32{236953, 40512, 24875, 237289, 549, 237202, 62081, 237198, 4828, 238992, 238041}},
+
+	// French accents (issue #15229 — Decode mangled U+00E0-U+00FF)
+	{name: "french accents", input: "café résumé naïve", want: []int32{123125, 236859, 118515, 120362}},
+	{name: "french with apostrophe", input: "L'élève a mangé", want: []int32{236798, 236789, 161654, 496, 14695, 236859}},
+
+	// German umlauts
+	{name: "german umlauts", input: "über Straße Größe", want: []int32{28223, 80176, 112880}},
+
+	// Codepoints in GPT-2 byte reversal range (U+0100-U+0142)
+	{name: "codepoints in gpt2 byte range", input: "ąęćł", want: []int32{237198, 226110, 237114}},
+	{name: "latin extended A", input: "ĀāĂăĄą", want: []int32{241920, 237448, 241645, 237106, 243514, 237198}},
+
+	// CJK & Japanese
+	{name: "chinese", input: "你好世界", want: []int32{144626, 12811}},
+	{name: "japanese hiragana", input: "こんにちは", want: []int32{85141}},
+
+	// Mixed scripts
+	{name: "mixed scripts", input: "hello ąęść world café 你好", want: []int32{23391, 236743, 237198, 237202, 14732, 1902, 33443, 43758, 237389}},
+
+	// Whitespace
+	{name: "empty string", input: "", want: []int32{}},
+	{name: "newlines", input: "\n\n", want: []int32{108}},
+	{name: "tabs", input: "\t\t", want: []int32{255969}},
+
+	// Code-like content
+	{name: "python code", input: "def foo(x): return x + 1", want: []int32{2063, 46293, 236769, 236781, 1473, 994, 1123, 900, 236743, 236770}},
+	{name: "json", input: `{"key": "value"}`, want: []int32{14937, 2478, 1083, 623, 2394, 25938}},
+
+	// Misc
+	{name: "repeated char", input: "aaaaaa", want: []int32{50354, 9236}},
+	{name: "emoji", input: "hello 👋 world", want: []int32{23391, 155818, 1902}},
+	{name: "digits", input: "12345", want: []int32{236770, 236778, 236800, 236812, 236810}},
+	{name: "float", input: "3.14159", want: []int32{236800, 236761, 236770, 236812, 236770, 236810, 236819}},
+}
+
+// findGemma4GGUF looks for any gemma4 model GGUF in the local ollama store.
+func findGemma4GGUF() (string, error) {
+	modelsDir := envconfig.Models()
+	manifestDir := filepath.Join(modelsDir, "manifests", "registry.ollama.ai", "library", "gemma4")
+	entries, err := os.ReadDir(manifestDir)
+	if err != nil {
+		return "", fmt.Errorf("no gemma4 manifests in %s: %w", manifestDir, err)
+	}
+
+	blobDir := filepath.Join(modelsDir, "blobs")
+
+	for _, entry := range entries {
+		if entry.IsDir() {
+			continue
+		}
+
+		data, err := os.ReadFile(filepath.Join(manifestDir, entry.Name()))
+		if err != nil {
+			continue
+		}
+
+		var manifest struct {
+			Layers []struct {
+				MediaType string `json:"mediaType"`
+				Digest    string `json:"digest"`
+			} `json:"layers"`
+		}
+		if err := json.Unmarshal(data, &manifest); err != nil {
+			continue
+		}
+
+		for _, layer := range manifest.Layers {
+			if layer.MediaType == "application/vnd.ollama.image.model" {
+				blobPath := filepath.Join(blobDir, strings.Replace(layer.Digest, ":", "-", 1))
+				if _, err := os.Stat(blobPath); err == nil {
+					return blobPath, nil
+				}
+			}
+		}
+	}
+
+	return "", fmt.Errorf("no gemma4 model blob found in %s", modelsDir)
+}
+
+// loadGemma4Tokenizer opens a GGUF and builds a BPE tokenizer from its
+// tokenizer metadata — the same configuration used at inference time.
+func loadGemma4Tokenizer(t *testing.T, ggufPath string) tokenizer.BytePairEncoding {
+	t.Helper()
+
+	f, err := gguf.Open(ggufPath)
+	if err != nil {
+		t.Fatalf("gguf.Open: %v", err)
+	}
+	defer f.Close()
+
+	tokens := f.KeyValue("tokenizer.ggml.tokens").Strings()
+	if len(tokens) == 0 {
+		t.Fatal("no tokenizer.ggml.tokens in GGUF")
+	}
+
+	scores64 := f.KeyValue("tokenizer.ggml.scores").Floats()
+	scores := make([]float32, len(scores64))
+	for i, s := range scores64 {
+		scores[i] = float32(s)
+	}
+
+	types64 := f.KeyValue("tokenizer.ggml.token_type").Ints()
+	types := make([]int32, len(types64))
+	for i, tt := range types64 {
+		types[i] = int32(tt)
+	}
+
+	merges := f.KeyValue("tokenizer.ggml.merges").Strings()
+
+	vocab := &tokenizer.Vocabulary{
+		Values: tokens,
+		Types:  types,
+		Scores: scores,
+		Merges: merges,
+		BOS:    []int32{2},
+		EOS:    []int32{1},
+		AddBOS: false,
+	}
+
+	return tokenizer.NewBytePairEncodingWithOptions(vocab, []string{},
+		tokenizer.WithSentencePieceNormalizer())
+}
+
+// writeTokenizerJSON reconstructs a tokenizer.json from GGUF metadata
+// for the Rust tokenizers library to load as an independent reference.
+func writeTokenizerJSON(t *testing.T, ggufPath string) string {
+	t.Helper()
+
+	f, err := gguf.Open(ggufPath)
+	if err != nil {
+		t.Fatalf("gguf.Open: %v", err)
+	}
+	defer f.Close()
+
+	tokens := f.KeyValue("tokenizer.ggml.tokens").Strings()
+	mergeStrs := f.KeyValue("tokenizer.ggml.merges").Strings()
+
+	vocab := make(map[string]int, len(tokens))
+	for i, tok := range tokens {
+		vocab[tok] = i
+	}
+
+	merges := make([][2]string, len(mergeStrs))
+	for i, m := range mergeStrs {
+		parts := strings.SplitN(m, " ", 2)
+		if len(parts) == 2 {
+			merges[i] = [2]string{parts[0], parts[1]}
+		}
+	}
+
+	tj := map[string]any{
+		"version": "1.0",
+		"model": map[string]any{
+			"type":   "BPE",
+			"vocab":  vocab,
+			"merges": merges,
+		},
+		"normalizer": map[string]any{
+			"type":    "Replace",
+			"pattern": map[string]string{"String": " "},
+			"content": "\u2581",
+		},
+	}
+
+	tmpFile, err := os.CreateTemp(t.TempDir(), "gemma4_tokenizer_*.json")
+	if err != nil {
+		t.Fatalf("create temp file: %v", err)
+	}
+
+	if err := json.NewEncoder(tmpFile).Encode(tj); err != nil {
+		tmpFile.Close()
+		t.Fatalf("encode tokenizer.json: %v", err)
+	}
+	tmpFile.Close()
+
+	return tmpFile.Name()
+}
+
+func TestGemma4TokenizerMatchesReference(t *testing.T) {
+	ggufPath, err := findGemma4GGUF()
+	if err != nil {
+		t.Skipf("skipping: %v", err)
+	}
+	t.Logf("using GGUF: %s", ggufPath)
+
+	tok := loadGemma4Tokenizer(t, ggufPath)
+
+	verify := os.Getenv("VERIFY_HF_TOKENIZER") != ""
+	var tokenizerJSONPath string
+	if verify {
+		if err := exec.Command("python3", "-c", "from tokenizers import Tokenizer").Run(); err != nil {
+			t.Fatal("VERIFY_HF_TOKENIZER=1 requires python3 with tokenizers>=0.21 on PATH")
+		}
+		tokenizerJSONPath = writeTokenizerJSON(t, ggufPath)
+		defer os.Remove(tokenizerJSONPath)
+		t.Log("VERIFY_HF_TOKENIZER=1: verifying against Rust tokenizers library")
+	}
+
+	for _, tc := range gemma4TokenizerRefCases {
+		t.Run(tc.name, func(t *testing.T) {
+			ids, err := tok.Encode(tc.input, false)
+			if err != nil {
+				t.Fatalf("Encode(%q): %v", tc.input, err)
+			}
+
+			if tc.want != nil {
+				if fmt.Sprint(ids) != fmt.Sprint(tc.want) {
+					t.Errorf("Encode(%q):\n  got:  %v\n  want: %v", tc.input, ids, tc.want)
+				}
+			} else {
+				t.Errorf("no expected IDs for %q; our tokenizer produced: %v", tc.input, ids)
+			}
+
+			if len(ids) > 0 {
+				decoded, err := tok.Decode(ids)
+				if err != nil {
+					t.Fatalf("Decode: %v", err)
+				}
+				if decoded != tc.input {
+					t.Errorf("roundtrip %q: Decode(Encode) = %q", tc.input, decoded)
+				}
+			}
+
+			if verify {
+				refIDs := encodeWithRustTokenizer(t, tokenizerJSONPath, tc.input)
+
+				if fmt.Sprint(refIDs) != fmt.Sprint(ids) {
+					fmt.Fprintf(os.Stderr, "\nREFERENCE OUTPUT for %s (copy-paste as want):\nwant: []int32{%s},\n\n",
+						tc.name, int32SliceStr(refIDs))
+				}
+
+				if tc.want != nil && fmt.Sprint(refIDs) != fmt.Sprint(tc.want) {
+					t.Errorf("hardcoded expected IDs don't match reference for %q:\n  ref:      %v\n  hardcoded: %v",
+						tc.input, refIDs, tc.want)
+				}
+			}
+		})
+	}
+}
+
+func encodeWithRustTokenizer(t *testing.T, tokenizerPath, text string) []int32 {
+	t.Helper()
+
+	if text == "" {
+		return nil
+	}
+
+	script := fmt.Sprintf(`
+from tokenizers import Tokenizer
+t = Tokenizer.from_file(%q)
+ids = t.encode(%q, add_special_tokens=False).ids
+print(",".join(str(i) for i in ids))
+`, tokenizerPath, text)
+
+	cmd := exec.Command("python3", "-c", script)
+	var stdout, stderr strings.Builder
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		t.Fatalf("python3 failed: %v\nstderr: %s", err, stderr.String())
+	}
+
+	parts := strings.Split(strings.TrimSpace(stdout.String()), ",")
+	var ids []int32
+	for _, p := range parts {
+		if p == "" {
+			continue
+		}
+		var id int32
+		fmt.Sscanf(p, "%d", &id)
+		ids = append(ids, id)
+	}
+	return ids
+}
+
+func int32SliceStr(ids []int32) string {
+	parts := make([]string, len(ids))
+	for i, id := range ids {
+		parts[i] = fmt.Sprintf("%d", id)
+	}
+	return strings.Join(parts, ", ")
+}