consolidate the tokenizer (#14327)

This change adds a new x/tokenizer package which includes: * New BPE and SentencePiece tokenizers * Removing the dependency on the imagegen tokenizers * Fixes to multibyte decoding in the pipeline * Various correctness and benchmark tests Not included in this PR is the WordPiece tokenizer for BERT models which will be added when we add embedding models. The imagegen tokenizers will also be removed in a follow-up PR.
2026-04-18 15:54:11 +02:00 · 2026-02-19 15:55:45 -08:00
parent 458dd1b9d9
commit 97323d1c68
18 changed files with 1807 additions and 16 deletions
--- a/x/tokenizer/tokenizer_load_test.go
+++ b/x/tokenizer/tokenizer_load_test.go
@@ -0,0 +1,26 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestLoadFromBytesRejectsWordPiece(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "WordPiece",
+			"vocab": {"[UNK]": 0, "hello": 1}
+		},
+		"added_tokens": []
+	}`)
+
+	_, err := LoadFromBytes(data)
+	if err == nil {
+		t.Fatal("expected WordPiece load to fail")
+	}
+	if !strings.Contains(err.Error(), "unsupported tokenizer type: WordPiece") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}