mlx: quantized embeddings, fast SwiGLU, and runtime fixes (#14884)

Add QuantizedEmbedding and EmbeddingLayer interface so models can use quantized embedding weights and expose tied output projections. This change updates gemma3, glm4_moe_lite, llama, qwen3, and qwen3_5 to use the new interface.
2026-04-24 01:35:49 +02:00 · 2026-03-17 11:21:38 -07:00
parent fa69b833cd
commit d727aacd04
12 changed files with 405 additions and 37 deletions
--- a/x/tokenizer/tokenizer.go
+++ b/x/tokenizer/tokenizer.go
@@ -71,6 +71,11 @@ func (t *Tokenizer) BOS() int32 {
 	return t.vocab.BOS
 }

+// AddBOS returns whether a BOS token should be prepended during encoding.
+func (t *Tokenizer) AddBOS() bool {
+	return t.vocab.AddBOS
+}
+
 // EOS returns the first end of sequence token ID (for backwards compatibility)
 func (t *Tokenizer) EOS() int32 {
 	if len(t.vocab.EOS) > 0 {