mlx: quantized embeddings, fast SwiGLU, and runtime fixes (#14884)

Add QuantizedEmbedding and EmbeddingLayer interface so models can
use quantized embedding weights and expose tied output projections.
This change updates gemma3, glm4_moe_lite, llama, qwen3, and qwen3_5
to use the new interface.
This commit is contained in:
Patrick Devine
2026-03-17 11:21:38 -07:00
committed by GitHub
parent fa69b833cd
commit d727aacd04
12 changed files with 405 additions and 37 deletions

View File

@@ -71,6 +71,11 @@ func (t *Tokenizer) BOS() int32 {
return t.vocab.BOS
}
// AddBOS returns whether a BOS token should be prepended during encoding.
func (t *Tokenizer) AddBOS() bool {
return t.vocab.AddBOS
}
// EOS returns the first end of sequence token ID (for backwards compatibility)
func (t *Tokenizer) EOS() int32 {
if len(t.vocab.EOS) > 0 {