mirror of
https://github.com/ollama/ollama.git
synced 2026-04-24 01:35:49 +02:00
mlx: quantized embeddings, fast SwiGLU, and runtime fixes (#14884)
Add QuantizedEmbedding and EmbeddingLayer interface so models can use quantized embedding weights and expose tied output projections. This change updates gemma3, glm4_moe_lite, llama, qwen3, and qwen3_5 to use the new interface.
This commit is contained in:
@@ -71,6 +71,11 @@ func (t *Tokenizer) BOS() int32 {
|
||||
return t.vocab.BOS
|
||||
}
|
||||
|
||||
// AddBOS returns whether a BOS token should be prepended during encoding.
|
||||
func (t *Tokenizer) AddBOS() bool {
|
||||
return t.vocab.AddBOS
|
||||
}
|
||||
|
||||
// EOS returns the first end of sequence token ID (for backwards compatibility)
|
||||
func (t *Tokenizer) EOS() int32 {
|
||||
if len(t.vocab.EOS) > 0 {
|
||||
|
||||
Reference in New Issue
Block a user