mirror of
https://github.com/ollama/ollama.git
synced 2026-04-18 14:54:11 +02:00
Add QuantizedEmbedding and EmbeddingLayer interface so models can use quantized embedding weights and expose tied output projections. This change updates gemma3, glm4_moe_lite, llama, qwen3, and qwen3_5 to use the new interface.
43 lines
1.0 KiB
Go
43 lines
1.0 KiB
Go
package model
|
|
|
|
import (
|
|
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
|
"github.com/ollama/ollama/x/models/nn"
|
|
)
|
|
|
|
// MakeEmbeddingLayer constructs an embedding layer from a tensor map.
|
|
//
|
|
// For quantized tensors (path.weight + path.weight_scale), it returns a
|
|
// QuantizedEmbedding using the same quant metadata path that linear layers use.
|
|
// For non-quantized tensors, it returns a standard dense embedding.
|
|
func MakeEmbeddingLayer(
|
|
tensors map[string]*mlx.Array,
|
|
path string,
|
|
defaultGroupSize, defaultBits int,
|
|
defaultMode string,
|
|
tensorQuant map[string]*TensorQuantInfo,
|
|
) nn.EmbeddingLayer {
|
|
w := tensors[path+".weight"]
|
|
if w == nil {
|
|
return nil
|
|
}
|
|
|
|
scales := tensors[path+".weight_scale"]
|
|
if scales != nil {
|
|
qbiases := tensors[path+".weight_qbias"]
|
|
groupSize, bits, mode := ResolveLinearQuantParams(
|
|
defaultGroupSize,
|
|
defaultBits,
|
|
defaultMode,
|
|
tensorQuant,
|
|
path+".weight",
|
|
w,
|
|
scales,
|
|
)
|
|
|
|
return nn.NewQuantizedEmbedding(w, scales, qbiases, groupSize, bits, mode)
|
|
}
|
|
|
|
return nn.NewEmbedding(w)
|
|
}
|