mirror of
https://github.com/ollama/ollama.git
synced 2026-04-17 23:54:05 +02:00
model: add qwen3-next architecture (#14051)
This commit is contained in:
@@ -58,6 +58,48 @@ func useMoreBits(iLayer, nLayers int) bool {
|
||||
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
|
||||
}
|
||||
|
||||
func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
|
||||
switch {
|
||||
// Full attention
|
||||
case strings.HasSuffix(name, ".attn_q.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".attn_k.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".attn_v.weight"):
|
||||
return fsggml.TensorTypeQ6_K, true
|
||||
case strings.HasSuffix(name, ".attn_output.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
|
||||
// Linear attention (Gated Delta Net) after split
|
||||
case strings.HasSuffix(name, ".attn_qkv.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".attn_gate.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
|
||||
// SSM
|
||||
case strings.HasSuffix(name, ".ssm_ba.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ssm_out.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
|
||||
// MoE experts + shared experts
|
||||
case strings.HasSuffix(name, ".ffn_down_exps.weight"):
|
||||
return fsggml.TensorTypeQ6_K, true
|
||||
case strings.HasSuffix(name, ".ffn_down_shexp.weight"):
|
||||
return fsggml.TensorTypeQ6_K, true
|
||||
case strings.HasSuffix(name, ".ffn_gate_exps.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ffn_gate_shexp.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ffn_up_exps.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ffn_up_shexp.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
}
|
||||
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
|
||||
// Ported from llama_tensor_get_type, removed unsupported quantization types
|
||||
nExperts := max(1, kv.Uint("expert_count", 0))
|
||||
@@ -217,6 +259,7 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
|
||||
|
||||
// do not quantize expert gating tensors
|
||||
quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")
|
||||
quantize = quantize && !strings.Contains(name, "ffn_gate_inp_shexp.weight")
|
||||
|
||||
// do not quantize positional embeddings and token types (BERT)
|
||||
quantize = quantize && (name != "position_embd.weight")
|
||||
@@ -244,6 +287,12 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
|
||||
|
||||
newType := fsggml.TensorType(t.Kind)
|
||||
if quantize {
|
||||
if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
|
||||
if qt, ok := qwen3nextQuantType(name); ok {
|
||||
return qt
|
||||
}
|
||||
}
|
||||
|
||||
// get more optimal quantization type based on the tensor shape, layer, etc.
|
||||
newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
|
||||
if newType != defaultType {
|
||||
|
||||
Reference in New Issue
Block a user