model: add qwen3-next architecture (#14051)

This commit is contained in:
Jeffrey Morgan
2026-02-03 23:27:21 -08:00
committed by GitHub
parent ee25219edd
commit 77eb2ca619
29 changed files with 3614 additions and 12 deletions

View File

@@ -58,6 +58,48 @@ func useMoreBits(iLayer, nLayers int) bool {
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
}
func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
switch {
// Full attention
case strings.HasSuffix(name, ".attn_q.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".attn_k.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".attn_v.weight"):
return fsggml.TensorTypeQ6_K, true
case strings.HasSuffix(name, ".attn_output.weight"):
return fsggml.TensorTypeQ4_K, true
// Linear attention (Gated Delta Net) after split
case strings.HasSuffix(name, ".attn_qkv.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".attn_gate.weight"):
return fsggml.TensorTypeQ4_K, true
// SSM
case strings.HasSuffix(name, ".ssm_ba.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ssm_out.weight"):
return fsggml.TensorTypeQ4_K, true
// MoE experts + shared experts
case strings.HasSuffix(name, ".ffn_down_exps.weight"):
return fsggml.TensorTypeQ6_K, true
case strings.HasSuffix(name, ".ffn_down_shexp.weight"):
return fsggml.TensorTypeQ6_K, true
case strings.HasSuffix(name, ".ffn_gate_exps.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ffn_gate_shexp.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ffn_up_exps.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ffn_up_shexp.weight"):
return fsggml.TensorTypeQ4_K, true
}
return 0, false
}
func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
// Ported from llama_tensor_get_type, removed unsupported quantization types
nExperts := max(1, kv.Uint("expert_count", 0))
@@ -217,6 +259,7 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
// do not quantize expert gating tensors
quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")
quantize = quantize && !strings.Contains(name, "ffn_gate_inp_shexp.weight")
// do not quantize positional embeddings and token types (BERT)
quantize = quantize && (name != "position_embd.weight")
@@ -244,6 +287,12 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
newType := fsggml.TensorType(t.Kind)
if quantize {
if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
if qt, ok := qwen3nextQuantType(name); ok {
return qt
}
}
// get more optimal quantization type based on the tensor shape, layer, etc.
newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
if newType != defaultType {