model: add qwen3-next architecture (#14051)

2026-04-17 23:54:05 +02:00 · 2026-02-03 23:27:21 -08:00
parent ee25219edd
commit 77eb2ca619
29 changed files with 3614 additions and 12 deletions
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -58,6 +58,48 @@ func useMoreBits(iLayer, nLayers int) bool {
 	return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
 }

+func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
+	switch {
+	// Full attention
+	case strings.HasSuffix(name, ".attn_q.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".attn_k.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".attn_v.weight"):
+		return fsggml.TensorTypeQ6_K, true
+	case strings.HasSuffix(name, ".attn_output.weight"):
+		return fsggml.TensorTypeQ4_K, true
+
+	// Linear attention (Gated Delta Net) after split
+	case strings.HasSuffix(name, ".attn_qkv.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".attn_gate.weight"):
+		return fsggml.TensorTypeQ4_K, true
+
+	// SSM
+	case strings.HasSuffix(name, ".ssm_ba.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ssm_out.weight"):
+		return fsggml.TensorTypeQ4_K, true
+
+	// MoE experts + shared experts
+	case strings.HasSuffix(name, ".ffn_down_exps.weight"):
+		return fsggml.TensorTypeQ6_K, true
+	case strings.HasSuffix(name, ".ffn_down_shexp.weight"):
+		return fsggml.TensorTypeQ6_K, true
+	case strings.HasSuffix(name, ".ffn_gate_exps.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ffn_gate_shexp.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ffn_up_exps.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ffn_up_shexp.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	}
+
+	return 0, false
+}
+
 func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
 	// Ported from llama_tensor_get_type, removed unsupported quantization types
 	nExperts := max(1, kv.Uint("expert_count", 0))
@@ -217,6 +259,7 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil

 	// do not quantize expert gating tensors
 	quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")
+	quantize = quantize && !strings.Contains(name, "ffn_gate_inp_shexp.weight")

 	// do not quantize positional embeddings and token types (BERT)
 	quantize = quantize && (name != "position_embd.weight")
@@ -244,6 +287,12 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil

 	newType := fsggml.TensorType(t.Kind)
 	if quantize {
+		if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
+			if qt, ok := qwen3nextQuantType(name); ok {
+				return qt
+			}
+		}
+
 		// get more optimal quantization type based on the tensor shape, layer, etc.
 		newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
 		if newType != defaultType {