gemma4: add Gemma 4 GGML model support

Add full Gemma 4 model family support (E2B, E4B, 26B MoE, 31B Dense) for the GGML backend including text, vision, converter, parser, and renderer. Text model features: - Sliding window + full attention with per-layer patterns - KV sharing across layers with donor map - Per-layer embeddings (PLE) with learned projections - MoE routing with RMSNorm + learned scale - Proportional RoPE with freq_factors for global attention - Final logit softcapping Vision model features: - SigLIP vision encoder with 2D RoPE - ClippableLinear with input/output clamping via packed v.clamp_data - Adaptive average pooling with nMerge kernel - Multi-modal projection with unweighted RMSNorm Converter: - Safetensors to GGUF with vision tensor renaming - Fused MoE gate_up_proj splitting - Vision patch embedding reshape (HF to Conv2D layout) - Packed clamp data tensor for ClippableLinear bounds - Proportional RoPE freq_factors generation Also includes: - BackendGet() on ml.Tensor for reading weight tensor data - Q6_K CUDA get_rows kernel support - MoE-aware ffn_down quantization layer counting - Gemma4 parser with tool calling and thinking support - Gemma4 renderer with structured tool format - Architecture-based auto-detection of renderer/parser/stop tokens - Integration test gemma4 model list additions
2026-04-24 09:46:01 +02:00 · 2026-03-30 17:38:56 -07:00
parent f6b69f3f28
commit ea3c6a3cbe
20 changed files with 2273 additions and 8 deletions
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -153,7 +153,16 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 		// MLA tensors need higher precision to avoid quality degradation
 		newType = fsggml.TensorTypeQ8_0
 	} else if strings.Contains(name, "ffn_down") {
-		iLayer := qs.iFfnDown
+		// For MoE models, ffn_down.weight (dense) and ffn_down_exps.weight (expert) both
+		// exist per layer and should get the same useMoreBits treatment. Dense sorts before
+		// expert alphabetically, so dense increments the counter and expert uses counter-1.
+		var iLayer int
+		if strings.Contains(name, "_exps") {
+			iLayer = max(0, qs.iFfnDown-1)
+		} else {
+			iLayer = qs.iFfnDown
+			qs.iFfnDown++
+		}
 		n_layer := qs.nFfnDown
 		if ftype == fsggml.FileTypeQ4_K_M {
 			if useMoreBits(iLayer, n_layer) {
@@ -162,7 +171,6 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
 			newType = fsggml.TensorTypeQ5_K
 		}
-		qs.iFfnDown++
 	} else if strings.Contains(name, "attn_output.weight") {
 		if nExperts == 8 {
 			if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {