diff --git a/x/create/gemma4.go b/x/create/gemma4.go index 35e920077..2bb493db3 100644 --- a/x/create/gemma4.go +++ b/x/create/gemma4.go @@ -93,6 +93,13 @@ func (t gemma4ImportTransform) quantizationType(name string, shape []int32, quan return "" } + // MoE router logits choose the top-k expert set. Quantization noise here + // can flip expert selection, after which downstream activations diverge + // sharply. The tensor is small, so leave it in source precision. + if isGemma4RouterProjection(name) { + return "" + } + // Mixed-precision quantization: sensitive tensors get higher precision. // // Value projections (v_proj) directly determine attention output quality. @@ -170,6 +177,12 @@ func isEmbedTokensWeight(name string) bool { !strings.Contains(name, "per_layer") } +func isGemma4RouterProjection(name string) bool { + return strings.HasSuffix(name, ".router.proj.weight") && + !strings.Contains(name, "audio_tower") && + !strings.Contains(name, "vision_tower") +} + func (t gemma4ImportTransform) transformTensor(td *safetensors.TensorData) ([]*safetensors.TensorData, error) { if td == nil { return nil, nil diff --git a/x/create/gemma4_test.go b/x/create/gemma4_test.go index 40b183162..858183db0 100644 --- a/x/create/gemma4_test.go +++ b/x/create/gemma4_test.go @@ -68,6 +68,11 @@ func TestGemma4QuantizationType(t *testing.T) { {"expert gate_up nvfp4", transform26B, "model.layers.0.moe.experts.42.gate_up_proj.weight", aligned, "nvfp4", "nvfp4"}, {"expert gate_up mxfp4", transform26B, "model.layers.0.moe.experts.42.gate_up_proj.weight", aligned, "mxfp4", "mxfp4"}, + // === Router projection: expert selection is sensitive; keep source precision === + {"router proj int4", transform26B, "model.layers.0.router.proj.weight", aligned, "int4", ""}, + {"router proj nvfp4", transform26B, "model.layers.0.router.proj.weight", aligned, "nvfp4", ""}, + {"router proj mxfp4", transform26B, "model.layers.0.router.proj.weight", aligned, "mxfp4", ""}, + // === k_proj: promoted only for 8-expert models === {"k_proj 128 experts int4", transform26B, "model.layers.0.self_attn.k_proj.weight", aligned, "int4", "int4"}, {"k_proj 8 experts int4", transform8E, "model.layers.0.self_attn.k_proj.weight", aligned, "int4", "int8"},