Keep Gemma4 router projection in source precision (#15613)

This commit is contained in:
Daniel Hiltgen
2026-04-15 15:04:23 -07:00
committed by GitHub
parent cdddea0592
commit 5d920cc6bc
2 changed files with 18 additions and 0 deletions

View File

@@ -93,6 +93,13 @@ func (t gemma4ImportTransform) quantizationType(name string, shape []int32, quan
return ""
}
// MoE router logits choose the top-k expert set. Quantization noise here
// can flip expert selection, after which downstream activations diverge
// sharply. The tensor is small, so leave it in source precision.
if isGemma4RouterProjection(name) {
return ""
}
// Mixed-precision quantization: sensitive tensors get higher precision.
//
// Value projections (v_proj) directly determine attention output quality.
@@ -170,6 +177,12 @@ func isEmbedTokensWeight(name string) bool {
!strings.Contains(name, "per_layer")
}
func isGemma4RouterProjection(name string) bool {
return strings.HasSuffix(name, ".router.proj.weight") &&
!strings.Contains(name, "audio_tower") &&
!strings.Contains(name, "vision_tower")
}
func (t gemma4ImportTransform) transformTensor(td *safetensors.TensorData) ([]*safetensors.TensorData, error) {
if td == nil {
return nil, nil

View File

@@ -68,6 +68,11 @@ func TestGemma4QuantizationType(t *testing.T) {
{"expert gate_up nvfp4", transform26B, "model.layers.0.moe.experts.42.gate_up_proj.weight", aligned, "nvfp4", "nvfp4"},
{"expert gate_up mxfp4", transform26B, "model.layers.0.moe.experts.42.gate_up_proj.weight", aligned, "mxfp4", "mxfp4"},
// === Router projection: expert selection is sensitive; keep source precision ===
{"router proj int4", transform26B, "model.layers.0.router.proj.weight", aligned, "int4", ""},
{"router proj nvfp4", transform26B, "model.layers.0.router.proj.weight", aligned, "nvfp4", ""},
{"router proj mxfp4", transform26B, "model.layers.0.router.proj.weight", aligned, "mxfp4", ""},
// === k_proj: promoted only for 8-expert models ===
{"k_proj 128 experts int4", transform26B, "model.layers.0.self_attn.k_proj.weight", aligned, "int4", "int4"},
{"k_proj 8 experts int4", transform8E, "model.layers.0.self_attn.k_proj.weight", aligned, "int4", "int8"},