package convert import ( "strings" "testing" ) func TestGemma4AudioReplacements(t *testing.T) { p := gemma4Model{} r := strings.NewReplacer(p.Replacements()...) tests := []struct { name string in string want string }{ // SSCP convolution blocks { "sscp conv0 weight", "model.audio_tower.subsample_conv_projection.conv_0.conv.weight", "a.conv1d.0.weight", }, { "sscp conv0 norm", "model.audio_tower.subsample_conv_projection.conv_0.norm.weight", "a.conv1d.0.norm.weight", }, { "sscp conv1 weight", "model.audio_tower.subsample_conv_projection.conv_1.conv.weight", "a.conv1d.1.weight", }, { "sscp input proj weight", "model.audio_tower.subsample_conv_projection.input_proj_linear.weight", "a.pre_encode.out.weight", }, { "sscp input proj bias", "model.audio_tower.subsample_conv_projection.input_proj_linear.bias", "a.pre_encode.out.bias", }, { "sscp layer0 conv weight (new naming)", "model.audio_tower.subsample_conv_projection.layer0.conv.weight", "a.conv1d.0.weight", }, { "sscp layer1 norm weight (new naming)", "model.audio_tower.subsample_conv_projection.layer1.norm.weight", "a.conv1d.1.norm.weight", }, // Conformer attention { "attn q weight", "model.audio_tower.conformer.0.attention.attn.q_proj.linear.weight", "a.blk.0.attn_q.weight", }, { "attn k weight", "model.audio_tower.conformer.5.attention.attn.k_proj.linear.weight", "a.blk.5.attn_k.weight", }, { "attn v clamp input_min", "model.audio_tower.conformer.0.attention.attn.v_proj.input_min", "a.blk.0.attn_v.input_min", }, { "attn out weight (ClippableLinear)", "model.audio_tower.conformer.0.attention.post.linear.weight", "a.blk.0.attn_out.weight", }, { "attn out clamp output_max", "model.audio_tower.conformer.0.attention.post.output_max", "a.blk.0.attn_out.output_max", }, { "attn pre norm", "model.audio_tower.conformer.0.attention.pre_attn_norm.weight", "a.blk.0.ln1.weight", }, { "attn post norm", "model.audio_tower.conformer.0.attention.post_norm.weight", "a.blk.0.ln2.weight", }, { "linear pos", "model.audio_tower.conformer.0.attention.attn.relative_position_embedding.pos_proj.weight", "a.blk.0.linear_pos.weight", }, { "per dim scale", "model.audio_tower.conformer.0.attention.attn.per_dim_scale", "a.blk.0.per_dim_scale", }, { "per dim key scale", "model.audio_tower.conformer.0.attention.attn.per_dim_key_scale", "a.blk.0.per_dim_k_scale", }, { "attn relative k proj (new naming)", "model.audio_tower.layers.0.self_attn.relative_k_proj.weight", "a.blk.0.linear_pos.weight", }, { "attn pre norm (new naming)", "model.audio_tower.layers.0.norm_pre_attn.weight", "a.blk.0.ln1.weight", }, { "attn post norm (new naming)", "model.audio_tower.layers.0.norm_post_attn.weight", "a.blk.0.ln2.weight", }, { "attn out clamp output_max (new naming)", "model.audio_tower.layers.0.self_attn.post.output_max", "a.blk.0.attn_out.output_max", }, { "per dim scale (new naming)", "model.audio_tower.layers.0.self_attn.per_dim_scale", "a.blk.0.per_dim_scale", }, // Conformer feedforward start { "ffn up weight", "model.audio_tower.conformer.0.ffw_layer_start.ffw_layer_1.linear.weight", "a.blk.0.ffn_up.weight", }, { "ffn down weight", "model.audio_tower.conformer.0.ffw_layer_start.ffw_layer_2.linear.weight", "a.blk.0.ffn_down.weight", }, { "ffn norm", "model.audio_tower.conformer.0.ffw_layer_start.pre_layer_norm.weight", "a.blk.0.ffn_norm.weight", }, { "ffn post norm", "model.audio_tower.conformer.0.ffw_layer_start.post_layer_norm.weight", "a.blk.0.ffn_post_norm.weight", }, // Conformer feedforward end { "ffn up 1 weight", "model.audio_tower.conformer.0.ffw_layer_end.ffw_layer_1.linear.weight", "a.blk.0.ffn_up_1.weight", }, { "ffn down 1 weight", "model.audio_tower.conformer.0.ffw_layer_end.ffw_layer_2.linear.weight", "a.blk.0.ffn_down_1.weight", }, { "ffn norm 1", "model.audio_tower.conformer.0.ffw_layer_end.pre_layer_norm.weight", "a.blk.0.ffn_norm_1.weight", }, { "ffn post norm 1", "model.audio_tower.conformer.0.ffw_layer_end.post_layer_norm.weight", "a.blk.0.ffn_post_norm_1.weight", }, { "ffn up output_max (new naming)", "model.audio_tower.layers.10.feed_forward1.ffw_layer_1.output_max", "a.blk.10.ffn_up.output_max", }, { "ffn down output_min (new naming)", "model.audio_tower.layers.0.feed_forward1.ffw_layer_2.output_min", "a.blk.0.ffn_down.output_min", }, { "ffn up 1 input_max (new naming)", "model.audio_tower.layers.0.feed_forward2.ffw_layer_1.input_max", "a.blk.0.ffn_up_1.input_max", }, { "ffn norm 1 (new naming)", "model.audio_tower.layers.0.feed_forward2.pre_layer_norm.weight", "a.blk.0.ffn_norm_1.weight", }, // Conformer lightweight conv1d { "conv dw weight", "model.audio_tower.conformer.0.lconv1d.depthwise_conv1d.weight", "a.blk.0.conv_dw.weight", }, { "conv norm (pre_layer_norm)", "model.audio_tower.conformer.0.lconv1d.pre_layer_norm.weight", "a.blk.0.conv_norm.weight", }, { "norm conv (conv_norm)", "model.audio_tower.conformer.0.lconv1d.conv_norm.weight", "a.blk.0.norm_conv.weight", }, { "conv pw1 weight", "model.audio_tower.conformer.0.lconv1d.linear_start.linear.weight", "a.blk.0.conv_pw1.weight", }, { "conv pw2 weight", "model.audio_tower.conformer.0.lconv1d.linear_end.linear.weight", "a.blk.0.conv_pw2.weight", }, // Audio embedder { "audio embedder projection weight", "model.embed_audio.embedding_projection.linear.weight", "mm.a.input_projection.weight", }, { "audio embedder projection bias", "model.embed_audio.embedding_projection.linear.bias", "mm.a.input_projection.bias", }, // Audio output projection { "audio output proj weight", "model.audio_tower.output_proj.weight", "mm.a.fc.weight", }, { "audio output proj bias", "model.audio_tower.output_proj.bias", "mm.a.fc.bias", }, // Verify vision tensors still work { "vision q weight", "model.vision_tower.encoder.layers.0.self_attn.q_proj.linear.weight", "v.blk.0.attn_q.weight", }, { "vision std bias", "model.vision_tower.std_bias", "v.std_bias", }, { "vision std scale", "model.vision_tower.std_scale", "v.std_scale", }, { "vision patch embd", "model.vision_tower.patch_embedder.input_proj.weight", "v.patch_embd.weight", }, { "vision projector", "model.embed_vision.embedding_projection.linear.weight", "mm.input_projection.weight", }, // Verify text tensors still work { "text attn q", "model.language_model.layers.0.self_attn.q_proj.weight", "blk.0.attn_q.weight", }, { "text token embd", "model.language_model.embed_tokens.weight", "token_embd.weight", }, { "text moe gate up fused", "model.language_model.layers.0.experts.gate_up_proj", "blk.0.ffn_gate_up_exps.weight", }, { "text moe down", "model.language_model.layers.0.experts.down_proj", "blk.0.ffn_down_exps.weight", }, { "text moe down with weight suffix", "model.language_model.layers.0.experts.down_proj.weight", "blk.0.ffn_down_exps.weight", }, { "text moe per expert scale", "model.language_model.layers.0.router.per_expert_scale", "blk.0.ffn_down_exps.scale", }, { "text moe per expert scale with weight suffix", "model.language_model.layers.0.router.per_expert_scale.weight", "blk.0.ffn_down_exps.scale", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := r.Replace(tt.in); got != tt.want { t.Errorf("Replace(%q) = %q, want %q", tt.in, got, tt.want) } }) } }