ggml update to b6840 (#12791)

2026-04-22 16:55:44 +02:00 · 2025-11-06 10:19:22 -08:00
parent c4ba257c64
commit 544b6739dd
103 changed files with 3644 additions and 1215 deletions
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        });
    }

+    bool is_clip_model = false;
    for (const auto * it : tensors) {
        const struct ggml_tensor * tensor = it->tensor;

@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
            qs.has_output = true;
        }
+
+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
    }

    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0)
+    if (qs.n_attention_wv != 0 && !is_clip_model)
    {
        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
        // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;

+        // do not quantize specific multimodal tensors
+        quantize &= name.find(".position_embd.") == std::string::npos;
+
        ggml_type new_type;
        void * new_data;
        size_t new_size;