Update vendor ggml code to a5bb8ba4 (#13832)

Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
2026-04-22 13:55:25 +02:00 · 2026-02-02 17:31:59 -08:00
parent 8f4a008139
commit ef00199fb4
241 changed files with 21271 additions and 5074 deletions
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -3,6 +3,7 @@
 #include "llama.h"

 #include <array>
+#include <cassert>

 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
@@ -52,8 +53,8 @@ struct llama_hparams {
    uint32_t n_rel_attn_bkts = 0;

    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-    uint32_t n_embd_head_k_mla = 0;
-    uint32_t n_embd_head_v_mla = 0;
+    uint32_t n_embd_head_k_mla_impl = 0;
+    uint32_t n_embd_head_v_mla_impl = 0;

    // for WavTokenizer
    struct llama_hparams_posnet   posnet;
@@ -107,9 +108,9 @@ struct llama_hparams {

    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
-    float    rope_freq_base_train_swa;
+    float    rope_freq_base_train_swa  = 10000.0f;
    float    rope_freq_scale_train;
-    float    rope_freq_scale_train_swa;
+    float    rope_freq_scale_train_swa = 1.0f;

    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@@ -125,10 +126,11 @@ struct llama_hparams {
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    // the size of the sliding window (0 - no SWA)
    uint32_t n_swa = 0;
-    // if swa_layers[il] == true, then layer il is SWA
-    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
+    // if swa_layers[il] == 1, then layer il is SWA
+    // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
    // by default, all layers are dense
-    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
+    // note: using uint32_t type for compatibility reason
+    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -163,6 +165,9 @@ struct llama_hparams {
    // for Classifiers
    uint32_t n_cls_out = 1;

+    // output embedding dimension (0 = use n_embd)
+    uint32_t n_embd_out_impl = 0;
+
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
@@ -235,6 +240,9 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;

+    // dimension of output embeddings
+    uint32_t n_embd_out() const;
+
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;

@@ -266,15 +274,57 @@ struct llama_hparams {

    bool is_swa(uint32_t il) const;

+    // note: currently only support if either all or none of the layers are MLA
+    bool is_mla() const;
+
+    uint32_t n_embd_head_k_mla() const;
+    uint32_t n_embd_head_v_mla() const;
+
    bool has_kv(uint32_t il) const;

    // number of layers for which has_kv() returns true
    uint32_t n_layer_kv() const;

    // note that this function uses different SWA parameters from those in the hparams
+    // note: inlined on purpose for performance reasons
    // TODO: think of a better place for this function
    // TODO: pack the SWA params in a struct?
-    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
+    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
+        assert(p0 >= 0 && p1 >= 0);
+
+        switch (swa_type) {
+            case LLAMA_SWA_TYPE_NONE:
+                {
+                } break;
+            case LLAMA_SWA_TYPE_STANDARD:
+                {
+                    if (p1 - p0 >= (int32_t) n_swa) {
+                        return true;
+                    }
+                } break;
+            case LLAMA_SWA_TYPE_CHUNKED:
+                {
+                    const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+                    if (p0 < pos_chunk_start) {
+                        return true;
+                    }
+                } break;
+            case LLAMA_SWA_TYPE_SYMMETRIC:
+                {
+                    const int32_t half_n_swa = (int32_t) n_swa / 2;
+                    const int32_t pos_diff = p1 - p0;
+
+                    // Mask if outside the symmetric window
+                    if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
+                        return true;
+                    }
+                } break;
+        }
+
+        return false;
+    }
+

    bool use_mrope() const;
 };