Revert "Update vendored llama.cpp to b7847" (#14061)

2026-04-24 17:55:43 +02:00 · 2026-02-03 18:39:36 -08:00
parent a6355329bf
commit b1fccabb34
240 changed files with 5050 additions and 21247 deletions
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -7,13 +7,11 @@
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
 #include "llama-memory-hybrid.h"
-#include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"

 #include <cassert>
 #include <cmath>
 #include <cstring>
-#include <unordered_set>

 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
@@ -23,8 +21,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    }

    if (ubatch->embd) {
-        GGML_ASSERT(n_embd == embd->ne[0]);
-
+        const int64_t n_embd   = embd->ne[0];
        const int64_t n_tokens = ubatch->n_tokens;

        ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -34,8 +31,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
    bool res = true;

-    res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
-    res &= (!params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
+    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);

    return res;
 }
@@ -65,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
    bool res = true;

-    res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+    res &= pos->ne[0] == params.ubatch.n_tokens;

    return res;
 }
@@ -98,9 +95,11 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {

        int32_t * data = (int32_t *) pos_bucket->data;

-        for (int j = 0; j < n_tokens; ++j) {
-            for (int i = 0; i < n_tokens; ++i) {
-                data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+                }
            }
        }
    }
@@ -323,32 +322,34 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
    const int64_t n_tokens = ubatch->n_tokens;

    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
-        for (int i1 = 0; i1 < n_tokens; ++i1) {
-            const llama_seq_id s1 = ubatch->seq_id[i1][0];
-            const llama_pos    p1 = ubatch->pos[i1];
+        for (int h = 0; h < 1; ++h) {
+            for (int i1 = 0; i1 < n_tokens; ++i1) {
+                const llama_seq_id s1 = ubatch->seq_id[i1][0];
+                const llama_pos    p1 = ubatch->pos[i1];

-            const uint64_t idst = i1*n_kv;
+                const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;

-            for (int i0 = 0; i0 < n_tokens; ++i0) {
-                const llama_seq_id s0 = ubatch->seq_id[i0][0];
-                const llama_pos p0    = ubatch->pos[i0];
+                for (int i0 = 0; i0 < n_tokens; ++i0) {
+                    const llama_seq_id s0 = ubatch->seq_id[i0][0];
+                    const llama_pos p0    = ubatch->pos[i0];

-                // mask different sequences
-                if (s0 != s1) {
-                    continue;
+                    // mask different sequences
+                    if (s0 != s1) {
+                        continue;
+                    }
+
+                    // mask future tokens
+                    if (cparams.causal_attn && p0 > p1) {
+                        continue;
+                    }
+
+                    // apply SWA if any
+                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+                        continue;
+                    }
+
+                    data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                }
-
-                // mask future tokens
-                if (cparams.causal_attn && p0 > p1) {
-                    continue;
-                }
-
-                // apply SWA if any
-                if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
-                    continue;
-                }
-
-                data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
            }
        }
    };
@@ -407,27 +408,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
    return res;
 }

-void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
-    mctx->set_input_k_idxs(self_k_idxs, ubatch);
-
-    mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
-}
-
-bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
-    const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
-
-    this->mctx = mctx;
-
-    bool res = true;
-
-    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-
-    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
-
-    return res;
-}
-
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@@ -473,19 +453,27 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {

    float * data = (float *) cross_kq_mask->data;

-    for (int i = 0; i < n_tokens; ++i) {
-        for (int j = 0; j < n_enc; ++j) {
-            float f = -INFINITY;
+    for (int h = 0; h < 1; ++h) {
+        for (int i = 0; i < n_tokens; ++i) {
+            for (int j = 0; j < n_enc; ++j) {
+                float f = -INFINITY;

-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[i][s];
+                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                    const llama_seq_id seq_id = ubatch->seq_id[i][s];

-                if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
-                    f = 0.0f;
+                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+                        f = 0.0f;
+                    }
                }
-            }

-            data[i*n_enc + j] = f;
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
+            }
+        }
+
+        for (int i = n_tokens; i < n_tokens; ++i) {
+            for (int j = 0; j < n_enc; ++j) {
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
+            }
        }
    }
 }
@@ -533,113 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
    return res;
 }

-void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
-    const auto * attn_ctx = mctx->get_attn();
-
-    // base tensors may not be allocated if there are no non-SWA attention layers
-    if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
-        attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
-        attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
-
-        attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
-    }
-
-    // swa tensors may not be allocated if there are no SWA attention layers
-    if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
-        attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
-        attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
-
-        attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
-    }
-
-    const int64_t n_rs = mctx->get_recr()->get_n_rs();
-
-    if (inp_rs->s_copy) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
-        int32_t * data = (int32_t *) inp_rs->s_copy->data;
-
-        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_rs; ++i) {
-            data[i] = mctx->get_recr()->s_copy(i);
-        }
-    }
-}
-
-bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) {
-    const auto * mctx = static_cast<const llama_memory_hybrid_iswa_context *>(params.mctx);
-
-    this->mctx = mctx;
-
-    bool res = true;
-
-    const auto * attn_ctx = mctx->get_attn();
-
-    // base tensors may not be allocated if there are no non-SWA attention layers
-    if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
-        res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
-      //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
-
-        res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
-        res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
-    }
-
-    // swa tensors may not be allocated if there are no SWA attention layers
-    if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
-        res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-      //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
-
-        res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
-        res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
-    }
-
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
-
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
-
-    res &= inp_rs->head == mctx->get_recr()->get_head();
-    res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
-
-    return res;
-}
-
-void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
-    // set the inputs only for the active samplers in the current ubatch
-    std::unordered_set<llama_seq_id> active_samplers;
-    for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
-        if (ubatch->output[i]) {
-            llama_seq_id seq_id = ubatch->seq_id[i][0];
-            active_samplers.insert(seq_id);
-        }
-    }
-
-    for (auto seq_id : active_samplers) {
-        if (samplers.find(seq_id) == samplers.end()) {
-            continue;
-        }
-
-        auto & sampler = samplers[seq_id];
-
-        if (sampler->iface->backend_set_input) {
-            sampler->iface->backend_set_input(sampler);
-        }
-    }
-}
-
-bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
-    if (samplers.size() != params.samplers.size()) {
-        return false;
-    }
-
-    for (const auto & [seq_id, sampler] : params.samplers) {
-        if (samplers[seq_id] != sampler) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 //
 // llm_graph_result
 //
@@ -656,15 +537,10 @@ int64_t llm_graph_result::get_max_nodes() const {
 }

 void llm_graph_result::reset() {
-    t_inp_tokens  = nullptr;
-    t_inp_embd    = nullptr;
+    t_tokens      = nullptr;
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
-    t_sampled.clear();
-    t_sampled_probs.clear();
-    t_sampled_logits.clear();
-    t_candidates.clear();

    params = {};

@@ -689,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }

-void llm_graph_result::set_outputs() {
-    if (t_logits != nullptr) {
-        ggml_set_output(t_logits);
-    }
-    if (t_embd != nullptr) {
-        ggml_set_output(t_embd);
-    }
-    if (t_embd_pooled != nullptr) {
-        ggml_set_output(t_embd_pooled);
-    }
-    for (auto & [seq_id, t] : t_sampled) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_probs) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_logits) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_candidates) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-}
-
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
    if (!this->params.allow_reuse(params)) {
        if (debug > 1) {
@@ -802,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    loras            (params.loras),
    mctx             (params.mctx),
    cross            (params.cross),
-    samplers         (params.samplers),
    cb_func          (params.cb),
    res              (params.res),
    ctx0             (res->get_ctx()),
@@ -1361,29 +1204,17 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
-    const int64_t n_embd_inp = hparams.n_embd_inp();
-    const int64_t n_embd     = hparams.n_embd;
+    const int64_t n_embd = hparams.n_embd_inp();

-    assert(n_embd_inp >= n_embd);
+    auto inp = std::make_unique<llm_graph_input_embd>();

-    auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
+    ggml_tensor * cur = nullptr;

-    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-    cb(inp->tokens, "inp_tokens", -1);
-    ggml_set_input(inp->tokens);
-    res->t_inp_tokens = inp->tokens;
-
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
-    cb(inp->embd, "inp_embd", -1);
-    ggml_set_input(inp->embd);
-
-    // select one of the 2 inputs, based on the batch contents
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18550
-    std::array<ggml_tensor *, 2> inps;
-
-    // token embeddings path (ubatch.token != nullptr)
-    {
-        auto & cur = inps[0];
+    if (ubatch.token) {
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp->tokens, "inp_tokens", -1);
+        ggml_set_input(inp->tokens);
+        res->t_tokens = inp->tokens;

        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);

@@ -1404,43 +1235,22 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {

            cur = ggml_add(ctx0, cur, inpL_delta);
        }
-
-        if (n_embd_inp != n_embd) {
-            cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
-        }
-    }
-
-    // vector embeddings path (ubatch.embd != nullptr)
-    {
-        auto & cur = inps[1];
+    } else {
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        ggml_set_input(inp->embd);

        cur = inp->embd;
    }

-    assert(ggml_are_same_shape (inps[0], inps[1]));
-    assert(ggml_are_same_stride(inps[0], inps[1]));
-
-    ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
-
-    if (n_embd_inp != n_embd) {
-        cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
-    }
-
-    res->t_inp_embd = cur;
-
    // For Granite architecture
    if (hparams.f_embedding_scale != 0.0f) {
        cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
    }

-    cb(cur, "embd", -1);
+    cb(cur, "inp_embd", -1);

    res->add_input(std::move(inp));

-    // make sure the produced embeddings are immediately materialized in the ggml graph
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18599
-    ggml_build_forward_expand(gf, cur);
-
    return cur;
 }

@@ -1532,7 +1342,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
    //}

    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
-    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc  : hparams.n_ctx_train;
+    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;

    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
    ggml_set_input(cur);
@@ -1630,11 +1440,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
        cb(cur, LLAMA_TENSOR_NAME_FATTN, il);

-        if (!cparams.offload_kqv) {
-            // all nodes between the KV store and the attention output are run on the CPU
-            ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
-        }
-
        ggml_flash_attn_ext_add_sinks(cur, sinks);
        ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);

@@ -1844,11 +1649,9 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * sinks,
-        ggml_tensor * v_mla, // TODO: remove
+        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
-    GGML_ASSERT(v_mla == nullptr);
-
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    // expand k later to enable rope fusion which directly writes into k-v cache
@@ -1891,93 +1694,6 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }

-static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
-           ggml_context * ctx0,
-     const llama_ubatch & ubatch,
-    const llama_hparams & hparams,
-    const llama_cparams & cparams,
-    const llama_kv_cache_context * mctx_cur) {
-
-    auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
-
-    {
-        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
-
-        const auto n_kv     = mctx_cur->get_n_kv();
-        const auto n_tokens = ubatch.n_tokens;
-        const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
-        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
-
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
-        ggml_set_input(inp->self_kq_mask);
-
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-    }
-
-    return inp;
-}
-
-llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
-    const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
-
-    auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
-
-    return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
-}
-
-ggml_tensor * llm_graph_context::build_attn(
-        llm_graph_input_attn_k * inp,
-        ggml_tensor * wo,
-        ggml_tensor * wo_b,
-        ggml_tensor * q_cur,
-        ggml_tensor * k_cur,
-        ggml_tensor * v_cur,
-        ggml_tensor * kq_b,
-        ggml_tensor * sinks,
-        ggml_tensor * v_mla,
-            float     kq_scale,
-            int       il) const {
-    // these nodes are added to the graph together so that they are not reordered
-    // by doing so, the number of splits in the graph is reduced
-    // expand k later to enable rope fusion which directly writes into k-v cache
-    ggml_build_forward_expand(gf, q_cur);
-    ggml_build_forward_expand(gf, v_cur);
-    ggml_build_forward_expand(gf, k_cur);
-
-    const auto * mctx_cur = inp->mctx;
-
-    // store to KV cache
-    {
-        const auto & k_idxs = inp->get_k_idxs();
-
-        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
-    }
-
-    const auto & kq_mask = inp->get_kq_mask();
-
-    ggml_tensor * q = q_cur;
-    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
-    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
-
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
-    cb(cur, "kqv_out", il);
-
-    if (wo) {
-        cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
-            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
-            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
-        }
-    }
-
-    if (wo_b) {
-        cur = ggml_add(ctx0, cur, wo_b);
-    }
-
-    return cur;
-}
-
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_kv_iswa * inp,
        ggml_tensor * wo,
@@ -2118,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
-        ggml_set_name(inp->self_kq_mask, "self_kq_mask");

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-        ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
    }

    {
@@ -2134,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);
-        ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }

    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2273,62 +1985,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
 }

-llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
-    const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
-
-    auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
-
-    // build iswa attention input
-    const auto * attn_ctx = mctx_cur->get_attn();
-
-    auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
-
-    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
-    {
-        const auto n_kv = attn_ctx->get_base()->get_n_kv();
-
-        inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
-        inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
-
-        inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
-        ggml_set_input(inp_attn->self_kq_mask);
-
-        inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
-    }
-
-    {
-        const auto n_kv = attn_ctx->get_swa()->get_n_kv();
-
-        inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
-        inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
-
-        inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
-        ggml_set_input(inp_attn->self_kq_mask_swa);
-
-        inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
-    }
-
-    auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
-
-    return (llm_graph_input_mem_hybrid_iswa *) res->add_input(std::move(inp));
-}
-
 void llm_graph_context::build_dense_out(
    ggml_tensor * dense_2,
    ggml_tensor * dense_3) const {
-    if (!cparams.embeddings || !(dense_2 || dense_3)) {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
        return;
    }
    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");

-    if (dense_2) {
-        cur = ggml_mul_mat(ctx0, dense_2, cur);
-    }
-    if (dense_3) {
-        cur = ggml_mul_mat(ctx0, dense_3, cur);
-    }
+    cur = ggml_mul_mat(ctx0, dense_2, cur);
+    cur = ggml_mul_mat(ctx0, dense_3, cur);
    cb(cur, "result_embd_pooled", -1);
    res->t_embd_pooled = cur;
    ggml_build_forward_expand(gf, cur);
@@ -2419,87 +2086,6 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }

-void llm_graph_context::build_sampling() const {
-    if (samplers.empty() || !res->t_logits) {
-        return;
-    }
-
-    auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
-    res->add_input(std::move(inp_sampling));
-
-    std::map<llama_seq_id, int32_t> seq_to_logit_row;
-    int32_t logit_row_idx = 0;
-
-    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
-        if (ubatch.output[i]) {
-            llama_seq_id seq_id = ubatch.seq_id[i][0];
-            seq_to_logit_row[seq_id] = logit_row_idx;
-            logit_row_idx++;
-        }
-    }
-
-    // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
-    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
-
-    // add a dummy row of logits
-    // this trick makes the graph static, regardless of which samplers are activated
-    // this is important in order to minimize graph reallocations
-    // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
-    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
-
-    for (const auto & [seq_id, sampler] : samplers) {
-        const auto it = seq_to_logit_row.find(seq_id);
-
-        // inactive samplers always work on the first row
-        const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
-
-        ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
-        ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
-
-        struct llama_sampler_data data = {
-            /*.logits      =*/ logits_seq,
-            /*.probs       =*/ nullptr,
-            /*.sampled     =*/ nullptr,
-            /*.candidates  =*/ nullptr,
-        };
-
-        assert(sampler->iface->backend_apply);
-        sampler->iface->backend_apply(sampler, ctx0, gf, &data);
-
-        if (data.sampled != nullptr) {
-            res->t_sampled[seq_id] = data.sampled;
-            ggml_build_forward_expand(gf, data.sampled);
-        }
-
-        if (data.probs != nullptr) {
-            res->t_sampled_probs[seq_id] = data.probs;
-            ggml_build_forward_expand(gf, data.probs);
-        }
-
-        if (data.logits != nullptr) {
-            res->t_sampled_logits[seq_id] = data.logits;
-            ggml_build_forward_expand(gf, data.logits);
-        }
-
-        if (data.candidates != nullptr) {
-            res->t_candidates[seq_id] = data.candidates;
-            ggml_build_forward_expand(gf, data.candidates);
-        }
-    }
-
-    // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
-    /*
-    for (const auto & [seq_id, sampler] : samplers) {
-        if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
-            ggml_tensor * selected_token = it->second;
-            if (selected_token != nullptr) {
-                llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
-            }
-        }
-    }
-    */
-}
-
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;