Revert "Update vendored llama.cpp to b7847" (#14061)

2026-04-25 18:25:42 +02:00 · 2026-02-03 18:39:36 -08:00
parent a6355329bf
commit b1fccabb34
240 changed files with 5050 additions and 21247 deletions
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -40,14 +40,6 @@ struct llama_context {

    ~llama_context();

-    // reserve a new backend scheduler (if needed)
-    // for example, when:
-    //   - changing loras
-    //   - changing samplers
-    //   - changing attention type
-    //   - etc.
-    void sched_reserve();
-
    void synchronize();

    const llama_model   & get_model()   const;
@@ -78,18 +70,6 @@ struct llama_context {
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);

-    llama_token * get_sampled_tokens() const;
-    llama_token   get_sampled_token_ith(int32_t idx);
-
-    float * get_sampled_logits_ith(int32_t idx);
-    size_t  get_sampled_logits_count(int32_t idx);
-
-    float * get_sampled_probs_ith(int32_t idx);
-    size_t  get_sampled_probs_count(int32_t idx);
-
-    const llama_token * get_sampled_candidates_ith(int32_t idx);
-    size_t get_sampled_candidates_count(int32_t idx);
-
    void attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
@@ -212,13 +192,10 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);

    void output_reorder();

-    // map the output row index `i` to batch index
-    int64_t output_resolve_row(int32_t i) const;
-
    //
    // graph
    //
@@ -236,8 +213,6 @@ public:
    ggml_cgraph * graph_reserve(
        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);

-    bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
-
 private:
    llm_graph_params graph_params(
                        llm_graph_result * res,
@@ -277,31 +252,6 @@ private:
    size_t  embd_size = 0; // capacity (of floats) for embeddings
    float * embd      = nullptr;

-    // TODO: simplify
-    struct sampling_info {
-        std::map<llama_seq_id, llama_sampler *> samplers;
-
-        float       * logits      = nullptr;
-        size_t        logits_size = 0;
-
-        llama_token * sampled      = nullptr;
-        size_t        sampled_size = 0;
-
-        float       * probs        = nullptr;
-        size_t        probs_size   = 0;
-
-        llama_token * candidates   = nullptr;
-        size_t        candidates_size = 0;
-
-        std::vector<uint32_t> logits_count;
-        std::vector<uint32_t> probs_count;
-        std::vector<uint32_t> candidates_count;
-
-        std::vector<llama_token> token_ids_full_vocab;
-    };
-
-    sampling_info sampling;
-
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
@@ -322,8 +272,6 @@ private:

    ggml_backend_sched_ptr sched;

-    bool sched_need_reserve = true;
-
    ggml_backend_t backend_cpu = nullptr;
    std::vector<ggml_backend_ptr> backends;