mirror of
https://github.com/ollama/ollama.git
synced 2026-04-24 17:55:43 +02:00
GGML update to ec98e2002 (#13451)
* Revert "add support for NVIDIA Nemotron 3 Nano"
This reverts commit e7d2ae9d69.
* GGML update to 380b4c984
Remove MaskBatchPadding as GGML_KQ_MASK_PAD is no longer present (no
padding required)
* update to c45f89d55
* ec98e2002
solar pro needed more adjusting - needs verification
* review comments
This commit is contained in:
21
llama/llama.cpp/src/llama-graph.h
vendored
21
llama/llama.cpp/src/llama-graph.h
vendored
@@ -132,8 +132,8 @@ public:
|
||||
// temperature tuning, used by llama4
|
||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
||||
virtual ~llm_graph_input_attn_temp() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
@@ -142,6 +142,7 @@ public:
|
||||
|
||||
const uint32_t n_attn_temp_floor_scale;
|
||||
const float f_attn_temp_scale;
|
||||
const float f_attn_temp_offset;
|
||||
};
|
||||
|
||||
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
||||
@@ -224,6 +225,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * s_copy; // I32 [n_rs]
|
||||
|
||||
// views of s_copy, computed once per graph
|
||||
@@ -232,6 +235,10 @@ public:
|
||||
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
||||
|
||||
const llama_memory_recurrent_context * mctx;
|
||||
|
||||
// used in view offsets, need to match for valid graph reuse
|
||||
uint32_t head;
|
||||
int32_t rs_z;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||
@@ -364,22 +371,28 @@ public:
|
||||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_mem_hybrid(
|
||||
const llama_cparams & cparams,
|
||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_context * mctx) :
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_context * mctx) :
|
||||
inp_attn(std::move(inp_attn)),
|
||||
inp_rs(std::move(inp_rs)),
|
||||
cparams(cparams),
|
||||
mctx(mctx) { }
|
||||
virtual ~llm_graph_input_mem_hybrid() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||
|
||||
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user