mirror of
https://github.com/ollama/ollama.git
synced 2026-04-21 00:05:40 +02:00
Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
105
llama/llama.cpp/src/llama-adapter.cpp
vendored
105
llama/llama.cpp/src/llama-adapter.cpp
vendored
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <map>
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
// vec
|
||||
@@ -163,13 +164,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
|
||||
// check metadata
|
||||
{
|
||||
const gguf_context * gguf_ctx = ctx_gguf.get();
|
||||
|
||||
LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
|
||||
|
||||
// get metadata as string
|
||||
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
|
||||
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
|
||||
: gguf_type_name(type);
|
||||
const char * name = gguf_get_key(gguf_ctx, i);
|
||||
const std::string value = gguf_kv_to_str(gguf_ctx, i);
|
||||
|
||||
if (type != GGUF_TYPE_ARRAY) {
|
||||
adapter.gguf_kv.emplace(name, value);
|
||||
}
|
||||
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
|
||||
replace_all(print_value, "\n", "\\n");
|
||||
|
||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
|
||||
}
|
||||
|
||||
auto get_kv_str = [&](const std::string & key) -> std::string {
|
||||
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
|
||||
int id = gguf_find_key(gguf_ctx, key.c_str());
|
||||
return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
|
||||
};
|
||||
auto get_kv_f32 = [&](const std::string & key) -> float {
|
||||
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
|
||||
int id = gguf_find_key(gguf_ctx, key.c_str());
|
||||
return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
|
||||
};
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
|
||||
@@ -190,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
}
|
||||
|
||||
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
|
||||
|
||||
// parse alora invocation sequence vector
|
||||
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||
if (kid >= 0) {
|
||||
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
|
||||
throw std::runtime_error("invalid gguf type for " + key);
|
||||
}
|
||||
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
|
||||
if (arr_type != GGUF_TYPE_UINT32) {
|
||||
throw std::runtime_error("invalid gguf element type for " + key);
|
||||
}
|
||||
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
|
||||
adapter.alora_invocation_tokens.resize(seq_len);
|
||||
std::copy(
|
||||
(const llama_token *)data,
|
||||
(const llama_token *)data + seq_len,
|
||||
adapter.alora_invocation_tokens.begin());
|
||||
}
|
||||
}
|
||||
|
||||
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
||||
@@ -383,6 +429,57 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
|
||||
const auto & it = adapter->gguf_kv.find(key);
|
||||
if (it == adapter->gguf_kv.end()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
|
||||
return (int)adapter->gguf_kv.size();
|
||||
}
|
||||
|
||||
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
auto it = adapter->gguf_kv.begin();
|
||||
std::advance(it, i);
|
||||
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||||
}
|
||||
|
||||
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
auto it = adapter->gguf_kv.begin();
|
||||
std::advance(it, i);
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||
if (!adapter) {
|
||||
return 0;
|
||||
}
|
||||
return adapter->alora_invocation_tokens.size();
|
||||
}
|
||||
|
||||
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
|
||||
GGML_ASSERT(adapter);
|
||||
return adapter->alora_invocation_tokens.data();
|
||||
}
|
||||
|
||||
6
llama/llama.cpp/src/llama-adapter.h
vendored
6
llama/llama.cpp/src/llama-adapter.h
vendored
@@ -67,6 +67,12 @@ struct llama_adapter_lora {
|
||||
|
||||
float alpha;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora() = default;
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
|
||||
175
llama/llama.cpp/src/llama-arch.cpp
vendored
175
llama/llama.cpp/src/llama-arch.cpp
vendored
@@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
{ LLM_ARCH_QWEN, "qwen" },
|
||||
@@ -44,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
||||
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_MAMBA2, "mamba2" },
|
||||
@@ -68,6 +70,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
@@ -94,6 +97,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_DREAM, "dream" },
|
||||
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
||||
{ LLM_ARCH_LLADA, "llada" },
|
||||
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
|
||||
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -121,6 +127,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
|
||||
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
@@ -129,12 +136,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
|
||||
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
|
||||
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
|
||||
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
||||
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
||||
@@ -165,20 +176,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
|
||||
|
||||
{ LLM_KV_SPLIT_NO, "split.no" },
|
||||
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
||||
@@ -235,8 +252,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
||||
|
||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
|
||||
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
||||
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
|
||||
|
||||
// deprecated
|
||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||
@@ -392,12 +412,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
},
|
||||
@@ -576,6 +600,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_CLS, "cls" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_JINA_BERT_V3,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_BLOOM,
|
||||
{
|
||||
@@ -689,6 +727,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
@@ -1021,6 +1060,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GEMMA_EMBEDDING,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STARCODER2,
|
||||
{
|
||||
@@ -1534,6 +1594,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
// mamba(2) ssm layers
|
||||
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
// attention layers
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
// dense FFN
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_EXAONE,
|
||||
{
|
||||
@@ -2030,6 +2115,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -2087,6 +2173,66 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_LLADA_MOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_SEED_OSS,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GROVEMOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
|
||||
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2219,6 +2365,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// altup / laurel (gemma 3n)
|
||||
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
@@ -2340,6 +2489,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
case LLM_ARCH_PLAMO2:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -2350,6 +2500,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
case LLM_ARCH_LLADA_MOE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
23
llama/llama.cpp/src/llama-arch.h
vendored
23
llama/llama.cpp/src/llama-arch.h
vendored
@@ -26,6 +26,7 @@ enum llm_arch {
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
LLM_ARCH_NEO_BERT,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_JINA_BERT_V3,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
LLM_ARCH_QWEN,
|
||||
@@ -48,6 +49,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GEMMA2,
|
||||
LLM_ARCH_GEMMA3,
|
||||
LLM_ARCH_GEMMA3N,
|
||||
LLM_ARCH_GEMMA_EMBEDDING,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_MAMBA2,
|
||||
@@ -72,6 +74,7 @@ enum llm_arch {
|
||||
LLM_ARCH_T5ENCODER,
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_RWKV6,
|
||||
@@ -98,6 +101,9 @@ enum llm_arch {
|
||||
LLM_ARCH_DREAM,
|
||||
LLM_ARCH_SMALLTHINKER,
|
||||
LLM_ARCH_LLADA,
|
||||
LLM_ARCH_LLADA_MOE,
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_GROVEMOE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -125,6 +131,7 @@ enum llm_kv {
|
||||
LLM_KV_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
@@ -133,12 +140,16 @@ enum llm_kv {
|
||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||
LLM_KV_EXPERT_GATING_FUNC,
|
||||
LLM_KV_EXPERT_GROUP_SCALE,
|
||||
LLM_KV_EXPERTS_PER_GROUP,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
LLM_KV_DECODER_BLOCK_COUNT,
|
||||
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_SWIN_NORM,
|
||||
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
||||
@@ -169,6 +180,8 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
@@ -183,6 +196,10 @@ enum llm_kv {
|
||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
||||
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
||||
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
||||
|
||||
LLM_KV_SPLIT_NO,
|
||||
LLM_KV_SPLIT_COUNT,
|
||||
@@ -231,6 +248,9 @@ enum llm_kv {
|
||||
|
||||
LLM_KV_ADAPTER_TYPE,
|
||||
LLM_KV_ADAPTER_LORA_ALPHA,
|
||||
LLM_KV_ADAPTER_LORA_TASK_NAME,
|
||||
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
||||
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
|
||||
|
||||
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
||||
LLM_KV_POSNET_BLOCK_COUNT,
|
||||
@@ -287,6 +307,9 @@ enum llm_tensor {
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_CHEXPS,
|
||||
LLM_TENSOR_FFN_GATE_CHEXPS,
|
||||
LLM_TENSOR_FFN_UP_CHEXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
|
||||
2
llama/llama.cpp/src/llama-batch.cpp
vendored
2
llama/llama.cpp/src/llama-batch.cpp
vendored
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
||||
|
||||
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
||||
if (sequential && has_cpl) {
|
||||
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
|
||||
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
32
llama/llama.cpp/src/llama-chat.cpp
vendored
32
llama/llama.cpp/src/llama-chat.cpp
vendored
@@ -16,10 +16,10 @@
|
||||
static std::string trim(const std::string & str) {
|
||||
size_t start = 0;
|
||||
size_t end = str.size();
|
||||
while (start < end && isspace(str[start])) {
|
||||
while (start < end && isspace(static_cast<unsigned char>(str[start]))) {
|
||||
start += 1;
|
||||
}
|
||||
while (end > start && isspace(str[end - 1])) {
|
||||
while (end > start && isspace(static_cast<unsigned char>(str[end - 1]))) {
|
||||
end -= 1;
|
||||
}
|
||||
return str.substr(start, end - start);
|
||||
@@ -69,6 +69,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
};
|
||||
|
||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||
@@ -201,6 +203,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
||||
} else if (tmpl_contains("<seed:bos>")) {
|
||||
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
||||
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GROK_2;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -752,6 +758,28 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|im_assistant|>assistant<|im_middle|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
|
||||
for (auto message: chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<seed:bos>assistant\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
|
||||
} else if (role == "user") {
|
||||
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
|
||||
} else if (role == "assistant") {
|
||||
ss << "Assistant: " << message->content << "<|separator|>\n\n";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "Assistant:";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
2
llama/llama.cpp/src/llama-chat.h
vendored
2
llama/llama.cpp/src/llama-chat.h
vendored
@@ -49,6 +49,8 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
558
llama/llama.cpp/src/llama-context.cpp
vendored
558
llama/llama.cpp/src/llama-context.cpp
vendored
@@ -35,14 +35,12 @@ llama_context::llama_context(
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.defrag_thold = params.defrag_thold;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
|
||||
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
cparams.warmup = false;
|
||||
@@ -87,13 +85,15 @@ llama_context::llama_context(
|
||||
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||
}
|
||||
|
||||
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
|
||||
// with causal attention, the batch size is limited by the context size
|
||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||
|
||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
|
||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
||||
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
||||
@@ -103,16 +103,6 @@ llama_context::llama_context(
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
|
||||
{
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
||||
|
||||
if (!supports_set_rows && !cparams.kv_unified) {
|
||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||
cparams.kv_unified = true;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
||||
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
||||
@@ -130,7 +120,7 @@ llama_context::llama_context(
|
||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||
LLAMA_LOG_INFO("%s: flash_attn = %s\n", __func__, llama_flash_attn_type_name(params.flash_attn_type));
|
||||
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
@@ -145,11 +135,6 @@ llama_context::llama_context(
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
|
||||
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
|
||||
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
|
||||
}
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// GPU backends
|
||||
for (auto * dev : model.devices) {
|
||||
@@ -196,7 +181,7 @@ llama_context::llama_context(
|
||||
// graph outputs buffer
|
||||
{
|
||||
// resized during inference when a batch uses more outputs
|
||||
if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
|
||||
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
|
||||
throw std::runtime_error("failed to reserve initial output buffer");
|
||||
}
|
||||
|
||||
@@ -285,28 +270,75 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
if (!hparams.vocab_only && memory) {
|
||||
if (!hparams.vocab_only) {
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
||||
mctx = memory->init_full();
|
||||
if (!mctx) {
|
||||
throw std::runtime_error("failed to initialize memory module");
|
||||
}
|
||||
}
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
n_outputs = n_seqs;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
// resolve automatic Flash Attention use
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
||||
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to split graph for Flash Attention check");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||
bool fa_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_fa != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
||||
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
||||
fa_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (fa_device_mismatch) {
|
||||
cparams.flash_attn = false;
|
||||
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||
if (ggml_is_quantized(params.type_v)) {
|
||||
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
||||
}
|
||||
} else {
|
||||
cparams.flash_attn = true;
|
||||
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
int n_splits_pp = -1;
|
||||
int n_nodes_pp = -1;
|
||||
|
||||
int n_splits_tg = -1;
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
// simulate full KV cache
|
||||
|
||||
const auto mctx = memory->init_full();
|
||||
if (!mctx) {
|
||||
throw std::runtime_error("failed to initialize KV cache");
|
||||
}
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
@@ -444,26 +476,12 @@ llama_memory_t llama_context::get_memory() const {
|
||||
return memory.get();
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_context::kv_self_defrag_sched() {
|
||||
if (!memory) {
|
||||
return;
|
||||
}
|
||||
|
||||
memory_force_optimize = true;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
bool llama_context::kv_self_update(bool optimize) {
|
||||
bool llama_context::memory_update(bool optimize) {
|
||||
if (!memory) {
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
// TODO: remove in the future
|
||||
optimize |= memory_force_optimize;
|
||||
memory_force_optimize = false;
|
||||
|
||||
const auto mctx = memory->init_update(this, optimize);
|
||||
switch (mctx->get_status()) {
|
||||
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||
@@ -908,12 +926,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
//cross.t_embd = t_embd;
|
||||
@@ -996,8 +1008,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
// handle any pending defrags/shifts
|
||||
kv_self_update(false);
|
||||
// handle any pending shifts/copies
|
||||
memory_update(false);
|
||||
|
||||
llama_memory_context_ptr mctx;
|
||||
|
||||
@@ -1022,7 +1034,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
if (!did_optimize) {
|
||||
did_optimize = true;
|
||||
|
||||
if (kv_self_update(true)) {
|
||||
if (memory_update(true)) {
|
||||
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
|
||||
|
||||
continue;
|
||||
@@ -1075,7 +1087,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
|
||||
llama_pos pos_min[LLAMA_MAX_SEQ];
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
||||
@@ -1092,7 +1104,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
|
||||
LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
|
||||
|
||||
memory->seq_rm(s, pos_min[s], -1);
|
||||
}
|
||||
@@ -1243,12 +1255,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
//synchronize();
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1362,8 +1368,9 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
GGML_ASSERT(n_outputs >= 1);
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
@@ -1397,7 +1404,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
this->n_outputs = save_n_outputs;
|
||||
|
||||
// initialize scheduler with the specified graph
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
if (split_only) {
|
||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -1437,7 +1446,9 @@ ggml_status llama_context::graph_compute(
|
||||
if (backend_cpu != nullptr) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
||||
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
||||
set_threadpool_fn(backend_cpu, tp);
|
||||
if (set_threadpool_fn) {
|
||||
set_threadpool_fn(backend_cpu, tp);
|
||||
}
|
||||
}
|
||||
|
||||
// set the number of threads for all the backends
|
||||
@@ -1656,30 +1667,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
|
||||
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
llama_io_write_dummy io;
|
||||
try {
|
||||
return state_seq_write_data(io, seq_id);
|
||||
return state_seq_write_data(io, seq_id, flags);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
|
||||
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
|
||||
llama_io_write_buffer io(dst, size);
|
||||
try {
|
||||
return state_seq_write_data(io, seq_id);
|
||||
return state_seq_write_data(io, seq_id, flags);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
|
||||
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
|
||||
llama_io_read_buffer io(src, size);
|
||||
try {
|
||||
return state_seq_read_data(io, seq_id);
|
||||
return state_seq_read_data(io, seq_id, flags);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
@@ -1777,7 +1788,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
|
||||
{
|
||||
const size_t state_size = file.size() - file.tell();
|
||||
llama_io_read_file io(&file);
|
||||
const size_t nread = state_seq_read_data(io, seq_id);
|
||||
const size_t nread = state_seq_read_data(io, seq_id, 0);
|
||||
if (!nread) {
|
||||
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
||||
return 0;
|
||||
@@ -1801,7 +1812,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
|
||||
|
||||
// save the context state using stream saving
|
||||
llama_io_write_file io(&file);
|
||||
state_seq_write_data(io, seq_id);
|
||||
state_seq_write_data(io, seq_id, 0);
|
||||
|
||||
const size_t res = file.tell();
|
||||
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
||||
@@ -1876,7 +1887,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
}
|
||||
|
||||
if (memory != nullptr) {
|
||||
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
|
||||
memory->state_write(io);
|
||||
}
|
||||
|
||||
@@ -1962,7 +1973,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
}
|
||||
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
|
||||
|
||||
memory->state_read(io);
|
||||
}
|
||||
@@ -1970,21 +1981,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
return io.n_bytes();
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
||||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
if (memory) {
|
||||
memory->state_write(io, seq_id);
|
||||
memory->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
return io.n_bytes();
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
if (memory) {
|
||||
memory->state_read(io, seq_id);
|
||||
memory->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
return io.n_bytes();
|
||||
@@ -2015,6 +2026,21 @@ void llama_context::perf_reset() {
|
||||
n_reused = 0;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||
for (const auto & buft_size : model.memory_breakdown()) {
|
||||
ret[buft_size.first].model += buft_size.second;
|
||||
}
|
||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
||||
ret[buft_size.first].context += buft_size.second;
|
||||
}
|
||||
for (const auto & backend_ptr : backends) {
|
||||
ggml_backend_t backend = backend_ptr.get();
|
||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
@@ -2047,7 +2073,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
||||
opt_params.opt_period = n_batch / n_ubatch;
|
||||
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
||||
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
||||
|
||||
opt_params.optimizer = lopt_params.optimizer_type;
|
||||
opt_ctx = ggml_opt_init(opt_params);
|
||||
|
||||
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
||||
@@ -2247,12 +2273,13 @@ llama_context_params llama_context_default_params() {
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
|
||||
/*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.yarn_ext_factor =*/ -1.0f,
|
||||
/*.yarn_attn_factor =*/ 1.0f,
|
||||
/*.yarn_beta_fast =*/ 32.0f,
|
||||
/*.yarn_beta_slow =*/ 1.0f,
|
||||
/*.yarn_attn_factor =*/ -1.0f,
|
||||
/*.yarn_beta_fast =*/ -1.0f,
|
||||
/*.yarn_beta_slow =*/ -1.0f,
|
||||
/*.yarn_orig_ctx =*/ 0,
|
||||
/*.defrag_thold =*/ -1.0f,
|
||||
/*.cb_eval =*/ nullptr,
|
||||
@@ -2263,7 +2290,6 @@ llama_context_params llama_context_default_params() {
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
/*.swa_full =*/ true,
|
||||
@@ -2291,12 +2317,30 @@ llama_context * llama_init_from_model(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
||||
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
|
||||
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
||||
params.flash_attn = false;
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||
if (model->hparams.n_embd_head_k % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||
if (model->hparams.n_embd_head_v % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
|
||||
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -2342,16 +2386,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
|
||||
return &ctx->get_model();
|
||||
}
|
||||
|
||||
// deprecated
|
||||
llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
|
||||
return dynamic_cast<llama_kv_cache *>(ctx->get_memory());
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_update(llama_context * ctx) {
|
||||
ctx->kv_self_update(false);
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
||||
return ctx->pooling_type();
|
||||
}
|
||||
@@ -2569,168 +2603,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
|
||||
return mem->get_can_shift();
|
||||
}
|
||||
|
||||
//
|
||||
// kv cache
|
||||
//
|
||||
|
||||
// deprecated
|
||||
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
|
||||
const auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t res = 0;
|
||||
|
||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
||||
const llama_pos p0 = kv->seq_pos_min(s);
|
||||
const llama_pos p1 = kv->seq_pos_max(s);
|
||||
|
||||
if (p0 >= 0) {
|
||||
res += (p1 - p0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
// note: this is the same as above - will be removed anyway, so it's ok
|
||||
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
|
||||
const auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t res = 0;
|
||||
|
||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
||||
const llama_pos p0 = kv->seq_pos_min(s);
|
||||
const llama_pos p1 = kv->seq_pos_max(s);
|
||||
|
||||
if (p0 >= 0) {
|
||||
res += (p1 - p0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_clear(llama_context * ctx) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_memory_clear(kv, true);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
bool llama_kv_self_seq_rm(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return llama_memory_seq_rm(kv, seq_id, p0, p1);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_seq_cp(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_memory_seq_keep(kv, seq_id);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_seq_add(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_memory_seq_add(kv, seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_seq_div(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_memory_seq_div(kv, seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return llama_memory_seq_pos_min(kv, seq_id);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return llama_memory_seq_pos_max(kv, seq_id);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_defrag(llama_context * ctx) {
|
||||
// force defrag
|
||||
ctx->kv_self_defrag_sched();
|
||||
}
|
||||
|
||||
// deprecated
|
||||
bool llama_kv_self_can_shift(const llama_context * ctx) {
|
||||
auto * kv = llama_get_memory(ctx);
|
||||
if (!kv) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return llama_memory_can_shift(kv);
|
||||
}
|
||||
|
||||
// llama state API
|
||||
|
||||
// deprecated
|
||||
@@ -2800,19 +2672,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
||||
return ctx->state_seq_get_size(seq_id);
|
||||
return llama_state_seq_get_size_ext(ctx, seq_id, 0);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->state_seq_get_data(seq_id, dst, size);
|
||||
return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
|
||||
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
return ctx->state_seq_get_size(seq_id, flags);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->state_seq_set_data(seq_id, src, size);
|
||||
return ctx->state_seq_get_data(seq_id, dst, size, flags);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->state_seq_set_data(seq_id, src, size, flags);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||
@@ -2895,6 +2779,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
||||
ctx->perf_reset();
|
||||
}
|
||||
|
||||
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i] == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i];
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LLAMA_LOG_INFO(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
36
llama/llama.cpp/src/llama-context.h
vendored
36
llama/llama.cpp/src/llama-context.h
vendored
@@ -17,9 +17,17 @@ class llama_batch_allocr;
|
||||
class llama_io_read_i;
|
||||
class llama_io_write_i;
|
||||
|
||||
// "memory" as in abstract memory for the context
|
||||
struct llama_memory_i;
|
||||
struct llama_memory_context_i;
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
// init scheduler and compute buffers, reserve worst-case graphs
|
||||
llama_context(
|
||||
@@ -46,10 +54,8 @@ struct llama_context {
|
||||
|
||||
llama_memory_t get_memory() const;
|
||||
|
||||
// return true of the KV cache was updated
|
||||
// TODO: remove
|
||||
bool kv_self_update(bool optimize);
|
||||
void kv_self_defrag_sched();
|
||||
// return true if the memory was updated
|
||||
bool memory_update(bool optimize);
|
||||
|
||||
enum llama_pooling_type pooling_type() const;
|
||||
|
||||
@@ -111,9 +117,9 @@ struct llama_context {
|
||||
size_t state_get_data( uint8_t * dst, size_t size);
|
||||
size_t state_set_data(const uint8_t * src, size_t size);
|
||||
|
||||
size_t state_seq_get_size(llama_seq_id seq_id);
|
||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
|
||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
|
||||
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
|
||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
|
||||
|
||||
bool state_load_file(
|
||||
const char * filepath,
|
||||
@@ -146,12 +152,15 @@ struct llama_context {
|
||||
llama_perf_context_data perf_get_data() const;
|
||||
void perf_reset();
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
// TODO: more flexible combinations of logical/physical batch size and context size
|
||||
void opt_epoch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train,
|
||||
@@ -197,7 +206,7 @@ public:
|
||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
|
||||
// reserve a graph with a dummy ubatch of the specified size
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
||||
|
||||
private:
|
||||
llm_graph_params graph_params(
|
||||
@@ -212,8 +221,8 @@ private:
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
size_t state_read_data (llama_io_read_i & io);
|
||||
|
||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
|
||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
|
||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
|
||||
//
|
||||
// members
|
||||
@@ -229,9 +238,6 @@ private:
|
||||
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
|
||||
// TODO: temporary, until the llama_kv_self_defrag() API is removed
|
||||
bool memory_force_optimize = false;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
@@ -287,10 +293,6 @@ private:
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = true;
|
||||
|
||||
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||
bool graph_reuse_disable = false;
|
||||
|
||||
|
||||
3
llama/llama.cpp/src/llama-cparams.h
vendored
3
llama/llama.cpp/src/llama-cparams.h
vendored
@@ -4,7 +4,7 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define LLAMA_MAX_SEQ 64
|
||||
#define LLAMA_MAX_SEQ 256
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
@@ -24,7 +24,6 @@ struct llama_cparams {
|
||||
float yarn_attn_factor;
|
||||
float yarn_beta_fast;
|
||||
float yarn_beta_slow;
|
||||
float defrag_thold;
|
||||
|
||||
bool embeddings;
|
||||
bool causal_attn;
|
||||
|
||||
244
llama/llama.cpp/src/llama-graph.cpp
vendored
244
llama/llama.cpp/src/llama-graph.cpp
vendored
@@ -4,8 +4,8 @@
|
||||
#include "llama-batch.h"
|
||||
#include "llama-cparams.h"
|
||||
|
||||
#include "llama-kv-cache-unified.h"
|
||||
#include "llama-kv-cache-unified-iswa.h"
|
||||
#include "llama-kv-cache.h"
|
||||
#include "llama-kv-cache-iswa.h"
|
||||
#include "llama-memory-hybrid.h"
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
std::vector<int> target_pos(n_seqs_unq, -1);
|
||||
std::vector<int> target_row(n_seqs_unq, -1);
|
||||
|
||||
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
||||
const bool last = (
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
||||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
||||
);
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_pos pos = ubatch->pos[i];
|
||||
@@ -258,6 +261,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
||||
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
||||
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
|
||||
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
|
||||
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
|
||||
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
|
||||
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
||||
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
||||
|
||||
LLAMA_LOG_DEBUG(" ");
|
||||
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
||||
LLAMA_LOG_DEBUG("%2d", j);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n");
|
||||
|
||||
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
|
||||
LLAMA_LOG_DEBUG(" %2d ", i);
|
||||
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
||||
float val = data[i * n_kv + j];
|
||||
if (val == -INFINITY) {
|
||||
LLAMA_LOG_DEBUG(" ∞");
|
||||
} else {
|
||||
LLAMA_LOG_DEBUG(" 0");
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
const int64_t n_kv = ubatch->n_tokens;
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -267,6 +300,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
float * data = (float *) kq_mask->data;
|
||||
|
||||
// [TAG_NO_CACHE_ISWA]
|
||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
||||
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
||||
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
||||
@@ -277,32 +313,44 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
||||
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
||||
|
||||
if (s0 != s1) {
|
||||
continue; // skip different sequences
|
||||
}
|
||||
|
||||
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
|
||||
continue; // skip future tokens for causal attention
|
||||
}
|
||||
|
||||
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
|
||||
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
|
||||
// continue; // skip masked tokens for SWA
|
||||
//}
|
||||
|
||||
// TODO: reimplement this like in llama_kv_cache_unified
|
||||
if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
|
||||
if (hparams.use_alibi) {
|
||||
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
break;
|
||||
if (hparams.use_alibi) {
|
||||
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
||||
void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
|
||||
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
@@ -314,12 +362,10 @@ bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params)
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
||||
@@ -331,8 +377,8 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
|
||||
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_iswa_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
@@ -350,8 +396,6 @@ bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & pa
|
||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -879,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
selection_probs = logits;
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_GROVEMOE) {
|
||||
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
||||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||
}
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
cb(selected_experts, "ffn_moe_topk", il);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
||||
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
|
||||
// TODO: Use scalar div instead when/if implemented
|
||||
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
|
||||
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
|
||||
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
|
||||
} else {
|
||||
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
|
||||
}
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
cb(weights, "ffn_moe_weights", il);
|
||||
|
||||
|
||||
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
||||
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
||||
@@ -911,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(weights, "ffn_moe_weights_scaled", il);
|
||||
}
|
||||
|
||||
//call early so that topk-moe can be used
|
||||
ggml_build_forward_expand(gf, weights);
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||
|
||||
if (weight_before_ffn) {
|
||||
@@ -1136,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
|
||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
|
||||
|
||||
auto & cur = inp->cls;
|
||||
|
||||
@@ -1186,7 +1247,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
|
||||
|
||||
@@ -1223,15 +1284,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * kq_mask,
|
||||
ggml_tensor * v_mla,
|
||||
ggml_tensor * sinks,
|
||||
float kq_scale) const {
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
const bool v_trans = v->nb[1] > v->nb[2];
|
||||
|
||||
// split the batch into streams if needed
|
||||
const auto n_stream = k->ne[3];
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
|
||||
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
||||
|
||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
||||
@@ -1260,6 +1322,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
|
||||
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
||||
|
||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
||||
@@ -1275,6 +1338,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
// The permutations are noops and only change how the tensor data is interpreted.
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
cb(cur, "fattn_mla", il);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
||||
#endif
|
||||
@@ -1283,6 +1347,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
||||
} else {
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
// note: this op tends to require high floating point range
|
||||
// while for some models F16 is enough, for others it is not, so we default to F32 here
|
||||
@@ -1290,38 +1355,48 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
|
||||
if (arch == LLM_ARCH_GROK) {
|
||||
// need to do the following:
|
||||
// multiply by attn_output_multiplyer of 0.08838834764831845
|
||||
// multiply by attn_output_multiplier
|
||||
// and then :
|
||||
// kq = 30 * tanh(kq / 30)
|
||||
// before the softmax below
|
||||
|
||||
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
|
||||
kq = ggml_scale(ctx0, kq, 30);
|
||||
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
|
||||
cb(kq, "kq_tanh", il);
|
||||
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
||||
cb(kq, "kq_scaled", il);
|
||||
}
|
||||
|
||||
if (hparams.attn_soft_cap) {
|
||||
kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
|
||||
cb(kq, "kq_scaled_1", il);
|
||||
kq = ggml_tanh (ctx0, kq);
|
||||
cb(kq, "kq_tanh", il);
|
||||
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
||||
cb(kq, "kq_scaled_2", il);
|
||||
}
|
||||
|
||||
if (kq_b) {
|
||||
kq = ggml_add(ctx0, kq, kq_b);
|
||||
cb(kq, "kq_plus_kq_b", il);
|
||||
}
|
||||
|
||||
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||
ggml_soft_max_add_sinks(kq, sinks);
|
||||
cb(kq, "kq_soft_max", il);
|
||||
|
||||
if (!v_trans) {
|
||||
// note: avoid this branch
|
||||
v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
||||
cb(v, "v_cont", il);
|
||||
}
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||
cb(kqv, "kqv", il);
|
||||
|
||||
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
|
||||
if (v_mla) {
|
||||
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
||||
cb(kqv, "kqv_mla", il);
|
||||
}
|
||||
|
||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
@@ -1360,6 +1435,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
@@ -1375,13 +1451,14 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
// [TAG_NO_CACHE_PAD]
|
||||
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
||||
assert(!ubatch.equal_seqs());
|
||||
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
|
||||
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
||||
|
||||
ggml_tensor * q = q_cur;
|
||||
ggml_tensor * k = k_cur;
|
||||
ggml_tensor * v = v_cur;
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@@ -1399,17 +1476,17 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
return cur;
|
||||
}
|
||||
|
||||
static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unified_impl(
|
||||
static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||
ggml_context * ctx0,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache_unified_context * mctx_cur) {
|
||||
const llama_kv_cache_context * mctx_cur) {
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, mctx_cur);
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv>(hparams, cparams, mctx_cur);
|
||||
|
||||
{
|
||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
|
||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
|
||||
|
||||
const auto n_kv = mctx_cur->get_n_kv();
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
@@ -1427,22 +1504,23 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
|
||||
return inp;
|
||||
}
|
||||
|
||||
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
|
||||
llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const {
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
|
||||
|
||||
auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
|
||||
auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
|
||||
|
||||
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
|
||||
return (llm_graph_input_attn_kv *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv_unified * inp,
|
||||
llm_graph_input_attn_kv * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
@@ -1469,7 +1547,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@@ -1488,40 +1566,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
llm_graph_input_attn_kv_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
return build_attn_with_sinks(
|
||||
inp,
|
||||
wo,
|
||||
wo_b,
|
||||
q_cur,
|
||||
k_cur,
|
||||
v_cur,
|
||||
kq_b,
|
||||
v_mla,
|
||||
nullptr,
|
||||
kq_scale,
|
||||
il);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * v_mla,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
@@ -1561,7 +1614,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@@ -1600,6 +1653,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
@@ -1615,7 +1669,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = k_cur;
|
||||
ggml_tensor * v = v_cur;
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@@ -1636,10 +1690,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
// TODO: maybe separate the inner implementation into a separate function
|
||||
// like with the non-sliding window equivalent
|
||||
// once sliding-window hybrid caches are a thing.
|
||||
llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);
|
||||
llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const {
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_iswa_context *>(mctx);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, mctx_cur);
|
||||
|
||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||
|
||||
@@ -1656,7 +1710,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||
}
|
||||
|
||||
{
|
||||
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
|
||||
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA");
|
||||
|
||||
const auto n_kv = mctx_cur->get_swa()->get_n_kv();
|
||||
|
||||
@@ -1669,7 +1723,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
}
|
||||
|
||||
return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
|
||||
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_rs(
|
||||
@@ -1792,7 +1846,7 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||
|
||||
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
||||
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||
|
||||
@@ -1843,34 +1897,32 @@ void llm_graph_context::build_pooling(
|
||||
case LLAMA_POOLING_TYPE_RANK:
|
||||
{
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
if (cls) {
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
cur = ggml_mul_mat(ctx0, cls, inp);
|
||||
cur = ggml_mul_mat(ctx0, cls, cur);
|
||||
if (cls_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_b);
|
||||
}
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
}
|
||||
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
if (cls_out) {
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
}
|
||||
} else if (cls_out) {
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
if (cls_out) {
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||
}
|
||||
|
||||
// softmax for qwen3 reranker
|
||||
if (arch == LLM_ARCH_QWEN3) {
|
||||
cur = ggml_soft_max(ctx0, cur);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
||||
94
llama/llama.cpp/src/llama-graph.h
vendored
94
llama/llama.cpp/src/llama-graph.h
vendored
@@ -19,8 +19,8 @@ struct llama_cparams;
|
||||
|
||||
struct llama_memory_context_i;
|
||||
|
||||
class llama_kv_cache_unified_context;
|
||||
class llama_kv_cache_unified_iswa_context;
|
||||
class llama_kv_cache_context;
|
||||
class llama_kv_cache_iswa_context;
|
||||
class llama_memory_recurrent_context;
|
||||
class llama_memory_hybrid_context;
|
||||
|
||||
@@ -78,6 +78,11 @@ struct llm_graph_params;
|
||||
|
||||
class llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_i() {
|
||||
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
|
||||
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
|
||||
}
|
||||
|
||||
virtual ~llm_graph_input_i() = default;
|
||||
|
||||
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
||||
@@ -90,6 +95,9 @@ public:
|
||||
GGML_UNUSED(params);
|
||||
return false;
|
||||
}
|
||||
protected:
|
||||
// env: LLAMA_GRAPH_INPUT_DEBUG
|
||||
int debug = 0;
|
||||
};
|
||||
|
||||
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
||||
@@ -152,7 +160,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_pos_bucket_kv(
|
||||
const llama_hparams & hparams,
|
||||
const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {}
|
||||
const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
|
||||
virtual ~llm_graph_input_pos_bucket_kv() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
@@ -161,7 +169,7 @@ public:
|
||||
|
||||
const llama_hparams hparams;
|
||||
|
||||
const llama_kv_cache_unified_context * mctx;
|
||||
const llama_kv_cache_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_out_ids : public llm_graph_input_i {
|
||||
@@ -198,7 +206,7 @@ public:
|
||||
|
||||
class llm_graph_input_cls : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
||||
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
||||
virtual ~llm_graph_input_cls() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
@@ -206,6 +214,7 @@ public:
|
||||
ggml_tensor * cls; // I32 [n_batch]
|
||||
|
||||
const llama_cparams cparams;
|
||||
const llm_arch arch;
|
||||
};
|
||||
|
||||
class llm_graph_input_rs : public llm_graph_input_i {
|
||||
@@ -257,17 +266,17 @@ public:
|
||||
const llama_cparams cparams;
|
||||
};
|
||||
|
||||
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
||||
class llm_graph_input_attn_kv : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_kv_unified(
|
||||
llm_graph_input_attn_kv(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache_unified_context * mctx) :
|
||||
const llama_kv_cache_context * mctx) :
|
||||
hparams(hparams),
|
||||
cparams(cparams),
|
||||
mctx(mctx) {
|
||||
}
|
||||
~llm_graph_input_attn_kv_unified() = default;
|
||||
~llm_graph_input_attn_kv() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
@@ -290,20 +299,20 @@ public:
|
||||
const llama_hparams hparams;
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_kv_cache_unified_context * mctx;
|
||||
const llama_kv_cache_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
|
||||
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_kv_unified_iswa(
|
||||
llm_graph_input_attn_kv_iswa(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache_unified_iswa_context * mctx) :
|
||||
const llama_kv_cache_iswa_context * mctx) :
|
||||
hparams(hparams),
|
||||
cparams(cparams),
|
||||
mctx(mctx) {
|
||||
}
|
||||
~llm_graph_input_attn_kv_unified_iswa() = default;
|
||||
~llm_graph_input_attn_kv_iswa() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
@@ -330,7 +339,7 @@ public:
|
||||
const llama_hparams hparams;
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_kv_cache_unified_iswa_context * mctx;
|
||||
const llama_kv_cache_iswa_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_attn_cross : public llm_graph_input_i {
|
||||
@@ -351,7 +360,7 @@ public:
|
||||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_mem_hybrid(
|
||||
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
|
||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_context * mctx) :
|
||||
inp_attn(std::move(inp_attn)),
|
||||
@@ -361,11 +370,11 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||
|
||||
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
|
||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||
|
||||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
@@ -680,14 +689,15 @@ struct llm_graph_context {
|
||||
//
|
||||
|
||||
ggml_tensor * build_attn_mha(
|
||||
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * kq_mask,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale) const;
|
||||
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * kq_mask,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
||||
|
||||
@@ -699,50 +709,39 @@ struct llm_graph_context {
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
|
||||
llm_graph_input_attn_kv * build_attn_inp_kv() const;
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_kv_unified * inp,
|
||||
llm_graph_input_attn_kv * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
|
||||
llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
|
||||
|
||||
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
llm_graph_input_attn_kv_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
||||
ggml_tensor * build_attn_with_sinks(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
@@ -756,6 +755,7 @@ struct llm_graph_context {
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
@@ -765,7 +765,7 @@ struct llm_graph_context {
|
||||
//
|
||||
|
||||
// TODO: move this implementation to llama_memory_recurrent.
|
||||
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
||||
// this is analogous to llama_kv_cache::cpy_k / cpy_v
|
||||
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
||||
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
||||
// `llama_memory_recurrent`
|
||||
|
||||
62
llama/llama.cpp/src/llama-hparams.cpp
vendored
62
llama/llama.cpp/src/llama-hparams.cpp
vendored
@@ -1,6 +1,7 @@
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include <cassert>
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
@@ -161,3 +162,64 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
if (n_layer_kv_from_start >= 0) {
|
||||
if (il < (uint32_t) n_layer_kv_from_start) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// by default, all layers have kv
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_layer_kv() const {
|
||||
uint32_t res = 0;
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
if (has_kv(il)) {
|
||||
res++;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||
{
|
||||
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||
const int32_t pos_diff = p1 - p0;
|
||||
|
||||
// Mask if outside the symmetric window
|
||||
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
40
llama/llama.cpp/src/llama-hparams.h
vendored
40
llama/llama.cpp/src/llama-hparams.h
vendored
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
|
||||
};
|
||||
|
||||
enum llama_swa_type {
|
||||
LLAMA_SWA_TYPE_NONE = 0,
|
||||
LLAMA_SWA_TYPE_STANDARD = 1,
|
||||
LLAMA_SWA_TYPE_CHUNKED = 2,
|
||||
LLAMA_SWA_TYPE_NONE = 0,
|
||||
LLAMA_SWA_TYPE_STANDARD = 1,
|
||||
LLAMA_SWA_TYPE_CHUNKED = 2,
|
||||
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
||||
};
|
||||
|
||||
struct llama_hparams_posnet {
|
||||
@@ -41,6 +42,7 @@ struct llama_hparams {
|
||||
uint32_t n_embd;
|
||||
uint32_t n_embd_features = 0;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
@@ -69,10 +71,13 @@ struct llama_hparams {
|
||||
uint32_t n_lora_kv = 0;
|
||||
uint32_t n_ff_exp = 0;
|
||||
uint32_t n_ff_shexp = 0;
|
||||
uint32_t n_ff_chexp = 0;
|
||||
uint32_t n_expert_shared = 0;
|
||||
uint32_t n_norm_groups = 0;
|
||||
uint32_t n_group_experts = 0;
|
||||
|
||||
float expert_weights_scale = 0.0;
|
||||
float expert_group_scale = 0.05f;
|
||||
float expert_weights_scale = 0.0f;
|
||||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
@@ -82,8 +87,9 @@ struct llama_hparams {
|
||||
float f_norm_rms_eps;
|
||||
float f_norm_group_eps;
|
||||
|
||||
float f_attn_logit_softcapping = 50.0f;
|
||||
float f_final_logit_softcapping = 30.0f;
|
||||
float f_attn_logit_softcapping = 50.0f;
|
||||
float f_router_logit_softcapping = 30.0f;
|
||||
float f_final_logit_softcapping = 30.0f;
|
||||
|
||||
// for RWKV
|
||||
uint32_t rescale_every_n_layers = 0;
|
||||
@@ -104,6 +110,11 @@ struct llama_hparams {
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
float yarn_ext_factor = -1.0f;
|
||||
float yarn_attn_factor = 1.0f;
|
||||
float yarn_beta_fast = 32.0f;
|
||||
float yarn_beta_slow = 1.0f;
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
|
||||
// Sliding Window Attention (SWA)
|
||||
@@ -136,10 +147,14 @@ struct llama_hparams {
|
||||
float f_embedding_scale = 0.0f;
|
||||
float f_attention_scale = 0.0f;
|
||||
|
||||
// grok-2
|
||||
float f_attn_out_scale = 0.0f;
|
||||
uint32_t attn_temp_length = 0;
|
||||
|
||||
bool causal_attn = true;
|
||||
bool use_alibi = false;
|
||||
bool attn_soft_cap = false;
|
||||
bool use_kq_norm = true;
|
||||
bool use_kq_norm = false;
|
||||
|
||||
// for Classifiers
|
||||
uint32_t n_cls_out = 1;
|
||||
@@ -159,6 +174,7 @@ struct llama_hparams {
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
uint32_t dec_n_layer = 0;
|
||||
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
@@ -226,6 +242,16 @@ struct llama_hparams {
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
bool has_kv(uint32_t il) const;
|
||||
|
||||
// number of layers for which has_kv() returns true
|
||||
uint32_t n_layer_kv() const;
|
||||
|
||||
// note that this function uses different SWA parameters from those in the hparams
|
||||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
2
llama/llama.cpp/src/llama-impl.h
vendored
2
llama/llama.cpp/src/llama-impl.h
vendored
@@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
|
||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
||||
|
||||
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
||||
|
||||
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "llama-kv-cache-unified-iswa.h"
|
||||
#include "llama-kv-cache-iswa.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
@@ -8,10 +8,10 @@
|
||||
#include <cassert>
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_iswa
|
||||
// llama_kv_cache_iswa
|
||||
//
|
||||
|
||||
llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
llama_kv_cache_iswa::llama_kv_cache_iswa(
|
||||
const llama_model & model,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
@@ -22,9 +22,26 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_ubatch,
|
||||
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
|
||||
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
||||
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
||||
uint32_t n_pad,
|
||||
const layer_filter_cb & filter,
|
||||
const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
|
||||
|
||||
// chain filters
|
||||
const layer_filter_cb filter_base = [&](int32_t il) {
|
||||
if (filter && !filter(il)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !model.hparams.is_swa(il);
|
||||
};
|
||||
|
||||
const layer_filter_cb filter_swa = [&](int32_t il) {
|
||||
if (filter && !filter(il)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return model.hparams.is_swa(il);
|
||||
};
|
||||
|
||||
const uint32_t size_base = kv_size;
|
||||
|
||||
@@ -40,25 +57,25 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
|
||||
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
||||
|
||||
kv_base = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_base), type_k, type_v,
|
||||
kv_base = std::make_unique<llama_kv_cache>(
|
||||
model, type_k, type_v,
|
||||
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
||||
0, LLAMA_SWA_TYPE_NONE);
|
||||
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
|
||||
|
||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||
|
||||
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_swa), type_k, type_v,
|
||||
kv_swa = std::make_unique<llama_kv_cache>(
|
||||
model, type_k, type_v,
|
||||
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
||||
hparams.n_swa, hparams.swa_type);
|
||||
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::clear(bool data) {
|
||||
void llama_kv_cache_iswa::clear(bool data) {
|
||||
kv_base->clear(data);
|
||||
kv_swa ->clear(data);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
bool res = true;
|
||||
|
||||
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
||||
@@ -67,36 +84,44 @@ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llam
|
||||
return res;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
|
||||
void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
|
||||
kv_base->seq_keep(seq_id);
|
||||
kv_swa ->seq_keep(seq_id);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||
void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||
kv_base->seq_add(seq_id, p0, p1, shift);
|
||||
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
kv_base->seq_div(seq_id, p0, p1, d);
|
||||
kv_swa ->seq_div(seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
||||
llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
||||
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
||||
return kv_swa->seq_pos_min(seq_id);
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||
llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return kv_swa->seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
||||
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
||||
mb[buft_size.first] += buft_size.second;
|
||||
}
|
||||
return mb;
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
GGML_UNUSED(embd_all);
|
||||
|
||||
// first try simple split
|
||||
@@ -136,7 +161,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
||||
|
||||
assert(sinfos_base.size() == sinfos_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(
|
||||
return std::make_unique<llama_kv_cache_iswa_context>(
|
||||
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
@@ -172,61 +197,67 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
||||
|
||||
assert(sinfos_base.size() == sinfos_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(
|
||||
return std::make_unique<llama_kv_cache_iswa_context>(
|
||||
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
// TODO: if we fail again, we should attempt different splitting strategies
|
||||
// but to do that properly, we first have to refactor the batches to be more flexible
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(this);
|
||||
llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
|
||||
return std::make_unique<llama_kv_cache_iswa_context>(this);
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(this, lctx, optimize);
|
||||
llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
|
||||
return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
||||
bool llama_kv_cache_iswa::get_can_shift() const {
|
||||
return kv_base->get_size() == kv_swa->get_size();
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
kv_base->state_write(io, seq_id);
|
||||
kv_swa ->state_write(io, seq_id);
|
||||
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||
kv_base->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
kv_swa->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
kv_base->state_read(io, seq_id);
|
||||
kv_swa ->state_read(io, seq_id);
|
||||
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||
kv_base->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
kv_swa->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
||||
llama_kv_cache * llama_kv_cache_iswa::get_base() const {
|
||||
return kv_base.get();
|
||||
}
|
||||
|
||||
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
|
||||
llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
|
||||
return kv_swa.get();
|
||||
}
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_iswa_context
|
||||
// llama_kv_cache_iswa_context
|
||||
//
|
||||
|
||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {}
|
||||
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
|
||||
|
||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
||||
llama_kv_cache_unified_iswa * kv) :
|
||||
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
||||
llama_kv_cache_iswa * kv) :
|
||||
ctx_base(kv->get_base()->init_full()),
|
||||
ctx_swa (kv->get_swa ()->init_full()),
|
||||
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
||||
llama_kv_cache_iswa * kv,
|
||||
llama_context * lctx,
|
||||
bool optimize) :
|
||||
ctx_base(kv->get_base()->init_update(lctx, optimize)),
|
||||
@@ -234,21 +265,21 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
||||
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
||||
llama_kv_cache_iswa * kv,
|
||||
slot_info_vec_t sinfos_base,
|
||||
slot_info_vec_t sinfos_swa,
|
||||
std::vector<llama_ubatch> ubatches) :
|
||||
ubatches(std::move(ubatches)),
|
||||
// note: here we copy the ubatches. not sure if this is ideal
|
||||
ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
||||
ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
||||
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
||||
ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
||||
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default;
|
||||
llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
|
||||
|
||||
bool llama_kv_cache_unified_iswa_context::next() {
|
||||
bool llama_kv_cache_iswa_context::next() {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
ctx_base->next();
|
||||
@@ -261,7 +292,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa_context::apply() {
|
||||
bool llama_kv_cache_iswa_context::apply() {
|
||||
assert(!llama_memory_status_is_fail(status));
|
||||
|
||||
bool res = true;
|
||||
@@ -272,24 +303,24 @@ bool llama_kv_cache_unified_iswa_context::apply() {
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const {
|
||||
llama_memory_status llama_kv_cache_iswa_context::get_status() const {
|
||||
return status;
|
||||
}
|
||||
|
||||
const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const {
|
||||
const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return ubatches[i_next];
|
||||
}
|
||||
|
||||
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const {
|
||||
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return static_cast<const llama_kv_cache_unified_context *>(ctx_base.get());
|
||||
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
|
||||
}
|
||||
|
||||
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa() const {
|
||||
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return static_cast<const llama_kv_cache_unified_context *>(ctx_swa.get());
|
||||
return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
|
||||
}
|
||||
@@ -1,19 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-kv-cache-unified.h"
|
||||
#include "llama-kv-cache.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_iswa
|
||||
// llama_kv_cache_iswa
|
||||
//
|
||||
|
||||
// utilizes two instances of llama_kv_cache_unified
|
||||
// utilizes two instances of llama_kv_cache
|
||||
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
|
||||
|
||||
class llama_kv_cache_unified_iswa : public llama_memory_i {
|
||||
class llama_kv_cache_iswa : public llama_memory_i {
|
||||
public:
|
||||
llama_kv_cache_unified_iswa(
|
||||
llama_kv_cache_iswa(
|
||||
const llama_model & model,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
@@ -24,9 +24,11 @@ public:
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_ubatch,
|
||||
uint32_t n_pad);
|
||||
uint32_t n_pad,
|
||||
const layer_filter_cb & filter,
|
||||
const layer_reuse_cb & reuse);
|
||||
|
||||
~llama_kv_cache_unified_iswa() = default;
|
||||
~llama_kv_cache_iswa() = default;
|
||||
|
||||
//
|
||||
// llama_memory_i
|
||||
@@ -54,52 +56,54 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_iswa specific API
|
||||
// llama_kv_cache_iswa specific API
|
||||
//
|
||||
|
||||
llama_kv_cache_unified * get_base() const;
|
||||
llama_kv_cache_unified * get_swa () const;
|
||||
llama_kv_cache * get_base() const;
|
||||
llama_kv_cache * get_swa () const;
|
||||
|
||||
private:
|
||||
const llama_hparams & hparams;
|
||||
|
||||
const bool unified;
|
||||
|
||||
std::unique_ptr<llama_kv_cache_unified> kv_base;
|
||||
std::unique_ptr<llama_kv_cache_unified> kv_swa;
|
||||
std::unique_ptr<llama_kv_cache> kv_base;
|
||||
std::unique_ptr<llama_kv_cache> kv_swa;
|
||||
};
|
||||
|
||||
class llama_kv_cache_unified_iswa_context : public llama_memory_context_i {
|
||||
class llama_kv_cache_iswa_context : public llama_memory_context_i {
|
||||
public:
|
||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
||||
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||
|
||||
// used for errors
|
||||
llama_kv_cache_unified_iswa_context(llama_memory_status status);
|
||||
llama_kv_cache_iswa_context(llama_memory_status status);
|
||||
|
||||
// used to create a full-cache context
|
||||
llama_kv_cache_unified_iswa_context(
|
||||
llama_kv_cache_unified_iswa * kv);
|
||||
llama_kv_cache_iswa_context(
|
||||
llama_kv_cache_iswa * kv);
|
||||
|
||||
// used to create an update context
|
||||
llama_kv_cache_unified_iswa_context(
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_kv_cache_iswa_context(
|
||||
llama_kv_cache_iswa * kv,
|
||||
llama_context * lctx,
|
||||
bool optimize);
|
||||
|
||||
// used to create a batch processing context from a batch
|
||||
llama_kv_cache_unified_iswa_context(
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_kv_cache_iswa_context(
|
||||
llama_kv_cache_iswa * kv,
|
||||
slot_info_vec_t sinfos_base,
|
||||
slot_info_vec_t sinfos_swa,
|
||||
std::vector<llama_ubatch> ubatches);
|
||||
|
||||
virtual ~llama_kv_cache_unified_iswa_context();
|
||||
virtual ~llama_kv_cache_iswa_context();
|
||||
|
||||
//
|
||||
// llama_memory_context_i
|
||||
@@ -112,14 +116,14 @@ public:
|
||||
const llama_ubatch & get_ubatch() const override;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_iswa_context specific API
|
||||
// llama_kv_cache_iswa_context specific API
|
||||
//
|
||||
|
||||
const llama_kv_cache_unified_context * get_base() const;
|
||||
const llama_kv_cache_unified_context * get_swa() const;
|
||||
const llama_kv_cache_context * get_base() const;
|
||||
const llama_kv_cache_context * get_swa() const;
|
||||
|
||||
private:
|
||||
//llama_kv_cache_unified_iswa * kv;
|
||||
//llama_kv_cache_iswa * kv;
|
||||
|
||||
// the index of the next ubatch to process
|
||||
size_t i_next = 0;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,27 +14,13 @@ struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified
|
||||
// llama_kv_cache
|
||||
//
|
||||
|
||||
class llama_kv_cache_unified : public llama_memory_i {
|
||||
class llama_kv_cache : public llama_memory_i {
|
||||
public:
|
||||
static uint32_t get_padding(const llama_cparams & cparams);
|
||||
|
||||
// this callback is used to filter out layers that should not be included in the cache
|
||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
||||
|
||||
struct defrag_info {
|
||||
bool empty() const {
|
||||
return ids.empty();
|
||||
}
|
||||
|
||||
// contains information about which cell moves where:
|
||||
// - cell i moves to ids[i]
|
||||
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
|
||||
std::vector<uint32_t> ids;
|
||||
};
|
||||
|
||||
struct stream_copy_info {
|
||||
bool empty() const {
|
||||
assert(ssrc.size() == sdst.size());
|
||||
@@ -52,8 +38,8 @@ public:
|
||||
using idx_vec_t = std::vector<uint32_t>;
|
||||
|
||||
// number of streams: ns = s1 - s0 + 1
|
||||
llama_seq_id s0;
|
||||
llama_seq_id s1;
|
||||
uint32_t s0;
|
||||
uint32_t s1;
|
||||
|
||||
std::vector<llama_seq_id> strm; // [ns]
|
||||
std::vector<idx_vec_t> idxs; // [ns]
|
||||
@@ -92,21 +78,22 @@ public:
|
||||
|
||||
using slot_info_vec_t = std::vector<slot_info>;
|
||||
|
||||
llama_kv_cache_unified(
|
||||
const llama_model & model,
|
||||
layer_filter_cb && filter,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
bool unified,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type);
|
||||
llama_kv_cache(
|
||||
const llama_model & model,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
bool unified,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type,
|
||||
const layer_filter_cb & filter,
|
||||
const layer_reuse_cb & reuse);
|
||||
|
||||
~llama_kv_cache_unified() = default;
|
||||
~llama_kv_cache() = default;
|
||||
|
||||
//
|
||||
// llama_memory_i
|
||||
@@ -134,13 +121,15 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified specific API
|
||||
// llama_kv_cache specific API
|
||||
//
|
||||
|
||||
uint32_t get_size() const;
|
||||
@@ -152,10 +141,7 @@ public:
|
||||
// graph_build API
|
||||
//
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
uint32_t get_n_kv(const slot_info & sinfo) const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
@@ -173,7 +159,7 @@ public:
|
||||
// return empty vector on failure
|
||||
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
|
||||
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
|
||||
bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
|
||||
|
||||
// find a slot of kv cells that can hold the ubatch
|
||||
// if cont == true, then the slot must be continuous
|
||||
@@ -228,10 +214,7 @@ private:
|
||||
// env: LLAMA_KV_CACHE_DEBUG
|
||||
int debug = 0;
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = true;
|
||||
|
||||
// this is the SWA type of the cache - not to be confused with the model SWA type
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
@@ -241,7 +224,7 @@ private:
|
||||
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
||||
std::vector<uint32_t> v_heads;
|
||||
|
||||
std::vector<llama_kv_cells_unified> v_cells;
|
||||
std::vector<llama_kv_cells> v_cells;
|
||||
|
||||
// maps from a sequence id to a stream id
|
||||
std::vector<uint32_t> seq_to_stream;
|
||||
@@ -254,9 +237,6 @@ private:
|
||||
// model layer id -> KV cache layer id
|
||||
std::unordered_map<int32_t, int32_t> map_layer_ids;
|
||||
|
||||
// return non-empty vector if cells have been moved
|
||||
defrag_info defrag_prepare(int32_t n_max_nodes) const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
@@ -277,11 +257,6 @@ private:
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx) const;
|
||||
|
||||
ggml_cgraph * build_graph_defrag(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx,
|
||||
const defrag_info & dinfo) const;
|
||||
|
||||
struct cell_ranges_t {
|
||||
uint32_t strm;
|
||||
|
||||
@@ -295,35 +270,33 @@ private:
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
||||
};
|
||||
|
||||
class llama_kv_cache_unified_context : public llama_memory_context_i {
|
||||
class llama_kv_cache_context : public llama_memory_context_i {
|
||||
public:
|
||||
// some shorthands
|
||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
||||
using defrag_info = llama_kv_cache_unified::defrag_info;
|
||||
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
|
||||
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||
using stream_copy_info = llama_kv_cache::stream_copy_info;
|
||||
|
||||
// used for errors
|
||||
llama_kv_cache_unified_context(llama_memory_status status);
|
||||
llama_kv_cache_context(llama_memory_status status);
|
||||
|
||||
// used to create a full-cache context
|
||||
llama_kv_cache_unified_context(
|
||||
llama_kv_cache_unified * kv);
|
||||
llama_kv_cache_context(
|
||||
llama_kv_cache * kv);
|
||||
|
||||
// used to create an update context
|
||||
llama_kv_cache_unified_context(
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_kv_cache_context(
|
||||
llama_kv_cache * kv,
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
defrag_info dinfo,
|
||||
stream_copy_info sc_info);
|
||||
|
||||
// used to create a batch procesing context from a batch
|
||||
llama_kv_cache_unified_context(
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_kv_cache_context(
|
||||
llama_kv_cache * kv,
|
||||
slot_info_vec_t sinfos,
|
||||
std::vector<llama_ubatch> ubatches);
|
||||
|
||||
virtual ~llama_kv_cache_unified_context();
|
||||
virtual ~llama_kv_cache_context();
|
||||
|
||||
//
|
||||
// llama_memory_context_i
|
||||
@@ -336,22 +309,27 @@ public:
|
||||
const llama_ubatch & get_ubatch() const override;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_context specific API
|
||||
// llama_kv_cache_context specific API
|
||||
//
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
||||
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
||||
// - k_idxs [n_tokens]
|
||||
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
||||
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
|
||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
||||
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
||||
|
||||
// create destination indices for each head of the current batch for where it would be written in the KV cache
|
||||
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
|
||||
// helps understand the implementation logic of cpy_k and cpy_v
|
||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
|
||||
@@ -365,7 +343,7 @@ public:
|
||||
private:
|
||||
llama_memory_status status;
|
||||
|
||||
llama_kv_cache_unified * kv;
|
||||
llama_kv_cache * kv;
|
||||
llama_context * lctx;
|
||||
|
||||
//
|
||||
@@ -374,8 +352,6 @@ private:
|
||||
|
||||
bool do_shift = false;
|
||||
|
||||
defrag_info dinfo;
|
||||
|
||||
stream_copy_info sc_info;
|
||||
|
||||
//
|
||||
42
llama/llama.cpp/src/llama-kv-cells.h
vendored
42
llama/llama.cpp/src/llama-kv-cells.h
vendored
@@ -11,7 +11,7 @@
|
||||
|
||||
// meta information about KV cells that can be part of multiple sequences at the same time
|
||||
// TODO: add unit tests
|
||||
class llama_kv_cells_unified {
|
||||
class llama_kv_cells {
|
||||
public:
|
||||
void reset() {
|
||||
for (uint32_t i = 0; i < pos.size(); ++i) {
|
||||
@@ -77,30 +77,30 @@ public:
|
||||
}
|
||||
|
||||
// move cell isrc to idst (used during defrag)
|
||||
void mv(uint32_t isrc, uint32_t idst) {
|
||||
assert(isrc < pos.size());
|
||||
assert(idst < pos.size());
|
||||
//void mv(uint32_t isrc, uint32_t idst) {
|
||||
// assert(isrc < pos.size());
|
||||
// assert(idst < pos.size());
|
||||
|
||||
assert(pos[idst] == -1);
|
||||
assert(pos[isrc] != -1);
|
||||
// assert(pos[idst] == -1);
|
||||
// assert(pos[isrc] != -1);
|
||||
|
||||
pos [idst] = pos [isrc];
|
||||
shift[idst] = shift[isrc];
|
||||
seq [idst] = seq [isrc];
|
||||
// pos [idst] = pos [isrc];
|
||||
// shift[idst] = shift[isrc];
|
||||
// seq [idst] = seq [isrc];
|
||||
|
||||
pos [isrc] = -1;
|
||||
shift[isrc] = 0;
|
||||
seq [isrc].reset();
|
||||
// pos [isrc] = -1;
|
||||
// shift[isrc] = 0;
|
||||
// seq [isrc].reset();
|
||||
|
||||
used.erase (isrc);
|
||||
used.insert(idst);
|
||||
}
|
||||
// used.erase (isrc);
|
||||
// used.insert(idst);
|
||||
//}
|
||||
|
||||
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
|
||||
llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
|
||||
llama_kv_cells cp(uint32_t i, uint32_t n) const {
|
||||
assert(i + n <= pos.size());
|
||||
|
||||
llama_kv_cells_unified res;
|
||||
llama_kv_cells res;
|
||||
|
||||
res.resize(n);
|
||||
|
||||
@@ -117,8 +117,8 @@ public:
|
||||
}
|
||||
|
||||
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
||||
llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
|
||||
llama_kv_cells_unified res;
|
||||
llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
|
||||
llama_kv_cells res;
|
||||
|
||||
res.resize(idxs.size());
|
||||
|
||||
@@ -135,7 +135,7 @@ public:
|
||||
}
|
||||
|
||||
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
|
||||
void set(uint32_t i, const llama_kv_cells_unified & other) {
|
||||
void set(uint32_t i, const llama_kv_cells & other) {
|
||||
assert(i + other.pos.size() <= pos.size());
|
||||
|
||||
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
||||
@@ -165,7 +165,7 @@ public:
|
||||
}
|
||||
|
||||
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
||||
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
|
||||
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
|
||||
assert(idxs.size() == other.pos.size());
|
||||
|
||||
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
||||
|
||||
83
llama/llama.cpp/src/llama-memory-hybrid.cpp
vendored
83
llama/llama.cpp/src/llama-memory-hybrid.cpp
vendored
@@ -9,32 +9,29 @@
|
||||
//
|
||||
|
||||
llama_memory_hybrid::llama_memory_hybrid(
|
||||
const llama_model & model,
|
||||
/* attn */
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type,
|
||||
/* recurrent */
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
layer_filter_cb && filter_attn,
|
||||
layer_filter_cb && filter_recr) :
|
||||
const llama_model & model,
|
||||
/* attn */
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type,
|
||||
/* recurrent */
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
const layer_filter_cb & filter_attn,
|
||||
const layer_filter_cb & filter_recr) :
|
||||
hparams(model.hparams),
|
||||
mem_attn(new llama_kv_cache_unified(
|
||||
mem_attn(new llama_kv_cache(
|
||||
model,
|
||||
filter_attn == nullptr ?
|
||||
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
||||
: filter_attn,
|
||||
type_k,
|
||||
type_v,
|
||||
v_trans,
|
||||
@@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||
n_seq_max,
|
||||
n_pad,
|
||||
n_swa,
|
||||
swa_type
|
||||
swa_type,
|
||||
filter_attn == nullptr ?
|
||||
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
||||
: filter_attn,
|
||||
nullptr
|
||||
)),
|
||||
mem_recr(new llama_memory_recurrent(
|
||||
model,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
: filter_recr,
|
||||
type_r,
|
||||
type_s,
|
||||
offload,
|
||||
rs_size,
|
||||
n_seq_max
|
||||
n_seq_max,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
: filter_recr
|
||||
)) {}
|
||||
|
||||
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
@@ -165,17 +166,29 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
||||
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
||||
mb[buft_size.first] += buft_size.second;
|
||||
}
|
||||
return mb;
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
mem_attn->state_write(io, seq_id);
|
||||
mem_recr->state_write(io, seq_id);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
mem_attn->state_read(io, seq_id);
|
||||
mem_recr->state_read(io, seq_id);
|
||||
}
|
||||
|
||||
llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
|
||||
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
|
||||
return mem_attn.get();
|
||||
}
|
||||
|
||||
@@ -206,7 +219,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
||||
std::vector<llama_ubatch> ubatches) :
|
||||
ubatches(std::move(ubatches)),
|
||||
// note: here we copy the ubatches. not sure if this is ideal
|
||||
ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
||||
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
||||
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||
}
|
||||
@@ -244,8 +257,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
|
||||
return ubatches[i_next];
|
||||
}
|
||||
|
||||
const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
|
||||
return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
|
||||
const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
|
||||
return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
|
||||
}
|
||||
|
||||
const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
|
||||
|
||||
58
llama/llama.cpp/src/llama-memory-hybrid.h
vendored
58
llama/llama.cpp/src/llama-memory-hybrid.h
vendored
@@ -2,7 +2,7 @@
|
||||
|
||||
#include "llama-batch.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-kv-cache-unified.h"
|
||||
#include "llama-kv-cache.h"
|
||||
#include "llama-memory.h"
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
@@ -13,36 +13,32 @@
|
||||
// llama_memory_hybrid
|
||||
//
|
||||
|
||||
// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
|
||||
// utilizes instances of llama_memory_recurrent and llama_kv_cache to
|
||||
// support models where each layer may be either attention-based or recurrent
|
||||
|
||||
class llama_memory_hybrid : public llama_memory_i {
|
||||
public:
|
||||
|
||||
// this callback is used to filter out layers that should not be included in the cache
|
||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
||||
|
||||
llama_memory_hybrid(
|
||||
const llama_model & model,
|
||||
/* attn */
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type,
|
||||
/* recurrent */
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
layer_filter_cb && filter_attn = nullptr,
|
||||
layer_filter_cb && filter_recr = nullptr);
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type,
|
||||
/* recurrent */
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
const layer_filter_cb & filter_attn = nullptr,
|
||||
const layer_filter_cb & filter_recr = nullptr);
|
||||
|
||||
~llama_memory_hybrid() = default;
|
||||
|
||||
@@ -72,28 +68,30 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_memory_hybrid specific API
|
||||
//
|
||||
|
||||
llama_kv_cache_unified * get_mem_attn() const;
|
||||
llama_kv_cache * get_mem_attn() const;
|
||||
llama_memory_recurrent * get_mem_recr() const;
|
||||
|
||||
private:
|
||||
const llama_hparams & hparams;
|
||||
|
||||
const std::unique_ptr<llama_kv_cache_unified> mem_attn;
|
||||
const std::unique_ptr<llama_kv_cache> mem_attn;
|
||||
const std::unique_ptr<llama_memory_recurrent> mem_recr;
|
||||
};
|
||||
|
||||
class llama_memory_hybrid_context : public llama_memory_context_i {
|
||||
public:
|
||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
||||
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||
|
||||
// init failure
|
||||
explicit llama_memory_hybrid_context(llama_memory_status status);
|
||||
@@ -125,7 +123,7 @@ public:
|
||||
// llama_memory_hybrid_context
|
||||
//
|
||||
|
||||
const llama_kv_cache_unified_context * get_attn() const;
|
||||
const llama_kv_cache_context * get_attn() const;
|
||||
const llama_memory_recurrent_context * get_recr() const;
|
||||
|
||||
private:
|
||||
|
||||
30
llama/llama.cpp/src/llama-memory-recurrent.cpp
vendored
30
llama/llama.cpp/src/llama-memory-recurrent.cpp
vendored
@@ -16,13 +16,13 @@
|
||||
//
|
||||
|
||||
llama_memory_recurrent::llama_memory_recurrent(
|
||||
const llama_model & model,
|
||||
layer_filter_cb && filter,
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
bool offload,
|
||||
uint32_t mem_size,
|
||||
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||
const llama_model & model,
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
bool offload,
|
||||
uint32_t mem_size,
|
||||
uint32_t n_seq_max,
|
||||
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
|
||||
head = 0;
|
||||
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
do {
|
||||
balloc.split_reset();
|
||||
@@ -680,7 +688,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
|
||||
return size_s_bytes;
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||
uint32_t cell_count = 0;
|
||||
|
||||
@@ -718,7 +728,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
||||
state_write_data(io, cell_ranges);
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
uint32_t cell_count;
|
||||
io.read_to(&cell_count, sizeof(cell_count));
|
||||
|
||||
|
||||
27
llama/llama.cpp/src/llama-memory-recurrent.h
vendored
27
llama/llama.cpp/src/llama-memory-recurrent.h
vendored
@@ -4,6 +4,7 @@
|
||||
#include "llama-graph.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
@@ -12,21 +13,17 @@
|
||||
//
|
||||
|
||||
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
|
||||
// see the implementation of llama_kv_cache_unified_context_i for an example how to do it
|
||||
// see the implementation of llama_kv_cache_context_i for an example how to do it
|
||||
class llama_memory_recurrent : public llama_memory_i {
|
||||
public:
|
||||
|
||||
// this callback is used to filter out layers that should not be included in the cache
|
||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
||||
|
||||
llama_memory_recurrent(
|
||||
const llama_model & model,
|
||||
layer_filter_cb && filter,
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
bool offload,
|
||||
uint32_t mem_size,
|
||||
uint32_t n_seq_max);
|
||||
const llama_model & model,
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
bool offload,
|
||||
uint32_t mem_size,
|
||||
uint32_t n_seq_max,
|
||||
const layer_filter_cb & filter);
|
||||
|
||||
~llama_memory_recurrent() = default;
|
||||
|
||||
@@ -54,6 +51,8 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
|
||||
// find a contiguous slot of memory cells and emplace the ubatch there
|
||||
@@ -63,8 +62,8 @@ public:
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
||||
uint32_t size = 0; // total number of cells, shared across all sequences
|
||||
|
||||
26
llama/llama.cpp/src/llama-memory.h
vendored
26
llama/llama.cpp/src/llama-memory.h
vendored
@@ -2,7 +2,9 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
|
||||
struct llama_ubatch;
|
||||
|
||||
@@ -36,8 +38,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
|
||||
|
||||
// the interface for managing the memory context during batch processing
|
||||
// this interface is implemented per memory type. see:
|
||||
// - llama_kv_cache_unified_context
|
||||
// - llama_kv_cache_unified_iswa_context
|
||||
// - llama_kv_cache_context
|
||||
// - llama_kv_cache_iswa_context
|
||||
// ...
|
||||
//
|
||||
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
|
||||
@@ -64,6 +66,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
|
||||
// general concept of LLM memory
|
||||
// the KV cache is a type of LLM memory, but there can be other types
|
||||
struct llama_memory_i {
|
||||
// this callback is used to filter out layers that should not be included in the cache
|
||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
||||
|
||||
// this callback is used to specify which layers should reuse memory from other layers
|
||||
// return negative value to indicate that the layer il should not reuse memory
|
||||
using layer_reuse_cb = std::function<int32_t(int32_t il)>;
|
||||
|
||||
virtual ~llama_memory_i() = default;
|
||||
|
||||
// split the input batch into a set of ubatches and verify that they can fit into the cache
|
||||
@@ -77,7 +86,7 @@ struct llama_memory_i {
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
virtual llama_memory_context_ptr init_full() = 0;
|
||||
|
||||
// prepare for any pending memory updates, such as shifts, defrags, etc.
|
||||
// prepare for any pending memory updates, such as shifts, copies, etc.
|
||||
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
|
||||
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
|
||||
|
||||
@@ -100,17 +109,14 @@ struct llama_memory_i {
|
||||
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
||||
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
||||
|
||||
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
|
||||
|
||||
//
|
||||
// state write/read
|
||||
//
|
||||
|
||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
|
||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
|
||||
};
|
||||
|
||||
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
||||
|
||||
// TODO: temporary until the llama_kv_cache is removed from the public API
|
||||
struct llama_kv_cache : public llama_memory_i {
|
||||
virtual ~llama_kv_cache() = default;
|
||||
};
|
||||
|
||||
1
llama/llama.cpp/src/llama-model-loader.cpp
vendored
1
llama/llama.cpp/src/llama-model-loader.cpp
vendored
@@ -789,6 +789,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
|
||||
if (cur == NULL) {
|
||||
|
||||
1855
llama/llama.cpp/src/llama-model.cpp
vendored
1855
llama/llama.cpp/src/llama-model.cpp
vendored
File diff suppressed because it is too large
Load Diff
17
llama/llama.cpp/src/llama-model.h
vendored
17
llama/llama.cpp/src/llama-model.h
vendored
@@ -7,6 +7,7 @@
|
||||
#include "llama-memory.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@@ -28,6 +29,7 @@ enum llm_type {
|
||||
LLM_TYPE_80M,
|
||||
LLM_TYPE_109M,
|
||||
LLM_TYPE_137M,
|
||||
LLM_TYPE_140M,
|
||||
LLM_TYPE_160M,
|
||||
LLM_TYPE_190M,
|
||||
LLM_TYPE_220M,
|
||||
@@ -36,12 +38,15 @@ enum llm_type {
|
||||
LLM_TYPE_270M,
|
||||
LLM_TYPE_335M,
|
||||
LLM_TYPE_350M,
|
||||
LLM_TYPE_360M,
|
||||
LLM_TYPE_410M,
|
||||
LLM_TYPE_450M,
|
||||
LLM_TYPE_475M,
|
||||
LLM_TYPE_558M,
|
||||
LLM_TYPE_700M,
|
||||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_950M,
|
||||
LLM_TYPE_0_3B,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_0_6B,
|
||||
@@ -54,6 +59,7 @@ enum llm_type {
|
||||
LLM_TYPE_1_7B,
|
||||
LLM_TYPE_1_8B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_6B,
|
||||
LLM_TYPE_2_8B,
|
||||
LLM_TYPE_2_9B,
|
||||
LLM_TYPE_3B,
|
||||
@@ -76,9 +82,11 @@ enum llm_type {
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
LLM_TYPE_35B,
|
||||
LLM_TYPE_36B,
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_120B,
|
||||
LLM_TYPE_142B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
@@ -268,6 +276,11 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp = nullptr;
|
||||
|
||||
// ff adjugate experts (chexps)
|
||||
struct ggml_tensor * ffn_gate_chexps = nullptr;
|
||||
struct ggml_tensor * ffn_down_chexps = nullptr;
|
||||
struct ggml_tensor * ffn_up_chexps = nullptr;
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||
@@ -449,10 +462,12 @@ struct llama_model {
|
||||
|
||||
std::string desc() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t size() const; // file size
|
||||
size_t n_tensors() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements() const;
|
||||
|
||||
|
||||
10
llama/llama.cpp/src/llama-quant.cpp
vendored
10
llama/llama.cpp/src/llama-quant.cpp
vendored
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
// attention layers have a non-zero number of kv heads
|
||||
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
// now n_attn_layer is the number of attention layers in the encoder
|
||||
// for each decoder block, there are 2 attention layers
|
||||
n_attn_layer += 2 * model.hparams.dec_n_layer;
|
||||
}
|
||||
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
||||
}
|
||||
@@ -920,7 +922,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
@@ -1037,8 +1039,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
}
|
||||
close_ofstream();
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
|
||||
if (qs.n_fallback > 0) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
||||
|
||||
352
llama/llama.cpp/src/llama-sampling.cpp
vendored
352
llama/llama.cpp/src/llama-sampling.cpp
vendored
@@ -128,6 +128,89 @@ struct ring_buffer {
|
||||
std::vector<T> data;
|
||||
};
|
||||
|
||||
// writes result in res, does not mutate cur
|
||||
static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
|
||||
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx;
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
|
||||
bucket_idx.reserve(cur.size);
|
||||
|
||||
for (int i = 0; i < (int)cur.size; ++i) {
|
||||
const float val = cur.data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||
bucket_idx.push_back(ib);
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= npartial) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
res.resize(nhave);
|
||||
auto * ptr = res.data();
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)cur.size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = res.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets - 1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
|
||||
}
|
||||
|
||||
// reduces the size of cur_p to npartial, keeping only the top npartial elements
|
||||
static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
|
||||
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
|
||||
if (npartial <= 128) {
|
||||
std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
|
||||
|
||||
cur_p->size = npartial;
|
||||
cur_p->sorted = true;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> tmp;
|
||||
|
||||
llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
|
||||
|
||||
std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
|
||||
|
||||
cur_p->size = npartial;
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
|
||||
// iterator for the probabilities
|
||||
#ifdef __GNUC__
|
||||
@@ -200,18 +283,21 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
||||
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
|
||||
GGML_ASSERT(cur_p->size > 0);
|
||||
|
||||
// Sort the logits in descending order
|
||||
if (!cur_p->sorted) {
|
||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
cur_p->sorted = true;
|
||||
// Sort the logits in descending order if requested
|
||||
if (do_sort && !cur_p->sorted) {
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
|
||||
}
|
||||
|
||||
float max_l = cur_p->data[0].logit;
|
||||
if (!cur_p->sorted) {
|
||||
for (size_t i = 1; i < cur_p->size; ++i) {
|
||||
max_l = std::max(max_l, cur_p->data[i].logit);
|
||||
}
|
||||
}
|
||||
|
||||
float cum_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
@@ -226,7 +312,6 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
||||
}
|
||||
|
||||
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
|
||||
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
|
||||
// if (k >= (int32_t)cur_p->size) {
|
||||
// return;
|
||||
// }
|
||||
@@ -239,64 +324,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
|
||||
// Sort scores in descending order
|
||||
if (!cur_p->sorted) {
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
if (k <= 128) {
|
||||
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
|
||||
} else {
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx(cur_p->size);
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
const float val = cur_p->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= k) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::vector<llama_token_data> tmp_tokens(nhave);
|
||||
auto * ptr = tmp_tokens.data();
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets - 1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||
|
||||
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||
|
||||
}
|
||||
cur_p->sorted = true;
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, k);
|
||||
}
|
||||
|
||||
cur_p->size = k;
|
||||
@@ -576,9 +604,73 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
|
||||
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
// edge cases
|
||||
if (cur_p->size == 0) {
|
||||
cur_p->selected = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
cur_p->selected = 0;
|
||||
|
||||
if (cur_p->size == 1) {
|
||||
cur_p->data[0].p = 1.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
// max logit for numerical stability
|
||||
float max_l = cur_p->data[0].logit;
|
||||
if (!cur_p->sorted) {
|
||||
for (size_t i = 1; i < cur_p->size; ++i) {
|
||||
max_l = std::max(max_l, cur_p->data[i].logit);
|
||||
}
|
||||
}
|
||||
|
||||
// apply softmax to obtain the probabilities
|
||||
double sum_cum = 0.0f;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
float p = expf(cur_p->data[i].logit - max_l);
|
||||
cur_p->data[i].p = p;
|
||||
sum_cum += p;
|
||||
}
|
||||
|
||||
#if 1
|
||||
// sample from the obtained probabilities and normalize the probs in a single pass
|
||||
// this is ~3x faster on Mac with full gpt-oss vocab than the version below
|
||||
//
|
||||
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
|
||||
const double rnd = dist(ctx->rng);
|
||||
|
||||
double sum_run = 0.0f;
|
||||
const double sum_tgt = sum_cum*rnd;
|
||||
|
||||
bool found = false;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (!found) {
|
||||
// accumulate probs until we reach the target sum
|
||||
sum_run += cur_p->data[i].p;
|
||||
if (sum_run >= sum_tgt) {
|
||||
cur_p->selected = i;
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
|
||||
// normalize probs
|
||||
cur_p->data[i].p /= sum_cum;
|
||||
}
|
||||
|
||||
// fallback to the last token (don't think this can happen)
|
||||
assert(found);
|
||||
if (!found) {
|
||||
cur_p->selected = cur_p->size - 1;
|
||||
}
|
||||
#else
|
||||
// for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].p /= sum_cum;
|
||||
}
|
||||
|
||||
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
|
||||
@@ -626,32 +718,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||
);
|
||||
}
|
||||
|
||||
// softmax
|
||||
|
||||
static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "softmax";
|
||||
}
|
||||
|
||||
static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_softmax_i = {
|
||||
/* .name = */ llama_sampler_softmax_name,
|
||||
/* .accept = */ nullptr,
|
||||
/* .apply = */ llama_sampler_softmax_apply,
|
||||
/* .reset = */ nullptr,
|
||||
/* .clone = */ nullptr,
|
||||
/* .free = */ nullptr,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_softmax() {
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_softmax_i,
|
||||
/* .ctx = */ nullptr
|
||||
);
|
||||
}
|
||||
|
||||
// top-k
|
||||
|
||||
struct llama_sampler_top_k {
|
||||
@@ -663,7 +729,7 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
|
||||
}
|
||||
|
||||
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_top_k *) smpl->ctx;
|
||||
llama_sampler_top_k_impl(cur_p, ctx->k);
|
||||
}
|
||||
|
||||
@@ -699,6 +765,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
||||
struct llama_sampler_top_p {
|
||||
const float p;
|
||||
const size_t min_keep;
|
||||
|
||||
std::vector<llama_token_data> buf_sort;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -706,20 +774,35 @@ static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl
|
||||
}
|
||||
|
||||
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_top_p *) smpl->ctx;
|
||||
|
||||
if (ctx->p >= 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
|
||||
size_t k = cur_p->size;
|
||||
auto * pdata = cur_p->data;
|
||||
|
||||
auto & buf_sort = ctx->buf_sort;
|
||||
|
||||
// if not sorted, try adaptive top-k sorting
|
||||
if (!cur_p->sorted && cur_p->size > 1024) {
|
||||
k = std::min<size_t>(256, cur_p->size);
|
||||
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
|
||||
pdata = buf_sort.data();
|
||||
} else if (!cur_p->sorted) {
|
||||
// small candidates -> sort inplace
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, k);
|
||||
}
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = cur_p->size;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cum_sum += cur_p->data[i].p;
|
||||
cum_sum += pdata[i].p;
|
||||
|
||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||
@@ -727,9 +810,21 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// we exceeded the current top-k heuristic -> increase k and continue
|
||||
if (!cur_p->sorted && i == k - 1) {
|
||||
k = cur_p->size;
|
||||
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
|
||||
pdata = buf_sort.data();
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the top-p tokens
|
||||
if (!cur_p->sorted) {
|
||||
std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
cur_p->size = last_idx;
|
||||
}
|
||||
|
||||
@@ -757,6 +852,7 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
||||
/* .ctx = */ new llama_sampler_top_p {
|
||||
/* .p = */ p,
|
||||
/* .min_keep = */ min_keep,
|
||||
/* .buf_sort = */ {},
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -773,7 +869,7 @@ static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl
|
||||
}
|
||||
|
||||
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_min_p *) smpl->ctx;
|
||||
|
||||
if (ctx->p <= 0.0f || !cur_p->size) {
|
||||
return;
|
||||
@@ -799,7 +895,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
|
||||
// if we have enough values the operation was a success
|
||||
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
|
||||
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||
std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
|
||||
cur_p->size = filtered_tokens.size();
|
||||
min_p_applied = true;
|
||||
}
|
||||
@@ -809,10 +905,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||
if (!min_p_applied) {
|
||||
// Sort the logits in descending order
|
||||
if (!cur_p->sorted) {
|
||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
cur_p->sorted = true;
|
||||
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
|
||||
}
|
||||
|
||||
const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
|
||||
@@ -869,7 +962,7 @@ static const char * llama_sampler_typical_name(const struct llama_sampler * /*sm
|
||||
}
|
||||
|
||||
static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_typical *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_typical *) smpl->ctx;
|
||||
|
||||
// Reference implementation:
|
||||
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
||||
@@ -878,7 +971,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
|
||||
}
|
||||
|
||||
// Compute the softmax of logits and calculate entropy
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
@@ -1012,7 +1105,7 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
|
||||
}
|
||||
|
||||
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
|
||||
if (ctx->delta > 0) {
|
||||
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
|
||||
const float max_temp = ctx->temp + ctx->delta;
|
||||
@@ -1027,7 +1120,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / cur_p->size);
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
@@ -1121,7 +1214,7 @@ struct llama_sampler_xtc {
|
||||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -1139,17 +1232,20 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
|
||||
|
||||
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
||||
float chance = distribution(ctx->rng);
|
||||
if (chance > ctx->probability) return;
|
||||
if (chance > ctx->probability) {
|
||||
return;
|
||||
}
|
||||
|
||||
// in case it's not sorted/recalculated yet
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
int pos_last = 0;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].p >= ctx->threshold) {
|
||||
pos_last = i;
|
||||
} else break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
|
||||
@@ -1221,7 +1317,7 @@ struct llama_sampler_mirostat {
|
||||
|
||||
float mu;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -1231,7 +1327,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
|
||||
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
// Estimate s_hat using the most probable m tokens
|
||||
float s_hat = 0.0;
|
||||
@@ -1250,7 +1346,8 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
|
||||
float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
|
||||
|
||||
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||
|
||||
@@ -1336,7 +1433,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
|
||||
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
// Truncate the words with surprise values greater than mu
|
||||
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
||||
@@ -1348,7 +1445,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
|
||||
}
|
||||
|
||||
// Normalize the probabilities of the remaining words
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||
|
||||
@@ -1540,7 +1637,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
||||
}
|
||||
trigger_pattern += ")[\\s\\S]*";
|
||||
auto trigger_pattern_c = trigger_pattern.c_str();
|
||||
const auto * trigger_pattern_c = trigger_pattern.c_str();
|
||||
trigger_patterns = &trigger_pattern_c;
|
||||
num_trigger_patterns = 1;
|
||||
}
|
||||
@@ -1748,7 +1845,7 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||
}
|
||||
|
||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
|
||||
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||
return;
|
||||
@@ -1780,13 +1877,14 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
|
||||
}
|
||||
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
||||
|
||||
//apply mask
|
||||
// apply mask
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit < max - (ctx->n * std)) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
}
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
|
||||
@@ -1991,7 +2089,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
|
||||
|
||||
{
|
||||
const int last = last_n_repeat - 1;
|
||||
int rt = 0, lt = 0;
|
||||
|
||||
int rt = 0;
|
||||
int lt = 0;
|
||||
|
||||
for (int k = 1; k < last_n_repeat; ++k) {
|
||||
if (k > rt) {
|
||||
@@ -2135,8 +2235,8 @@ static struct llama_sampler_i llama_sampler_dry_i = {
|
||||
/* .free = */ llama_sampler_dry_free,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
|
||||
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
||||
const int MAX_CHAR_LEN = 40;
|
||||
const int MAX_SEQ_LEN = 20;
|
||||
@@ -2169,7 +2269,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_dry_i,
|
||||
/* .ctx = */ new llama_sampler_dry {
|
||||
/* .total_context_size = */ context_size,
|
||||
/* .total_context_size = */ n_ctx_train,
|
||||
/* .dry_multiplier = */ dry_multiplier,
|
||||
/* .dry_base = */ dry_base,
|
||||
/* .dry_allowed_length = */ dry_allowed_length,
|
||||
@@ -2308,7 +2408,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
|
||||
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_infill *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p);
|
||||
llama_sampler_softmax_impl(cur_p, true);
|
||||
|
||||
#if defined(GGML_DEBUG_SAMPLER_INFILL)
|
||||
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
|
||||
|
||||
21
llama/llama.cpp/src/llama-vocab.cpp
vendored
21
llama/llama.cpp/src/llama-vocab.cpp
vendored
@@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GROK_2:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@@ -1763,7 +1770,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
// correct endiannes of data in precompiled_charsmap binary blob
|
||||
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
||||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||
@@ -1944,7 +1951,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "bailingmoe") {
|
||||
tokenizer_pre == "bailingmoe" ||
|
||||
tokenizer_pre == "llada-moe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
@@ -1963,6 +1971,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "kimi-k2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "grok-2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
@@ -2331,7 +2343,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
// @ngxson : quick hack for gpt-oss, always render these tokens
|
||||
for (const auto & t : token_to_id) {
|
||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
}
|
||||
}
|
||||
@@ -2378,6 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
if (has_return && has_call && has_end) {
|
||||
special_eog_ids.erase(end_id);
|
||||
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||
}
|
||||
}
|
||||
@@ -2459,7 +2472,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
// set attributes by model/tokenizer/architecture name
|
||||
if (false
|
||||
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
||||
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
||||
|| _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
|
||||
) {
|
||||
if (token_to_id.count("<mask>") == 0) {
|
||||
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
||||
|
||||
1
llama/llama.cpp/src/llama-vocab.h
vendored
1
llama/llama.cpp/src/llama-vocab.h
vendored
@@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
75
llama/llama.cpp/src/llama.cpp
vendored
75
llama/llama.cpp/src/llama.cpp
vendored
@@ -25,6 +25,18 @@
|
||||
// interface implementation
|
||||
//
|
||||
|
||||
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
|
||||
switch (flash_attn_type) {
|
||||
case LLAMA_FLASH_ATTN_TYPE_AUTO:
|
||||
return "auto";
|
||||
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
|
||||
return "disabled";
|
||||
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
|
||||
return "enabled";
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_perf =*/ true,
|
||||
@@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
||||
llama_supports_rpc();
|
||||
}
|
||||
|
||||
@@ -71,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||
GGML_ASSERT(dev && "CPU backend is not loaded");
|
||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
||||
numa_init_fn(numa);
|
||||
if (numa_init_fn) {
|
||||
numa_init_fn(numa);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
model->devices.push_back(*dev);
|
||||
}
|
||||
} else {
|
||||
// default device selection
|
||||
|
||||
// build list of available devices
|
||||
std::vector<ggml_backend_dev_t> gpus;
|
||||
std::vector<ggml_backend_dev_t> igpus;
|
||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||
// use all available devices
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
@@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||
rpc_servers.push_back(dev);
|
||||
} else {
|
||||
model->devices.push_back(dev);
|
||||
// check if there is already a GPU with the same device id
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||
ggml_backend_dev_props d_props;
|
||||
ggml_backend_dev_get_props(d, &d_props);
|
||||
if (props.device_id && d_props.device_id) {
|
||||
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (it != gpus.end()) {
|
||||
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||
__func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||
} else {
|
||||
gpus.push_back(dev);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
igpus.push_back(dev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// add RPC servers at the front of the list
|
||||
if (!rpc_servers.empty()) {
|
||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||
|
||||
// add RPC servers at the front of the list to minimize network transfers
|
||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||
|
||||
// add GPUs
|
||||
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
||||
|
||||
// add integrated GPUs only if no other devices were found
|
||||
if (model->devices.empty()) {
|
||||
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
}
|
||||
|
||||
for (auto * dev : model->devices) {
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(path_model, splits, *model, params);
|
||||
|
||||
43
llama/llama.cpp/src/unicode.h
vendored
43
llama/llama.cpp/src/unicode.h
vendored
@@ -4,6 +4,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// TODO: reimplement this structure in endian-independent way
|
||||
struct unicode_cpt_flags {
|
||||
enum {
|
||||
UNDEFINED = 0x0001,
|
||||
@@ -15,6 +16,10 @@ struct unicode_cpt_flags {
|
||||
SYMBOL = 0x0040, // regex: \p{S}
|
||||
CONTROL = 0x0080, // regex: \p{C}
|
||||
MASK_CATEGORIES = 0x00FF,
|
||||
WHITESPACE = 0x0100,
|
||||
LOWERCASE = 0x0200,
|
||||
UPPERCASE = 0x0400,
|
||||
NFD = 0x0800,
|
||||
};
|
||||
|
||||
// codepoint type
|
||||
@@ -34,11 +39,49 @@ struct unicode_cpt_flags {
|
||||
|
||||
// decode from uint16
|
||||
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
*reinterpret_cast<uint16_t*>(this) = flags;
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
is_undefined = (flags & UNDEFINED) ? 1 : 0;
|
||||
is_number = (flags & NUMBER) ? 1 : 0;
|
||||
is_letter = (flags & LETTER) ? 1 : 0;
|
||||
is_separator = (flags & SEPARATOR) ? 1 : 0;
|
||||
is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
|
||||
is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
|
||||
is_symbol = (flags & SYMBOL) ? 1 : 0;
|
||||
is_control = (flags & CONTROL) ? 1 : 0;
|
||||
is_whitespace = (flags & WHITESPACE) ? 1 : 0;
|
||||
is_lowercase = (flags & LOWERCASE) ? 1 : 0;
|
||||
is_uppercase = (flags & UPPERCASE) ? 1 : 0;
|
||||
is_nfd = (flags & NFD) ? 1 : 0;
|
||||
#else
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint16_t as_uint() const {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
return *reinterpret_cast<const uint16_t*>(this);
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
uint16_t result =
|
||||
is_undefined * UNDEFINED
|
||||
+ is_number * NUMBER
|
||||
+ is_letter * LETTER
|
||||
+ is_separator * SEPARATOR
|
||||
+ is_accent_mark * ACCENT_MARK
|
||||
+ is_punctuation * PUNCTUATION
|
||||
+ is_symbol * SYMBOL
|
||||
+ is_control * CONTROL
|
||||
+ is_whitespace * WHITESPACE
|
||||
+ is_lowercase * LOWERCASE
|
||||
+ is_uppercase * UPPERCASE
|
||||
+ is_nfd * NFD
|
||||
;
|
||||
|
||||
return result;
|
||||
#else
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint16_t category_flag() const {
|
||||
|
||||
Reference in New Issue
Block a user