mirror of
https://github.com/ollama/ollama.git
synced 2026-04-24 17:55:43 +02:00
Two reductions:
1. Drop the gguf_rename_tensor forwarder from gguf.h/gguf.cpp.
The rename-in-place trick it does (calling ggml_set_name on an embedded
ggml_tensor) can be done from outside gguf.cpp via:
char * p = const_cast<char *>(gguf_get_tensor_name(meta, id));
strncpy(p, new_name, GGML_MAX_NAME - 1);
That pointer aims into a mutable char[GGML_MAX_NAME] inside a std::vector
element; the const on the return type is API courtesy. This is defined
behavior and has no struct-layout dependency.
2. Drop the src/CMakeLists.txt hunk that added llama-ollama-compat.cpp to
the llama target. Replace with a target_sources() call in Ollama's
llama/server/CMakeLists.txt after FetchContent_MakeAvailable. Our
compat files now stay in llama/compat/ and are never copied into the
fetched _deps/ tree.
Net patch now touches 3 files, 20 lines, all pure call-site insertions:
src/llama-model-loader.cpp +8 (include + translate + 2x should_skip)
src/llama-model.cpp +4 (include + apply_tensor_transforms)
tools/mtmd/clip.cpp +8 (include + translate_clip + maybe_load)
Verified: fresh build from scratch (rm -rf build && cmake configure)
runs PATCH_COMMAND cleanly, compiles, and ollama run gemma3 still works
end-to-end for text + vision.
96 lines
4.1 KiB
Diff
96 lines
4.1 KiB
Diff
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
|
index 4e65a45a5..75836c683 100644
|
|
--- a/src/llama-model-loader.cpp
|
|
+++ b/src/llama-model-loader.cpp
|
|
@@ -4,6 +4,7 @@
|
|
#include "ggml.h"
|
|
#include "gguf.h"
|
|
#include "llama-hparams.h"
|
|
+#include "llama-ollama-compat.h"
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
@@ -549,6 +550,7 @@ llama_model_loader::llama_model_loader(
|
|
}
|
|
|
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name);
|
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
|
|
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
@@ -573,6 +575,9 @@ llama_model_loader::llama_model_loader(
|
|
// so we build a unified tensors index for weights.
|
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
std::string tensor_name = std::string(cur->name);
|
|
+ if (llama_ollama_compat::should_skip_tensor(this, tensor_name.c_str())) {
|
|
+ continue;
|
|
+ }
|
|
// make sure there is no duplicated tensor names
|
|
if (weights_map.find(tensor_name) != weights_map.end()) {
|
|
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
|
@@ -683,6 +688,9 @@ llama_model_loader::llama_model_loader(
|
|
// Save tensors data offset info of the main file.
|
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
std::string tensor_name = std::string(cur->name);
|
|
+ if (llama_ollama_compat::should_skip_tensor(this, tensor_name.c_str())) {
|
|
+ continue;
|
|
+ }
|
|
// make sure there is no duplicated tensor names
|
|
if (weights_map.find(tensor_name) != weights_map.end()) {
|
|
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
|
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
|
index 4ded484dd..7d3509c23 100644
|
|
--- a/src/llama-model.cpp
|
|
+++ b/src/llama-model.cpp
|
|
@@ -6,6 +6,7 @@
|
|
#include "llama-mmap.h"
|
|
#include "llama-cparams.h"
|
|
#include "llama-model-loader.h"
|
|
+#include "llama-ollama-compat.h"
|
|
|
|
#include "llama-kv-cache.h"
|
|
#include "llama-kv-cache-iswa.h"
|
|
@@ -8023,6 +8024,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
return false;
|
|
}
|
|
+ // Apply any Ollama-format numerical fixups (e.g. gemma3 RMSNorm +1)
|
|
+ // while the data is in its final backend buffers.
|
|
+ llama_ollama_compat::apply_tensor_transforms(&ml, ctx);
|
|
}
|
|
|
|
if (use_mmap_buffer) {
|
|
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
|
index f0e8786b6..35defa89d 100644
|
|
--- a/tools/mtmd/clip.cpp
|
|
+++ b/tools/mtmd/clip.cpp
|
|
@@ -10,6 +10,8 @@
|
|
#include "ggml-backend.h"
|
|
#include "gguf.h"
|
|
|
|
+#include "llama-ollama-compat.h"
|
|
+
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cmath>
|
|
@@ -985,6 +987,11 @@ struct clip_model_loader {
|
|
|
|
ctx_meta.reset(meta);
|
|
|
|
+ // If this is an Ollama-format monolithic GGUF (text + embedded
|
|
+ // vision), translate its metadata and tensor names into the
|
|
+ // upstream mmproj shape so the rest of this loader runs unchanged.
|
|
+ llama_ollama_compat::translate_clip_metadata(ctx_gguf.get(), meta);
|
|
+
|
|
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
|
|
|
// print gguf info
|
|
@@ -2358,6 +2365,7 @@ struct clip_model_loader {
|
|
auto it_off = tensor_offset.find(t->name);
|
|
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
|
|
const size_t offset = it_off->second;
|
|
+ if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue;
|
|
fin.seekg(offset, std::ios::beg);
|
|
if (!fin) {
|
|
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|