Files
ollama/llama/compat/upstream-edits.patch
jmorganca 61b367ec29 llama/compat: shrink patch to pure call-site hooks (34 -> 20 lines)
Two reductions:

1. Drop the gguf_rename_tensor forwarder from gguf.h/gguf.cpp.
   The rename-in-place trick it does (calling ggml_set_name on an embedded
   ggml_tensor) can be done from outside gguf.cpp via:

     char * p = const_cast<char *>(gguf_get_tensor_name(meta, id));
     strncpy(p, new_name, GGML_MAX_NAME - 1);

   That pointer aims into a mutable char[GGML_MAX_NAME] inside a std::vector
   element; the const on the return type is API courtesy. This is defined
   behavior and has no struct-layout dependency.

2. Drop the src/CMakeLists.txt hunk that added llama-ollama-compat.cpp to
   the llama target. Replace with a target_sources() call in Ollama's
   llama/server/CMakeLists.txt after FetchContent_MakeAvailable. Our
   compat files now stay in llama/compat/ and are never copied into the
   fetched _deps/ tree.

Net patch now touches 3 files, 20 lines, all pure call-site insertions:
  src/llama-model-loader.cpp  +8  (include + translate + 2x should_skip)
  src/llama-model.cpp         +4  (include + apply_tensor_transforms)
  tools/mtmd/clip.cpp         +8  (include + translate_clip + maybe_load)

Verified: fresh build from scratch (rm -rf build && cmake configure)
runs PATCH_COMMAND cleanly, compiles, and ollama run gemma3 still works
end-to-end for text + vision.
2026-04-20 09:29:34 -07:00

96 lines
4.1 KiB
Diff

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 4e65a45a5..75836c683 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -4,6 +4,7 @@
#include "ggml.h"
#include "gguf.h"
#include "llama-hparams.h"
+#include "llama-ollama-compat.h"
#include <algorithm>
#include <array>
@@ -549,6 +550,7 @@ llama_model_loader::llama_model_loader(
}
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
@@ -573,6 +575,9 @@ llama_model_loader::llama_model_loader(
// so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
+ if (llama_ollama_compat::should_skip_tensor(this, tensor_name.c_str())) {
+ continue;
+ }
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
@@ -683,6 +688,9 @@ llama_model_loader::llama_model_loader(
// Save tensors data offset info of the main file.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
+ if (llama_ollama_compat::should_skip_tensor(this, tensor_name.c_str())) {
+ continue;
+ }
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4ded484dd..7d3509c23 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6,6 +6,7 @@
#include "llama-mmap.h"
#include "llama-cparams.h"
#include "llama-model-loader.h"
+#include "llama-ollama-compat.h"
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
@@ -8023,6 +8024,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
return false;
}
+ // Apply any Ollama-format numerical fixups (e.g. gemma3 RMSNorm +1)
+ // while the data is in its final backend buffers.
+ llama_ollama_compat::apply_tensor_transforms(&ml, ctx);
}
if (use_mmap_buffer) {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f0e8786b6..35defa89d 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,6 +10,8 @@
#include "ggml-backend.h"
#include "gguf.h"
+#include "llama-ollama-compat.h"
+
#include <algorithm>
#include <cassert>
#include <cmath>
@@ -985,6 +987,11 @@ struct clip_model_loader {
ctx_meta.reset(meta);
+ // If this is an Ollama-format monolithic GGUF (text + embedded
+ // vision), translate its metadata and tensor names into the
+ // upstream mmproj shape so the rest of this loader runs unchanged.
+ llama_ollama_compat::translate_clip_metadata(ctx_gguf.get(), meta);
+
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
// print gguf info
@@ -2358,6 +2365,7 @@ struct clip_model_loader {
auto it_off = tensor_offset.find(t->name);
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
const size_t offset = it_off->second;
+ if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue;
fin.seekg(offset, std::ios::beg);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));