avoid context overflow (#11175 )

For smaller context models, make sure we do not exceed the training size.
Re-remove cuda v11 (#10694 )
2026-04-27 03:05:43 +02:00 · 2025-06-23 15:52:50 -07:00 · 2025-06-23 14:07:00 -07:00 · 2025-06-23 09:21:12 -07:00 · 2025-06-20 12:32:51 -07:00 · 2025-06-20 11:12:01 -07:00
33 changed files with 1613 additions and 285 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -103,11 +103,6 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 11'
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@@ -324,8 +319,6 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,14 +78,13 @@ if(CMAKE_CUDA_COMPILER)

    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
-    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()

@@ -116,7 +115,11 @@ if(CMAKE_HIP_COMPILER)

        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCIES
+            RUNTIME_DEPENDENCY_SET rocm
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
+        )
+        install(RUNTIME_DEPENDENCY_SET rocm
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,20 +17,12 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
-      }
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
@@ -58,6 +50,7 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
+        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
@@ -78,11 +71,6 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/24
+++ b/24
@@ -7,12 +7,13 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

-# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

@@ -38,15 +39,6 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

-FROM base AS cuda-11
-ARG CUDA11VERSION=11.3
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -98,17 +90,15 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
-COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
+COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6

 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
+COPY --from=rocm-6 dist/lib/ollama /lib/ollama

 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
--- a/README.md
+++ b/README.md
@@ -409,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
+- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)

 ### Cloud

--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,9 +2,6 @@ package convert

 import (
 	"fmt"
-	"io"
-	"slices"
-	"strings"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }

 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	oldnew := []string{
-		"model.layers", "blk",
-		"w1", "ffn_gate_exps",
-		"w2", "ffn_down_exps",
-		"w3", "ffn_up_exps",
-	}
-
-	for i := range p.NumLocalExperts {
-		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
-	}
-
-	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
-	namer := strings.NewReplacer(oldnew...)
-	experts := make(map[string]experts)
-
-	// merge experts into a single tensor while removing them from ts
-	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
-		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
-			return false
-		}
-
-		name := namer.Replace(t.Name())
-		experts[name] = append(experts[name], t)
-		return true
-	})
-
-	var out []*ggml.Tensor
-	for n, e := range experts {
-		// TODO(mxyng): sanity check experts
-		out = append(out, &ggml.Tensor{
-			Name:     n,
-			Kind:     e[0].Kind(),
-			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
-			WriterTo: e,
+	merges := make([]merge, 0, p.NumHiddenLayers*6)
+	for i := range p.NumHiddenLayers {
+		merges = append(merges, merge{
+			fmt.Sprintf("blk.%d.*.w1.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w1.bias", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.bias", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.bias", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
 		})
 	}

+	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }

 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
+		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
+		"block_sparse_moe.experts.", ".",
 	)
 }
-
-type experts []Tensor
-
-func (e experts) WriteTo(w io.Writer) (int64, error) {
-	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
-	for _, t := range e {
-		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
-		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
-		// this accomplishes the same thing by writing each expert tensor in sequence
-		if _, err := t.WriteTo(w); err != nil {
-			return 0, err
-		}
-	}
-
-	return 0, nil
-}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -2,7 +2,9 @@ package convert

 import (
 	"cmp"
+	"io"
 	"iter"
+	"path"
 	"slices"
 	"strings"

@@ -74,3 +76,54 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 		}
 	}
 }
+
+type merge struct {
+	pattern, name string
+}
+
+// mergeTensors merges tensors that match a given pattern into a single tensor.
+func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
+	var matched []Tensor
+	for i := range merges {
+		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
+			matched, _ := path.Match(merges[i].pattern, t.Name())
+			return matched
+		})
+
+		if len(matched) > 0 {
+			out = append(out, &ggml.Tensor{
+				Name:     merges[i].name,
+				Kind:     matched[0].Kind(),
+				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
+				WriterTo: mergeGroup(matched),
+			})
+		}
+	}
+
+	return out, unmatched
+}
+
+// slicesSplitFunc splits a slice into two slices based on a predicate function.
+func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
+	for _, e := range s {
+		if fn(e) {
+			matched = append(matched, e)
+		} else {
+			unmatched = append(unmatched, e)
+		}
+	}
+
+	return matched, unmatched
+}
+
+type mergeGroup []Tensor
+
+func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
+	for _, t := range g {
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -9,6 +9,8 @@ import (
 	"strings"
 	"testing"

+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 )

@@ -302,3 +304,99 @@ func TestSplitDim(t *testing.T) {
 		}
 	})
 }
+
+func TestMerge(t *testing.T) {
+	unmatched := []Tensor{
+		&fakeTensor{
+			name:  "a.0.b",
+			shape: []uint64{5, 2},
+			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+		},
+		&fakeTensor{
+			name:  "a.1.b",
+			shape: []uint64{5, 2},
+			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
+		},
+		&fakeTensor{
+			name:  "c.0.d",
+			shape: []uint64{5, 2},
+			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
+		},
+		&fakeTensor{
+			name:  "c.1.d",
+			shape: []uint64{5, 2},
+			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
+		},
+		&fakeTensor{
+			name:  "e.0.f",
+			shape: []uint64{5, 2},
+			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
+		},
+	}
+
+	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
+		for i := range n {
+			got := matched[i]
+			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
+				t.Errorf("unexpected (-want +got):\n%s", diff)
+			}
+
+			var b bytes.Buffer
+			if _, err := got.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, 20)
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			offset := 10 + (i * 20)
+			want := make([]float32, 20)
+			for j := range 20 {
+				want[j] = float32(offset + j)
+			}
+
+			if diff := cmp.Diff(want, f32s); diff != "" {
+				t.Errorf("unexpected data (-want +got):\n%s", diff)
+			}
+		}
+	}
+
+	t.Run("single merge", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
+		if len(unmatched) != 3 {
+			t.Error("expected 3 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 1 {
+			t.Error("expected 1 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 1, matched)
+	})
+
+	t.Run("multiple merges", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
+		if len(unmatched) != 1 {
+			t.Error("expected 1 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 2 {
+			t.Error("expected 2 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 2, matched)
+	})
+
+	t.Run("no match", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
+		if len(unmatched) != 5 {
+			t.Error("expected 5 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 0 {
+			t.Error("expected no merged tensors, got", len(matched))
+		}
+	})
+}
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,6 +3,7 @@
 package discover

 import (
+	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -55,10 +56,13 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
+		return "sbsa"
 	}

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```

 **Experimental LLM Library Override**
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
+package gguf
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"slices"
+	"strings"
+)
+
+const (
+	typeUint8 uint32 = iota
+	typeInt8
+	typeUint16
+	typeInt16
+	typeUint32
+	typeInt32
+	typeFloat32
+	typeBool
+	typeString
+	typeArray
+	typeUint64
+	typeInt64
+	typeFloat64
+)
+
+var ErrUnsupported = errors.New("unsupported")
+
+type File struct {
+	Magic   [4]byte
+	Version uint32
+
+	keyValues *lazy[KeyValue]
+	tensors   *lazy[TensorInfo]
+	offset    int64
+
+	file   *os.File
+	reader *bufferedReader
+	bts    []byte
+}
+
+func Open(path string) (f *File, err error) {
+	f = &File{bts: make([]byte, 4096)}
+	f.file, err = os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	f.reader = newBufferedReader(f.file, 32<<10)
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
+		return nil, err
+	}
+
+	if bytes.Equal(f.Magic[:], []byte("gguf")) {
+		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
+	}
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
+		return nil, err
+	}
+
+	if f.Version < 2 {
+		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
+	}
+
+	f.tensors, err = newLazy(f, f.readTensor)
+	if err != nil {
+		return nil, err
+	}
+
+	f.tensors.successFunc = func() error {
+		offset := f.reader.offset
+
+		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
+		f.offset = offset + (alignment-offset%alignment)%alignment
+		return nil
+	}
+
+	f.keyValues, err = newLazy(f, f.readKeyValue)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
+
+func (f *File) readTensor() (TensorInfo, error) {
+	name, err := readString(f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	dims, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	shape := make([]uint64, dims)
+	for i := range dims {
+		shape[i], err = read[uint64](f)
+		if err != nil {
+			return TensorInfo{}, err
+		}
+	}
+
+	type_, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	offset, err := read[uint64](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	return TensorInfo{
+		Name:   name,
+		Offset: offset,
+		Shape:  shape,
+		Type:   TensorType(type_),
+	}, nil
+}
+
+func (f *File) readKeyValue() (KeyValue, error) {
+	key, err := readString(f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	t, err := read[uint32](f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	value, err := func() (any, error) {
+		switch t {
+		case typeUint8:
+			return read[uint8](f)
+		case typeInt8:
+			return read[int8](f)
+		case typeUint16:
+			return read[uint16](f)
+		case typeInt16:
+			return read[int16](f)
+		case typeUint32:
+			return read[uint32](f)
+		case typeInt32:
+			return read[int32](f)
+		case typeUint64:
+			return read[uint64](f)
+		case typeInt64:
+			return read[int64](f)
+		case typeFloat32:
+			return read[float32](f)
+		case typeFloat64:
+			return read[float64](f)
+		case typeBool:
+			return read[bool](f)
+		case typeString:
+			return readString(f)
+		case typeArray:
+			return readArray(f)
+		default:
+			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+		}
+	}()
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	return KeyValue{
+		Key:   key,
+		Value: Value{value},
+	}, nil
+}
+
+func read[T any](f *File) (t T, err error) {
+	err = binary.Read(f.reader, binary.LittleEndian, &t)
+	return t, err
+}
+
+func readString(f *File) (string, error) {
+	n, err := read[uint64](f)
+	if err != nil {
+		return "", err
+	}
+
+	if int(n) > len(f.bts) {
+		f.bts = make([]byte, n)
+	}
+
+	bts := f.bts[:n]
+	if _, err := io.ReadFull(f.reader, bts); err != nil {
+		return "", err
+	}
+	defer clear(bts)
+
+	return string(bts), nil
+}
+
+func readArray(f *File) (any, error) {
+	t, err := read[uint32](f)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := read[uint64](f)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t {
+	case typeUint8:
+		return readArrayData[uint8](f, n)
+	case typeInt8:
+		return readArrayData[int8](f, n)
+	case typeUint16:
+		return readArrayData[uint16](f, n)
+	case typeInt16:
+		return readArrayData[int16](f, n)
+	case typeUint32:
+		return readArrayData[uint32](f, n)
+	case typeInt32:
+		return readArrayData[int32](f, n)
+	case typeUint64:
+		return readArrayData[uint64](f, n)
+	case typeInt64:
+		return readArrayData[int64](f, n)
+	case typeFloat32:
+		return readArrayData[float32](f, n)
+	case typeFloat64:
+		return readArrayData[float64](f, n)
+	case typeBool:
+		return readArrayData[bool](f, n)
+	case typeString:
+		return readArrayString(f, n)
+	default:
+		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+	}
+}
+
+func readArrayData[T any](f *File, n uint64) (s []T, err error) {
+	s = make([]T, n)
+	for i := range n {
+		e, err := read[T](f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func readArrayString(f *File, n uint64) (s []string, err error) {
+	s = make([]string, n)
+	for i := range n {
+		e, err := readString(f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func (f *File) Close() error {
+	f.keyValues.stop()
+	f.tensors.stop()
+	return f.file.Close()
+}
+
+func (f *File) KeyValue(key string) KeyValue {
+	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
+		key = f.KeyValue("general.architecture").String() + "." + key
+	}
+
+	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
+		return kv.Key == key
+	}); index >= 0 {
+		return f.keyValues.values[index]
+	}
+
+	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
+		if keyValue.Key == key {
+			return keyValue
+		}
+	}
+
+	return KeyValue{}
+}
+
+func (f *File) NumKeyValues() int {
+	return int(f.keyValues.count)
+}
+
+func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
+	return f.keyValues.All()
+}
+
+func (f *File) TensorInfo(name string) TensorInfo {
+	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
+		return t.Name == name
+	}); index >= 0 {
+		return f.tensors.values[index]
+	}
+
+	// fast-forward through key values if we haven't already
+	_ = f.keyValues.rest()
+	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
+		if tensor.Name == name {
+			return tensor
+		}
+	}
+
+	return TensorInfo{}
+}
+
+func (f *File) NumTensors() int {
+	return int(f.tensors.count)
+}
+
+func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
+	// fast forward through key values if we haven't already
+	f.keyValues.rest()
+	return f.tensors.All()
+}
+
+func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
+	t := f.TensorInfo(name)
+	if t.NumBytes() == 0 {
+		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
+	}
+
+	// fast forward through tensor info if we haven't already
+	_ = f.tensors.rest()
+	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
+}
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
+package gguf_test
+
+import (
+	"bytes"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
+)
+
+func createBinFile(tb testing.TB) string {
+	tb.Helper()
+	f, err := os.CreateTemp(tb.TempDir(), "")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	kv := ggml.KV{
+		"general.architecture":                   "llama",
+		"llama.block_count":                      uint32(8),
+		"llama.embedding_length":                 uint32(3),
+		"llama.attention.head_count":             uint32(2),
+		"llama.attention.head_count_kv":          uint32(2),
+		"llama.attention.key_length":             uint32(3),
+		"llama.rope.dimension_count":             uint32(4),
+		"llama.rope.freq_base":                   float32(10000.0),
+		"llama.rope.freq_scale":                  float32(1.0),
+		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
+		"tokenizer.ggml.eos_token_id":            uint32(0),
+		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
+		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
+		"tokenizer.ggml.scores":                  []float32{0, 1},
+	}
+
+	tensors := []*ggml.Tensor{
+		{
+			Name:     "token_embd.weight",
+			Kind:     0,
+			Shape:    []uint64{2, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
+		},
+		{
+			Name:     "output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 2},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
+		},
+	}
+
+	for i := range 8 {
+		tensors = append(tensors, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		})
+	}
+
+	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
+		tb.Fatal(err)
+	}
+
+	return f.Name()
+}
+
+func TestRead(t *testing.T) {
+	f, err := gguf.Open(createBinFile(t))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if got := f.KeyValue("does.not.exist").Valid(); got {
+		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
+	}
+
+	if got := f.KeyValue("general.architecture").String(); got != "llama" {
+		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
+	}
+
+	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
+		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
+	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
+		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if got.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
+	}
+
+	if got := f.KeyValue("block_count").Uint(); got != 8 {
+		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
+	}
+
+	var kvs []string
+	for _, kv := range f.KeyValues() {
+		if !kv.Valid() {
+			t.Error("found invalid key-value pair:", kv)
+		}
+
+		kvs = append(kvs, kv.Key)
+	}
+
+	if len(kvs) != f.NumKeyValues() {
+		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
+	}
+
+	if diff := cmp.Diff(kvs, []string{
+		"general.architecture",
+		"llama.block_count",
+		"llama.embedding_length",
+		"llama.attention.head_count",
+		"llama.attention.head_count_kv",
+		"llama.attention.key_length",
+		"llama.rope.dimension_count",
+		"llama.rope.freq_base",
+		"llama.rope.freq_scale",
+		"llama.attention.layer_norm_rms_epsilon",
+		"tokenizer.ggml.eos_token_id",
+		"tokenizer.ggml.eos_token_ids",
+		"tokenizer.ggml.tokens",
+		"tokenizer.ggml.scores",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
+	}
+
+	var tis []string
+	for _, ti := range f.TensorInfos() {
+		if !ti.Valid() {
+			t.Error("found invalid tensor info:", ti)
+		}
+
+		tis = append(tis, ti.Name)
+	}
+
+	if len(tis) != f.NumTensors() {
+		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
+	}
+
+	if diff := cmp.Diff(tis, []string{
+		"token_embd.weight",
+		"output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_output.weight",
+		"blk.1.attn_q.weight",
+		"blk.1.attn_k.weight",
+		"blk.1.attn_v.weight",
+		"blk.1.attn_output.weight",
+		"blk.2.attn_q.weight",
+		"blk.2.attn_k.weight",
+		"blk.2.attn_v.weight",
+		"blk.2.attn_output.weight",
+		"blk.3.attn_q.weight",
+		"blk.3.attn_k.weight",
+		"blk.3.attn_v.weight",
+		"blk.3.attn_output.weight",
+		"blk.4.attn_q.weight",
+		"blk.4.attn_k.weight",
+		"blk.4.attn_v.weight",
+		"blk.4.attn_output.weight",
+		"blk.5.attn_q.weight",
+		"blk.5.attn_k.weight",
+		"blk.5.attn_v.weight",
+		"blk.5.attn_output.weight",
+		"blk.6.attn_q.weight",
+		"blk.6.attn_k.weight",
+		"blk.6.attn_v.weight",
+		"blk.6.attn_output.weight",
+		"blk.7.attn_q.weight",
+		"blk.7.attn_k.weight",
+		"blk.7.attn_v.weight",
+		"blk.7.attn_output.weight",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
+	}
+
+	ti, r, err := f.TensorReader("output.weight")
+	if err != nil {
+		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
+	}
+
+	if ti.Name != "output.weight" {
+		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
+	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
+		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if ti.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
+	}
+
+	var b bytes.Buffer
+	if _, err := b.ReadFrom(r); err != nil {
+		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
+	}
+
+	if b.Len() != int(ti.NumBytes()) {
+		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
+	}
+}
+
+func BenchmarkRead(b *testing.B) {
+	b.ReportAllocs()
+
+	p := createBinFile(b)
+	for b.Loop() {
+		f, err := gguf.Open(p)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		if got := f.KeyValue("general.architecture").String(); got != "llama" {
+			b.Errorf("got = %q, want %q", got, "llama")
+		}
+
+		// Iterate through some tensors
+		for range f.TensorInfos() {
+		}
+
+		f.Close()
+	}
+}
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
+package gguf
+
+import (
+	"reflect"
+	"slices"
+)
+
+type KeyValue struct {
+	Key string
+	Value
+}
+
+func (kv KeyValue) Valid() bool {
+	return kv.Key != "" && kv.Value.value != nil
+}
+
+type Value struct {
+	value any
+}
+
+func value[T any](v Value, kinds ...reflect.Kind) (t T) {
+	vv := reflect.ValueOf(v.value)
+	if slices.Contains(kinds, vv.Kind()) {
+		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
+	}
+	return
+}
+
+func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
+	switch vv := reflect.ValueOf(v.value); vv.Kind() {
+	case reflect.Slice:
+		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
+			ts = make([]T, vv.Len())
+			for i := range vv.Len() {
+				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
+			}
+		}
+	}
+	return
+}
+
+// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
+func (v Value) Int() int64 {
+	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
+func (v Value) Ints() (i64s []int64) {
+	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
+func (v Value) Uint() uint64 {
+	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
+func (v Value) Uints() (u64s []uint64) {
+	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Float returns Value as a float. If it is not a float, it returns 0.
+func (v Value) Float() float64 {
+	return value[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
+func (v Value) Floats() (f64s []float64) {
+	return values[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Bool returns Value as a boolean. If it is not a boolean, it returns false.
+func (v Value) Bool() bool {
+	return value[bool](v, reflect.Bool)
+}
+
+// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
+func (v Value) Bools() (bools []bool) {
+	return values[bool](v, reflect.Bool)
+}
+
+// String returns Value as a string. If it is not a string, it returns an empty string.
+func (v Value) String() string {
+	return value[string](v, reflect.String)
+}
+
+// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
+func (v Value) Strings() (strings []string) {
+	return values[string](v, reflect.String)
+}
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
+package gguf
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func split(name string, values map[string][]any) (matched []any, unmatched []any) {
+	for key, value := range values {
+		if key == name {
+			matched = value
+		} else {
+			unmatched = append(unmatched, value...)
+		}
+	}
+	return
+}
+
+func TestValue(t *testing.T) {
+	values := map[string][]any{
+		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
+		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
+		"float64": {float32(42), float64(42)},
+		"string":  {"42", "hello"},
+		"bool":    {true, false},
+	}
+
+	t.Run("int64", func(t *testing.T) {
+		matched, unmatched := split("int64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 42 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 0 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+	})
+
+	t.Run("uint64", func(t *testing.T) {
+		matched, unmatched := split("uint64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 42 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 0 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+	})
+
+	t.Run("float64", func(t *testing.T) {
+		matched, unmatched := split("float64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 42 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 0 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+	})
+
+	t.Run("string", func(t *testing.T) {
+		matched, unmatched := split("string", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != v {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != "" {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		matched, unmatched := split("bool", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != v {
+				t.Errorf("expected true, got %v", b)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != false {
+				t.Errorf("expected false, got %v", b)
+			}
+		}
+	})
+}
+
+func TestValues(t *testing.T) {
+	values := map[string][]any{
+		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
+		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
+		"float64s": {[]float32{42}, []float64{42}},
+		"strings":  {[]string{"42"}, []string{"hello"}},
+		"bools":    {[]bool{true}, []bool{false}},
+	}
+
+	t.Run("int64s", func(t *testing.T) {
+		matched, unmatched := split("int64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64s := kv.Ints(); i64s != nil {
+				t.Errorf("expected nil, got %v", i64s)
+			}
+		}
+	})
+
+	t.Run("uint64s", func(t *testing.T) {
+		matched, unmatched := split("uint64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64s := kv.Uints(); u64s != nil {
+				t.Errorf("expected nil, got %v", u64s)
+			}
+		}
+	})
+
+	t.Run("float64s", func(t *testing.T) {
+		matched, unmatched := split("float64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64s := kv.Floats(); f64s != nil {
+				t.Errorf("expected nil, got %v", f64s)
+			}
+		}
+	})
+
+	t.Run("strings", func(t *testing.T) {
+		matched, unmatched := split("strings", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.Strings(); s != nil {
+				t.Errorf("expected nil, got %v", s)
+			}
+		}
+	})
+
+	t.Run("bools", func(t *testing.T) {
+		matched, unmatched := split("bools", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bools(); b != nil {
+				t.Errorf("expected nil, got %v", b)
+			}
+		}
+	})
+}
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
+package gguf
+
+import (
+	"encoding/binary"
+	"iter"
+	"log/slog"
+)
+
+type lazy[T any] struct {
+	count  uint64
+	next   func() (T, bool)
+	stop   func()
+	values []T
+
+	// successFunc is called when all values have been successfully read.
+	successFunc func() error
+}
+
+func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
+	it := lazy[T]{}
+	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
+		return nil, err
+	}
+
+	it.values = make([]T, 0)
+	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
+		for i := range it.count {
+			t, err := fn()
+			if err != nil {
+				slog.Error("error reading tensor", "index", i, "error", err)
+				return
+			}
+
+			it.values = append(it.values, t)
+			if !yield(t) {
+				break
+			}
+		}
+
+		if it.successFunc != nil {
+			it.successFunc()
+		}
+	})
+
+	return &it, nil
+}
+
+func (g *lazy[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, v := range g.All() {
+			if !yield(v) {
+				break
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) All() iter.Seq2[int, T] {
+	return func(yield func(int, T) bool) {
+		for i := range int(g.count) {
+			if i < len(g.values) {
+				if !yield(i, g.values[i]) {
+					break
+				}
+			} else {
+				t, ok := g.next()
+				if !ok {
+					break
+				}
+
+				if !yield(i, t) {
+					break
+				}
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) rest() (collected bool) {
+	for {
+		_, ok := g.next()
+		collected = collected || ok
+		if !ok {
+			break
+		}
+	}
+
+	return collected
+}
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
+package gguf
+
+import (
+	"bufio"
+	"io"
+)
+
+type bufferedReader struct {
+	offset int64
+	*bufio.Reader
+}
+
+func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
+	return &bufferedReader{
+		Reader: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (rs *bufferedReader) Read(p []byte) (n int, err error) {
+	n, err = rs.Reader.Read(p)
+	rs.offset += int64(n)
+	return n, err
+}
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
+package gguf
+
+import (
+	"log/slog"
+	"strings"
+)
+
+type TensorInfo struct {
+	Name   string
+	Offset uint64
+	Shape  []uint64
+	Type   TensorType
+}
+
+func (ti TensorInfo) Valid() bool {
+	return ti.Name != "" && ti.NumBytes() > 0
+}
+
+func (ti TensorInfo) NumValues() int64 {
+	var numItems int64 = 1
+	for _, dim := range ti.Shape {
+		numItems *= int64(dim)
+	}
+	return numItems
+}
+
+// NumBytes returns the number of bytes in the tensor.
+func (ti TensorInfo) NumBytes() int64 {
+	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
+}
+
+func (ti TensorInfo) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", ti.Name),
+		slog.Int64("offset", int64(ti.Offset)),
+		slog.Any("shape", ti.Shape),
+		slog.Int64("num_values", ti.NumValues()),
+		slog.Int64("num_bytes", ti.NumBytes()),
+		slog.Any("type", ti.Type),
+	)
+}
+
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+
+	// unexported // unused in gguf
+	tensorTypeQ4_2
+	tensorTypeQ4_3
+
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ2_XXS
+	tensorTypeIQ2_XS
+	tensorTypeIQ3_XXS
+	tensorTypeIQ1_S
+	tensorTypeIQ4_NL
+	tensorTypeIQ3_S
+	tensorTypeIQ2_S
+	tensorTypeIQ4_XS
+
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ1_M
+
+	TensorTypeBF16
+
+	// unexported // unused in gguf
+	tensorTypeQ4_0_4_4
+	tensorTypeQ4_0_4_8
+	tensorTypeQ4_0_8_8
+
+	// unexported // unquantizable by ollama
+	tensorTypeTQ1_0
+	tensorTypeTQ2_0
+
+	// unexported // unused in gguf
+	tensorTypeIQ4_NL_4_4
+	tensorTypeIQ4_NL_4_8
+	tensorTypeIQ4_NL_8_8
+)
+
+func (tt TensorType) NumBytes() float64 {
+	return float64(tt.typeSize()) / float64(tt.blockSize())
+}
+
+func (tt TensorType) typeSize() int64 {
+	switch tt {
+	case TensorTypeF32:
+		return 4
+	case TensorTypeF16:
+		return 2
+	case TensorTypeQ4_0:
+		return 2 + tt.blockSize()/2
+	case TensorTypeQ4_1:
+		return 2 + 2 + tt.blockSize()/2
+	case TensorTypeQ5_0:
+		return 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ5_1:
+		return 2 + 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ8_0:
+		return 2 + tt.blockSize()
+	case TensorTypeQ8_1:
+		return 2 + 2 + tt.blockSize()
+	case TensorTypeQ2_K:
+		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
+	case TensorTypeQ3_K:
+		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
+	case TensorTypeQ4_K:
+		return 2 + 2 + 12 + tt.blockSize()/2
+	case TensorTypeQ5_K:
+		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
+	case TensorTypeQ6_K:
+		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
+	case TensorTypeQ8_K:
+		return 4 + tt.blockSize() + 2*tt.blockSize()/16
+	case tensorTypeIQ2_XXS:
+		return 2 + 2*tt.blockSize()/8
+	case tensorTypeIQ2_XS:
+		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
+	case tensorTypeIQ3_XXS:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8
+	case tensorTypeIQ1_S:
+		return 2 + tt.blockSize()/8 + tt.blockSize()/16
+	case tensorTypeIQ4_NL:
+		return 2 + tt.blockSize()/2
+	case tensorTypeIQ3_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
+	case tensorTypeIQ2_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/16
+	case tensorTypeIQ4_XS:
+		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
+	case TensorTypeI8:
+		return 1
+	case TensorTypeI16:
+		return 2
+	case TensorTypeI32:
+		return 4
+	case TensorTypeI64:
+		return 8
+	case TensorTypeF64:
+		return 8
+	case tensorTypeIQ1_M:
+		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
+	case TensorTypeBF16:
+		return 2
+	default:
+		return 0
+	}
+}
+
+func (tt TensorType) blockSize() int64 {
+	switch tt {
+	case TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
+		return 1
+	case TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (tt TensorType) String() string {
+	switch tt {
+	case TensorTypeF32:
+		return "f32"
+	case TensorTypeF16:
+		return "f16"
+	case TensorTypeQ4_0:
+		return "q4_0"
+	case TensorTypeQ4_1:
+		return "q4_1"
+	case tensorTypeQ4_2:
+		return "q4_2"
+	case tensorTypeQ4_3:
+		return "q4_3"
+	case TensorTypeQ5_0:
+		return "q5_0"
+	case TensorTypeQ5_1:
+		return "q5_1"
+	case TensorTypeQ8_0:
+		return "q8_0"
+	case TensorTypeQ8_1:
+		return "q8_1"
+	case TensorTypeQ2_K:
+		return "q2_k"
+	case TensorTypeQ3_K:
+		return "q3_k"
+	case TensorTypeQ4_K:
+		return "q4_k"
+	case TensorTypeQ5_K:
+		return "q5_k"
+	case TensorTypeQ6_K:
+		return "q6_k"
+	case TensorTypeQ8_K:
+		return "q8_k"
+	case tensorTypeIQ2_XXS:
+		return "iq2_xxs"
+	case tensorTypeIQ2_XS:
+		return "iq2_xs"
+	case tensorTypeIQ3_XXS:
+		return "iq3_xxs"
+	case tensorTypeIQ1_S:
+		return "iq1_s"
+	case tensorTypeIQ4_NL:
+		return "iq4_nl"
+	case tensorTypeIQ3_S:
+		return "iq3_s"
+	case tensorTypeIQ2_S:
+		return "iq2_s"
+	case tensorTypeIQ4_XS:
+		return "iq4_xs"
+	case TensorTypeI8:
+		return "i8"
+	case TensorTypeI16:
+		return "i16"
+	case TensorTypeI32:
+		return "i32"
+	case TensorTypeI64:
+		return "i64"
+	case TensorTypeF64:
+		return "f64"
+	case tensorTypeIQ1_M:
+		return "iq1_m"
+	case TensorTypeBF16:
+		return "bf16"
+	case tensorTypeQ4_0_4_4:
+		return "q4_0_4_4"
+	case tensorTypeQ4_0_4_8:
+		return "q4_0_4_8"
+	case tensorTypeQ4_0_8_8:
+		return "q4_0_8_8"
+	case tensorTypeTQ1_0:
+		return "tq1_0"
+	case tensorTypeTQ2_0:
+		return "tq2_0"
+	case tensorTypeIQ4_NL_4_4:
+		return "iq4_nl_4_4"
+	case tensorTypeIQ4_NL_4_8:
+		return "iq4_nl_4_8"
+	case tensorTypeIQ4_NL_8_8:
+		return "iq4_nl_8_8"
+	default:
+		return "unknown"
+	}
+}
+
+func (tt TensorType) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.Uint64("value", uint64(tt)),
+		slog.String("name", strings.ToUpper(tt.String())),
+		slog.Int64("size", tt.typeSize()),
+		slog.Int64("block_size", tt.blockSize()),
+		slog.Float64("num_bytes", tt.NumBytes()),
+	)
+}
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -45,6 +45,8 @@ var (
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
+		"codellama:latest",
+		"nous-hermes:latest",
 	}
 )

--- a/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
+++ b/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -0,0 +1,32 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 22 Jun 2025 09:22:05 -0700
+Subject: [PATCH] temporary prevent rocm+cuda mixed loading
+
+---
+ ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 4e67d243..8f49f084 100644
+--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
+ 
+     ggml_backend_load_best("blas", silent, dir_path);
+     ggml_backend_load_best("cann", silent, dir_path);
+-    ggml_backend_load_best("cuda", silent, dir_path);
+-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+    
+     ggml_backend_load_best("kompute", silent, dir_path);
+     ggml_backend_load_best("metal", silent, dir_path);
+     ggml_backend_load_best("rpc", silent, dir_path);
--- a/llm/server.go
+++ b/llm/server.go
@@ -139,6 +139,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}

+	// Verify the requested context size is <= the model training size
+	trainCtx := f.KV().ContextLength()
+	if opts.NumCtx/numParallel > int(trainCtx) && trainCtx > 0 {
+		slog.Warn("requested context size too large for model", "num_ctx", opts.NumCtx, "num_parallel", numParallel, "n_ctx_train", trainCtx)
+		opts.NumCtx = int(trainCtx) * numParallel
+	}
+
 	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
@@ -311,7 +318,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}

-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -602,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }

 func (c *Context) Compute(tensors ...ml.Tensor) {
-	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
+	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
+		panic(fmt.Errorf("error computing ggml graph: %v", status))
+	}
 	C.ggml_backend_sched_reset(c.b.sched)

 	needSync := true
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {

    ggml_backend_load_best("blas", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
+
+    // Avoid mixed hip+cuda configurations
+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
+    if (!hip_devices && !rocr_devices) {
+        ggml_backend_load_best("cuda", silent, dir_path);
+    } else {
+        ggml_backend_load_best("hip", silent, dir_path);
+    }
+    
    ggml_backend_load_best("kompute", silent, dir_path);
    ggml_backend_load_best("metal", silent, dir_path);
    ggml_backend_load_best("rpc", silent, dir_path);
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -27,7 +27,6 @@ function checkEnv() {
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
    # Locate CUDA versions
-    # Note: this assumes every version found will be built
    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
    if ($cudaList.length -eq 0) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -94,19 +93,6 @@ function buildOllama() {

        $hashEnv = @{}
        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        if ("$script:CUDA_DIRS".Contains("v11")) {
-            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
-            $env:CUDAToolkit_ROOT=$hashEnv[$v11]
-            write-host "Building CUDA v11 backend libraries"
-            # Note: cuda v11 requires msvc 2019 so force the older generator
-            # to avoid 2022 (or newer) from being used as the default
-            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
        if ("$script:CUDA_DIRS".Contains("v12")) {
            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
            $env:CUDAToolkit_ROOT=$hashEnv[$v12]
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=GOFLAGS \
    --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
    --build-arg=OLLAMA_SKIP_CUDA_GENERATE \
-    --build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
    --build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
-    --build-arg=CUDA_V11_ARCHITECTURES \
    --build-arg=CUDA_V12_ARCHITECTURES \
    --build-arg=OLLAMA_SKIP_ROCM_GENERATE \
    --build-arg=OLLAMA_FAST_BUILD \
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
@@ -73,22 +73,18 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}

 	// Check for completion capability
-	r, err := os.Open(m.ModelPath)
+	f, err := gguf.Open(m.ModelPath)
 	if err == nil {
-		defer r.Close()
+		defer f.Close()

-		f, err := ggml.Decode(r, 1024)
-		if err == nil {
-			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityEmbedding)
-			} else {
-				capabilities = append(capabilities, model.CapabilityCompletion)
-			}
-			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityVision)
-			}
+		if f.KeyValue("pooling_type").Valid() {
+			capabilities = append(capabilities, model.CapabilityEmbedding)
 		} else {
-			slog.Error("couldn't decode ggml", "error", err)
+			// If no embedding is specified, we assume the model supports completion
+			capabilities = append(capabilities, model.CapabilityCompletion)
+		}
+		if f.KeyValue("vision.block_count").Valid() {
+			capabilities = append(capabilities, model.CapabilityVision)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,123 +1,42 @@
 package server

 import (
-	"bytes"
-	"encoding/binary"
-	"errors"
-	"os"
-	"path/filepath"
 	"strings"
 	"testing"

+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )

-// Constants for GGUF magic bytes and version
-var (
-	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
-	ggufVer   = uint32(3)                      // Version 3
-)
-
-// Helper function to create mock GGUF data
-func createMockGGUFData(architecture string, vision bool) []byte {
-	var buf bytes.Buffer
-
-	// Write GGUF header
-	buf.Write(ggufMagic)
-	binary.Write(&buf, binary.LittleEndian, ggufVer)
-
-	// Write tensor count (0 for our test)
-	var numTensors uint64 = 0
-	binary.Write(&buf, binary.LittleEndian, numTensors)
-
-	// Calculate number of metadata entries
-	numMetaEntries := uint64(1) // architecture entry
-	if vision {
-		numMetaEntries++
-	}
-	// Add embedding entry if architecture is "bert"
-	if architecture == "bert" {
-		numMetaEntries++
-	}
-	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
-
-	// Write architecture metadata
-	archKey := "general.architecture"
-	keyLen := uint64(len(archKey))
-	binary.Write(&buf, binary.LittleEndian, keyLen)
-	buf.WriteString(archKey)
-
-	// String type (8)
-	var strType uint32 = 8
-	binary.Write(&buf, binary.LittleEndian, strType)
-
-	// String length
-	strLen := uint64(len(architecture))
-	binary.Write(&buf, binary.LittleEndian, strLen)
-	buf.WriteString(architecture)
-
-	if vision {
-		visionKey := architecture + ".vision.block_count"
-		keyLen = uint64(len(visionKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(visionKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var countVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, countVal)
-	}
-	// Write embedding metadata if architecture is "bert"
-	if architecture == "bert" {
-		poolKey := architecture + ".pooling_type"
-		keyLen = uint64(len(poolKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(poolKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var poolingVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, poolingVal)
-	}
-
-	return buf.Bytes()
-}
-
 func TestModelCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create completion model (llama architecture without vision)
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})

-	// Create different types of mock model files
-	completionModelPath := filepath.Join(tempDir, "model.bin")
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
-	// Create a simple model file for tests that don't depend on GGUF content
-	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})

-	if err := errors.Join(
-		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})

 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -145,21 +64,13 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
-		{
-			name: "model with tools and insert capability",
-			model: Model{
-				ModelPath: simpleModelPath,
-				Template:  toolsInsertTemplate,
-			},
-			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
-		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -224,29 +135,33 @@ func TestModelCapabilities(t *testing.T) {
 }

 func TestModelCheckCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create simple model file for tests that don't depend on GGUF content
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})

-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	simpleModelPath := filepath.Join(tempDir, "model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})

-	if err := errors.Join(
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})

 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -261,7 +176,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -270,7 +185,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -278,7 +193,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -287,7 +202,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -312,7 +227,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,16 +257,8 @@ func TestQuantizeModel(t *testing.T) {

 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			f, err := os.CreateTemp(t.TempDir(), tt.name)
-			if err != nil {
-				t.Fatal(err.Error())
-			}
-			defer f.Close()
-			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
-			if err != nil {
-				t.Fatalf("failed to create initial model: %s", err)
-			}
-			fp, err := os.Open(f.Name())
+			p, _ := createBinFile(t, tt.kv, tt.tensors)
+			fp, err := os.Open(p)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,11 +112,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()

-	f, err := os.CreateTemp(t.TempDir(), modelName)
-	require.NoError(t, err)
-	defer f.Close()
-
-	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
+	p, _ := createBinFile(t, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -129,14 +125,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	}))
-	require.NoError(t, err)
-
-	fname := f.Name()
-	model := &Model{Name: modelName, ModelPath: fname}
-	b.f, err = llm.LoadModel(model.ModelPath, 0)
-	require.NoError(t, err)
+	})

+	model := &Model{Name: modelName, ModelPath: p}
+	f, err := llm.LoadModel(model.ModelPath, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}
Author	SHA1	Message	Date
Daniel Hiltgen	10a8e04a8d	avoid context overflow (#11175 ) For smaller context models, make sure we do not exceed the training size.	2025-06-23 15:52:50 -07:00
Daniel Hiltgen	1c6669e64c	Re-remove cuda v11 (#10694 ) * Re-remove cuda v11 Revert the revert - drop v11 support requiring drivers newer than Feb 23 This reverts commit `c6bcdc4223`. * Simplify layout With only one version of the GPU libraries, we can simplify things down somewhat. (Jetsons still require special handling) * distinct sbsa variant for linux arm64 This avoids accidentally trying to load the sbsa cuda libraries on a jetson system which results in crashes. * temporary prevent rocm+cuda mixed loading	2025-06-23 14:07:00 -07:00
AJ	2bb69b40c7	readme: add ai-hub to community integrations (#11169 )	2025-06-23 09:21:12 -07:00
Daniel Hiltgen	65bff664cb	build speedups (#11142 ) Enable parallel building of the GPU architectures.	2025-06-20 12:32:51 -07:00
Michael Yang	c088ac0e79	convert: utility for merging tensors (#11069 )	2025-06-20 11:12:01 -07:00
Michael Yang	0a066cfd91	Reapply "feat: incremental gguf parser (#10822 )" (#11114 ) (#11119 ) * Reapply "feat: incremental gguf parser (#10822)" (#11114) This reverts commit `a6e64fbdf2`. * fix older ggufs	2025-06-20 11:11:40 -07:00
Jesse Gross	87b7af6cee	ggml: Check return status for computation. We don't check the return status after computing the graph, which can silently lead to bad outputs if we try to keep going and future computation succeeds. This appears to happens in certain cases on Apple M2 devices. Fixes #11070	2025-06-19 17:12:49 -07:00
Daniel Hiltgen	f2527b08fb	int: add coverage for older models (#11137 ) Verified these fail on 0.9.1 and pass on HEAD.	2025-06-19 12:10:19 -07:00