avoid context overflow (#11175 )

For smaller context models, make sure we do not exceed the training size.
Re-remove cuda v11 (#10694 )
2026-04-27 11:15:40 +02:00 · 2025-06-23 15:52:50 -07:00 · 2025-06-23 14:07:00 -07:00 · 2025-06-23 09:21:12 -07:00 · 2025-06-20 12:32:51 -07:00 · 2025-06-20 11:12:01 -07:00
33 changed files with 1613 additions and 285 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -103,11 +103,6 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
          - os: windows
            arch: amd64
            preset: 'CUDA 11'
            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@@ -324,8 +319,6 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }
          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,14 +78,13 @@ if(CMAKE_CUDA_COMPILER)
    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()
@@ -116,7 +115,11 @@ if(CMAKE_HIP_COMPILER)
        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCIES
+            RUNTIME_DEPENDENCY_SET rocm
            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
        )
        install(RUNTIME_DEPENDENCY_SET rocm
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,20 +17,12 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
    {
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
      }
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
@@ -58,6 +50,7 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
@@ -78,11 +71,6 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
    {
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 11"
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/24
+++ b/24
@@ -7,12 +7,13 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
-# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
@@ -38,15 +39,6 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8
 FROM base AS cuda-11
 ARG CUDA11VERSION=11.3
 RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 11' \
        && cmake --build --parallel --preset 'CUDA 11' \
        && cmake --install build --component CUDA --strip --parallel 8
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -98,17 +90,15 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .
 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
-COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
+COPY --from=rocm-6 dist/lib/ollama /lib/ollama
 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
--- a/README.md
+++ b/README.md
@@ -409,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 ### Cloud
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,9 +2,6 @@ package convert
 import (
 	"fmt"
 	"io"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }
 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	oldnew := []string{
+	merges := make([]merge, 0, p.NumHiddenLayers*6)
-		"model.layers", "blk",
+	for i := range p.NumHiddenLayers {
-		"w1", "ffn_gate_exps",
+		merges = append(merges, merge{
-		"w2", "ffn_down_exps",
+			fmt.Sprintf("blk.%d.*.w1.weight", i),
-		"w3", "ffn_up_exps",
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
-	}
+		}, merge{
-
+			fmt.Sprintf("blk.%d.*.w1.bias", i),
-	for i := range p.NumLocalExperts {
+			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
-		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
+		}, merge{
-	}
+			fmt.Sprintf("blk.%d.*.w2.weight", i),
-
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
-	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
+		}, merge{
-	namer := strings.NewReplacer(oldnew...)
+			fmt.Sprintf("blk.%d.*.w2.bias", i),
-	experts := make(map[string]experts)
+			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
-
+		}, merge{
-	// merge experts into a single tensor while removing them from ts
+			fmt.Sprintf("blk.%d.*.w3.weight", i),
-	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
-		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
+		}, merge{
-			return false
+			fmt.Sprintf("blk.%d.*.w3.bias", i),
-		}
+			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
 		name := namer.Replace(t.Name())
 		experts[name] = append(experts[name], t)
 		return true
 	})
 	var out []*ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
 		out = append(out, &ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
 			WriterTo: e,
 		})
 	}
 	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }
 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
 		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
 		"block_sparse_moe.experts.", ".",
 	)
 }
 type experts []Tensor
 func (e experts) WriteTo(w io.Writer) (int64, error) {
 	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
 	for _, t := range e {
 		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
 		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
 		// this accomplishes the same thing by writing each expert tensor in sequence
 		if _, err := t.WriteTo(w); err != nil {
 			return 0, err
 		}
 	}
 	return 0, nil
 }
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -2,7 +2,9 @@ package convert
 import (
 	"cmp"
 	"io"
 	"iter"
 	"path"
 	"slices"
 	"strings"
@@ -74,3 +76,54 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 		}
 	}
 }
 type merge struct {
 	pattern, name string
 }
 // mergeTensors merges tensors that match a given pattern into a single tensor.
 func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
 	var matched []Tensor
 	for i := range merges {
 		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
 			matched, _ := path.Match(merges[i].pattern, t.Name())
 			return matched
 		})
 		if len(matched) > 0 {
 			out = append(out, &ggml.Tensor{
 				Name:     merges[i].name,
 				Kind:     matched[0].Kind(),
 				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
 				WriterTo: mergeGroup(matched),
 			})
 		}
 	}
 	return out, unmatched
 }
 // slicesSplitFunc splits a slice into two slices based on a predicate function.
 func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
 	for _, e := range s {
 		if fn(e) {
 			matched = append(matched, e)
 		} else {
 			unmatched = append(unmatched, e)
 		}
 	}
 	return matched, unmatched
 }
 type mergeGroup []Tensor
 func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
 	for _, t := range g {
 		if _, err := t.WriteTo(w); err != nil {
 			return 0, err
 		}
 	}
 	return 0, nil
 }
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -9,6 +9,8 @@ import (
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 )
@@ -302,3 +304,99 @@ func TestSplitDim(t *testing.T) {
 		}
 	})
 }
 func TestMerge(t *testing.T) {
 	unmatched := []Tensor{
 		&fakeTensor{
 			name:  "a.0.b",
 			shape: []uint64{5, 2},
 			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
 		},
 		&fakeTensor{
 			name:  "a.1.b",
 			shape: []uint64{5, 2},
 			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
 		},
 		&fakeTensor{
 			name:  "c.0.d",
 			shape: []uint64{5, 2},
 			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
 		},
 		&fakeTensor{
 			name:  "c.1.d",
 			shape: []uint64{5, 2},
 			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
 		},
 		&fakeTensor{
 			name:  "e.0.f",
 			shape: []uint64{5, 2},
 			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
 		},
 	}
 	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
 		for i := range n {
 			got := matched[i]
 			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
 				t.Errorf("unexpected (-want +got):\n%s", diff)
 			}
 			var b bytes.Buffer
 			if _, err := got.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, 20)
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			offset := 10 + (i * 20)
 			want := make([]float32, 20)
 			for j := range 20 {
 				want[j] = float32(offset + j)
 			}
 			if diff := cmp.Diff(want, f32s); diff != "" {
 				t.Errorf("unexpected data (-want +got):\n%s", diff)
 			}
 		}
 	}
 	t.Run("single merge", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
 		if len(unmatched) != 3 {
 			t.Error("expected 3 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 1 {
 			t.Error("expected 1 merged tensor, got", len(matched))
 		}
 		checkMatched(t, 1, matched)
 	})
 	t.Run("multiple merges", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
 		if len(unmatched) != 1 {
 			t.Error("expected 1 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 2 {
 			t.Error("expected 2 merged tensor, got", len(matched))
 		}
 		checkMatched(t, 2, matched)
 	})
 	t.Run("no match", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
 		if len(unmatched) != 5 {
 			t.Error("expected 5 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 0 {
 			t.Error("expected no merged tensors, got", len(matched))
 		}
 	})
 }
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,6 +3,7 @@
 package discover
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -55,10 +56,13 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
 		return "sbsa"
 	}
 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
 		// The detected driver is older than Feb 2023
 		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):
 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```
 **Experimental LLM Library Override**
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
 package gguf
 import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"iter"
 	"os"
 	"slices"
 	"strings"
 )
 const (
 	typeUint8 uint32 = iota
 	typeInt8
 	typeUint16
 	typeInt16
 	typeUint32
 	typeInt32
 	typeFloat32
 	typeBool
 	typeString
 	typeArray
 	typeUint64
 	typeInt64
 	typeFloat64
 )
 var ErrUnsupported = errors.New("unsupported")
 type File struct {
 	Magic   [4]byte
 	Version uint32
 	keyValues *lazy[KeyValue]
 	tensors   *lazy[TensorInfo]
 	offset    int64
 	file   *os.File
 	reader *bufferedReader
 	bts    []byte
 }
 func Open(path string) (f *File, err error) {
 	f = &File{bts: make([]byte, 4096)}
 	f.file, err = os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	f.reader = newBufferedReader(f.file, 32<<10)
 	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
 		return nil, err
 	}
 	if bytes.Equal(f.Magic[:], []byte("gguf")) {
 		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
 	}
 	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
 		return nil, err
 	}
 	if f.Version < 2 {
 		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
 	}
 	f.tensors, err = newLazy(f, f.readTensor)
 	if err != nil {
 		return nil, err
 	}
 	f.tensors.successFunc = func() error {
 		offset := f.reader.offset
 		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
 		f.offset = offset + (alignment-offset%alignment)%alignment
 		return nil
 	}
 	f.keyValues, err = newLazy(f, f.readKeyValue)
 	if err != nil {
 		return nil, err
 	}
 	return f, nil
 }
 func (f *File) readTensor() (TensorInfo, error) {
 	name, err := readString(f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	dims, err := read[uint32](f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	shape := make([]uint64, dims)
 	for i := range dims {
 		shape[i], err = read[uint64](f)
 		if err != nil {
 			return TensorInfo{}, err
 		}
 	}
 	type_, err := read[uint32](f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	offset, err := read[uint64](f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	return TensorInfo{
 		Name:   name,
 		Offset: offset,
 		Shape:  shape,
 		Type:   TensorType(type_),
 	}, nil
 }
 func (f *File) readKeyValue() (KeyValue, error) {
 	key, err := readString(f)
 	if err != nil {
 		return KeyValue{}, err
 	}
 	t, err := read[uint32](f)
 	if err != nil {
 		return KeyValue{}, err
 	}
 	value, err := func() (any, error) {
 		switch t {
 		case typeUint8:
 			return read[uint8](f)
 		case typeInt8:
 			return read[int8](f)
 		case typeUint16:
 			return read[uint16](f)
 		case typeInt16:
 			return read[int16](f)
 		case typeUint32:
 			return read[uint32](f)
 		case typeInt32:
 			return read[int32](f)
 		case typeUint64:
 			return read[uint64](f)
 		case typeInt64:
 			return read[int64](f)
 		case typeFloat32:
 			return read[float32](f)
 		case typeFloat64:
 			return read[float64](f)
 		case typeBool:
 			return read[bool](f)
 		case typeString:
 			return readString(f)
 		case typeArray:
 			return readArray(f)
 		default:
 			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
 		}
 	}()
 	if err != nil {
 		return KeyValue{}, err
 	}
 	return KeyValue{
 		Key:   key,
 		Value: Value{value},
 	}, nil
 }
 func read[T any](f *File) (t T, err error) {
 	err = binary.Read(f.reader, binary.LittleEndian, &t)
 	return t, err
 }
 func readString(f *File) (string, error) {
 	n, err := read[uint64](f)
 	if err != nil {
 		return "", err
 	}
 	if int(n) > len(f.bts) {
 		f.bts = make([]byte, n)
 	}
 	bts := f.bts[:n]
 	if _, err := io.ReadFull(f.reader, bts); err != nil {
 		return "", err
 	}
 	defer clear(bts)
 	return string(bts), nil
 }
 func readArray(f *File) (any, error) {
 	t, err := read[uint32](f)
 	if err != nil {
 		return nil, err
 	}
 	n, err := read[uint64](f)
 	if err != nil {
 		return nil, err
 	}
 	switch t {
 	case typeUint8:
 		return readArrayData[uint8](f, n)
 	case typeInt8:
 		return readArrayData[int8](f, n)
 	case typeUint16:
 		return readArrayData[uint16](f, n)
 	case typeInt16:
 		return readArrayData[int16](f, n)
 	case typeUint32:
 		return readArrayData[uint32](f, n)
 	case typeInt32:
 		return readArrayData[int32](f, n)
 	case typeUint64:
 		return readArrayData[uint64](f, n)
 	case typeInt64:
 		return readArrayData[int64](f, n)
 	case typeFloat32:
 		return readArrayData[float32](f, n)
 	case typeFloat64:
 		return readArrayData[float64](f, n)
 	case typeBool:
 		return readArrayData[bool](f, n)
 	case typeString:
 		return readArrayString(f, n)
 	default:
 		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
 	}
 }
 func readArrayData[T any](f *File, n uint64) (s []T, err error) {
 	s = make([]T, n)
 	for i := range n {
 		e, err := read[T](f)
 		if err != nil {
 			return nil, err
 		}
 		s[i] = e
 	}
 	return s, nil
 }
 func readArrayString(f *File, n uint64) (s []string, err error) {
 	s = make([]string, n)
 	for i := range n {
 		e, err := readString(f)
 		if err != nil {
 			return nil, err
 		}
 		s[i] = e
 	}
 	return s, nil
 }
 func (f *File) Close() error {
 	f.keyValues.stop()
 	f.tensors.stop()
 	return f.file.Close()
 }
 func (f *File) KeyValue(key string) KeyValue {
 	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
 		key = f.KeyValue("general.architecture").String() + "." + key
 	}
 	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
 		return kv.Key == key
 	}); index >= 0 {
 		return f.keyValues.values[index]
 	}
 	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
 		if keyValue.Key == key {
 			return keyValue
 		}
 	}
 	return KeyValue{}
 }
 func (f *File) NumKeyValues() int {
 	return int(f.keyValues.count)
 }
 func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
 	return f.keyValues.All()
 }
 func (f *File) TensorInfo(name string) TensorInfo {
 	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
 		return t.Name == name
 	}); index >= 0 {
 		return f.tensors.values[index]
 	}
 	// fast-forward through key values if we haven't already
 	_ = f.keyValues.rest()
 	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
 		if tensor.Name == name {
 			return tensor
 		}
 	}
 	return TensorInfo{}
 }
 func (f *File) NumTensors() int {
 	return int(f.tensors.count)
 }
 func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
 	// fast forward through key values if we haven't already
 	f.keyValues.rest()
 	return f.tensors.All()
 }
 func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
 	t := f.TensorInfo(name)
 	if t.NumBytes() == 0 {
 		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
 	}
 	// fast forward through tensor info if we haven't already
 	_ = f.tensors.rest()
 	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
 }
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
 package gguf_test
 import (
 	"bytes"
 	"os"
 	"strconv"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/fs/gguf"
 )
 func createBinFile(tb testing.TB) string {
 	tb.Helper()
 	f, err := os.CreateTemp(tb.TempDir(), "")
 	if err != nil {
 		tb.Fatal(err)
 	}
 	defer f.Close()
 	kv := ggml.KV{
 		"general.architecture":                   "llama",
 		"llama.block_count":                      uint32(8),
 		"llama.embedding_length":                 uint32(3),
 		"llama.attention.head_count":             uint32(2),
 		"llama.attention.head_count_kv":          uint32(2),
 		"llama.attention.key_length":             uint32(3),
 		"llama.rope.dimension_count":             uint32(4),
 		"llama.rope.freq_base":                   float32(10000.0),
 		"llama.rope.freq_scale":                  float32(1.0),
 		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
 		"tokenizer.ggml.eos_token_id":            uint32(0),
 		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
 		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
 		"tokenizer.ggml.scores":                  []float32{0, 1},
 	}
 	tensors := []*ggml.Tensor{
 		{
 			Name:     "token_embd.weight",
 			Kind:     0,
 			Shape:    []uint64{2, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
 		},
 		{
 			Name:     "output.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 2},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
 		},
 	}
 	for i := range 8 {
 		tensors = append(tensors, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		}, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		}, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		}, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		})
 	}
 	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
 		tb.Fatal(err)
 	}
 	return f.Name()
 }
 func TestRead(t *testing.T) {
 	f, err := gguf.Open(createBinFile(t))
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 	if got := f.KeyValue("does.not.exist").Valid(); got {
 		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
 	}
 	if got := f.KeyValue("general.architecture").String(); got != "llama" {
 		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
 	}
 	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
 		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
 	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
 		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
 	} else if got.Type != gguf.TensorTypeF32 {
 		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
 	}
 	if got := f.KeyValue("block_count").Uint(); got != 8 {
 		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
 	}
 	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
 		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
 	}
 	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
 		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
 	}
 	var kvs []string
 	for _, kv := range f.KeyValues() {
 		if !kv.Valid() {
 			t.Error("found invalid key-value pair:", kv)
 		}
 		kvs = append(kvs, kv.Key)
 	}
 	if len(kvs) != f.NumKeyValues() {
 		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
 	}
 	if diff := cmp.Diff(kvs, []string{
 		"general.architecture",
 		"llama.block_count",
 		"llama.embedding_length",
 		"llama.attention.head_count",
 		"llama.attention.head_count_kv",
 		"llama.attention.key_length",
 		"llama.rope.dimension_count",
 		"llama.rope.freq_base",
 		"llama.rope.freq_scale",
 		"llama.attention.layer_norm_rms_epsilon",
 		"tokenizer.ggml.eos_token_id",
 		"tokenizer.ggml.eos_token_ids",
 		"tokenizer.ggml.tokens",
 		"tokenizer.ggml.scores",
 	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
 		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
 	}
 	var tis []string
 	for _, ti := range f.TensorInfos() {
 		if !ti.Valid() {
 			t.Error("found invalid tensor info:", ti)
 		}
 		tis = append(tis, ti.Name)
 	}
 	if len(tis) != f.NumTensors() {
 		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
 	}
 	if diff := cmp.Diff(tis, []string{
 		"token_embd.weight",
 		"output.weight",
 		"blk.0.attn_q.weight",
 		"blk.0.attn_k.weight",
 		"blk.0.attn_v.weight",
 		"blk.0.attn_output.weight",
 		"blk.1.attn_q.weight",
 		"blk.1.attn_k.weight",
 		"blk.1.attn_v.weight",
 		"blk.1.attn_output.weight",
 		"blk.2.attn_q.weight",
 		"blk.2.attn_k.weight",
 		"blk.2.attn_v.weight",
 		"blk.2.attn_output.weight",
 		"blk.3.attn_q.weight",
 		"blk.3.attn_k.weight",
 		"blk.3.attn_v.weight",
 		"blk.3.attn_output.weight",
 		"blk.4.attn_q.weight",
 		"blk.4.attn_k.weight",
 		"blk.4.attn_v.weight",
 		"blk.4.attn_output.weight",
 		"blk.5.attn_q.weight",
 		"blk.5.attn_k.weight",
 		"blk.5.attn_v.weight",
 		"blk.5.attn_output.weight",
 		"blk.6.attn_q.weight",
 		"blk.6.attn_k.weight",
 		"blk.6.attn_v.weight",
 		"blk.6.attn_output.weight",
 		"blk.7.attn_q.weight",
 		"blk.7.attn_k.weight",
 		"blk.7.attn_v.weight",
 		"blk.7.attn_output.weight",
 	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
 		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
 	}
 	ti, r, err := f.TensorReader("output.weight")
 	if err != nil {
 		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
 	}
 	if ti.Name != "output.weight" {
 		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
 	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
 		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
 	} else if ti.Type != gguf.TensorTypeF32 {
 		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
 	}
 	var b bytes.Buffer
 	if _, err := b.ReadFrom(r); err != nil {
 		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
 	}
 	if b.Len() != int(ti.NumBytes()) {
 		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
 	}
 }
 func BenchmarkRead(b *testing.B) {
 	b.ReportAllocs()
 	p := createBinFile(b)
 	for b.Loop() {
 		f, err := gguf.Open(p)
 		if err != nil {
 			b.Fatal(err)
 		}
 		if got := f.KeyValue("general.architecture").String(); got != "llama" {
 			b.Errorf("got = %q, want %q", got, "llama")
 		}
 		// Iterate through some tensors
 		for range f.TensorInfos() {
 		}
 		f.Close()
 	}
 }
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
 package gguf
 import (
 	"reflect"
 	"slices"
 )
 type KeyValue struct {
 	Key string
 	Value
 }
 func (kv KeyValue) Valid() bool {
 	return kv.Key != "" && kv.Value.value != nil
 }
 type Value struct {
 	value any
 }
 func value[T any](v Value, kinds ...reflect.Kind) (t T) {
 	vv := reflect.ValueOf(v.value)
 	if slices.Contains(kinds, vv.Kind()) {
 		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
 	}
 	return
 }
 func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
 	switch vv := reflect.ValueOf(v.value); vv.Kind() {
 	case reflect.Slice:
 		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
 			ts = make([]T, vv.Len())
 			for i := range vv.Len() {
 				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
 			}
 		}
 	}
 	return
 }
 // Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
 func (v Value) Int() int64 {
 	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
 }
 // Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
 func (v Value) Ints() (i64s []int64) {
 	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
 }
 // Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
 func (v Value) Uint() uint64 {
 	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
 }
 // Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
 func (v Value) Uints() (u64s []uint64) {
 	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
 }
 // Float returns Value as a float. If it is not a float, it returns 0.
 func (v Value) Float() float64 {
 	return value[float64](v, reflect.Float32, reflect.Float64)
 }
 // Floats returns Value as a float slice. If it is not a float slice, it returns nil.
 func (v Value) Floats() (f64s []float64) {
 	return values[float64](v, reflect.Float32, reflect.Float64)
 }
 // Bool returns Value as a boolean. If it is not a boolean, it returns false.
 func (v Value) Bool() bool {
 	return value[bool](v, reflect.Bool)
 }
 // Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
 func (v Value) Bools() (bools []bool) {
 	return values[bool](v, reflect.Bool)
 }
 // String returns Value as a string. If it is not a string, it returns an empty string.
 func (v Value) String() string {
 	return value[string](v, reflect.String)
 }
 // Strings returns Value as a string slice. If it is not a string slice, it returns nil.
 func (v Value) Strings() (strings []string) {
 	return values[string](v, reflect.String)
 }
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
 package gguf
 import (
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 func split(name string, values map[string][]any) (matched []any, unmatched []any) {
 	for key, value := range values {
 		if key == name {
 			matched = value
 		} else {
 			unmatched = append(unmatched, value...)
 		}
 	}
 	return
 }
 func TestValue(t *testing.T) {
 	values := map[string][]any{
 		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
 		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
 		"float64": {float32(42), float64(42)},
 		"string":  {"42", "hello"},
 		"bool":    {true, false},
 	}
 	t.Run("int64", func(t *testing.T) {
 		matched, unmatched := split("int64", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if i64 := kv.Int(); i64 != 42 {
 				t.Errorf("expected 42, got %d", i64)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if i64 := kv.Int(); i64 != 0 {
 				t.Errorf("expected 42, got %d", i64)
 			}
 		}
 	})
 	t.Run("uint64", func(t *testing.T) {
 		matched, unmatched := split("uint64", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if u64 := kv.Uint(); u64 != 42 {
 				t.Errorf("expected 42, got %d", u64)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if u64 := kv.Uint(); u64 != 0 {
 				t.Errorf("expected 42, got %d", u64)
 			}
 		}
 	})
 	t.Run("float64", func(t *testing.T) {
 		matched, unmatched := split("float64", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if f64 := kv.Float(); f64 != 42 {
 				t.Errorf("expected 42, got %f", f64)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if f64 := kv.Float(); f64 != 0 {
 				t.Errorf("expected 42, got %f", f64)
 			}
 		}
 	})
 	t.Run("string", func(t *testing.T) {
 		matched, unmatched := split("string", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if s := kv.String(); s != v {
 				t.Errorf("expected 42, got %s", s)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if s := kv.String(); s != "" {
 				t.Errorf("expected 42, got %s", s)
 			}
 		}
 	})
 	t.Run("bool", func(t *testing.T) {
 		matched, unmatched := split("bool", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if b := kv.Bool(); b != v {
 				t.Errorf("expected true, got %v", b)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if b := kv.Bool(); b != false {
 				t.Errorf("expected false, got %v", b)
 			}
 		}
 	})
 }
 func TestValues(t *testing.T) {
 	values := map[string][]any{
 		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
 		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
 		"float64s": {[]float32{42}, []float64{42}},
 		"strings":  {[]string{"42"}, []string{"hello"}},
 		"bools":    {[]bool{true}, []bool{false}},
 	}
 	t.Run("int64s", func(t *testing.T) {
 		matched, unmatched := split("int64s", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if i64s := kv.Ints(); i64s != nil {
 				t.Errorf("expected nil, got %v", i64s)
 			}
 		}
 	})
 	t.Run("uint64s", func(t *testing.T) {
 		matched, unmatched := split("uint64s", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if u64s := kv.Uints(); u64s != nil {
 				t.Errorf("expected nil, got %v", u64s)
 			}
 		}
 	})
 	t.Run("float64s", func(t *testing.T) {
 		matched, unmatched := split("float64s", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if f64s := kv.Floats(); f64s != nil {
 				t.Errorf("expected nil, got %v", f64s)
 			}
 		}
 	})
 	t.Run("strings", func(t *testing.T) {
 		matched, unmatched := split("strings", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if s := kv.Strings(); s != nil {
 				t.Errorf("expected nil, got %v", s)
 			}
 		}
 	})
 	t.Run("bools", func(t *testing.T) {
 		matched, unmatched := split("bools", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if b := kv.Bools(); b != nil {
 				t.Errorf("expected nil, got %v", b)
 			}
 		}
 	})
 }
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
 package gguf
 import (
 	"encoding/binary"
 	"iter"
 	"log/slog"
 )
 type lazy[T any] struct {
 	count  uint64
 	next   func() (T, bool)
 	stop   func()
 	values []T
 	// successFunc is called when all values have been successfully read.
 	successFunc func() error
 }
 func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
 	it := lazy[T]{}
 	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
 		return nil, err
 	}
 	it.values = make([]T, 0)
 	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
 		for i := range it.count {
 			t, err := fn()
 			if err != nil {
 				slog.Error("error reading tensor", "index", i, "error", err)
 				return
 			}
 			it.values = append(it.values, t)
 			if !yield(t) {
 				break
 			}
 		}
 		if it.successFunc != nil {
 			it.successFunc()
 		}
 	})
 	return &it, nil
 }
 func (g *lazy[T]) Values() iter.Seq[T] {
 	return func(yield func(T) bool) {
 		for _, v := range g.All() {
 			if !yield(v) {
 				break
 			}
 		}
 	}
 }
 func (g *lazy[T]) All() iter.Seq2[int, T] {
 	return func(yield func(int, T) bool) {
 		for i := range int(g.count) {
 			if i < len(g.values) {
 				if !yield(i, g.values[i]) {
 					break
 				}
 			} else {
 				t, ok := g.next()
 				if !ok {
 					break
 				}
 				if !yield(i, t) {
 					break
 				}
 			}
 		}
 	}
 }
 func (g *lazy[T]) rest() (collected bool) {
 	for {
 		_, ok := g.next()
 		collected = collected || ok
 		if !ok {
 			break
 		}
 	}
 	return collected
 }
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
 package gguf
 import (
 	"bufio"
 	"io"
 )
 type bufferedReader struct {
 	offset int64
 	*bufio.Reader
 }
 func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
 	return &bufferedReader{
 		Reader: bufio.NewReaderSize(rs, size),
 	}
 }
 func (rs *bufferedReader) Read(p []byte) (n int, err error) {
 	n, err = rs.Reader.Read(p)
 	rs.offset += int64(n)
 	return n, err
 }
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
 package gguf
 import (
 	"log/slog"
 	"strings"
 )
 type TensorInfo struct {
 	Name   string
 	Offset uint64
 	Shape  []uint64
 	Type   TensorType
 }
 func (ti TensorInfo) Valid() bool {
 	return ti.Name != "" && ti.NumBytes() > 0
 }
 func (ti TensorInfo) NumValues() int64 {
 	var numItems int64 = 1
 	for _, dim := range ti.Shape {
 		numItems *= int64(dim)
 	}
 	return numItems
 }
 // NumBytes returns the number of bytes in the tensor.
 func (ti TensorInfo) NumBytes() int64 {
 	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
 }
 func (ti TensorInfo) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.String("name", ti.Name),
 		slog.Int64("offset", int64(ti.Offset)),
 		slog.Any("shape", ti.Shape),
 		slog.Int64("num_values", ti.NumValues()),
 		slog.Int64("num_bytes", ti.NumBytes()),
 		slog.Any("type", ti.Type),
 	)
 }
 type TensorType uint32
 const (
 	TensorTypeF32 TensorType = iota
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
 	// unexported // unused in gguf
 	tensorTypeQ4_2
 	tensorTypeQ4_3
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
 	TensorTypeQ8_1
 	TensorTypeQ2_K
 	TensorTypeQ3_K
 	TensorTypeQ4_K
 	TensorTypeQ5_K
 	TensorTypeQ6_K
 	TensorTypeQ8_K
 	// unexported // unquantizable by ollama
 	tensorTypeIQ2_XXS
 	tensorTypeIQ2_XS
 	tensorTypeIQ3_XXS
 	tensorTypeIQ1_S
 	tensorTypeIQ4_NL
 	tensorTypeIQ3_S
 	tensorTypeIQ2_S
 	tensorTypeIQ4_XS
 	TensorTypeI8
 	TensorTypeI16
 	TensorTypeI32
 	TensorTypeI64
 	TensorTypeF64
 	// unexported // unquantizable by ollama
 	tensorTypeIQ1_M
 	TensorTypeBF16
 	// unexported // unused in gguf
 	tensorTypeQ4_0_4_4
 	tensorTypeQ4_0_4_8
 	tensorTypeQ4_0_8_8
 	// unexported // unquantizable by ollama
 	tensorTypeTQ1_0
 	tensorTypeTQ2_0
 	// unexported // unused in gguf
 	tensorTypeIQ4_NL_4_4
 	tensorTypeIQ4_NL_4_8
 	tensorTypeIQ4_NL_8_8
 )
 func (tt TensorType) NumBytes() float64 {
 	return float64(tt.typeSize()) / float64(tt.blockSize())
 }
 func (tt TensorType) typeSize() int64 {
 	switch tt {
 	case TensorTypeF32:
 		return 4
 	case TensorTypeF16:
 		return 2
 	case TensorTypeQ4_0:
 		return 2 + tt.blockSize()/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + tt.blockSize()/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + tt.blockSize()/2
 	case TensorTypeQ5_1:
 		return 2 + 2 + 4 + tt.blockSize()/2
 	case TensorTypeQ8_0:
 		return 2 + tt.blockSize()
 	case TensorTypeQ8_1:
 		return 2 + 2 + tt.blockSize()
 	case TensorTypeQ2_K:
 		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
 	case TensorTypeQ3_K:
 		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
 	case TensorTypeQ4_K:
 		return 2 + 2 + 12 + tt.blockSize()/2
 	case TensorTypeQ5_K:
 		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
 	case TensorTypeQ6_K:
 		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
 	case TensorTypeQ8_K:
 		return 4 + tt.blockSize() + 2*tt.blockSize()/16
 	case tensorTypeIQ2_XXS:
 		return 2 + 2*tt.blockSize()/8
 	case tensorTypeIQ2_XS:
 		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
 	case tensorTypeIQ3_XXS:
 		return 2 + tt.blockSize()/4 + tt.blockSize()/8
 	case tensorTypeIQ1_S:
 		return 2 + tt.blockSize()/8 + tt.blockSize()/16
 	case tensorTypeIQ4_NL:
 		return 2 + tt.blockSize()/2
 	case tensorTypeIQ3_S:
 		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
 	case tensorTypeIQ2_S:
 		return 2 + tt.blockSize()/4 + tt.blockSize()/16
 	case tensorTypeIQ4_XS:
 		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
 	case TensorTypeI8:
 		return 1
 	case TensorTypeI16:
 		return 2
 	case TensorTypeI32:
 		return 4
 	case TensorTypeI64:
 		return 8
 	case TensorTypeF64:
 		return 8
 	case tensorTypeIQ1_M:
 		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
 	case TensorTypeBF16:
 		return 2
 	default:
 		return 0
 	}
 }
 func (tt TensorType) blockSize() int64 {
 	switch tt {
 	case TensorTypeF32,
 		TensorTypeF16,
 		TensorTypeI8,
 		TensorTypeI16,
 		TensorTypeI32,
 		TensorTypeI64,
 		TensorTypeF64,
 		TensorTypeBF16:
 		return 1
 	case TensorTypeQ4_0,
 		TensorTypeQ4_1,
 		TensorTypeQ5_0,
 		TensorTypeQ5_1,
 		TensorTypeQ8_0,
 		TensorTypeQ8_1,
 		tensorTypeIQ4_NL:
 		return 32
 	default:
 		return 256
 	}
 }
 func (tt TensorType) String() string {
 	switch tt {
 	case TensorTypeF32:
 		return "f32"
 	case TensorTypeF16:
 		return "f16"
 	case TensorTypeQ4_0:
 		return "q4_0"
 	case TensorTypeQ4_1:
 		return "q4_1"
 	case tensorTypeQ4_2:
 		return "q4_2"
 	case tensorTypeQ4_3:
 		return "q4_3"
 	case TensorTypeQ5_0:
 		return "q5_0"
 	case TensorTypeQ5_1:
 		return "q5_1"
 	case TensorTypeQ8_0:
 		return "q8_0"
 	case TensorTypeQ8_1:
 		return "q8_1"
 	case TensorTypeQ2_K:
 		return "q2_k"
 	case TensorTypeQ3_K:
 		return "q3_k"
 	case TensorTypeQ4_K:
 		return "q4_k"
 	case TensorTypeQ5_K:
 		return "q5_k"
 	case TensorTypeQ6_K:
 		return "q6_k"
 	case TensorTypeQ8_K:
 		return "q8_k"
 	case tensorTypeIQ2_XXS:
 		return "iq2_xxs"
 	case tensorTypeIQ2_XS:
 		return "iq2_xs"
 	case tensorTypeIQ3_XXS:
 		return "iq3_xxs"
 	case tensorTypeIQ1_S:
 		return "iq1_s"
 	case tensorTypeIQ4_NL:
 		return "iq4_nl"
 	case tensorTypeIQ3_S:
 		return "iq3_s"
 	case tensorTypeIQ2_S:
 		return "iq2_s"
 	case tensorTypeIQ4_XS:
 		return "iq4_xs"
 	case TensorTypeI8:
 		return "i8"
 	case TensorTypeI16:
 		return "i16"
 	case TensorTypeI32:
 		return "i32"
 	case TensorTypeI64:
 		return "i64"
 	case TensorTypeF64:
 		return "f64"
 	case tensorTypeIQ1_M:
 		return "iq1_m"
 	case TensorTypeBF16:
 		return "bf16"
 	case tensorTypeQ4_0_4_4:
 		return "q4_0_4_4"
 	case tensorTypeQ4_0_4_8:
 		return "q4_0_4_8"
 	case tensorTypeQ4_0_8_8:
 		return "q4_0_8_8"
 	case tensorTypeTQ1_0:
 		return "tq1_0"
 	case tensorTypeTQ2_0:
 		return "tq2_0"
 	case tensorTypeIQ4_NL_4_4:
 		return "iq4_nl_4_4"
 	case tensorTypeIQ4_NL_4_8:
 		return "iq4_nl_4_8"
 	case tensorTypeIQ4_NL_8_8:
 		return "iq4_nl_8_8"
 	default:
 		return "unknown"
 	}
 }
 func (tt TensorType) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.Uint64("value", uint64(tt)),
 		slog.String("name", strings.ToUpper(tt.String())),
 		slog.Int64("size", tt.typeSize()),
 		slog.Int64("block_size", tt.blockSize()),
 		slog.Float64("num_bytes", tt.NumBytes()),
 	)
 }
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -45,6 +45,8 @@ var (
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
 		"codellama:latest",
 		"nous-hermes:latest",
 	}
 )
--- a/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
+++ b/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -0,0 +1,32 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Sun, 22 Jun 2025 09:22:05 -0700
 Subject: [PATCH] temporary prevent rocm+cuda mixed loading
 ---
 ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
 index 4e67d243..8f49f084 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("blas", silent, dir_path);
     ggml_backend_load_best("cann", silent, dir_path);
 -    ggml_backend_load_best("cuda", silent, dir_path);
 -    ggml_backend_load_best("hip", silent, dir_path);
 +
 +    // Avoid mixed hip+cuda configurations
 +    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
 +    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
 +    if (!hip_devices && !rocr_devices) {
 +        ggml_backend_load_best("cuda", silent, dir_path);
 +    } else {
 +        ggml_backend_load_best("hip", silent, dir_path);
 +    }
 +    
     ggml_backend_load_best("kompute", silent, dir_path);
     ggml_backend_load_best("metal", silent, dir_path);
     ggml_backend_load_best("rpc", silent, dir_path);
--- a/llm/server.go
+++ b/llm/server.go
@@ -139,6 +139,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}
 	// Verify the requested context size is <= the model training size
 	trainCtx := f.KV().ContextLength()
 	if opts.NumCtx/numParallel > int(trainCtx) && trainCtx > 0 {
 		slog.Warn("requested context size too large for model", "num_ctx", opts.NumCtx, "num_parallel", numParallel, "n_ctx_train", trainCtx)
 		opts.NumCtx = int(trainCtx) * numParallel
 	}
 	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
@@ -311,7 +318,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}
-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -602,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }
 func (c *Context) Compute(tensors ...ml.Tensor) {
-	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
+	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
 		panic(fmt.Errorf("error computing ggml graph: %v", status))
 	}
 	C.ggml_backend_sched_reset(c.b.sched)
 	needSync := true
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
    ggml_backend_load_best("blas", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
    // Avoid mixed hip+cuda configurations
    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
    if (!hip_devices && !rocr_devices) {
        ggml_backend_load_best("cuda", silent, dir_path);
    } else {
        ggml_backend_load_best("hip", silent, dir_path);
    }
    ggml_backend_load_best("kompute", silent, dir_path);
    ggml_backend_load_best("metal", silent, dir_path);
    ggml_backend_load_best("rpc", silent, dir_path);
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -27,7 +27,6 @@ function checkEnv() {
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
    # Locate CUDA versions
    # Note: this assumes every version found will be built
    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
    if ($cudaList.length -eq 0) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -94,19 +93,6 @@ function buildOllama() {
        $hashEnv = @{}
        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
        if ("$script:CUDA_DIRS".Contains("v11")) {
            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
            $env:CUDAToolkit_ROOT=$hashEnv[$v11]
            write-host "Building CUDA v11 backend libraries"
            # Note: cuda v11 requires msvc 2019 so force the older generator
            # to avoid 2022 (or newer) from being used as the default
            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "CUDA" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
        if ("$script:CUDA_DIRS".Contains("v12")) {
            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
            $env:CUDAToolkit_ROOT=$hashEnv[$v12]
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=GOFLAGS \
    --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
    --build-arg=OLLAMA_SKIP_CUDA_GENERATE \
    --build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
    --build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
    --build-arg=CUDA_V11_ARCHITECTURES \
    --build-arg=CUDA_V12_ARCHITECTURES \
    --build-arg=OLLAMA_SKIP_ROCM_GENERATE \
    --build-arg=OLLAMA_FAST_BUILD \
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
@@ -73,23 +73,19 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}
 	// Check for completion capability
-	r, err := os.Open(m.ModelPath)
+	f, err := gguf.Open(m.ModelPath)
 	if err == nil {
-		defer r.Close()
+		defer f.Close()
-		f, err := ggml.Decode(r, 1024)
+		if f.KeyValue("pooling_type").Valid() {
 		if err == nil {
 			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 			capabilities = append(capabilities, model.CapabilityEmbedding)
 		} else {
 			// If no embedding is specified, we assume the model supports completion
 			capabilities = append(capabilities, model.CapabilityCompletion)
 		}
-			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
+		if f.KeyValue("vision.block_count").Valid() {
 			capabilities = append(capabilities, model.CapabilityVision)
 		}
 		} else {
 			slog.Error("couldn't decode ggml", "error", err)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
 	}
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,123 +1,42 @@
 package server
 import (
 	"bytes"
 	"encoding/binary"
 	"errors"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )
 // Constants for GGUF magic bytes and version
 var (
 	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
 	ggufVer   = uint32(3)                      // Version 3
 )
 // Helper function to create mock GGUF data
 func createMockGGUFData(architecture string, vision bool) []byte {
 	var buf bytes.Buffer
 	// Write GGUF header
 	buf.Write(ggufMagic)
 	binary.Write(&buf, binary.LittleEndian, ggufVer)
 	// Write tensor count (0 for our test)
 	var numTensors uint64 = 0
 	binary.Write(&buf, binary.LittleEndian, numTensors)
 	// Calculate number of metadata entries
 	numMetaEntries := uint64(1) // architecture entry
 	if vision {
 		numMetaEntries++
 	}
 	// Add embedding entry if architecture is "bert"
 	if architecture == "bert" {
 		numMetaEntries++
 	}
 	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
 	// Write architecture metadata
 	archKey := "general.architecture"
 	keyLen := uint64(len(archKey))
 	binary.Write(&buf, binary.LittleEndian, keyLen)
 	buf.WriteString(archKey)
 	// String type (8)
 	var strType uint32 = 8
 	binary.Write(&buf, binary.LittleEndian, strType)
 	// String length
 	strLen := uint64(len(architecture))
 	binary.Write(&buf, binary.LittleEndian, strLen)
 	buf.WriteString(architecture)
 	if vision {
 		visionKey := architecture + ".vision.block_count"
 		keyLen = uint64(len(visionKey))
 		binary.Write(&buf, binary.LittleEndian, keyLen)
 		buf.WriteString(visionKey)
 		// uint32 type (4)
 		var uint32Type uint32 = 4
 		binary.Write(&buf, binary.LittleEndian, uint32Type)
 		// uint32 value (1)
 		var countVal uint32 = 1
 		binary.Write(&buf, binary.LittleEndian, countVal)
 	}
 	// Write embedding metadata if architecture is "bert"
 	if architecture == "bert" {
 		poolKey := architecture + ".pooling_type"
 		keyLen = uint64(len(poolKey))
 		binary.Write(&buf, binary.LittleEndian, keyLen)
 		buf.WriteString(poolKey)
 		// uint32 type (4)
 		var uint32Type uint32 = 4
 		binary.Write(&buf, binary.LittleEndian, uint32Type)
 		// uint32 value (1)
 		var poolingVal uint32 = 1
 		binary.Write(&buf, binary.LittleEndian, poolingVal)
 	}
 	return buf.Bytes()
 }
 func TestModelCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
+	// Create completion model (llama architecture without vision)
-	tempDir := t.TempDir()
+	completionModelPath, _ := createBinFile(t, ggml.KV{
 		"general.architecture": "llama",
 	}, []*ggml.Tensor{})
-	// Create different types of mock model files
+	// Create vision model (llama architecture with vision block count)
-	completionModelPath := filepath.Join(tempDir, "model.bin")
+	visionModelPath, _ := createBinFile(t, ggml.KV{
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
+		"general.architecture":     "llama",
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+		"llama.vision.block_count": uint32(1),
-	// Create a simple model file for tests that don't depend on GGUF content
+	}, []*ggml.Tensor{})
 	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")
-	if err := errors.Join(
+	// Create embedding model (bert architecture with pooling type)
-		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		"general.architecture": "bert",
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+		"bert.pooling_type":    uint32(1),
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+	}, []*ggml.Tensor{})
 	); err != nil {
 		t.Fatalf("Failed to create model files: %v", err)
 	}
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -145,21 +64,13 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
 		{
 			name: "model with tools and insert capability",
 			model: Model{
 				ModelPath: simpleModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
 		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -224,29 +135,33 @@ func TestModelCapabilities(t *testing.T) {
 }
 func TestModelCheckCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
+	// Create simple model file for tests that don't depend on GGUF content
-	tempDir := t.TempDir()
+	completionModelPath, _ := createBinFile(t, ggml.KV{
 		"general.architecture": "llama",
 	}, []*ggml.Tensor{})
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
+	// Create vision model (llama architecture with vision block count)
-	simpleModelPath := filepath.Join(tempDir, "model.bin")
+	visionModelPath, _ := createBinFile(t, ggml.KV{
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+		"general.architecture":     "llama",
 		"llama.vision.block_count": uint32(1),
 	}, []*ggml.Tensor{})
-	if err := errors.Join(
+	// Create embedding model (bert architecture with pooling type)
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		"general.architecture": "bert",
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+		"bert.pooling_type":    uint32(1),
-	); err != nil {
+	}, []*ggml.Tensor{})
 		t.Fatalf("Failed to create model files: %v", err)
 	}
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -261,7 +176,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -270,7 +185,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -278,7 +193,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -287,7 +202,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -312,7 +227,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,16 +257,8 @@ func TestQuantizeModel(t *testing.T) {
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			f, err := os.CreateTemp(t.TempDir(), tt.name)
+			p, _ := createBinFile(t, tt.kv, tt.tensors)
-			if err != nil {
+			fp, err := os.Open(p)
 				t.Fatal(err.Error())
 			}
 			defer f.Close()
 			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
 			if err != nil {
 				t.Fatalf("failed to create initial model: %s", err)
 			}
 			fp, err := os.Open(f.Name())
 			if err != nil {
 				t.Fatal(err.Error())
 			}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,11 +112,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
-	f, err := os.CreateTemp(t.TempDir(), modelName)
+	p, _ := createBinFile(t, ggml.KV{
 	require.NoError(t, err)
 	defer f.Close()
 	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -129,14 +125,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	}))
+	})
 	require.NoError(t, err)
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
 	b.f, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)
 	model := &Model{Name: modelName, ModelPath: p}
 	f, err := llm.LoadModel(model.ModelPath, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
 	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}
Author	SHA1	Message	Date
Daniel Hiltgen	10a8e04a8d	avoid context overflow (#11175 ) For smaller context models, make sure we do not exceed the training size.	2025-06-23 15:52:50 -07:00
Daniel Hiltgen	1c6669e64c	Re-remove cuda v11 (#10694 ) * Re-remove cuda v11 Revert the revert - drop v11 support requiring drivers newer than Feb 23 This reverts commit `c6bcdc4223`. * Simplify layout With only one version of the GPU libraries, we can simplify things down somewhat. (Jetsons still require special handling) * distinct sbsa variant for linux arm64 This avoids accidentally trying to load the sbsa cuda libraries on a jetson system which results in crashes. * temporary prevent rocm+cuda mixed loading	2025-06-23 14:07:00 -07:00
AJ	2bb69b40c7	readme: add ai-hub to community integrations (#11169 )	2025-06-23 09:21:12 -07:00
Daniel Hiltgen	65bff664cb	build speedups (#11142 ) Enable parallel building of the GPU architectures.	2025-06-20 12:32:51 -07:00
Michael Yang	c088ac0e79	convert: utility for merging tensors (#11069 )	2025-06-20 11:12:01 -07:00
Michael Yang	0a066cfd91	Reapply "feat: incremental gguf parser (#10822 )" (#11114 ) (#11119 ) * Reapply "feat: incremental gguf parser (#10822)" (#11114) This reverts commit `a6e64fbdf2`. * fix older ggufs	2025-06-20 11:11:40 -07:00
Jesse Gross	87b7af6cee	ggml: Check return status for computation. We don't check the return status after computing the graph, which can silently lead to bad outputs if we try to keep going and future computation succeeds. This appears to happens in certain cases on Apple M2 devices. Fixes #11070	2025-06-19 17:12:49 -07:00
Daniel Hiltgen	f2527b08fb	int: add coverage for older models (#11137 ) Verified these fail on 0.9.1 and pass on HEAD.	2025-06-19 12:10:19 -07:00