Add experimental MLX backend and engine with imagegen support (#13648 )

* WIP - MLX backend with gemma3 * MLX: add cmake and go tag build toggles To build the new MLX backend code: cmake --preset MLX cmake --build --preset MLX --parallel cmake --install build --component MLX go build -tags mlx . Note: the main.go entrypoint for the MLX engine will change in a follow up commit. * add experimental image generation runtime * add experimental image generation runtime * MLX: wire up cuda build for linux * MLX: get dependencies correct and dedup This is still too large for a unified github artifact, but is now "correct" for the mlx_cuda_v13 directory. * fix relative link bug in dedup * Add darwin build and readme * add go build tag for mlx dependent code and wire up build_darwin.sh * lint cleanup * macos: build mlx for x86 This will be CPU only. * cuda build instructions and fix drift from mlx bump * stale comment * Delete agent helper doc * Clean up readme.md * Revise README for tokenizer clarity and details Updated README to clarify tokenizer functionality and removed correctness section. --------- Co-authored-by: jmorganca <jmorganca@gmail.com>
Linux: switch to zstd compression (#13651 )
2026-04-25 18:25:42 +02:00 · 2026-01-08 16:18:59 -08:00 · 2026-01-08 15:47:32 -08:00 · 2026-01-08 15:40:07 -08:00 · 2026-01-07 15:34:08 -08:00 · 2026-01-07 01:27:15 -08:00
682 changed files with 95745 additions and 33009 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,8 +15,12 @@ ml/backend/**/*.cu linguist-vendored
 ml/backend/**/*.cuh linguist-vendored
 ml/backend/**/*.m linguist-vendored
 ml/backend/**/*.metal linguist-vendored
+ml/backend/**/*.comp linguist-vendored
+ml/backend/**/*.glsl linguist-vendored
 ml/backend/**/CMakeLists.txt linguist-vendored

+app/webview linguist-vendored
+
 llama/build-info.cpp linguist-generated
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated

--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -16,13 +16,15 @@ jobs:
    outputs:
      GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
      VERSION: ${{ steps.goflags.outputs.VERSION }}
+      vendorsha: ${{ steps.changes.outputs.vendorsha }}
    steps:
      - uses: actions/checkout@v4
      - name: Set environment
        id: goflags
        run: |
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
-          echo VERSION="${GITHUB_REF_NAME#v}" >>$GITHUB_OUTPUT
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" | tee -a $GITHUB_OUTPUT
+          echo VERSION="${GITHUB_REF_NAME#v}" | tee -a $GITHUB_OUTPUT
+          echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT

  darwin-build:
    runs-on: macos-14-xlarge
@@ -53,6 +55,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - run: |
          ./scripts/build_darwin.sh
      - name: Log build results
@@ -63,6 +68,7 @@ jobs:
          name: bundles-darwin
          path: |
            dist/*.tgz
+            dist/*.tar.zst
            dist/*.zip
            dist/*.dmg

@@ -185,7 +191,7 @@ jobs:
      - uses: actions/cache@v4
        with:
          path: ${{ github.workspace }}\.ccache
-          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
+          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}-${{ needs.setup-environment.outputs.vendorsha }}
      - name: Build target "${{ matrix.preset }}"
        run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
@@ -249,6 +255,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - name: Verify gcc is actually clang
        run: |
          $ErrorActionPreference='Continue'
@@ -302,6 +311,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - uses: actions/download-artifact@v4
        with:
          pattern: depends-windows*
@@ -366,6 +378,7 @@ jobs:
              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
@@ -380,13 +393,13 @@ jobs:
          done
      - run: |
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
-            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
+            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd --ultra -22 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst);
          done
      - uses: actions/upload-artifact@v4
        with:
          name: bundles-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
          path: |
-            *.tgz
+            *.tar.zst

  # Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
  docker-build-push:
@@ -519,7 +532,7 @@ jobs:
      - name: Upload release artifacts
        run: |
          pids=()
-          for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.exe dist/*.dmg ; do
+          for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.tar.zst dist/*.exe dist/*.dmg ; do
            echo "Uploading $payload"
            gh release upload ${GITHUB_REF_NAME} $payload --clobber &
            pids[$!]=$!
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -22,6 +22,7 @@ jobs:
    runs-on: ubuntu-latest
    outputs:
      changed: ${{ steps.changes.outputs.changed }}
+      vendorsha: ${{ steps.changes.outputs.vendorsha }}
    steps:
      - uses: actions/checkout@v4
        with:
@@ -37,6 +38,7 @@ jobs:
          }

          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
+          echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT

  linux:
    needs: [changes]
@@ -83,7 +85,7 @@ jobs:
      - uses: actions/cache@v4
        with:
          path: /github/home/.cache/ccache
-          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
      - run: |
          cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
          cmake --build --preset ${{ matrix.preset }} --parallel
@@ -178,7 +180,7 @@ jobs:
      - uses: actions/cache@v4
        with:
          path: ${{ github.workspace }}\.ccache
-          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
      - run: |
          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
@@ -206,6 +208,9 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version-file: 'go.mod'
+          cache-dependency-path: |
+            go.sum
+            Makefile.sync
      - uses: actions/setup-node@v4
        with:
          node-version: '20'
@@ -226,12 +231,9 @@ jobs:
        if: always()
        run: go test -count=1 -benchtime=1x ./...

-      # TODO(bmizerany): replace this heavy tool with just the
-      # tools/checks/binaries we want and then make them all run in parallel
-      # across jobs, not on a single tiny vm on Github Actions.
-      - uses: golangci/golangci-lint-action@v6
+      - uses: golangci/golangci-lint-action@v9
        with:
-          args: --timeout 10m0s -v
+          only-new-issues: true

  patches:
    runs-on: ubuntu-latest
@@ -240,4 +242,4 @@ jobs:
      - name: Verify patches apply cleanly and do not change files
        run: |
          make -f Makefile.sync clean checkout apply-patches sync
-          git diff --compact-summary --exit-code
+          git diff --compact-summary --exit-code
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -1,5 +1,4 @@
-run:
-  timeout: 5m
+version: "2"
 linters:
  enable:
    - asasalint
@@ -7,35 +6,46 @@ linters:
    - bodyclose
    - containedctx
    - gocheckcompilerdirectives
-    - gofmt
-    - gofumpt
-    - gosimple
-    - govet
-    - ineffassign
    - intrange
    - makezero
    - misspell
    - nilerr
    - nolintlint
    - nosprintfhostport
-    - staticcheck
    - unconvert
    - usetesting
    - wastedassign
    - whitespace
  disable:
-    - usestdlibvars
    - errcheck
-linters-settings:
-  staticcheck:
-    checks:
-      - all
-      - -SA1019 # omit Deprecated check
+    - usestdlibvars
+  settings:
+    govet:
+      disable:
+        - unusedresult
+    staticcheck:
+      checks:
+        - all
+        - -QF* # disable quick fix suggestions
+        - -SA1019
+        - -ST1000 # package comment format
+        - -ST1003 # underscores in package names
+        - -ST1005 # error strings should not be capitalized
+        - -ST1012 # error var naming (ErrFoo)
+        - -ST1016 # receiver name consistency
+        - -ST1020 # comment on exported function format
+        - -ST1021 # comment on exported type format
+        - -ST1022 # comment on exported var format
+        - -ST1023 # omit type from declaration
 severity:
-  default-severity: error
+  default: error
  rules:
    - linters:
        - gofmt
        - goimports
        - intrange
      severity: info
+formatters:
+  enable:
+    - gofmt
+    - gofumpt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,22 @@ cmake_minimum_required(VERSION 3.21)

 project(Ollama C CXX)

+# Handle cross-compilation on macOS: when CMAKE_OSX_ARCHITECTURES is set to a
+# single architecture different from the host, override CMAKE_SYSTEM_PROCESSOR
+# to match. This is necessary because CMAKE_SYSTEM_PROCESSOR defaults to the
+# host architecture, but downstream projects (like MLX) use it to detect the
+# target architecture.
+if(CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES ";")
+    # Single architecture specified
+    if(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+        message(STATUS "Cross-compiling for x86_64: overriding CMAKE_SYSTEM_PROCESSOR from ${CMAKE_SYSTEM_PROCESSOR} to x86_64")
+        set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+    elseif(CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+        message(STATUS "Cross-compiling for arm64: overriding CMAKE_SYSTEM_PROCESSOR from ${CMAKE_SYSTEM_PROCESSOR} to arm64")
+        set(CMAKE_SYSTEM_PROCESSOR "arm64")
+    endif()
+endif()
+
 include(CheckLanguage)
 include(GNUInstallDirs)

@@ -12,7 +28,7 @@ set(BUILD_SHARED_LIBS ON)

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_EXTENSIONS ON) # Recent versions of MLX Requires gnu++17 extensions to compile properly

 set(GGML_BUILD ON)
 set(GGML_SHARED ON)
@@ -54,6 +70,13 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cp

 add_compile_definitions(NDEBUG GGML_VERSION=0x0 GGML_COMMIT=0x0)

+# Define GGML version variables for shared library SOVERSION
+# These are required by ggml/src/CMakeLists.txt for proper library versioning
+set(GGML_VERSION_MAJOR 0)
+set(GGML_VERSION_MINOR 0)
+set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
+
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
@@ -140,14 +163,48 @@ if(CMAKE_HIP_COMPILER)
    endif()
 endif()

-find_package(Vulkan)
-if(Vulkan_FOUND)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
-    install(TARGETS ggml-vulkan
-        RUNTIME_DEPENDENCIES
-            PRE_INCLUDE_REGEXES vulkan
-            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
-        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
-    )
+if(NOT APPLE)
+    find_package(Vulkan)
+    if(Vulkan_FOUND)
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
+        install(TARGETS ggml-vulkan
+            RUNTIME_DEPENDENCIES
+                PRE_INCLUDE_REGEXES vulkan
+                PRE_EXCLUDE_REGEXES ".*"
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
+        )
+    endif()
 endif()
+
+option(MLX_ENGINE "Enable MLX backend" OFF)
+
+if(MLX_ENGINE)
+    message(STATUS "Setting up MLX (this takes a while...)")
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/ml/backend/mlx)
+
+    # Find CUDA toolkit if MLX is built with CUDA support
+    find_package(CUDAToolkit)
+
+    install(TARGETS mlx mlxc
+        RUNTIME_DEPENDENCIES
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
+            PRE_EXCLUDE_REGEXES ".*"
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+    )
+
+    # Manually install cudart and cublas since they might not be picked up as direct dependencies
+    if(CUDAToolkit_FOUND)
+        file(GLOB CUDART_LIBS
+            "${CUDAToolkit_LIBRARY_DIR}/libcudart.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libcublas.so*")
+        if(CUDART_LIBS)
+            install(FILES ${CUDART_LIBS}
+                DESTINATION ${OLLAMA_INSTALL_DIR}
+                COMPONENT MLX)
+        endif()
+    endif()
+endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -41,7 +41,7 @@
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
-        "CMAKE_CUDA_FLAGS": "-t 2",
+        "CMAKE_CUDA_FLAGS": "-t 4",
        "OLLAMA_RUNNER_DIR": "cuda_v13"
      }
    },
@@ -83,6 +83,28 @@
      "cacheVariables": {
        "OLLAMA_RUNNER_DIR": "vulkan"
      }
+    },
+    {
+      "name": "MLX",
+      "inherits": [ "Default" ],
+      "cacheVariables": {
+        "MLX_ENGINE": "ON",
+        "OLLAMA_RUNNER_DIR": "mlx"
+      }
+    },
+    {
+      "name": "MLX CUDA 12",
+      "inherits": [ "MLX", "CUDA 12" ],
+      "cacheVariables": {
+        "OLLAMA_RUNNER_DIR": "mlx_cuda_v12"
+      }
+    },
+    {
+      "name": "MLX CUDA 13",
+      "inherits": [ "MLX", "CUDA 13" ],
+      "cacheVariables": {
+        "OLLAMA_RUNNER_DIR": "mlx_cuda_v13"
+      }
    }
  ],
  "buildPresets": [
@@ -140,6 +162,21 @@
      "name": "Vulkan",
      "targets": [ "ggml-vulkan" ],
      "configurePreset": "Vulkan"
+    },
+    {
+      "name": "MLX",
+      "targets": [ "mlx", "mlxc" ],
+      "configurePreset": "MLX"
+    },
+    {
+      "name": "MLX CUDA 12",
+      "targets": [ "mlx", "mlxc" ],
+      "configurePreset": "MLX CUDA 12"
+    },
+    {
+      "name": "MLX CUDA 13",
+      "targets": [ "mlx", "mlxc" ],
+      "configurePreset": "MLX CUDA 13"
    }
  ]
 }
--- a/50
+++ b/50
@@ -39,14 +39,14 @@ ENV CC=clang CXX=clang++
 FROM base-${TARGETARCH} AS base
 ARG CMAKEVERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
-COPY CMakeLists.txt CMakePresets.json .
-COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ENV LDFLAGS=-s

 FROM base AS cpu
 RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CPU' \
        && cmake --build --parallel ${PARALLEL} --preset 'CPU' \
@@ -57,6 +57,8 @@ ARG CUDA11VERSION=11.8
 RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 11' \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
@@ -67,6 +69,8 @@ ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 12' \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
@@ -78,6 +82,8 @@ ARG CUDA13VERSION=13.0
 RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
 ENV PATH=/usr/local/cuda-13/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 13' \
        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
@@ -87,6 +93,8 @@ RUN --mount=type=cache,target=/root/.ccache \
 FROM base AS rocm-6
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 ARG PARALLEL
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'ROCm 6' \
        && cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
@@ -118,10 +126,44 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}

 FROM base AS vulkan
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'Vulkan' \
        && cmake --build --parallel --preset 'Vulkan' \
-        && cmake --install build --component Vulkan --strip --parallel 8 
+        && cmake --install build --component Vulkan --strip --parallel 8
+
+FROM base AS mlx
+ARG CUDA13VERSION=13.0
+RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} \
+    && dnf install -y openblas-devel lapack-devel \
+    && dnf install -y libcudnn9-cuda-13 libcudnn9-devel-cuda-13 \
+    && dnf install -y libnccl libnccl-devel
+ENV PATH=/usr/local/cuda-13/bin:$PATH
+ENV BLAS_INCLUDE_DIRS=/usr/include/openblas
+ENV LAPACK_INCLUDE_DIRS=/usr/include/openblas
+ENV CGO_LDFLAGS="-L/usr/local/cuda-13/lib64 -L/usr/local/cuda-13/targets/x86_64-linux/lib/stubs"
+ARG PARALLEL
+WORKDIR /go/src/github.com/ollama/ollama
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+COPY x/ml/backend/mlx x/ml/backend/mlx
+COPY go.mod go.sum .
+RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
+ENV PATH=/usr/local/go/bin:$PATH
+RUN go mod download
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
+        && cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
+        && cmake --install build --component MLX --strip --parallel ${PARALLEL}
+COPY . .
+ARG GOFLAGS="'-ldflags=-w -s'"
+ENV CGO_ENABLED=1
+ARG CGO_CFLAGS
+ARG CGO_CXXFLAGS
+# TODO wire up the actual MLX engine here instead of building the main binary...
+RUN mkdir -p dist/bin
+RUN go build -tags mlx -trimpath -buildmode=pie -o dist/bin/imagegen ./x/imagegen/cmd/engine


 FROM base AS build
@@ -143,6 +185,8 @@ FROM --platform=linux/amd64 scratch AS amd64
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
+COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/
+COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/bin/ /bin/

 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=3cfa9c3f125763305b4226bc032f1954f08990dc
+FETCH_HEAD=ec98e2002

 .PHONY: help
 help:
@@ -57,7 +57,7 @@ checkout: $(WORKDIR)
 $(WORKDIR):
 	git clone $(UPSTREAM) $(WORKDIR)

-.PHONE: format-patches
+.PHONY: format-patches
 format-patches: llama/patches
 	git -C $(WORKDIR) format-patch \
 		--no-signature \
@@ -66,7 +66,11 @@ format-patches: llama/patches
 		-o $(realpath $<) \
 		$(FETCH_HEAD)

-.PHONE: clean
+.PHONY: clean
 clean: checkout
 	@git -C $(WORKDIR) am --abort || true
 	$(RM) llama/patches/.*.patched
+
+.PHONY: print-base
+print-base:
+	@echo $(FETCH_HEAD)
--- a/README.md
+++ b/README.md
@@ -367,6 +367,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
 - [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VS Code extension for multi-file/whole-repo coding
+- [Void](https://github.com/voideditor/void) (Open source AI code editor and Cursor alternative)
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
@@ -554,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
 - [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
+- [Swollama for Swift](https://github.com/guitaripod/Swollama) with [DocC](https://guitaripod.github.io/Swollama/documentation/swollama)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
--- a/api/client.go
+++ b/api/client.go
@@ -226,7 +226,14 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f

 		bts := scanner.Bytes()
 		if err := json.Unmarshal(bts, &errorResponse); err != nil {
-			return fmt.Errorf("unmarshal: %w", err)
+			if response.StatusCode >= http.StatusBadRequest {
+				return StatusError{
+					StatusCode:   response.StatusCode,
+					Status:       response.Status,
+					ErrorMessage: string(bts),
+				}
+			}
+			return errors.New(string(bts))
 		}

 		if response.StatusCode == http.StatusUnauthorized {
@@ -340,7 +347,7 @@ type CreateProgressFunc func(ProgressResponse) error
 // Create creates a model from a [Modelfile]. fn is a progress function that
 // behaves similarly to other methods (see [Client.Pull]).
 //
-// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
+// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.mdx
 func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
 		var resp ProgressResponse
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -55,6 +55,7 @@ func TestClientFromEnvironment(t *testing.T) {
 type testError struct {
 	message    string
 	statusCode int
+	raw        bool // if true, write message as-is instead of JSON encoding
 }

 func (e testError) Error() string {
@@ -111,6 +112,20 @@ func TestClientStream(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "plain text error response",
+			responses: []any{
+				"internal server error",
+			},
+			wantErr: "internal server error",
+		},
+		{
+			name: "HTML error page",
+			responses: []any{
+				"<html><body>404 Not Found</body></html>",
+			},
+			wantErr: "404 Not Found",
+		},
 	}

 	for _, tc := range testCases {
@@ -135,6 +150,12 @@ func TestClientStream(t *testing.T) {
 						return
 					}

+					if str, ok := resp.(string); ok {
+						fmt.Fprintln(w, str)
+						flusher.Flush()
+						continue
+					}
+
 					if err := json.NewEncoder(w).Encode(resp); err != nil {
 						t.Fatalf("failed to encode response: %v", err)
 					}
@@ -173,9 +194,10 @@ func TestClientStream(t *testing.T) {

 func TestClientDo(t *testing.T) {
 	testCases := []struct {
-		name     string
-		response any
-		wantErr  string
+		name           string
+		response       any
+		wantErr        string
+		wantStatusCode int
 	}{
 		{
 			name: "immediate error response",
@@ -183,7 +205,8 @@ func TestClientDo(t *testing.T) {
 				message:    "test error message",
 				statusCode: http.StatusBadRequest,
 			},
-			wantErr: "test error message",
+			wantErr:        "test error message",
+			wantStatusCode: http.StatusBadRequest,
 		},
 		{
 			name: "server error response",
@@ -191,7 +214,8 @@ func TestClientDo(t *testing.T) {
 				message:    "internal error",
 				statusCode: http.StatusInternalServerError,
 			},
-			wantErr: "internal error",
+			wantErr:        "internal error",
+			wantStatusCode: http.StatusInternalServerError,
 		},
 		{
 			name: "successful response",
@@ -203,6 +227,26 @@ func TestClientDo(t *testing.T) {
 				Success: true,
 			},
 		},
+		{
+			name: "plain text error response",
+			response: testError{
+				message:    "internal server error",
+				statusCode: http.StatusInternalServerError,
+				raw:        true,
+			},
+			wantErr:        "internal server error",
+			wantStatusCode: http.StatusInternalServerError,
+		},
+		{
+			name: "HTML error page",
+			response: testError{
+				message:    "<html><body>404 Not Found</body></html>",
+				statusCode: http.StatusNotFound,
+				raw:        true,
+			},
+			wantErr:        "<html><body>404 Not Found</body></html>",
+			wantStatusCode: http.StatusNotFound,
+		},
 	}

 	for _, tc := range testCases {
@@ -210,11 +254,16 @@ func TestClientDo(t *testing.T) {
 			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if errResp, ok := tc.response.(testError); ok {
 					w.WriteHeader(errResp.statusCode)
-					err := json.NewEncoder(w).Encode(map[string]string{
-						"error": errResp.message,
-					})
-					if err != nil {
-						t.Fatal("failed to encode error response:", err)
+					if !errResp.raw {
+						err := json.NewEncoder(w).Encode(map[string]string{
+							"error": errResp.message,
+						})
+						if err != nil {
+							t.Fatal("failed to encode error response:", err)
+						}
+					} else {
+						// Write raw message (simulates non-JSON error responses)
+						fmt.Fprint(w, errResp.message)
 					}
 					return
 				}
@@ -241,6 +290,15 @@ func TestClientDo(t *testing.T) {
 				if err.Error() != tc.wantErr {
 					t.Errorf("error message mismatch: got %q, want %q", err.Error(), tc.wantErr)
 				}
+				if tc.wantStatusCode != 0 {
+					if statusErr, ok := err.(StatusError); ok {
+						if statusErr.StatusCode != tc.wantStatusCode {
+							t.Errorf("status code mismatch: got %d, want %d", statusErr.StatusCode, tc.wantStatusCode)
+						}
+					} else {
+						t.Errorf("expected StatusError, got %T", err)
+					}
+				}
 				return
 			}

--- a/api/examples/chat/main.go
+++ b/api/examples/chat/main.go
@@ -15,19 +15,19 @@ func main() {
 	}

 	messages := []api.Message{
-		api.Message{
+		{
 			Role:    "system",
 			Content: "Provide very brief, concise responses",
 		},
-		api.Message{
+		{
 			Role:    "user",
 			Content: "Name some unusual animals",
 		},
-		api.Message{
+		{
 			Role:    "assistant",
 			Content: "Monotreme, platypus, echidna",
 		},
-		api.Message{
+		{
 			Role:    "user",
 			Content: "which of these is the most dangerous?",
 		},
--- a/api/types.go
+++ b/api/types.go
@@ -3,6 +3,7 @@ package api
 import (
 	"encoding/json"
 	"fmt"
+	"iter"
 	"log/slog"
 	"math"
 	"os"
@@ -14,6 +15,7 @@ import (
 	"github.com/google/uuid"

 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/internal/orderedmap"
 	"github.com/ollama/ollama/types/model"
 )

@@ -227,13 +229,79 @@ type ToolCallFunction struct {
 	Arguments ToolCallFunctionArguments `json:"arguments"`
 }

-type ToolCallFunctionArguments map[string]any
+// ToolCallFunctionArguments holds tool call arguments in insertion order.
+type ToolCallFunctionArguments struct {
+	om *orderedmap.Map[string, any]
+}
+
+// NewToolCallFunctionArguments creates a new empty ToolCallFunctionArguments.
+func NewToolCallFunctionArguments() ToolCallFunctionArguments {
+	return ToolCallFunctionArguments{om: orderedmap.New[string, any]()}
+}
+
+// Get retrieves a value by key.
+func (t *ToolCallFunctionArguments) Get(key string) (any, bool) {
+	if t == nil || t.om == nil {
+		return nil, false
+	}
+	return t.om.Get(key)
+}
+
+// Set sets a key-value pair, preserving insertion order.
+func (t *ToolCallFunctionArguments) Set(key string, value any) {
+	if t == nil {
+		return
+	}
+	if t.om == nil {
+		t.om = orderedmap.New[string, any]()
+	}
+	t.om.Set(key, value)
+}
+
+// Len returns the number of arguments.
+func (t *ToolCallFunctionArguments) Len() int {
+	if t == nil || t.om == nil {
+		return 0
+	}
+	return t.om.Len()
+}
+
+// All returns an iterator over all key-value pairs in insertion order.
+func (t *ToolCallFunctionArguments) All() iter.Seq2[string, any] {
+	if t == nil || t.om == nil {
+		return func(yield func(string, any) bool) {}
+	}
+	return t.om.All()
+}
+
+// ToMap returns a regular map (order not preserved).
+func (t *ToolCallFunctionArguments) ToMap() map[string]any {
+	if t == nil || t.om == nil {
+		return nil
+	}
+	return t.om.ToMap()
+}

 func (t *ToolCallFunctionArguments) String() string {
-	bts, _ := json.Marshal(t)
+	if t == nil || t.om == nil {
+		return "{}"
+	}
+	bts, _ := json.Marshal(t.om)
 	return string(bts)
 }

+func (t *ToolCallFunctionArguments) UnmarshalJSON(data []byte) error {
+	t.om = orderedmap.New[string, any]()
+	return json.Unmarshal(data, t.om)
+}
+
+func (t ToolCallFunctionArguments) MarshalJSON() ([]byte, error) {
+	if t.om == nil {
+		return []byte("{}"), nil
+	}
+	return json.Marshal(t.om)
+}
+
 type Tool struct {
 	Type     string       `json:"type"`
 	Items    any          `json:"items,omitempty"`
@@ -282,12 +350,78 @@ func (pt PropertyType) String() string {
 	return fmt.Sprintf("%v", []string(pt))
 }

+// ToolPropertiesMap holds tool properties in insertion order.
+type ToolPropertiesMap struct {
+	om *orderedmap.Map[string, ToolProperty]
+}
+
+// NewToolPropertiesMap creates a new empty ToolPropertiesMap.
+func NewToolPropertiesMap() *ToolPropertiesMap {
+	return &ToolPropertiesMap{om: orderedmap.New[string, ToolProperty]()}
+}
+
+// Get retrieves a property by name.
+func (t *ToolPropertiesMap) Get(key string) (ToolProperty, bool) {
+	if t == nil || t.om == nil {
+		return ToolProperty{}, false
+	}
+	return t.om.Get(key)
+}
+
+// Set sets a property, preserving insertion order.
+func (t *ToolPropertiesMap) Set(key string, value ToolProperty) {
+	if t == nil {
+		return
+	}
+	if t.om == nil {
+		t.om = orderedmap.New[string, ToolProperty]()
+	}
+	t.om.Set(key, value)
+}
+
+// Len returns the number of properties.
+func (t *ToolPropertiesMap) Len() int {
+	if t == nil || t.om == nil {
+		return 0
+	}
+	return t.om.Len()
+}
+
+// All returns an iterator over all properties in insertion order.
+func (t *ToolPropertiesMap) All() iter.Seq2[string, ToolProperty] {
+	if t == nil || t.om == nil {
+		return func(yield func(string, ToolProperty) bool) {}
+	}
+	return t.om.All()
+}
+
+// ToMap returns a regular map (order not preserved).
+func (t *ToolPropertiesMap) ToMap() map[string]ToolProperty {
+	if t == nil || t.om == nil {
+		return nil
+	}
+	return t.om.ToMap()
+}
+
+func (t ToolPropertiesMap) MarshalJSON() ([]byte, error) {
+	if t.om == nil {
+		return []byte("null"), nil
+	}
+	return json.Marshal(t.om)
+}
+
+func (t *ToolPropertiesMap) UnmarshalJSON(data []byte) error {
+	t.om = orderedmap.New[string, ToolProperty]()
+	return json.Unmarshal(data, t.om)
+}
+
 type ToolProperty struct {
-	AnyOf       []ToolProperty `json:"anyOf,omitempty"`
-	Type        PropertyType   `json:"type,omitempty"`
-	Items       any            `json:"items,omitempty"`
-	Description string         `json:"description,omitempty"`
-	Enum        []any          `json:"enum,omitempty"`
+	AnyOf       []ToolProperty     `json:"anyOf,omitempty"`
+	Type        PropertyType       `json:"type,omitempty"`
+	Items       any                `json:"items,omitempty"`
+	Description string             `json:"description,omitempty"`
+	Enum        []any              `json:"enum,omitempty"`
+	Properties  *ToolPropertiesMap `json:"properties,omitempty"`
 }

 // ToTypeScriptType converts a ToolProperty to a TypeScript type string
@@ -336,11 +470,11 @@ func mapToTypeScriptType(jsonType string) string {
 }

 type ToolFunctionParameters struct {
-	Type       string                  `json:"type"`
-	Defs       any                     `json:"$defs,omitempty"`
-	Items      any                     `json:"items,omitempty"`
-	Required   []string                `json:"required,omitempty"`
-	Properties map[string]ToolProperty `json:"properties"`
+	Type       string             `json:"type"`
+	Defs       any                `json:"$defs,omitempty"`
+	Items      any                `json:"items,omitempty"`
+	Required   []string           `json:"required,omitempty"`
+	Properties *ToolPropertiesMap `json:"properties"`
 }

 func (t *ToolFunctionParameters) String() string {
@@ -553,6 +687,9 @@ type CreateRequest struct {
 	Renderer string `json:"renderer,omitempty"`
 	Parser   string `json:"parser,omitempty"`

+	// Requires is the minimum version of Ollama required by the model.
+	Requires string `json:"requires,omitempty"`
+
 	// Info is a map of additional information for the model
 	Info map[string]any `json:"info,omitempty"`

@@ -603,6 +740,7 @@ type ShowResponse struct {
 	Tensors       []Tensor           `json:"tensors,omitempty"`
 	Capabilities  []model.Capability `json:"capabilities,omitempty"`
 	ModifiedAt    time.Time          `json:"modified_at,omitempty"`
+	Requires      string             `json:"requires,omitempty"`
 }

 // CopyRequest is the request passed to [Client.Copy].
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -11,6 +11,24 @@ import (
 	"github.com/stretchr/testify/require"
 )

+// testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests, order not preserved)
+func testPropsMap(m map[string]ToolProperty) *ToolPropertiesMap {
+	props := NewToolPropertiesMap()
+	for k, v := range m {
+		props.Set(k, v)
+	}
+	return props
+}
+
+// testArgs creates ToolCallFunctionArguments from a map (convenience function for tests, order not preserved)
+func testArgs(m map[string]any) ToolCallFunctionArguments {
+	args := NewToolCallFunctionArguments()
+	for k, v := range m {
+		args.Set(k, v)
+	}
+	return args
+}
+
 func TestKeepAliveParsingFromJSON(t *testing.T) {
 	tests := []struct {
 		name string
@@ -309,9 +327,9 @@ func TestToolFunctionParameters_MarshalJSON(t *testing.T) {
 			input: ToolFunctionParameters{
 				Type:     "object",
 				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
+				Properties: testPropsMap(map[string]ToolProperty{
 					"name": {Type: PropertyType{"string"}},
-				},
+				}),
 			},
 			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string"}}}`,
 		},
@@ -319,9 +337,9 @@ func TestToolFunctionParameters_MarshalJSON(t *testing.T) {
 			name: "no required",
 			input: ToolFunctionParameters{
 				Type: "object",
-				Properties: map[string]ToolProperty{
+				Properties: testPropsMap(map[string]ToolProperty{
 					"name": {Type: PropertyType{"string"}},
-				},
+				}),
 			},
 			expected: `{"type":"object","properties":{"name":{"type":"string"}}}`,
 		},
@@ -339,7 +357,7 @@ func TestToolFunctionParameters_MarshalJSON(t *testing.T) {
 func TestToolCallFunction_IndexAlwaysMarshals(t *testing.T) {
 	fn := ToolCallFunction{
 		Name:      "echo",
-		Arguments: ToolCallFunctionArguments{"message": "hi"},
+		Arguments: testArgs(map[string]any{"message": "hi"}),
 	}

 	data, err := json.Marshal(fn)
@@ -504,6 +522,116 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 	}
 }

+func TestToolPropertyNestedProperties(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected ToolProperty
+	}{
+		{
+			name: "nested object properties",
+			input: `{
+				"type": "object",
+				"description": "Location details",
+				"properties": {
+					"address": {
+						"type": "string",
+						"description": "Street address"
+					},
+					"city": {
+						"type": "string",
+						"description": "City name"
+					}
+				}
+			}`,
+			expected: ToolProperty{
+				Type:        PropertyType{"object"},
+				Description: "Location details",
+				Properties: testPropsMap(map[string]ToolProperty{
+					"address": {
+						Type:        PropertyType{"string"},
+						Description: "Street address",
+					},
+					"city": {
+						Type:        PropertyType{"string"},
+						Description: "City name",
+					},
+				}),
+			},
+		},
+		{
+			name: "deeply nested properties",
+			input: `{
+				"type": "object",
+				"description": "Event",
+				"properties": {
+					"location": {
+						"type": "object",
+						"description": "Location",
+						"properties": {
+							"coordinates": {
+								"type": "object",
+								"description": "GPS coordinates",
+								"properties": {
+									"lat": {"type": "number", "description": "Latitude"},
+									"lng": {"type": "number", "description": "Longitude"}
+								}
+							}
+						}
+					}
+				}
+			}`,
+			expected: ToolProperty{
+				Type:        PropertyType{"object"},
+				Description: "Event",
+				Properties: testPropsMap(map[string]ToolProperty{
+					"location": {
+						Type:        PropertyType{"object"},
+						Description: "Location",
+						Properties: testPropsMap(map[string]ToolProperty{
+							"coordinates": {
+								Type:        PropertyType{"object"},
+								Description: "GPS coordinates",
+								Properties: testPropsMap(map[string]ToolProperty{
+									"lat": {Type: PropertyType{"number"}, Description: "Latitude"},
+									"lng": {Type: PropertyType{"number"}, Description: "Longitude"},
+								}),
+							},
+						}),
+					},
+				}),
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var prop ToolProperty
+			err := json.Unmarshal([]byte(tt.input), &prop)
+			require.NoError(t, err)
+
+			// Compare JSON representations since pointer comparison doesn't work
+			expectedJSON, err := json.Marshal(tt.expected)
+			require.NoError(t, err)
+			actualJSON, err := json.Marshal(prop)
+			require.NoError(t, err)
+			assert.JSONEq(t, string(expectedJSON), string(actualJSON))
+
+			// Round-trip test: marshal and unmarshal again
+			data, err := json.Marshal(prop)
+			require.NoError(t, err)
+
+			var prop2 ToolProperty
+			err = json.Unmarshal(data, &prop2)
+			require.NoError(t, err)
+
+			prop2JSON, err := json.Marshal(prop2)
+			require.NoError(t, err)
+			assert.JSONEq(t, string(expectedJSON), string(prop2JSON))
+		})
+	}
+}
+
 func TestToolFunctionParameters_String(t *testing.T) {
 	tests := []struct {
 		name     string
@@ -515,12 +643,12 @@ func TestToolFunctionParameters_String(t *testing.T) {
 			params: ToolFunctionParameters{
 				Type:     "object",
 				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
+				Properties: testPropsMap(map[string]ToolProperty{
 					"name": {
 						Type:        PropertyType{"string"},
 						Description: "The name of the person",
 					},
-				},
+				}),
 			},
 			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
 		},
@@ -537,7 +665,7 @@ func TestToolFunctionParameters_String(t *testing.T) {
 					s.Self = s
 					return s
 				}(),
-				Properties: map[string]ToolProperty{},
+				Properties: testPropsMap(map[string]ToolProperty{}),
 			},
 			expected: "",
 		},
@@ -550,3 +678,235 @@ func TestToolFunctionParameters_String(t *testing.T) {
 		})
 	}
 }
+
+func TestToolCallFunctionArguments_OrderPreservation(t *testing.T) {
+	t.Run("marshal preserves insertion order", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("zebra", "z")
+		args.Set("apple", "a")
+		args.Set("mango", "m")
+
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+
+		// Should preserve insertion order, not alphabetical
+		assert.Equal(t, `{"zebra":"z","apple":"a","mango":"m"}`, string(data))
+	})
+
+	t.Run("unmarshal preserves JSON order", func(t *testing.T) {
+		jsonData := `{"zebra":"z","apple":"a","mango":"m"}`
+
+		var args ToolCallFunctionArguments
+		err := json.Unmarshal([]byte(jsonData), &args)
+		require.NoError(t, err)
+
+		// Verify iteration order matches JSON order
+		var keys []string
+		for k := range args.All() {
+			keys = append(keys, k)
+		}
+		assert.Equal(t, []string{"zebra", "apple", "mango"}, keys)
+	})
+
+	t.Run("round trip preserves order", func(t *testing.T) {
+		original := `{"z":1,"a":2,"m":3,"b":4}`
+
+		var args ToolCallFunctionArguments
+		err := json.Unmarshal([]byte(original), &args)
+		require.NoError(t, err)
+
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+
+		assert.Equal(t, original, string(data))
+	})
+
+	t.Run("String method returns ordered JSON", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("c", 3)
+		args.Set("a", 1)
+		args.Set("b", 2)
+
+		assert.Equal(t, `{"c":3,"a":1,"b":2}`, args.String())
+	})
+
+	t.Run("Get retrieves correct values", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("key1", "value1")
+		args.Set("key2", 42)
+
+		v, ok := args.Get("key1")
+		assert.True(t, ok)
+		assert.Equal(t, "value1", v)
+
+		v, ok = args.Get("key2")
+		assert.True(t, ok)
+		assert.Equal(t, 42, v)
+
+		_, ok = args.Get("nonexistent")
+		assert.False(t, ok)
+	})
+
+	t.Run("Len returns correct count", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		assert.Equal(t, 0, args.Len())
+
+		args.Set("a", 1)
+		assert.Equal(t, 1, args.Len())
+
+		args.Set("b", 2)
+		assert.Equal(t, 2, args.Len())
+	})
+
+	t.Run("empty args marshal to empty object", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+		assert.Equal(t, `{}`, string(data))
+	})
+
+	t.Run("zero value args marshal to empty object", func(t *testing.T) {
+		var args ToolCallFunctionArguments
+		assert.Equal(t, "{}", args.String())
+	})
+}
+
+func TestToolPropertiesMap_OrderPreservation(t *testing.T) {
+	t.Run("marshal preserves insertion order", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		props.Set("zebra", ToolProperty{Type: PropertyType{"string"}})
+		props.Set("apple", ToolProperty{Type: PropertyType{"number"}})
+		props.Set("mango", ToolProperty{Type: PropertyType{"boolean"}})
+
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+
+		// Should preserve insertion order, not alphabetical
+		expected := `{"zebra":{"type":"string"},"apple":{"type":"number"},"mango":{"type":"boolean"}}`
+		assert.Equal(t, expected, string(data))
+	})
+
+	t.Run("unmarshal preserves JSON order", func(t *testing.T) {
+		jsonData := `{"zebra":{"type":"string"},"apple":{"type":"number"},"mango":{"type":"boolean"}}`
+
+		var props ToolPropertiesMap
+		err := json.Unmarshal([]byte(jsonData), &props)
+		require.NoError(t, err)
+
+		// Verify iteration order matches JSON order
+		var keys []string
+		for k := range props.All() {
+			keys = append(keys, k)
+		}
+		assert.Equal(t, []string{"zebra", "apple", "mango"}, keys)
+	})
+
+	t.Run("round trip preserves order", func(t *testing.T) {
+		original := `{"z":{"type":"string"},"a":{"type":"number"},"m":{"type":"boolean"}}`
+
+		var props ToolPropertiesMap
+		err := json.Unmarshal([]byte(original), &props)
+		require.NoError(t, err)
+
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+
+		assert.Equal(t, original, string(data))
+	})
+
+	t.Run("Get retrieves correct values", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		props.Set("name", ToolProperty{Type: PropertyType{"string"}, Description: "The name"})
+		props.Set("age", ToolProperty{Type: PropertyType{"integer"}, Description: "The age"})
+
+		v, ok := props.Get("name")
+		assert.True(t, ok)
+		assert.Equal(t, "The name", v.Description)
+
+		v, ok = props.Get("age")
+		assert.True(t, ok)
+		assert.Equal(t, "The age", v.Description)
+
+		_, ok = props.Get("nonexistent")
+		assert.False(t, ok)
+	})
+
+	t.Run("Len returns correct count", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		assert.Equal(t, 0, props.Len())
+
+		props.Set("a", ToolProperty{})
+		assert.Equal(t, 1, props.Len())
+
+		props.Set("b", ToolProperty{})
+		assert.Equal(t, 2, props.Len())
+	})
+
+	t.Run("nil props marshal to null", func(t *testing.T) {
+		var props *ToolPropertiesMap
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+		assert.Equal(t, `null`, string(data))
+	})
+
+	t.Run("ToMap returns regular map", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		props.Set("a", ToolProperty{Type: PropertyType{"string"}})
+		props.Set("b", ToolProperty{Type: PropertyType{"number"}})
+
+		m := props.ToMap()
+		assert.Equal(t, 2, len(m))
+		assert.Equal(t, PropertyType{"string"}, m["a"].Type)
+		assert.Equal(t, PropertyType{"number"}, m["b"].Type)
+	})
+}
+
+func TestToolCallFunctionArguments_ComplexValues(t *testing.T) {
+	t.Run("nested objects preserve order", func(t *testing.T) {
+		jsonData := `{"outer":{"z":1,"a":2},"simple":"value"}`
+
+		var args ToolCallFunctionArguments
+		err := json.Unmarshal([]byte(jsonData), &args)
+		require.NoError(t, err)
+
+		// Outer keys should be in order
+		var keys []string
+		for k := range args.All() {
+			keys = append(keys, k)
+		}
+		assert.Equal(t, []string{"outer", "simple"}, keys)
+	})
+
+	t.Run("arrays as values", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("items", []string{"a", "b", "c"})
+		args.Set("numbers", []int{1, 2, 3})
+
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+
+		assert.Equal(t, `{"items":["a","b","c"],"numbers":[1,2,3]}`, string(data))
+	})
+}
+
+func TestToolPropertiesMap_NestedProperties(t *testing.T) {
+	t.Run("nested properties preserve order", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+
+		nestedProps := NewToolPropertiesMap()
+		nestedProps.Set("z_field", ToolProperty{Type: PropertyType{"string"}})
+		nestedProps.Set("a_field", ToolProperty{Type: PropertyType{"number"}})
+
+		props.Set("outer", ToolProperty{
+			Type:       PropertyType{"object"},
+			Properties: nestedProps,
+		})
+
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+
+		// Both outer and inner should preserve order
+		expected := `{"outer":{"type":"object","properties":{"z_field":{"type":"string"},"a_field":{"type":"number"}}}}`
+		assert.Equal(t, expected, string(data))
+	})
+}
--- a/app/cmd/app/app.go
+++ b/app/cmd/app/app.go
@@ -273,10 +273,6 @@ func main() {
 		Handler: uiServer.Handler(),
 	}

-	if _, err := uiServer.UserData(ctx); err != nil {
-		slog.Warn("failed to load user data", "error", err)
-	}
-
 	// Start the UI server
 	slog.Info("starting ui server", "port", port)
 	go func() {
@@ -320,6 +316,17 @@ func main() {
 		slog.Debug("no URL scheme request to handle")
 	}

+	go func() {
+		slog.Debug("waiting for ollama server to be ready")
+		if err := ui.WaitForServer(ctx, 10*time.Second); err != nil {
+			slog.Warn("ollama server not ready, continuing anyway", "error", err)
+		}
+
+		if _, err := uiServer.UserData(ctx); err != nil {
+			slog.Warn("failed to load user data", "error", err)
+		}
+	}()
+
 	osRun(cancel, hasCompletedFirstRun, startHidden)

 	slog.Info("shutting down desktop server")
@@ -361,7 +368,7 @@ func checkUserLoggedIn(uiServerPort int) bool {
 		return false
 	}

-	resp, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/api/v1/me", uiServerPort))
+	resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/me", uiServerPort), "application/json", nil)
 	if err != nil {
 		slog.Debug("failed to call local auth endpoint", "error", err)
 		return false
@@ -397,8 +404,8 @@ func checkUserLoggedIn(uiServerPort int) bool {
 // handleConnectURLScheme fetches the connect URL and opens it in the browser
 func handleConnectURLScheme() {
 	if checkUserLoggedIn(uiServerPort) {
-		slog.Info("user is already logged in, opening settings instead")
-		sendUIRequestMessage("/")
+		slog.Info("user is already logged in, opening app instead")
+		showWindow(wv.webview.Window())
 		return
 	}

@@ -434,37 +441,30 @@ func openInBrowser(url string) {
 	}
 }

-// parseURLScheme parses an ollama:// URL and returns whether it's a connect URL and the UI path
-func parseURLScheme(urlSchemeRequest string) (isConnect bool, uiPath string, err error) {
+// parseURLScheme parses an ollama:// URL and validates it
+// Supports: ollama:// (open app) and ollama://connect (OAuth)
+func parseURLScheme(urlSchemeRequest string) (isConnect bool, err error) {
 	parsedURL, err := url.Parse(urlSchemeRequest)
 	if err != nil {
-		return false, "", err
+		return false, fmt.Errorf("invalid URL: %w", err)
 	}

 	// Check if this is a connect URL
 	if parsedURL.Host == "connect" || strings.TrimPrefix(parsedURL.Path, "/") == "connect" {
-		return true, "", nil
+		return true, nil
 	}

-	// Extract the UI path
-	path := "/"
-	if parsedURL.Path != "" && parsedURL.Path != "/" {
-		// For URLs like ollama:///settings, use the path directly
-		path = parsedURL.Path
-	} else if parsedURL.Host != "" {
-		// For URLs like ollama://settings (without triple slash),
-		// the "settings" part is parsed as the host, not the path.
-		// We need to convert it to a path by prepending "/"
-		// This also handles ollama://settings/ where Windows adds a trailing slash
-		path = "/" + parsedURL.Host
+	// Allow bare ollama:// or ollama:/// to open the app
+	if (parsedURL.Host == "" && parsedURL.Path == "") || parsedURL.Path == "/" {
+		return false, nil
 	}

-	return false, path, nil
+	return false, fmt.Errorf("unsupported ollama:// URL path: %s", urlSchemeRequest)
 }

 // handleURLSchemeInCurrentInstance processes URL scheme requests in the current instance
 func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
-	isConnect, uiPath, err := parseURLScheme(urlSchemeRequest)
+	isConnect, err := parseURLScheme(urlSchemeRequest)
 	if err != nil {
 		slog.Error("failed to parse URL scheme request", "url", urlSchemeRequest, "error", err)
 		return
@@ -473,6 +473,8 @@ func handleURLSchemeInCurrentInstance(urlSchemeRequest string) {
 	if isConnect {
 		handleConnectURLScheme()
 	} else {
-		sendUIRequestMessage(uiPath)
+		if wv.webview != nil {
+			showWindow(wv.webview.Window())
+		}
 	}
 }
--- a/app/cmd/app/app_darwin.go
+++ b/app/cmd/app/app_darwin.go
@@ -191,13 +191,6 @@ func LaunchNewApp() {
 	C.launchApp(appName)
 }

-// Send a request to the main app thread to load a UI page
-func sendUIRequestMessage(path string) {
-	p := C.CString(path)
-	defer C.free(unsafe.Pointer(p))
-	C.uiRequest(p)
-}
-
 func registerLaunchAgent(hasCompletedFirstRun bool) {
 	// Remove any stale Login Item registrations
 	C.unregisterSelfFromLoginItem()
--- a/app/cmd/app/app_darwin.m
+++ b/app/cmd/app/app_darwin.m
@@ -24,27 +24,14 @@ bool firstTimeRun,startHidden; // Set in run before initialization
    for (NSURL *url in urls) {
        if ([url.scheme isEqualToString:@"ollama"]) {
            NSString *path = url.path;
-            if (!path || [path isEqualToString:@""]) {
-                // For URLs like ollama://settings (without triple slash),
-                // the "settings" part is parsed as the host, not the path.
-                // We need to convert it to a path by prepending "/"
-                if (url.host && ![url.host isEqualToString:@""]) {
-                    path = [@"/" stringByAppendingString:url.host];
-                } else {
-                    path = @"/";
-                }
-            }
-            
-            if ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"]) {
+
+            if (path && ([path isEqualToString:@"/connect"] || [url.host isEqualToString:@"connect"])) {
                // Special case: handle connect by opening browser instead of app
                handleConnectURL();
            } else {
                // Set app to be active and visible
                [NSApp setActivationPolicy:NSApplicationActivationPolicyRegular];
                [NSApp activateIgnoringOtherApps:YES];
-                
-                // Open the path with the UI
-                [self uiRequest:path];
            }
            
            break;
@@ -260,7 +247,7 @@ bool firstTimeRun,startHidden; // Set in run before initialization
 }

 - (void)openHelp:(id)sender {
-    NSURL *url = [NSURL URLWithString:@"https://github.com/ollama/ollama/tree/main/docs"];
+    NSURL *url = [NSURL URLWithString:@"https://docs.ollama.com/"];
    [[NSWorkspace sharedWorkspace] openURL:url];
 }

--- a/app/cmd/app/app_windows.go
+++ b/app/cmd/app/app_windows.go
@@ -138,7 +138,7 @@ func (app *appCallbacks) HandleURLScheme(urlScheme string) {

 // handleURLSchemeRequest processes URL scheme requests from other instances
 func handleURLSchemeRequest(urlScheme string) {
-	isConnect, uiPath, err := parseURLScheme(urlScheme)
+	isConnect, err := parseURLScheme(urlScheme)
 	if err != nil {
 		slog.Error("failed to parse URL scheme request", "url", urlScheme, "error", err)
 		return
@@ -147,7 +147,9 @@ func handleURLSchemeRequest(urlScheme string) {
 	if isConnect {
 		handleConnectURLScheme()
 	} else {
-		sendUIRequestMessage(uiPath)
+		if wv.webview != nil {
+			showWindow(wv.webview.Window())
+		}
 	}
 }

@@ -261,11 +263,6 @@ func createLoginShortcut() error {
 	return nil
 }

-// Send a request to the main app thread to load a UI page
-func sendUIRequestMessage(path string) {
-	wintray.SendUIRequestMessage(path)
-}
-
 func LaunchNewApp() {
 }

--- a/app/dialog/cocoa/dlg.m
+++ b/app/dialog/cocoa/dlg.m
@@ -169,37 +169,47 @@ DlgResult fileDlg(FileDlgParams* params) {
 	}
 	
 	NSArray* urls = [panel URLs];
-	if(self->params->allowMultiple && [urls count] >= 1) {
+	if([urls count] == 0) {
+		return DLG_CANCEL;
+	}
+	
+	if(self->params->allowMultiple) {
 		// For multiple files, we need to return all paths separated by null bytes
 		char* bufPtr = self->params->buf;
 		int remainingBuf = self->params->nbuf;
 		
-  // Calculate total required buffer size first
-  int totalSize = 0;
-  for(NSURL* url in urls) {
-      char tempBuf[PATH_MAX];
-      if(![url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX]) {
-          return DLG_URLFAIL;
-      }
-      totalSize += strlen(tempBuf) + 1; // +1 for null terminator
-  }
-  totalSize += 1; // Final null terminator
+		// Calculate total required buffer size first
+		int totalSize = 0;
+		for(NSURL* url in urls) {
+			char tempBuf[PATH_MAX];
+			if(![url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX]) {
+				return DLG_URLFAIL;
+			}
+			totalSize += strlen(tempBuf) + 1; // +1 for null terminator
+		}
+		totalSize += 1; // Final null terminator

-  if(totalSize > self->params->nbuf) {
-      // Not enough buffer space
-      return DLG_URLFAIL;
-  }
+		if(totalSize > self->params->nbuf) {
+			// Not enough buffer space
+			return DLG_URLFAIL;
+		}

-  // Now actually copy the paths (we know we have space)
-  bufPtr = self->params->buf;
-  for(NSURL* url in urls) {
-      char tempBuf[PATH_MAX];
-      [url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX];
-      int pathLen = strlen(tempBuf);
-      strcpy(bufPtr, tempBuf);
-      bufPtr += pathLen + 1;
-  }
-  *bufPtr = '\0'; // Final null terminator
+		// Now actually copy the paths (we know we have space)
+		bufPtr = self->params->buf;
+		for(NSURL* url in urls) {
+			char tempBuf[PATH_MAX];
+			[url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX];
+			int pathLen = strlen(tempBuf);
+			strcpy(bufPtr, tempBuf);
+			bufPtr += pathLen + 1;
+		}
+		*bufPtr = '\0'; // Final null terminator
+	} else {
+		// Single file/directory selection - write path to buffer
+		NSURL* url = [urls firstObject];
+		if(![url getFileSystemRepresentation:self->params->buf maxLength:self->params->nbuf]) {
+			return DLG_URLFAIL;
+		}
 	}
 	
 	return DLG_OK;
--- a/app/dialog/dlgs_windows.go
+++ b/app/dialog/dlgs_windows.go
@@ -15,7 +15,7 @@ const multiFileBufferSize = w32.MAX_PATH * 10
 type WinDlgError int

 func (e WinDlgError) Error() string {
-	return fmt.Sprintf("CommDlgExtendedError: %#x", e)
+	return fmt.Sprintf("CommDlgExtendedError: %#x", int(e))
 }

 func err() error {
--- a/app/server/server.go
+++ b/app/server/server.go
@@ -224,9 +224,7 @@ func (s *Server) cmd(ctx context.Context) (*exec.Cmd, error) {
 		if _, err := os.Stat(settings.Models); err == nil {
 			env["OLLAMA_MODELS"] = settings.Models
 		} else {
-			slog.Warn("models path not accessible, clearing models setting", "path", settings.Models, "err", err)
-			settings.Models = ""
-			s.store.SetSettings(settings)
+			slog.Warn("models path not accessible, using default", "path", settings.Models, "err", err)
 		}
 	}
 	if settings.ContextLength > 0 {
--- a/app/ui/app/codegen/gotypes.gen.ts
+++ b/app/ui/app/codegen/gotypes.gen.ts
@@ -469,26 +469,24 @@ export class HealthResponse {
 }
 export class User {
    id: string;
-    name: string;
    email: string;
-    avatarURL: string;
-    plan: string;
-    bio: string;
-    firstName: string;
-    lastName: string;
-    overThreshold: boolean;
+    name: string;
+    bio?: string;
+    avatarurl?: string;
+    firstname?: string;
+    lastname?: string;
+    plan?: string;

    constructor(source: any = {}) {
        if ('string' === typeof source) source = JSON.parse(source);
        this.id = source["id"];
-        this.name = source["name"];
        this.email = source["email"];
-        this.avatarURL = source["avatarURL"];
-        this.plan = source["plan"];
+        this.name = source["name"];
        this.bio = source["bio"];
-        this.firstName = source["firstName"];
-        this.lastName = source["lastName"];
-        this.overThreshold = source["overThreshold"];
+        this.avatarurl = source["avatarurl"];
+        this.firstname = source["firstname"];
+        this.lastname = source["lastname"];
+        this.plan = source["plan"];
    }
 }
 export class Attachment {
--- a/app/ui/app/src/api.ts
+++ b/app/ui/app/src/api.ts
@@ -15,7 +15,7 @@ import {
 import { parseJsonlFromResponse } from "./util/jsonl-parsing";
 import { ollamaClient as ollama } from "./lib/ollama-client";
 import type { ModelResponse } from "ollama/browser";
-import { API_BASE } from "./lib/config";
+import { API_BASE, OLLAMA_DOT_COM } from "./lib/config";

 // Extend Model class with utility methods
 declare module "@/gotypes" {
@@ -27,7 +27,6 @@ declare module "@/gotypes" {
 Model.prototype.isCloud = function (): boolean {
  return this.model.endsWith("cloud");
 };
-
 // Helper function to convert Uint8Array to base64
 function uint8ArrayToBase64(uint8Array: Uint8Array): string {
  const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
@@ -42,44 +41,50 @@ function uint8ArrayToBase64(uint8Array: Uint8Array): string {
 }

 export async function fetchUser(): Promise<User | null> {
-  try {
-    const response = await fetch(`${API_BASE}/api/v1/me`, {
-      method: "GET",
-      headers: {
-        "Content-Type": "application/json",
-      },
-    });
-
-    if (response.ok) {
-      const userData: User = await response.json();
-      return userData;
-    }
-
-    return null;
-  } catch (error) {
-    console.error("Error fetching user:", error);
-    return null;
-  }
-}
-
-export async function fetchConnectUrl(): Promise<string> {
-  const response = await fetch(`${API_BASE}/api/v1/connect`, {
-    method: "GET",
+  const response = await fetch(`${API_BASE}/api/me`, {
+    method: "POST",
    headers: {
      "Content-Type": "application/json",
    },
  });

-  if (!response.ok) {
-    throw new Error("Failed to fetch connect URL");
+  if (response.ok) {
+    const userData: User = await response.json();
+
+    if (userData.avatarurl && !userData.avatarurl.startsWith("http")) {
+      userData.avatarurl = `${OLLAMA_DOT_COM}${userData.avatarurl}`;
+    }
+
+    return userData;
  }

-  const data = await response.json();
-  return data.connect_url;
+  if (response.status === 401 || response.status === 403) {
+    return null;
+  }
+
+  throw new Error(`Failed to fetch user: ${response.status}`);
+}
+
+export async function fetchConnectUrl(): Promise<string> {
+  const response = await fetch(`${API_BASE}/api/me`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+  });
+
+  if (response.status === 401) {
+    const data = await response.json();
+    if (data.signin_url) {
+      return data.signin_url;
+    }
+  }
+
+  throw new Error("Failed to fetch connect URL");
 }

 export async function disconnectUser(): Promise<void> {
-  const response = await fetch(`${API_BASE}/api/v1/disconnect`, {
+  const response = await fetch(`${API_BASE}/api/signout`, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
@@ -204,12 +209,10 @@ export async function* sendMessage(
    data: uint8ArrayToBase64(att.data),
  }));

-  // Only send think parameter when actually requesting thinking
-  // Don't send false as it causes issues with some providers
+  // Send think parameter when it's explicitly set (true, false, or a non-empty string).
  const shouldSendThink =
    think !== undefined &&
-    ((typeof think === "boolean" && think) ||
-      (typeof think === "string" && think !== ""));
+    (typeof think === "boolean" || (typeof think === "string" && think !== ""));

  const response = await fetch(`${API_BASE}/api/v1/chat/${chatId}`, {
    method: "POST",
@@ -391,7 +394,8 @@ export async function getInferenceCompute(): Promise<InferenceCompute[]> {

 export async function fetchHealth(): Promise<boolean> {
  try {
-    const response = await fetch(`${API_BASE}/api/v1/health`, {
+    // Use the /api/version endpoint as a health check
+    const response = await fetch(`${API_BASE}/api/version`, {
      method: "GET",
      headers: {
        "Content-Type": "application/json",
@@ -400,7 +404,8 @@ export async function fetchHealth(): Promise<boolean> {

    if (response.ok) {
      const data = await response.json();
-      return data.healthy || false;
+      // If we get a version back, the server is healthy
+      return !!data.version;
    }

    return false;
--- a/app/ui/app/src/components/Settings.tsx
+++ b/app/ui/app/src/components/Settings.tsx
@@ -299,9 +299,9 @@ export default function Settings() {
                        </Button>
                      </div>
                    </div>
-                    {user?.avatarURL && (
+                    {user?.avatarurl && (
                      <img
-                        src={user.avatarURL}
+                        src={user.avatarurl}
                        alt={user?.name}
                        className="h-10 w-10 rounded-full bg-neutral-200 dark:bg-neutral-700 flex-shrink-0"
                        onError={(e) => {
--- a/app/ui/app/src/components/Thinking.tsx
+++ b/app/ui/app/src/components/Thinking.tsx
@@ -50,21 +50,33 @@ export default function Thinking({
  // Position content to show bottom when collapsed
  useEffect(() => {
    if (isCollapsed && contentRef.current && wrapperRef.current) {
-      const contentHeight = contentRef.current.scrollHeight;
-      const wrapperHeight = wrapperRef.current.clientHeight;
-      if (contentHeight > wrapperHeight) {
-        const translateY = -(contentHeight - wrapperHeight);
-        contentRef.current.style.transform = `translateY(${translateY}px)`;
-        setHasOverflow(true);
-      } else {
-        setHasOverflow(false);
-      }
+      requestAnimationFrame(() => {
+        if (!contentRef.current || !wrapperRef.current) return;
+
+        const contentHeight = contentRef.current.scrollHeight;
+        const wrapperHeight = wrapperRef.current.clientHeight;
+        if (contentHeight > wrapperHeight) {
+          const translateY = -(contentHeight - wrapperHeight);
+          contentRef.current.style.transform = `translateY(${translateY}px)`;
+          setHasOverflow(true);
+        } else {
+          contentRef.current.style.transform = "translateY(0)";
+          setHasOverflow(false);
+        }
+      });
    } else if (contentRef.current) {
      contentRef.current.style.transform = "translateY(0)";
      setHasOverflow(false);
    }
  }, [thinking, isCollapsed]);

+  useEffect(() => {
+    if (activelyThinking && wrapperRef.current && !isCollapsed) {
+      // When expanded and actively thinking, scroll to bottom
+      wrapperRef.current.scrollTop = wrapperRef.current.scrollHeight;
+    }
+  }, [thinking, activelyThinking, isCollapsed]);
+
  const handleToggle = () => {
    setIsCollapsed(!isCollapsed);
    setHasUserInteracted(true);
--- a/app/ui/app/src/hooks/useChats.ts
+++ b/app/ui/app/src/hooks/useChats.ts
@@ -7,6 +7,7 @@ import { createQueryBatcher } from "./useQueryBatcher";
 import { useRefetchModels } from "./useModels";
 import { useStreamingContext } from "@/contexts/StreamingContext";
 import { useSettings } from "./useSettings";
+import { getModelCapabilities } from "@/api";

 export const useChats = () => {
  return useQuery({
@@ -606,6 +607,24 @@ export const useSendMessage = (chatId: string) => {
              queryClient.setQueryData(["staleModels"], newStaleMap);

              queryClient.invalidateQueries({ queryKey: ["models"] });
+
+              // Fetch fresh capabilities for the downloaded model
+              getModelCapabilities(selectedModel.model)
+                .then((capabilities) => {
+                  queryClient.setQueryData(
+                    ["modelCapabilities", selectedModel.model],
+                    capabilities,
+                  );
+                })
+                .catch((error) => {
+                  console.error(
+                    "Failed to fetch capabilities after download:",
+                    error,
+                  );
+                  queryClient.invalidateQueries({
+                    queryKey: ["modelCapabilities", selectedModel.model],
+                  });
+                });
            }
            break;
          }
--- a/app/ui/app/src/hooks/useDownloadModel.ts
+++ b/app/ui/app/src/hooks/useDownloadModel.ts
@@ -1,114 +0,0 @@
-import { useMutation, useQueryClient } from "@tanstack/react-query";
-import { useState } from "react";
-import { pullModel } from "@/api";
-import { useSelectedModel } from "./useSelectedModel";
-import { useSettings } from "./useSettings";
-
-interface DownloadProgress {
-  status: string;
-  digest?: string;
-  total?: number;
-  completed?: number;
-  done?: boolean;
-}
-
-export function useDownloadModel(chatId?: string) {
-  const queryClient = useQueryClient();
-  const { selectedModel } = useSelectedModel(chatId);
-  const { setSettings } = useSettings();
-  const [downloadProgress, setDownloadProgress] =
-    useState<DownloadProgress | null>(null);
-  const [abortController, setAbortController] =
-    useState<AbortController | null>(null);
-  const [downloadingChatIds, setDownloadingChatIds] = useState<Set<string>>(
-    new Set(),
-  );
-
-  const mutation = useMutation({
-    mutationFn: async (modelName: string) => {
-      const controller = new AbortController();
-      setAbortController(controller);
-      setDownloadProgress({ status: "Starting download..." });
-      if (chatId) {
-        setDownloadingChatIds((prev) => new Set(prev).add(chatId));
-      }
-
-      try {
-        for await (const progress of pullModel(modelName, controller.signal)) {
-          setDownloadProgress(progress);
-
-          if (progress.status === "success") {
-            // Update selected model to indicate it's now available locally
-            if (selectedModel && selectedModel.model === modelName) {
-              setSettings({ SelectedModel: modelName });
-            }
-            // Invalidate models query to refresh the list
-            await queryClient.invalidateQueries({ queryKey: ["models"] });
-            break;
-          }
-        }
-      } finally {
-        setAbortController(null);
-        if (chatId) {
-          setDownloadingChatIds((prev) => {
-            const newSet = new Set(prev);
-            newSet.delete(chatId);
-            return newSet;
-          });
-        }
-      }
-    },
-    onSuccess: () => {
-      setDownloadProgress(null);
-      if (chatId) {
-        setDownloadingChatIds((prev) => {
-          const newSet = new Set(prev);
-          newSet.delete(chatId);
-          return newSet;
-        });
-      }
-    },
-    onError: (error: Error) => {
-      const status =
-        error.name === "AbortError" ? "Download cancelled" : "Download failed";
-      setDownloadProgress({ status, done: true });
-
-      // Clear error message after delay
-      const delay = error.name === "AbortError" ? 1500 : 3000;
-      setTimeout(() => {
-        setDownloadProgress(null);
-        if (chatId) {
-          setDownloadingChatIds((prev) => {
-            const newSet = new Set(prev);
-            newSet.delete(chatId);
-            return newSet;
-          });
-        }
-      }, delay);
-    },
-  });
-
-  const cancelDownload = () => {
-    if (abortController) {
-      abortController.abort();
-      setAbortController(null);
-      if (chatId) {
-        setDownloadingChatIds((prev) => {
-          const newSet = new Set(prev);
-          newSet.delete(chatId);
-          return newSet;
-        });
-      }
-    }
-  };
-
-  return {
-    downloadModel: mutation.mutate,
-    isDownloading:
-      mutation.isPending && chatId ? downloadingChatIds.has(chatId) : false,
-    downloadProgress:
-      chatId && downloadingChatIds.has(chatId) ? downloadProgress : null,
-    error: mutation.error,
-    cancelDownload,
-  };
-}
--- a/app/ui/app/src/hooks/useUser.ts
+++ b/app/ui/app/src/hooks/useUser.ts
@@ -1,29 +1,20 @@
 import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
-import { useEffect, useState } from "react";
 import { fetchUser, fetchConnectUrl, disconnectUser } from "@/api";

 export function useUser() {
  const queryClient = useQueryClient();
-  const [initialDataLoaded, setInitialDataLoaded] = useState(false);
-
-  // Wait for initial data to be loaded
-  useEffect(() => {
-    const initialPromise = window.__initialUserDataPromise;
-    if (initialPromise) {
-      initialPromise.finally(() => {
-        setInitialDataLoaded(true);
-      });
-    } else {
-      setInitialDataLoaded(true);
-    }
-  }, []);

  const userQuery = useQuery({
    queryKey: ["user"],
-    queryFn: () => fetchUser(),
+    queryFn: async () => {
+      const result = await fetchUser();
+      return result;
+    },
    staleTime: 5 * 60 * 1000, // Consider data stale after 5 minutes
    gcTime: 10 * 60 * 1000, // Keep in cache for 10 minutes
-    initialData: null, // Start with null to prevent flashing
+    retry: 10,
+    retryDelay: (attemptIndex) => Math.min(500 * attemptIndex, 2000),
+    refetchOnMount: true, // Always fetch when component mounts
  });

  // Mutation to refresh user data
@@ -49,14 +40,15 @@ export function useUser() {
    },
  });

+  const isLoading = userQuery.isLoading || userQuery.isFetching;
+  const isAuthenticated = Boolean(userQuery.data?.name);
+
  return {
    user: userQuery.data,
-    isLoading:
-      !initialDataLoaded ||
-      (userQuery.isLoading && userQuery.data === undefined), // Show loading until initial data is loaded
+    isLoading,
    isError: userQuery.isError,
    error: userQuery.error,
-    isAuthenticated: Boolean(userQuery.data?.name),
+    isAuthenticated,
    refreshUser: refreshUser.mutate,
    isRefreshing: refreshUser.isPending,
    refetchUser: userQuery.refetch,
--- a/app/ui/app/src/lib/config.ts
+++ b/app/ui/app/src/lib/config.ts
@@ -8,3 +8,6 @@ export const API_BASE = import.meta.env.DEV ? DEV_API_URL : "";
 export const OLLAMA_HOST = import.meta.env.DEV
  ? DEV_API_URL
  : window.location.origin;
+
+export const OLLAMA_DOT_COM =
+  import.meta.env.VITE_OLLAMA_DOT_COM_URL || "https://ollama.com";
--- a/app/ui/app/src/lib/highlighter.ts
+++ b/app/ui/app/src/lib/highlighter.ts
@@ -147,6 +147,7 @@ export const highlighterPromise = createHighlighter({
    "c",
    "cpp",
    "sql",
+    "swift",
    "yaml",
    "markdown",
  ],
--- a/app/ui/app/src/main.tsx
+++ b/app/ui/app/src/main.tsx
@@ -5,13 +5,6 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import { routeTree } from "./routeTree.gen";
 import { fetchUser } from "./api";
 import { StreamingProvider } from "./contexts/StreamingContext";
-import { User } from "@/gotypes";
-
-declare global {
-  interface Window {
-    __initialUserDataPromise?: Promise<User | null>;
-  }
-}

 const queryClient = new QueryClient({
  defaultOptions: {
@@ -24,27 +17,11 @@ const queryClient = new QueryClient({
  },
 });

-// Track initial user data fetch
-let initialUserDataPromise: Promise<User | null> | null = null;
-
-// Initialize user data on app startup
-const initializeUserData = async () => {
-  try {
-    const userData = await fetchUser();
+fetchUser().then((userData) => {
+  if (userData) {
    queryClient.setQueryData(["user"], userData);
-    return userData;
-  } catch (error) {
-    console.error("Error initializing user data:", error);
-    queryClient.setQueryData(["user"], null);
-    return null;
  }
-};
-
-// Start initialization immediately and track the promise
-initialUserDataPromise = initializeUserData();
-
-// Export the promise so hooks can await it
-window.__initialUserDataPromise = initialUserDataPromise;
+});

 const router = createRouter({
  routeTree,
--- a/app/ui/responses/types.go
+++ b/app/ui/responses/types.go
@@ -101,15 +101,14 @@ type HealthResponse struct {
 }

 type User struct {
-	ID            string `json:"id"`
-	Name          string `json:"name"`
-	Email         string `json:"email"`
-	AvatarURL     string `json:"avatarURL"`
-	Plan          string `json:"plan"`
-	Bio           string `json:"bio"`
-	FirstName     string `json:"firstName"`
-	LastName      string `json:"lastName"`
-	OverThreshold bool   `json:"overThreshold"`
+	ID        string `json:"id"`
+	Email     string `json:"email"`
+	Name      string `json:"name"`
+	Bio       string `json:"bio,omitempty"`
+	AvatarURL string `json:"avatarurl,omitempty"`
+	FirstName string `json:"firstname,omitempty"`
+	LastName  string `json:"lastname,omitempty"`
+	Plan      string `json:"plan,omitempty"`
 }

 type Attachment struct {
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -12,18 +12,17 @@ import (
 	"log/slog"
 	"net/http"
 	"net/http/httputil"
-	"net/url"
 	"os"
 	"runtime"
 	"runtime/debug"
 	"slices"
 	"strconv"
 	"strings"
+	"sync"
 	"time"

 	"github.com/google/uuid"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/app/auth"
 	"github.com/ollama/ollama/app/server"
 	"github.com/ollama/ollama/app/store"
 	"github.com/ollama/ollama/app/tools"
@@ -118,40 +117,66 @@ func (s *Server) log() *slog.Logger {

 // ollamaProxy creates a reverse proxy handler to the Ollama server
 func (s *Server) ollamaProxy() http.Handler {
-	ollamaHost := os.Getenv("OLLAMA_HOST")
-	if ollamaHost == "" {
-		ollamaHost = "http://127.0.0.1:11434"
-	}
+	var (
+		proxy   http.Handler
+		proxyMu sync.Mutex
+	)

-	if !strings.HasPrefix(ollamaHost, "http://") && !strings.HasPrefix(ollamaHost, "https://") {
-		ollamaHost = "http://" + ollamaHost
-	}
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		proxyMu.Lock()
+		p := proxy
+		proxyMu.Unlock()

-	target, err := url.Parse(ollamaHost)
-	if err != nil {
-		s.log().Error("failed to parse OLLAMA_HOST", "error", err, "host", ollamaHost)
-		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			http.Error(w, "failed to configure proxy", http.StatusInternalServerError)
-		})
-	}
+		if p == nil {
+			proxyMu.Lock()
+			if proxy == nil {
+				var err error
+				for i := range 2 {
+					if i > 0 {
+						s.log().Warn("ollama server not ready, retrying", "attempt", i+1)
+						time.Sleep(1 * time.Second)
+					}

-	s.log().Info("configuring ollama proxy", "target", target.String())
+					err = WaitForServer(context.Background(), 10*time.Second)
+					if err == nil {
+						break
+					}
+				}

-	proxy := httputil.NewSingleHostReverseProxy(target)
+				if err != nil {
+					proxyMu.Unlock()
+					s.log().Error("ollama server not ready after retries", "error", err)
+					http.Error(w, "Ollama server is not ready", http.StatusServiceUnavailable)
+					return
+				}

-	originalDirector := proxy.Director
-	proxy.Director = func(req *http.Request) {
-		originalDirector(req)
-		req.Host = target.Host
-		s.log().Debug("proxying request", "method", req.Method, "path", req.URL.Path, "target", target.Host)
-	}
+				target := envconfig.Host()
+				s.log().Info("configuring ollama proxy", "target", target.String())

-	proxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) {
-		s.log().Error("proxy error", "error", err, "path", r.URL.Path, "target", target.String())
-		http.Error(w, "proxy error: "+err.Error(), http.StatusBadGateway)
-	}
+				newProxy := httputil.NewSingleHostReverseProxy(target)

-	return proxy
+				originalDirector := newProxy.Director
+				newProxy.Director = func(req *http.Request) {
+					originalDirector(req)
+					req.Host = target.Host
+					s.log().Debug("proxying request", "method", req.Method, "path", req.URL.Path, "target", target.Host)
+				}
+
+				newProxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) {
+					s.log().Error("proxy error", "error", err, "path", r.URL.Path, "target", target.String())
+					http.Error(w, "proxy error: "+err.Error(), http.StatusBadGateway)
+				}
+
+				proxy = newProxy
+				p = newProxy
+			} else {
+				p = proxy
+			}
+			proxyMu.Unlock()
+		}
+
+		p.ServeHTTP(w, r)
+	})
 }

 type errHandlerFunc func(http.ResponseWriter, *http.Request) error
@@ -264,11 +289,10 @@ func (s *Server) Handler() http.Handler {
 	ollamaProxy := s.ollamaProxy()
 	mux.Handle("GET /api/tags", ollamaProxy)
 	mux.Handle("POST /api/show", ollamaProxy)
-
-	mux.Handle("GET /api/v1/me", handle(s.me))
-	mux.Handle("POST /api/v1/disconnect", handle(s.disconnect))
-	mux.Handle("GET /api/v1/connect", handle(s.connectURL))
-	mux.Handle("GET /api/v1/health", handle(s.health))
+	mux.Handle("GET /api/version", ollamaProxy)
+	mux.Handle("HEAD /api/version", ollamaProxy)
+	mux.Handle("POST /api/me", ollamaProxy)
+	mux.Handle("POST /api/signout", ollamaProxy)

 	// React app - catch all non-API routes and serve the React app
 	mux.Handle("GET /", s.appHandler())
@@ -338,7 +362,7 @@ func (s *Server) doSelfSigned(ctx context.Context, method, path string) (*http.R
 }

 // UserData fetches user data from ollama.com API for the current ollama key
-func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
+func (s *Server) UserData(ctx context.Context) (*api.UserResponse, error) {
 	resp, err := s.doSelfSigned(ctx, http.MethodPost, "/api/me")
 	if err != nil {
 		return nil, fmt.Errorf("failed to call ollama.com/api/me: %w", err)
@@ -349,7 +373,7 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
 		return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
 	}

-	var user responses.User
+	var user api.UserResponse
 	if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
 		return nil, fmt.Errorf("failed to parse user response: %w", err)
 	}
@@ -368,29 +392,27 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
 	return &user, nil
 }

-func waitForServer(ctx context.Context) error {
-	timeout := time.Now().Add(10 * time.Second)
-	// TODO: this avoids an error on first load of the app
-	// however we should either show a loading state or
-	// wait for the Ollama server to be ready before redirecting
-	for {
+// WaitForServer waits for the Ollama server to be ready
+func WaitForServer(ctx context.Context, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
 		c, err := api.ClientFromEnvironment()
 		if err != nil {
 			return err
 		}
 		if _, err := c.Version(ctx); err == nil {
-			break
-		}
-		if time.Now().After(timeout) {
-			return fmt.Errorf("timeout waiting for Ollama server to be ready")
+			slog.Debug("ollama server is ready")
+			return nil
 		}
 		time.Sleep(10 * time.Millisecond)
 	}
-	return nil
+	return errors.New("timeout waiting for Ollama server to be ready")
 }

 func (s *Server) createChat(w http.ResponseWriter, r *http.Request) error {
-	waitForServer(r.Context())
+	if err := WaitForServer(r.Context(), 10*time.Second); err != nil {
+		return err
+	}

 	id, err := uuid.NewV7()
 	if err != nil {
@@ -975,7 +997,7 @@ func (s *Server) chat(w http.ResponseWriter, r *http.Request) error {
 				for _, toolCall := range res.Message.ToolCalls {
 					// continues loop as tools were executed
 					toolsExecuted = true
-					result, content, err := registry.Execute(ctx, toolCall.Function.Name, toolCall.Function.Arguments)
+					result, content, err := registry.Execute(ctx, toolCall.Function.Name, toolCall.Function.Arguments.ToMap())
 					if err != nil {
 						errContent := fmt.Sprintf("Error: %v", err)
 						toolErrMsg := store.NewMessage("tool", errContent, nil)
@@ -1438,129 +1460,6 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
 	})
 }

-func (s *Server) me(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodGet {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	user, err := s.UserData(r.Context())
-	if err != nil {
-		// If fetching from API fails, try to return cached user data if available
-		if cachedUser, cacheErr := s.Store.User(); cacheErr == nil && cachedUser != nil {
-			s.log().Info("API request failed, returning cached user data", "error", err)
-			responseUser := &responses.User{
-				Name:  cachedUser.Name,
-				Email: cachedUser.Email,
-				Plan:  cachedUser.Plan,
-			}
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusOK)
-			return json.NewEncoder(w).Encode(responseUser)
-		}
-
-		s.log().Error("failed to get user data", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to get user data",
-		})
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(user)
-}
-
-func (s *Server) disconnect(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodPost {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	if err := s.Store.ClearUser(); err != nil {
-		s.log().Warn("failed to clear cached user data", "error", err)
-	}
-
-	// Get the SSH public key to encode for the delete request
-	pubKey, err := ollamaAuth.GetPublicKey()
-	if err != nil {
-		s.log().Error("failed to get public key", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to get public key",
-		})
-	}
-
-	// Encode the key using base64 URL encoding
-	encodedKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
-
-	// Call the /api/user/keys/{encodedKey} endpoint with DELETE
-	resp, err := s.doSelfSigned(r.Context(), http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey))
-	if err != nil {
-		s.log().Error("failed to call ollama.com/api/user/keys", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to disconnect from ollama.com",
-		})
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		s.log().Error("disconnect request failed", "status", resp.StatusCode)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to disconnect from ollama.com",
-		})
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(map[string]string{"status": "disconnected"})
-}
-
-func (s *Server) connectURL(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodGet {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	connectURL, err := auth.BuildConnectURL(OllamaDotCom)
-	if err != nil {
-		s.log().Error("failed to build connect URL", "error", err)
-		w.WriteHeader(http.StatusInternalServerError)
-		return json.NewEncoder(w).Encode(responses.Error{
-			Error: "failed to build connect URL",
-		})
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(map[string]string{
-		"connect_url": connectURL,
-	})
-}
-
-func (s *Server) health(w http.ResponseWriter, r *http.Request) error {
-	if r.Method != http.MethodGet {
-		http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
-		return nil
-	}
-
-	healthy := false
-	c, err := api.ClientFromEnvironment()
-	if err == nil {
-		if _, err := c.Version(r.Context()); err == nil {
-			healthy = true
-		}
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
-	return json.NewEncoder(w).Encode(responses.HealthResponse{
-		Healthy: healthy,
-	})
-}
-
 func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
 	ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
 	defer cancel()
@@ -1659,13 +1558,13 @@ func convertToOllamaTool(toolSchema map[string]any) api.Tool {

 	tool.Function.Parameters.Type = "object"
 	tool.Function.Parameters.Required = []string{}
-	tool.Function.Parameters.Properties = make(map[string]api.ToolProperty)
+	tool.Function.Parameters.Properties = api.NewToolPropertiesMap()

 	if schemaProps, ok := toolSchema["schema"].(map[string]any); ok {
 		tool.Function.Parameters.Type = getStringFromMap(schemaProps, "type", "object")

 		if props, ok := schemaProps["properties"].(map[string]any); ok {
-			tool.Function.Parameters.Properties = make(map[string]api.ToolProperty)
+			tool.Function.Parameters.Properties = api.NewToolPropertiesMap()

 			for propName, propDef := range props {
 				if propMap, ok := propDef.(map[string]any); ok {
@@ -1673,7 +1572,7 @@ func convertToOllamaTool(toolSchema map[string]any) api.Tool {
 						Type:        api.PropertyType{getStringFromMap(propMap, "type", "string")},
 						Description: getStringFromMap(propMap, "description", ""),
 					}
-					tool.Function.Parameters.Properties[propName] = prop
+					tool.Function.Parameters.Properties.Set(propName, prop)
 				}
 			}
 		}
--- a/app/wintray/eventloop.go
+++ b/app/wintray/eventloop.go
@@ -158,16 +158,16 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
 	case uint32(UI_REQUEST_MSG_ID):
 		// Requests for the UI must always come from the main event thread
 		l := int(wParam)
-		path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l)
+		path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l) //nolint:govet,gosec
 		t.app.UIRun(path)
 	case WM_COPYDATA:
 		// Handle URL scheme requests from other instances
 		if lParam != 0 {
-			cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam))
-			if cds.DwData == 1 { // Our identifier for URL scheme messages
+			cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam)) //nolint:govet,gosec
+			if cds.DwData == 1 {                             // Our identifier for URL scheme messages
 				// Convert the data back to string
 				data := make([]byte, cds.CbData)
-				copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData])
+				copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData]) //nolint:govet,gosec
 				urlScheme := string(data)
 				handleURLSchemeRequest(urlScheme)
 				lResult = 1 // Return non-zero to indicate success
--- a/cmd/bench/README.md
+++ b/cmd/bench/README.md
@@ -15,7 +15,7 @@ A Go-based command-line tool for benchmarking Ollama models with configurable pa

 ```
 go build -o ollama-bench bench.go
-./bench -model gpt-oss:20b -epochs 6 -format csv
+./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
 ```

 Using Go Run (without building)
@@ -29,31 +29,32 @@ go run bench.go -model gpt-oss:20b -epochs 3
 ### Basic Example

 ```
-./bench -model gemma3 -epochs 6
+./ollama-bench -model gemma3 -epochs 6
 ```

 ### Benchmark Multiple Models

 ```
-./bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
+./ollama-bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
 benchstat -col /name gemma.bench
 ```

 ### With Image Prompt

 ```
-./bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
+./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
 ```

 ### Advanced Example

 ```
-./bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
+./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
 ```

 ## Command Line Options

 | Option  	| Description | Default |
+|----------|-------------|---------|
 | -model	| Comma-separated list of models to benchmark	| (required)		|
 | -epochs	| Number of iterations per model		| 1			|
 | -max-tokens	| Maximum tokens for model response		| 0 (unlimited)		|
--- a/cmd/bench/bench.go
+++ b/cmd/bench/bench.go
@@ -48,8 +48,8 @@ func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool)
 	case "benchstat":
 		if verbose {
 			printHeader := func() {
-				fmt.Printf("sysname: %s\n", runtime.GOOS)
-				fmt.Printf("machine: %s\n", runtime.GOARCH)
+				fmt.Fprintf(w, "sysname: %s\n", runtime.GOOS)
+				fmt.Fprintf(w, "machine: %s\n", runtime.GOARCH)
 			}
 			once.Do(printHeader)
 		}
@@ -147,6 +147,17 @@ func BenchmarkChat(fOpt flagOptions) error {
 		return err
 	}

+	var out io.Writer = os.Stdout
+	if fOpt.outputFile != nil && *fOpt.outputFile != "" {
+		f, err := os.OpenFile(*fOpt.outputFile, os.O_CREATE|os.O_WRONLY, 0o644)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "ERROR: cannot open output file %s: %v\n", *fOpt.outputFile, err)
+			return err
+		}
+		defer f.Close()
+		out = f
+	}
+
 	for _, model := range models {
 		for range *fOpt.epochs {
 			options := make(map[string]interface{})
@@ -241,13 +252,14 @@ func BenchmarkChat(fOpt flagOptions) error {
 				},
 			}

-			OutputMetrics(os.Stdout, *fOpt.format, metrics, *fOpt.verbose)
+			OutputMetrics(out, *fOpt.format, metrics, *fOpt.verbose)

 			if *fOpt.keepAlive > 0 {
 				time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
 			}
 		}
 	}
+
 	return nil
 }

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -45,6 +45,7 @@ import (
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/syncmap"
 	"github.com/ollama/ollama/version"
+	xcmd "github.com/ollama/ollama/x/cmd"
 )

 const ConnectInstructions = "To sign in, navigate to:\n    %s\n\n"
@@ -517,6 +518,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return generateEmbedding(cmd, name, opts.Prompt, opts.KeepAlive, truncate, dimensions)
 	}

+	// Check for experimental flag
+	isExperimental, _ := cmd.Flags().GetBool("experimental")
+	yoloMode, _ := cmd.Flags().GetBool("yolo")
+
 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
 			var sErr api.AuthorizationError
@@ -543,6 +548,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 			}
 		}

+		// Use experimental agent loop with tools
+		if isExperimental {
+			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
+		}
+
 		return generateInteractive(cmd, opts)
 	}
 	return generate(cmd, opts)
@@ -943,6 +953,9 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 			rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
 		}
 		rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
+		if resp.Requires != "" {
+			rows = append(rows, []string{"", "requires", resp.Requires})
+		}
 		return
 	})

@@ -1430,7 +1443,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		latest.Summary()
 	}

-	return &api.Message{Role: role, Content: fullResponse.String()}, nil
+	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
 }

 func generate(cmd *cobra.Command, opts runOptions) error {
@@ -1751,6 +1764,8 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
 	runCmd.Flags().Bool("truncate", false, "For embedding models: truncate inputs exceeding context length (default: true). Set --truncate=false to error instead")
 	runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
+	runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
+	runCmd.Flags().BoolP("yolo", "y", false, "Skip all tool approval prompts (use with caution)")

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -291,6 +291,31 @@ Weigh anchor!
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
+
+	t.Run("min version", func(t *testing.T) {
+		var b bytes.Buffer
+		if err := showInfo(&api.ShowResponse{
+			Details: api.ModelDetails{
+				Family:            "test",
+				ParameterSize:     "7B",
+				QuantizationLevel: "FP16",
+			},
+			Requires: "0.14.0",
+		}, false, &b); err != nil {
+			t.Fatal(err)
+		}
+
+		expect := `  Model
+    architecture    test      
+    parameters      7B        
+    quantization    FP16      
+    requires        0.14.0    
+
+`
+		if diff := cmp.Diff(expect, b.String()); diff != "" {
+			t.Errorf("unexpected output (-want +got):\n%s", diff)
+		}
+	})
 }

 func TestDeleteHandler(t *testing.T) {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -40,6 +40,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /bye            Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
+
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -6,11 +6,14 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
+	"iter"
 	"log/slog"
+	"maps"
 	"os"
 	"slices"
 	"strings"

+	ofs "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -18,8 +21,13 @@ type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`

+	// TODO is this needed?
+	ModelType string `json:"model_type"`
+
 	TextModel struct {
-		VocabSize uint32 `json:"vocab_size"`
+		VocabSize  uint32 `json:"vocab_size"`
+		HiddenSize uint32 `json:"hidden_size"`
+		ModelType  string `json:"model_type"`
 	} `json:"text_config"`
 }

@@ -33,8 +41,94 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }

-func (ModelParameters) KV(t *Tokenizer) ggml.KV {
-	kv := ggml.KV{
+type KV map[string]any
+
+func (kv KV) Architecture() string {
+	return kv.String("general.architecture", "unknown")
+}
+
+type valueTypes interface {
+	uint8 | int8 | uint16 | int16 |
+		uint32 | int32 | uint64 | int64 |
+		string | float32 | float64 | bool
+}
+
+type arrayValueTypes interface {
+	[]uint8 | []int8 | []uint16 | []int16 |
+		[]uint32 | []int32 | []uint64 | []int64 |
+		[]string | []float32 | []float64 | []bool
+}
+
+func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
+		key = kv.Architecture() + "." + key
+	}
+
+	if val, ok := kv[key].(T); ok {
+		return val, true
+	}
+	return defaultValue[0], false
+}
+
+func (kv KV) String(key string, defaultValue ...string) string {
+	val, _ := keyValue(kv, key, append(defaultValue, "")...)
+	return val
+}
+
+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
+}
+
+func (kv KV) Float(key string, defaultValue ...float32) float32 {
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
+}
+
+func (kv KV) Bool(key string, defaultValue ...bool) bool {
+	val, _ := keyValue(kv, key, append(defaultValue, false)...)
+	return val
+}
+
+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
+	val, _ := keyValue(kv, key, append(defaultValue, []string{""})...)
+	return val
+}
+
+func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
+	val, _ := keyValue(kv, key, append(defaultValue, []int32{0})...)
+	return val
+}
+
+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
+	val, _ := keyValue(kv, key, append(defaultValue, []uint32{0})...)
+	return val
+}
+
+func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
+	val, _ := keyValue(kv, key, append(defaultValue, []float32{0})...)
+	return val
+}
+
+func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
+	val, _ := keyValue(kv, key, append(defaultValue, []bool{false})...)
+	return val
+}
+
+func (kv KV) Len() int {
+	return len(kv)
+}
+
+func (kv KV) Keys() iter.Seq[string] {
+	return maps.Keys(kv)
+}
+
+func (kv KV) Value(key string) any {
+	return kv[key]
+}
+
+func (ModelParameters) KV(t *Tokenizer) KV {
+	kv := KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -63,7 +157,7 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p AdapterParameters) KV() ggml.KV {
+func (p AdapterParameters) KV() KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -71,7 +165,7 @@ func (p AdapterParameters) KV() ggml.KV {
 		alpha = p.LoraParameters.Alpha
 	}

-	kv := ggml.KV{
+	kv := KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -88,9 +182,14 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-type ModelConverter interface {
+type ModelKV interface {
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) ggml.KV
+	KV(*Tokenizer) KV
+}
+
+type ModelConverter interface {
+	ModelKV
+
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
@@ -107,7 +206,7 @@ type moreParser interface {

 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(ggml.KV) ggml.KV
+	KV(ofs.Config) KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
 	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
@@ -115,7 +214,7 @@ type AdapterConverter interface {
 	Replacements() []string
 }

-func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -126,8 +225,8 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 		return err
 	}

-	arch, ok := baseKV["general.architecture"]
-	if !ok {
+	arch := baseKV.Architecture()
+	if arch == "" {
 		return errors.New("architecture not set for the base model")
 	}

@@ -153,23 +252,19 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
 }

-// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
-// and files it finds in the input path.
-// Supported input model formats include safetensors.
-// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, f *os.File) error {
+func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
-		return err
+		return nil, nil, err
 	}

 	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
-		return err
+		return nil, nil, err
 	}

 	if len(p.Architectures) < 1 {
-		return errors.New("unknown architecture")
+		return nil, nil, errors.New("unknown architecture")
 	}

 	var conv ModelConverter
@@ -182,6 +277,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
 		conv = &mistral3Model{}
+	case "Ministral3ForCausalLM":
+		conv = &mistral3CausalModel{}
 	case "MixtralForCausalLM":
 		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
@@ -200,29 +297,37 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &qwen25VLModel{}
 	case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
 		conv = &qwen3VLModel{}
+	case "Olmo3ForCausalLM":
+		conv = &olmoModel{}
 	case "BertModel":
 		conv = &bertModel{}
+	case "NomicBertModel", "NomicBertMoEModel":
+		conv = &nomicbertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
 	case "GptOssForCausalLM":
 		conv = &gptossModel{}
+	case "DeepseekOCRForCausalLM":
+		conv = &deepseekocr{}
+	case "DeepseekV3ForCausalLM":
+		conv = &deepseek2Model{}
 	default:
-		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
+		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}

 	if err := json.Unmarshal(bts, conv); err != nil {
-		return err
+		return nil, nil, err
 	}

 	if t, ok := conv.(moreParser); ok {
 		if err := t.parseMore(fsys); err != nil {
-			return err
+			return nil, nil, err
 		}
 	}

 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
-		return err
+		return nil, nil, err
 	}

 	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
@@ -244,6 +349,19 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
+	return conv, t, nil
+}
+
+// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
+// and files it finds in the input path.
+// Supported input model formats include safetensors.
+// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
+func ConvertModel(fsys fs.FS, f *os.File) error {
+	kv, t, err := LoadModelMetadata(fsys)
+	if err != nil {
+		return err
+	}
+	conv := kv.(ModelConverter)

 	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
@@ -253,7 +371,7 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	return writeFile(f, conv.KV(t), conv.Tensors(ts))
 }

-func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
+func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
 	for i := range ts {
 		ts[i].Shape = slices.Clone(ts[i].Shape)
 		slices.Reverse(ts[i].Shape)
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -88,7 +88,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bertModel) KV(t *Tokenizer) ggml.KV {
+func (p *bertModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -24,7 +24,7 @@ type commandrModel struct {

 var _ ModelConverter = (*commandrModel)(nil)

-func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
+func (p *commandrModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "command-r"
 	kv["general.name"] = "command-r"
--- a/convert/convert_deepseek2.go
+++ b/convert/convert_deepseek2.go
@@ -0,0 +1,173 @@
+package convert
+
+import (
+	"cmp"
+	"fmt"
+	"log/slog"
+	"regexp"
+	"strconv"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type deepseek2Model struct {
+	ModelParameters               // architectures, vocab_size
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	RMSNormEPS            float32 `json:"rms_norm_eps"`
+
+	RopeTheta     float32 `json:"rope_theta"`
+	QKNopeHeadDim uint32  `json:"qk_nope_head_dim"`
+	QKRopeHeadDim uint32  `json:"qk_rope_head_dim"`
+	KVLoraRank    uint32  `json:"kv_lora_rank"`
+	QLoraRank     uint32  `json:"q_lora_rank"`
+	VHeadDim      uint32  `json:"v_head_dim"`
+
+	ExpertCount            uint32  `json:"n_routed_experts"`
+	ExpertSharedCount      uint32  `json:"n_shared_experts"`
+	ExpertIntermediateSize uint32  `json:"moe_intermediate_size"`
+	ExpertUsedCount        uint32  `json:"num_experts_per_tok"`
+	ExpertWeightsNorm      bool    `json:"norm_topk_prob"`
+	ExpertWeightsScale     float32 `json:"routed_scaling_factor"`
+
+	ScoringFunc            string `json:"scoring_func"`
+	LeadingDenseBlockCount uint32 `json:"first_k_dense_replace"`
+
+	RopeScaling struct {
+		Factor                        float32 `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+		Type                          string  `json:"type"`
+		MScaleAllDim                  float32 `json:"mscale_all_dim"`
+	} `json:"rope_scaling"`
+
+	Architecture string
+}
+
+func (p *deepseek2Model) KV(t *Tokenizer) KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "deepseek2"
+	kv["general.type"] = "model"
+	kv["deepseek2.block_count"] = p.HiddenLayers
+
+	numHeads := p.NumAttentionHeads
+	numKVHeads := p.NumKeyValueHeads
+
+	kv["deepseek2.attention.head_count"] = numHeads
+	kv["deepseek2.attention.head_count_kv"] = numKVHeads
+	kv["deepseek2.attention.key_length"] = p.QKNopeHeadDim + p.QKRopeHeadDim
+	kv["deepseek2.attention.kv_lora_rank"] = p.KVLoraRank
+	kv["deepseek2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["deepseek2.attention.q_lora_rank"] = p.QLoraRank
+	kv["deepseek2.attention.value_length"] = p.VHeadDim
+	kv["deepseek2.context_length"] = p.MaxPositionEmbeddings
+	kv["deepseek2.embedding_length"] = p.HiddenSize
+	kv["deepseek2.expert_count"] = p.ExpertCount
+	kv["deepseek2.expert_feed_forward_length"] = p.ExpertIntermediateSize
+	kv["deepseek2.expert_shared_count"] = p.ExpertSharedCount
+
+	var scoringFunc uint32
+	switch p.ScoringFunc {
+	case "softmax":
+		// not currently supported in the model, but needed for Deepseek-OCR
+		scoringFunc = 1
+	case "sigmoid":
+		scoringFunc = 2
+	}
+	kv["deepseek2.expert_gating_func"] = scoringFunc
+	kv["deepseek2.expert_used_count"] = p.ExpertUsedCount
+	kv["deepseek2.expert_weights_norm"] = p.ExpertWeightsNorm
+	kv["deepseek2.expert_weights_scale"] = p.ExpertWeightsScale
+	kv["deepseek2.feed_forward_length"] = p.IntermediateSize
+	kv["deepseek2.leading_dense_block_count"] = p.LeadingDenseBlockCount
+
+	kv["deepseek2.rope.dimension_count"] = p.QKRopeHeadDim
+	kv["deepseek2.rope.freq_base"] = cmp.Or(p.RopeTheta, 10000.0)
+	kv["deepseek2.rope.scaling.factor"] = p.RopeScaling.Factor
+	kv["deepseek2.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
+	kv["deepseek2.rope.scaling.type"] = p.RopeScaling.Type
+	kv["deepseek2.rope.scaling.yarn_log_multiplier"] = 0.1 * p.RopeScaling.MScaleAllDim
+
+	kv["tokenizer.ggml.pre"] = "deepseek-v3"
+
+	return kv
+}
+
+func (p *deepseek2Model) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"language_model.", "",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.kv_a_proj_with_mqa", "attn_kv_a_mqa",
+		"self_attn.kv_a_layernorm", "attn_kv_a_norm",
+		"self_attn.kv_b_proj", "attn_kv_b",
+		"self_attn.q_a_proj", "attn_q_a",
+		"self_attn.q_a_layernorm", "attn_q_a_norm",
+		"self_attn.q_b_proj", "attn_q_b",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "ffn_norm",
+		"mlp.shared_experts.down_proj", "ffn_down_shexp",
+		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
+		"mlp.shared_experts.up_proj", "ffn_up_shexp",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"mlp.gate.e_score_correction_bias", "exp_probs_b.bias",
+		"mlp.gate", "ffn_gate_inp",
+	}
+}
+
+func (p *deepseek2Model) Tensors(s []Tensor) (out []*ggml.Tensor) {
+	merges := make([]merge, p.HiddenLayers*3)
+	for i := range p.HiddenLayers {
+		merges[i*3+0] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}
+		merges[i*3+1] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}
+		merges[i*3+2] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}
+	}
+
+	skipLayer := func(n string, minValue uint32) bool {
+		re := regexp.MustCompile(`^blk\.(\d+)`)
+		matches := re.FindStringSubmatch(n)
+		if matches == nil {
+			return false
+		}
+
+		blkNum, err := strconv.Atoi(matches[1])
+		if err != nil {
+			return false
+		}
+
+		return uint32(blkNum) >= minValue
+	}
+
+	out, s = mergeTensors(s, merges...)
+	for _, t := range s {
+		// skip any additional layers (such as the Multi-Token Prediction layer)
+		if skipLayer(t.Name(), p.HiddenLayers) {
+			slog.Debug("skipping layer", "name", t.Name())
+			continue
+		}
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+	return out
+}
--- a/convert/convert_deepseekocr.go
+++ b/convert/convert_deepseekocr.go
@@ -0,0 +1,136 @@
+package convert
+
+import (
+	"fmt"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type deepseekocr struct {
+	ModelParameters
+	LanguageConfig struct {
+		MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
+		HiddenSize            uint32 `json:"hidden_size"`
+		HiddenLayers          uint32 `json:"num_hidden_layers"`
+		IntermediateSize      uint32 `json:"intermediate_size"`
+		NumAttentionHeads     uint32 `json:"num_attention_heads"`
+		NumKeyValueHeads      uint32 `json:"num_key_value_heads"`
+		NumRoutedExperts      uint32 `json:"n_routed_experts"`
+		NumSharedExperts      uint32 `json:"n_shared_experts"`
+		NumExpertsPerToken    uint32 `json:"num_experts_per_tok"`
+		FirstKDenseReplace    uint32 `json:"first_k_dense_replace"`
+	} `json:"language_config"`
+
+	VisionConfig struct {
+		ImageSize uint32 `json:"image_size"`
+		Width     struct {
+			Vision struct {
+				Heads     uint32 `json:"heads"`
+				ImageSize uint32 `json:"image_size"`
+				Layers    uint32 `json:"layers"`
+				PatchSize uint32 `json:"patch_size"`
+				Width     uint32 `json:"width"`
+			} `json:"clip-l-14-224"`
+			Sam struct {
+				GlobalAttentionIndexes []int32 `json:"global_attn_indexes"`
+				Heads                  uint32  `json:"heads"`
+				Layers                 uint32  `json:"layers"`
+				Width                  uint32  `json:"width"`
+			} `json:"sam_vit_b"`
+		}
+	} `json:"vision_config"`
+}
+
+func (m *deepseekocr) KV(t *Tokenizer) KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "deepseekocr"
+	kv["block_count"] = m.LanguageConfig.HiddenLayers
+	kv["context_length"] = m.LanguageConfig.MaxPositionEmbeddings
+	kv["embedding_length"] = m.LanguageConfig.HiddenSize
+	kv["feed_forward_length"] = m.LanguageConfig.IntermediateSize
+	kv["attention.head_count"] = m.LanguageConfig.NumAttentionHeads
+	kv["attention.head_count_kv"] = m.LanguageConfig.NumKeyValueHeads
+	kv["expert_count"] = m.LanguageConfig.NumRoutedExperts
+	kv["expert_used_count"] = m.LanguageConfig.NumExpertsPerToken
+	kv["leading_dense_block_count"] = m.LanguageConfig.FirstKDenseReplace
+
+	kv["vision.block_count"] = m.VisionConfig.Width.Vision.Layers
+	kv["vision.embedding_length"] = m.VisionConfig.Width.Vision.Width
+	kv["vision.head_count"] = m.VisionConfig.Width.Vision.Heads
+	kv["vision.image_size"] = m.VisionConfig.Width.Vision.ImageSize
+	kv["vision.patch_size"] = m.VisionConfig.Width.Vision.PatchSize
+
+	kv["sam.block_count"] = m.VisionConfig.Width.Sam.Layers
+	kv["sam.embedding_length"] = m.VisionConfig.Width.Sam.Width
+	kv["sam.head_count"] = m.VisionConfig.Width.Sam.Heads
+	kv["sam.global_attention_indexes"] = m.VisionConfig.Width.Sam.GlobalAttentionIndexes
+	return kv
+}
+
+func (m *deepseekocr) Tensors(s []Tensor) (out []*ggml.Tensor) {
+	merges := make([]merge, m.LanguageConfig.HiddenLayers*3)
+	for i := range m.LanguageConfig.HiddenLayers {
+		merges[i*3+0] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}
+		merges[i*3+1] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}
+		merges[i*3+2] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}
+	}
+
+	out, s = mergeTensors(s, merges...)
+	for _, t := range s {
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+	return out
+}
+
+func (m *deepseekocr) Replacements() []string {
+	return []string{
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "ffn_norm",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
+		"mlp.shared_experts.up_proj", "ffn_up_shexp",
+		"mlp.shared_experts.down_proj", "ffn_down_shexp",
+		"model.norm", "output_norm",
+		"lm_head", "output",
+
+		"model.vision_model", "v",
+		"embeddings.patch_embedding", "patch_embd",
+		"embeddings.class_embedding", "class_embd",
+		"embeddings.position_embedding", "position_embd",
+		"transformer.layers", "blk",
+
+		"model.projector", "mm",
+		"model.image_newline", "mm.image_newline",
+		//nolint:misspell // this misspelling is upstream. fixing it breaks the model
+		"model.view_seperator", "mm.view_seperator",
+
+		"model.sam_model.patch_embed.proj", "s.patch_embd",
+		"model.sam_model.pos_embed", "s.position_embd",
+		"model.sam_model.blocks", "s.blk",
+		"model.sam_model.neck", "s.neck",
+		"model.sam_model.net_", "s.net_",
+	}
+}
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -23,7 +23,7 @@ type gemmaModel struct {

 var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
+func (p *gemmaModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,7 +1,5 @@
 package convert

-import "github.com/ollama/ollama/fs/ggml"
-
 type gemma2Model struct {
 	gemmaModel
 	SlidingWindow         uint32  `json:"sliding_window"`
@@ -9,7 +7,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
+func (p *gemma2Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,6 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

+	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -15,7 +16,7 @@ type gemma2Adapter struct {

 var _ AdapterConverter = (*gemma2Adapter)(nil)

-func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
+func (p *gemma2Adapter) KV(baseKV fs.Config) KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
--- a/convert/convert_gemma3.go
+++ b/convert/convert_gemma3.go
@@ -2,8 +2,7 @@ package convert

 import (
 	"cmp"
-
-	"github.com/ollama/ollama/fs/ggml"
+	"slices"
 )

 type gemma3Model struct {
@@ -26,16 +25,26 @@ type gemma3Model struct {
 		NumChannels       uint32  `json:"num_channels"`        // num_channels 3
 		PatchSize         uint32  `json:"patch_size"`          // patch_size 14
 	} `json:"vision_config"`
-	MaxPositionEmbeddings    uint32  `json:"max_position_embeddings"`
-	NumAttentionHeads        uint32  `json:"num_attention_heads"`
-	NumKeyValueHeads         uint32  `json:"num_key_value_heads"`
-	RMSNormEPS               float32 `json:"rms_norm_eps"`
-	HeadDim                  uint32  `json:"head_dim"`
-	FinalLogitSoftcap        float32 `json:"final_logit_softcapping"`
-	RopeLocalTheta           float32 `json:"rope_local_base_freq"`
-	RopeGlobalTheta          float32 `json:"rope_global_base_freq"`
-	SlidingWindow            uint32  `json:"sliding_window"`
-	MultiModalTokensPerImage uint32  `json:"mm_tokens_per_image"`
+	MaxPositionEmbeddings    uint32   `json:"max_position_embeddings"`
+	NumAttentionHeads        uint32   `json:"num_attention_heads"`
+	NumKeyValueHeads         uint32   `json:"num_key_value_heads"`
+	RMSNormEPS               float32  `json:"rms_norm_eps"`
+	HeadDim                  uint32   `json:"head_dim"`
+	FinalLogitSoftcap        float32  `json:"final_logit_softcapping"`
+	RopeLocalTheta           float32  `json:"rope_local_base_freq"`
+	RopeTheta                float32  `json:"rope_theta"`
+	SlidingWindow            uint32   `json:"sliding_window"`
+	SlidingWindowPattern     *uint32  `json:"sliding_window_pattern"`
+	LayerTypes               []string `json:"layer_types"`
+	MultiModalTokensPerImage uint32   `json:"mm_tokens_per_image"`
+	RopeScaling              *struct {
+		Type                          string  `json:"rope_type"`
+		Factor                        float32 `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+		ExtrapolationFactor           float32 `json:"extrapolation_factor"`
+		BetaFast                      float32 `json:"beta_fast"`
+		BetaSlow                      float32 `json:"beta_slow"`
+	} `json:"rope_scaling"`
 }

 const (
@@ -44,7 +53,7 @@ const (
 	gemma27BLayerCount = 62
 )

-func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
+func (p *gemma3Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3"

@@ -81,9 +90,38 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
 		kv["gemma3.attention.key_length"] = p.HeadDim
 		kv["gemma3.attention.value_length"] = p.HeadDim
 		kv["gemma3.attention.sliding_window"] = p.SlidingWindow
-		kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
+
+		// The sliding window pattern is either provided as the sliding_window_pattern
+		// key (an int) or as the layer_types key (a list of strings).
+		if p.SlidingWindowPattern != nil || len(p.LayerTypes) > 0 {
+			kv["gemma3.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
+				for i := range numBlocks {
+					var isLocal bool
+					if len(p.LayerTypes) > 0 && int(i) < len(p.LayerTypes) {
+						isLocal = p.LayerTypes[i] == "sliding_attention"
+					} else if p.SlidingWindowPattern != nil && *p.SlidingWindowPattern > 0 {
+						isLocal = (i+1)%*p.SlidingWindowPattern != 0
+					}
+					if !yield(isLocal) {
+						break
+					}
+				}
+			})
+		}
+		if p.FinalLogitSoftcap > 0 {
+			kv["gemma3.final_logit_softcapping"] = p.FinalLogitSoftcap
+		}
 		kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
-		kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
+		kv["gemma3.rope.freq_base"] = cmp.Or(p.RopeTheta, 1000000.0)
+		if p.RopeScaling != nil && p.RopeScaling.Type == "yarn" && p.RopeScaling.Factor > 0 {
+			kv["gemma3.rope.scaling.type"] = "yarn"
+			kv["gemma3.rope.scaling.factor"] = p.RopeScaling.Factor
+			kv["gemma3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
+			kv["gemma3.rope.scaling.extrapolation_factor"] = cmp.Or(p.RopeScaling.ExtrapolationFactor, float32(1.0))
+			kv["gemma3.rope.scaling.beta_fast"] = cmp.Or(p.RopeScaling.BetaFast, float32(64.0))
+			kv["gemma3.rope.scaling.beta_slow"] = cmp.Or(p.RopeScaling.BetaSlow, float32(1.0))
+		}
+
 		kv["gemma3.embedding_length"] = p.HiddenSize
 		kv["gemma3.feed_forward_length"] = p.IntermediateSize
 	default:
--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -38,7 +38,7 @@ type gemma3nModel struct {
 	VisionModel struct{} `json:"vision_config"`
 }

-func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
+func (m *gemma3nModel) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3n"
 	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -37,7 +37,7 @@ type gptossModel struct {

 var _ ModelConverter = (*gptossModel)(nil)

-func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
+func (m *gptossModel) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -48,7 +48,7 @@ type llamaModel struct {

 var _ ModelConverter = (*llamaModel)(nil)

-func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
+func (p *llamaModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -35,7 +35,7 @@ type llama4Model struct {
 }

 // KV implements ModelConverter.
-func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
+func (p *llama4Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama4"

--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,6 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

+	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -18,13 +19,13 @@ type llamaAdapter struct {

 var _ AdapterConverter = (*llamaAdapter)(nil)

-func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
+func (p *llamaAdapter) KV(baseKV fs.Config) KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
-	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
-	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
+	kv["llama.attention.head_count"] = baseKV.Value("llama.attention.head_count")
+	kv["llama.attention.head_count_kv"] = baseKV.Value("llama.attention.head_count_kv")

-	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
+	p.NumAttentionHeads = baseKV.Value("llama.attention.head_count").(uint32)

 	return kv
 }
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -29,6 +29,17 @@ type mistral3Model struct {
 		SlidingWindow         *uint32 `json:"sliding_window"`
 		HiddenAct             string  `json:"hidden_act"`
 		VocabSize             uint32  `json:"vocab_size"`
+		RopeParameters        struct {
+			BetaFast                  float32  `json:"beta_fast"`
+			BetaSlow                  float32  `json:"beta_slow"`
+			Factor                    float32  `json:"factor"`
+			Llama4ScalingBeta         *float32 `json:"llama_4_scaling_beta"`
+			OrigMaxPositionEmbeddings uint32   `json:"original_max_position_embeddings"`
+			RopeType                  string   `json:"rope_type"`
+			RopeTheta                 float32  `json:"rope_theta"`
+			Mscale                    *float32 `json:"mscale"`
+			MscaleAllDim              *float32 `json:"mscale_all_dim"`
+		} `json:"rope_parameters"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumAttentionHeads uint32  `json:"num_attention_heads"`
@@ -41,12 +52,15 @@ type mistral3Model struct {
 		HeadDim           uint32  `json:"head_dim"`
 		HiddenAct         string  `json:"hidden_act"`
 		RopeTheta         float32 `json:"rope_theta"`
+		RopeParameters    struct {
+			RopeTheta float32 `json:"rope_theta"`
+		} `json:"rope_parameters"`
 	} `json:"vision_config"`
 	MultiModalProjectorBias bool   `json:"multimodal_projector_bias"`
 	ProjectorHiddenAct      string `json:"projector_hidden_act"`
 }

-func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
+func (p *mistral3Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.TextModel.VocabSize
@@ -61,8 +75,25 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
 	kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
 	kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
-	kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
-	kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta
+	kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
+	kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
+	kv["mistral3.rope.scaling.factor"] = p.TextModel.RopeParameters.Factor
+	kv["mistral3.rope.scaling.type"] = p.TextModel.RopeParameters.RopeType
+	kv["mistral3.rope.scaling.beta_fast"] = p.TextModel.RopeParameters.BetaFast
+	kv["mistral3.rope.scaling.beta_slow"] = p.TextModel.RopeParameters.BetaSlow
+
+	if p.TextModel.RopeParameters.Mscale != nil {
+		kv["mistral3.rope.scaling.mscale"] = *p.TextModel.RopeParameters.Mscale
+	}
+	if p.TextModel.RopeParameters.MscaleAllDim != nil {
+		kv["mistral3.rope.scaling.mscale_all_dim"] = *p.TextModel.RopeParameters.MscaleAllDim
+	}
+	if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
+		kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
+	}
+	if p.TextModel.RopeParameters.Llama4ScalingBeta != nil {
+		kv["mistral3.rope.scaling_beta"] = *p.TextModel.RopeParameters.Llama4ScalingBeta
+	}

 	// Vision configuration
 	kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
@@ -74,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
 	kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
 	// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
-	kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
+	kv["mistral3.vision.rope.freq_base"] = cmp.Or(p.VisionModel.RopeTheta, p.VisionModel.RopeParameters.RopeTheta)

 	// Multimodal configuration
 	kv["mistral3.image_token_index"] = p.ImageTokenIndex
--- a/convert/convert_mistral_causal.go
+++ b/convert/convert_mistral_causal.go
@@ -0,0 +1,181 @@
+package convert
+
+import (
+	"cmp"
+	"fmt"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type mistral3CausalModel struct {
+	ModelParameters
+
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RMSNormEPS            float32 `json:"rms_norm_eps"`
+	HeadDim               uint32  `json:"head_dim"`
+	SlidingWindow         *uint32 `json:"sliding_window"`
+	HiddenAct             string  `json:"hidden_act"`
+	VocabSize             uint32  `json:"vocab_size"`
+	RopeParameters        struct {
+		BetaFast                  float32  `json:"beta_fast"`
+		BetaSlow                  float32  `json:"beta_slow"`
+		Factor                    float32  `json:"factor"`
+		Llama4ScalingBeta         *float32 `json:"llama_4_scaling_beta"`
+		OrigMaxPositionEmbeddings uint32   `json:"original_max_position_embeddings"`
+		RopeType                  string   `json:"rope_type"`
+		RopeTheta                 float32  `json:"rope_theta"`
+		Mscale                    *float32 `json:"mscale"`
+		MscaleAllDim              *float32 `json:"mscale_all_dim"`
+	} `json:"rope_parameters"`
+}
+
+func (p *mistral3CausalModel) KV(t *Tokenizer) KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "mistral3"
+	kv["mistral3.vocab_size"] = p.VocabSize
+
+	// Text configuration
+	kv["mistral3.block_count"] = p.NumHiddenLayers
+	kv["mistral3.context_length"] = p.MaxPositionEmbeddings
+	kv["mistral3.embedding_length"] = p.HiddenSize
+	kv["mistral3.feed_forward_length"] = p.IntermediateSize
+	kv["mistral3.attention.head_count"] = p.NumAttentionHeads
+	kv["mistral3.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["mistral3.attention.key_length"] = p.HeadDim
+	kv["mistral3.attention.value_length"] = p.HeadDim
+	kv["mistral3.rope.dimension_count"] = cmp.Or(p.HeadDim, p.HiddenSize/p.NumAttentionHeads)
+	kv["mistral3.rope.freq_base"] = cmp.Or(p.RopeTheta, p.RopeParameters.RopeTheta)
+	kv["mistral3.rope.scaling.factor"] = p.RopeParameters.Factor
+	kv["mistral3.rope.scaling.type"] = p.RopeParameters.RopeType
+	kv["mistral3.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
+	kv["mistral3.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
+
+	if p.RopeParameters.Mscale != nil {
+		kv["mistral3.rope.scaling.mscale"] = *p.RopeParameters.Mscale
+	}
+
+	if p.RopeParameters.MscaleAllDim != nil {
+		kv["mistral3.rope.scaling.mscale_all_dim"] = *p.RopeParameters.MscaleAllDim
+	}
+
+	if p.RopeParameters.OrigMaxPositionEmbeddings > 0 {
+		kv["mistral3.rope.scaling.original_context_length"] = p.RopeParameters.OrigMaxPositionEmbeddings
+		kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
+	}
+
+	if p.RopeParameters.Llama4ScalingBeta != nil {
+		kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
+	}
+
+	return kv
+}
+
+func (p *mistral3CausalModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	for _, t := range ts {
+		if !strings.HasPrefix(t.Name(), "v.") {
+			if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
+				strings.HasSuffix(t.Name(), ".attn_k.weight") {
+				t.SetRepacker(p.repack)
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *mistral3CausalModel) Replacements() []string {
+	return []string{
+		"model.norm", "output_norm",
+		"model.", "",
+		"layers", "blk",
+		"transformer.layers", "blk",
+		"vision_tower", "v",
+		"ln_pre", "encoder_norm",
+		"input_layernorm", "attn_norm",
+		"post_attention_layernorm", "ffn_norm",
+		"embed_tokens", "token_embd",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"attention.q_proj", "attn_q",
+		"attention.k_proj", "attn_k",
+		"attention.v_proj", "attn_v",
+		"attention.o_proj", "attn_output",
+		"attention_norm", "attn_norm",
+		"feed_forward.gate_proj", "ffn_gate",
+		"feed_forward.down_proj", "ffn_down",
+		"feed_forward.up_proj", "ffn_up",
+		"multi_modal_projector", "mm",
+		"ffn_norm", "ffn_norm",
+		"lm_head", "output",
+	}
+}
+
+func (p *mistral3CausalModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	var dims []int
+	for _, dim := range shape {
+		dims = append(dims, int(dim))
+	}
+
+	var heads uint32
+	if strings.HasSuffix(name, ".attn_q.weight") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, ".attn_k.weight") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	} else {
+		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
+	}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+		return nil, err
+	}
+
+	if err := n.T(0, 2, 1, 3); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -12,7 +12,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
+func (p *mixtralModel) KV(t *Tokenizer) KV {
 	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -34,7 +34,7 @@ type mllamaModel struct {
 	} `json:"vision_config"`
 }

-func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
+func (m *mllamaModel) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "mllama"

--- a/convert/convert_nomicbert.go
+++ b/convert/convert_nomicbert.go
@@ -0,0 +1,213 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type nomicbertModel struct {
+	ModelParameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
+	RopeFreqBase          float32 `json:"rope_theta"`
+	normalizeEmbeddings   bool
+	PoolingType           uint32
+
+	// MoE parameters (only present in v2 models)
+	NumExperts      uint32 `json:"num_local_experts"`
+	NumExpertsUsed  uint32 `json:"num_experts_per_tok"`
+	MoEEveryNLayers uint32 `json:"moe_every_n_layers"`
+}
+
+var (
+	_ ModelConverter = (*nomicbertModel)(nil)
+	_ moreParser     = (*nomicbertModel)(nil)
+)
+
+func (p *nomicbertModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "modules.json")
+	if err != nil {
+		return err
+	}
+
+	var modules []struct {
+		Type string `json:"type"`
+		Path string `json:"path"`
+	}
+
+	if err := json.Unmarshal(bts, &modules); err != nil {
+		return err
+	}
+
+	var pooling string
+	for _, m := range modules {
+		switch m.Type {
+		case "sentence_transformers.models.Pooling":
+			pooling = m.Path
+		case "sentence_transformers.models.Normalize":
+			p.normalizeEmbeddings = true
+		}
+	}
+
+	if pooling != "" {
+		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
+		if err != nil {
+			return err
+		}
+
+		var pc struct {
+			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
+			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
+		}
+
+		if err := json.Unmarshal(bts, &pc); err != nil {
+			return err
+		}
+
+		if pc.PoolingModeMeanTokens {
+			p.PoolingType = 1
+		} else if pc.PoolingModeCLSToken {
+			p.PoolingType = 2
+		}
+	}
+
+	return nil
+}
+
+func (p *nomicbertModel) KV(t *Tokenizer) KV {
+	kv := p.ModelParameters.KV(t)
+
+	// Determine architecture based on MoE parameters (following qwen3 pattern)
+	arch := "nomic-bert"
+	if p.MoEEveryNLayers > 0 {
+		arch += "-moe"
+	}
+
+	kv["general.architecture"] = arch
+	kv["attention.causal"] = false
+	kv["pooling_type"] = p.PoolingType
+	kv["normalize_embeddings"] = p.normalizeEmbeddings
+
+	kv["block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers)
+
+	if contextLength := p.MaxPositionEmbeddings; contextLength > 0 {
+		kv["context_length"] = contextLength
+	}
+
+	if embeddingLength := p.HiddenSize; embeddingLength > 0 {
+		kv["embedding_length"] = p.HiddenSize
+	}
+
+	if feedForwardLength := p.IntermediateSize; feedForwardLength > 0 {
+		kv["feed_forward_length"] = p.IntermediateSize
+	}
+
+	if headCount := p.NumAttentionHeads; headCount > 0 {
+		kv["attention.head_count"] = p.NumAttentionHeads
+	}
+
+	if kvHeadCount := p.NumKeyValueHeads; kvHeadCount > 0 {
+		kv["attention.head_count_kv"] = p.NumKeyValueHeads
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon); layerNormEpsilon > 0 {
+		kv["attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	if p.RopeFreqBase > 0 {
+		kv["rope.freq_base"] = p.RopeFreqBase
+	}
+
+	// MoE specific parameters (only if MoE is enabled)
+	if p.NumExperts > 0 {
+		kv["expert_count"] = p.NumExperts
+	}
+
+	if p.NumExpertsUsed > 0 {
+		kv["expert_used_count"] = p.NumExpertsUsed
+	}
+
+	if p.MoEEveryNLayers > 0 {
+		kv["moe_every_n_layers"] = p.MoEEveryNLayers
+	}
+
+	kv["tokenizer.ggml.model"] = "bert"
+	kv["tokenizer.ggml.token_type_count"] = uint32(2)
+
+	// convert to phantom space tokens
+	for i, e := range t.Tokens {
+		switch {
+		case strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]"):
+			// noop - keep special tokens as-is
+		case strings.HasPrefix(e, "##"):
+			t.Tokens[i] = e[2:]
+		default:
+			t.Tokens[i] = "\u2581" + e
+		}
+	}
+
+	kv["tokenizer.ggml.tokens"] = t.Tokens
+
+	return kv
+}
+
+func (p *nomicbertModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	out := make([]*ggml.Tensor, 0, len(ts))
+	for _, t := range ts {
+		if slices.Contains([]string{
+			"embeddings.position_ids",
+			"pooler.dense.weight",
+			"pooler.dense.bias",
+		}, t.Name()) {
+			continue
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (nomicbertModel) Replacements() []string {
+	return []string{
+		"encoder.layer", "blk",
+		"encoder.layers", "blk",
+		"embeddings.word_embeddings", "token_embd",
+		"embeddings.token_type_embeddings", "token_types",
+		"embeddings.LayerNorm", "token_embd_norm",
+
+		"attention.self.qkv", "attn_qkv",
+
+		"attention.output.dense", "attn_output",
+		"attention.output.LayerNorm", "attn_output_norm",
+
+		"mlp.up", "ffn_up",
+		"mlp.down", "ffn_down",
+
+		"mlp.router", "ffn_gate_inp",
+		"mlp.experts.up", "ffn_up_exps",
+		"mlp.experts.down", "ffn_down_exps",
+
+		"intermediate.dense", "ffn_up",
+		"output.dense", "ffn_down",
+		"output.LayerNorm", "layer_output_norm",
+	}
+}
--- a/convert/convert_olmo.go
+++ b/convert/convert_olmo.go
@@ -0,0 +1,117 @@
+package convert
+
+import (
+	"cmp"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type ropeScaling struct {
+	Factor                    float32 `json:"factor"`
+	OriginalMaxPositionEmbeds uint32  `json:"original_max_position_embeddings"`
+	AttentionFactor           float32 `json:"attention_factor"`
+	BetaFast                  float32 `json:"beta_fast"`
+	BetaSlow                  float32 `json:"beta_slow"`
+	RopeType                  string  `json:"rope_type"`
+	ExtrapolationFactor       float32 `json:"extrapolation_factor"`
+}
+
+type olmoModel struct {
+	ModelParameters
+
+	HiddenSize            uint32       `json:"hidden_size"`
+	NumHiddenLayers       uint32       `json:"num_hidden_layers"`
+	IntermediateSize      uint32       `json:"intermediate_size"`
+	NumAttentionHeads     uint32       `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32       `json:"num_key_value_heads"`
+	MaxPositionEmbeddings uint32       `json:"max_position_embeddings"`
+	RMSNormEPS            float32      `json:"rms_norm_eps"`
+	RopeTheta             float32      `json:"rope_theta"`
+	RopeScaling           *ropeScaling `json:"rope_scaling"`
+	SlidingWindow         uint32       `json:"sliding_window"`
+	LayerTypes            []string     `json:"layer_types"`
+}
+
+var _ ModelConverter = (*olmoModel)(nil)
+
+func (p *olmoModel) KV(t *Tokenizer) KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "olmo3"
+	kv["olmo3.block_count"] = p.NumHiddenLayers
+	kv["olmo3.context_length"] = p.MaxPositionEmbeddings
+	kv["olmo3.embedding_length"] = p.HiddenSize
+	kv["olmo3.feed_forward_length"] = p.IntermediateSize
+	kv["olmo3.attention.head_count"] = p.NumAttentionHeads
+	kv["olmo3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+
+	if p.RopeTheta > 0 {
+		kv["olmo3.rope.freq_base"] = p.RopeTheta
+	}
+
+	if p.RopeScaling != nil {
+		if p.RopeScaling.Factor > 0 {
+			kv["olmo3.rope.scaling.factor"] = p.RopeScaling.Factor
+		}
+		if p.RopeScaling.OriginalMaxPositionEmbeds > 0 {
+			kv["olmo3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeds
+		}
+		if p.RopeScaling.AttentionFactor > 0 {
+			kv["olmo3.rope.scaling.attn_factor"] = p.RopeScaling.AttentionFactor
+		}
+		if p.RopeScaling.RopeType != "" {
+			kv["olmo3.rope.scaling.type"] = p.RopeScaling.RopeType
+		}
+	}
+
+	if p.RMSNormEPS > 0 {
+		kv["olmo3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	}
+
+	if p.SlidingWindow > 0 {
+		kv["olmo3.attention.sliding_window"] = p.SlidingWindow
+	}
+
+	if len(p.LayerTypes) > 0 {
+		slidingPattern := make([]bool, len(p.LayerTypes))
+		for i, layerType := range p.LayerTypes {
+			slidingPattern[i] = (layerType == "sliding_attention")
+		}
+		kv["olmo3.attention.sliding_window_pattern"] = slidingPattern
+	}
+
+	return kv
+}
+
+func (p *olmoModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	out := make([]*ggml.Tensor, 0, len(ts))
+	for _, t := range ts {
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *olmoModel) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"model.norm", "output_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.k_norm", "attn_k_norm",
+		"post_attention_layernorm", "post_attention_norm",
+		"post_feedforward_layernorm", "post_ffw_norm",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+	}
+}
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -37,7 +37,7 @@ type phi3Model struct {

 var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
+func (p *phi3Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -22,7 +22,7 @@ type qwen2Model struct {

 var _ ModelConverter = (*qwen2Model)(nil)

-func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
+func (q *qwen2Model) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen2"
 	kv["qwen2.block_count"] = q.HiddenLayers
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -29,7 +29,7 @@ type qwen25VLModel struct {

 var _ ModelConverter = (*qwen25VLModel)(nil)

-func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
+func (q *qwen25VLModel) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen25vl"

--- a/convert/convert_qwen3.go
+++ b/convert/convert_qwen3.go
@@ -32,7 +32,7 @@ type qwen3Model struct {
 }

 // KV implements ModelConverter.
-func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
+func (q *qwen3Model) KV(t *Tokenizer) KV {
 	arch := "qwen3"
 	if q.NumExperts > 0 {
 		arch += "moe"
--- a/convert/convert_qwen3vl.go
+++ b/convert/convert_qwen3vl.go
@@ -45,7 +45,7 @@ func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
 	return json.Unmarshal(bts, &m.VisionModel)
 }

-func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
+func (m *qwen3VLModel) KV(t *Tokenizer) KV {
 	kv := m.qwen3Model.KV(t)

 	arch := "qwen3vl"
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"

 	"github.com/google/go-cmp/cmp"
+	fsc "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -28,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }

-func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, fsc.Config, ggml.Tensors) {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -59,9 +60,10 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv fsc.Config, tensors ggml.Tensors) map[string]string {
 	actual := make(map[string]string)
-	for k, v := range kv {
+	for k := range kv.Keys() {
+		v := kv.Value(k)
 		if s, ok := v.(json.Marshaler); !ok {
 			actual[k] = fmt.Sprintf("%v", v)
 		} else {
@@ -277,7 +279,7 @@ func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[str
 func TestConvertAdapter(t *testing.T) {
 	type AdapterCase struct {
 		Name     string
-		BaseKV   map[string]any
+		BaseKV   KV
 		Expected map[string]string
 	}

--- a/convert/reader.go
+++ b/convert/reader.go
@@ -44,7 +44,10 @@ func (t tensorBase) Kind() uint32 {
 		t.name == "v.positional_embedding_vlm" ||
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
-		t.name == "v.post_tile_position_embd.weight" {
+		t.name == "v.post_tile_position_embd.weight" ||
+		t.name == "s.position_embd" ||
+		strings.HasSuffix(t.name, "rel_pos_h") ||
+		strings.HasSuffix(t.name, "rel_pos_w") {
 		// these tensors are always F32
 		return tensorKindFP32
 	}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -96,7 +96,10 @@ type safetensor struct {

 func (st safetensor) Kind() uint32 {
 	kind := st.tensorBase.Kind()
-	if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
+	if st.dtype == "BF16" &&
+		!strings.HasPrefix(st.name, "v.") &&
+		!strings.HasPrefix(st.name, "s.") &&
+		kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}

--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -49,7 +49,8 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)

 			// temporary fix to handle gemma3 broken configs
-			if slices.Contains([]string{"<end_of_turn>", "<start_of_turn>"}, piece.GetPiece()) {
+			// TODO(parthsareen): allow reading of tokenizer.json to allow managing special tokens when using spm
+			if slices.Contains([]string{"<end_of_turn>", "<start_of_turn>", "<start_function_declaration>", "<end_function_declaration>", "<start_function_call>", "<end_function_call>", "<start_function_response>", "<end_function_response>", "<escape>"}, piece.GetPiece()) {
 				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
 			}

--- a/discover/cpu_linux.go
+++ b/discover/cpu_linux.go
@@ -2,6 +2,7 @@ package discover

 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -10,12 +11,21 @@ import (
 	"reflect"
 	"regexp"
 	"sort"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
 )

 func GetCPUMem() (memInfo, error) {
+	mem, err := getCPUMem()
+	if err != nil {
+		return memInfo{}, err
+	}
+	return getCPUMemByCgroups(mem), nil
+}
+
+func getCPUMem() (memInfo, error) {
 	var mem memInfo
 	var total, available, free, buffers, cached, freeSwap uint64
 	f, err := os.Open("/proc/meminfo")
@@ -56,6 +66,32 @@ func GetCPUMem() (memInfo, error) {
 	return mem, nil
 }

+func getCPUMemByCgroups(mem memInfo) memInfo {
+	total, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.max")
+	if err == nil {
+		mem.TotalMemory = total
+	}
+	used, err := getUint64ValueFromFile("/sys/fs/cgroup/memory.current")
+	if err == nil {
+		mem.FreeMemory = mem.TotalMemory - used
+	}
+	return mem
+}
+
+func getUint64ValueFromFile(path string) (uint64, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return 0, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		line := s.Text()
+		return strconv.ParseUint(line, 10, 64)
+	}
+	return 0, errors.New("empty file content")
+}
+
 const CpuInfoFilename = "/proc/cpuinfo"

 type linuxCpuInfo struct {
@@ -74,7 +110,41 @@ func GetCPUDetails() []CPU {
 		return nil
 	}
 	defer file.Close()
-	return linuxCPUDetails(file)
+	cpus := linuxCPUDetails(file)
+	return overwriteThreadCountByLinuxCgroups(cpus)
+}
+
+func overwriteThreadCountByLinuxCgroups(cpus []CPU) []CPU {
+	file, err := os.Open("/sys/fs/cgroup/cpu.max")
+	if err != nil {
+		return cpus
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if sl := strings.Split(line, " "); len(sl) == 2 {
+			allowdUs, err := strconv.ParseInt(sl[0], 10, 64)
+			if err != nil {
+				slog.Warn("failed to parse CPU allowed micro secs", "error", err)
+				return cpus
+			}
+			unitUs, err := strconv.ParseInt(sl[1], 10, 64)
+			if err != nil {
+				slog.Warn("failed to parse CPU unit micro secs", "error", err)
+				return cpus
+			}
+
+			threads := int(max(allowdUs/unitUs, 1))
+
+			cpu := cpus[0]
+			cpu.CoreCount = threads
+			cpu.ThreadCount = threads
+			return []CPU{cpu}
+		}
+	}
+	return cpus
 }

 func linuxCPUDetails(file io.Reader) []CPU {
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -65,6 +65,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		}

 		slog.Info("discovering available GPUs...")
+		detectIncompatibleLibraries()

 		// Warn if any user-overrides are set which could lead to incorrect GPU discovery
 		overrideWarnings()
@@ -98,6 +99,9 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 					continue
 				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
 					continue
+				} else if jetpack == "" && strings.Contains(filepath.Base(dir), "cuda_jetpack") {
+					slog.Debug("jetpack not detected (set JETSON_JETPACK or OLLAMA_LLM_LIBRARY to override), skipping", "libDir", dir)
+					continue
 				} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
 					slog.Info("experimental Vulkan support disabled.  To enable, set OLLAMA_VULKAN=1")
 					continue
@@ -125,15 +129,25 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		supportedMu := sync.Mutex{}
 		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
 		for i := range devices {
+			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
 			if !devices[i].NeedsInitValidation() {
+				// No need to validate, add to the supported map
+				supportedMu.Lock()
+				if _, ok := supported[devices[i].Library]; !ok {
+					supported[devices[i].Library] = make(map[string]map[string]int)
+				}
+				if _, ok := supported[devices[i].Library][libDir]; !ok {
+					supported[devices[i].Library][libDir] = make(map[string]int)
+				}
+				supported[devices[i].Library][libDir][devices[i].ID] = i
+				supportedMu.Unlock()
 				continue
 			}
-			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
 			slog.Debug("verifying if device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
-				extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1])
+				extraEnvs := ml.GetVisibleDevicesEnv(devices[i:i+1], true)
 				devices[i].AddInitValidation(extraEnvs)
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
 					slog.Debug("filtering device which didn't fully initialize",
@@ -319,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			defer cancel()

 			// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
-			devFilter := ml.GetVisibleDevicesEnv(devices)
+			// We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
+			devFilter := ml.GetVisibleDevicesEnv(devices, false)

 			for dir := range libDirs {
 				updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter)
@@ -474,3 +489,16 @@ func overrideWarnings() {
 		slog.Warn("if GPUs are not correctly discovered, unset and try again")
 	}
 }
+
+func detectIncompatibleLibraries() {
+	if runtime.GOOS != "windows" {
+		return
+	}
+	basePath, err := exec.LookPath("ggml-base.dll")
+	if err != nil || basePath == "" {
+		return
+	}
+	if !strings.HasPrefix(basePath, ml.LibOllamaPath) {
+		slog.Warn("potentially incompatible library detected in PATH", "location", basePath)
+	}
+}
--- a/docs/api.md
+++ b/docs/api.md
@@ -50,7 +50,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 Advanced parameters (optional):

 - `format`: the format to return a response in. Format can be `json` or a JSON schema
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
 - `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
@@ -507,7 +507,7 @@ The `message` object has the following fields:
 Advanced parameters (optional):

 - `format`: the format to return a response in. Format can be `json` or a JSON schema.
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

@@ -895,11 +895,11 @@ curl http://localhost:11434/api/chat -d '{
      "tool_calls": [
        {
          "function": {
-            "name": "get_temperature",
+            "name": "get_weather",
            "arguments": {
              "city": "Toronto"
            }
-          },
+          }
        }
      ]
    },
@@ -907,7 +907,7 @@ curl http://localhost:11434/api/chat -d '{
    {
      "role": "tool",
      "content": "11 degrees celsius",
-      "tool_name": "get_temperature",
+      "tool_name": "get_weather"
    }
  ],
  "stream": false,
@@ -1189,7 +1189,7 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 - `template`: (optional) the prompt template for the model
 - `license`: (optional) a string or list of strings containing the license or licenses for the model
 - `system`: (optional) a string containing the system prompt for the model
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.md#valid-parameters-and-values) for a list of parameters)
+- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.mdx#valid-parameters-and-values) for a list of parameters)
 - `messages`: (optional) a list of message objects used to create a conversation
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `quantize` (optional): quantize a non-quantized (e.g. float16) model
@@ -1698,7 +1698,7 @@ Generate embeddings from a model
 Advanced parameters:

 - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 - `dimensions`: number of dimensions for the embedding

@@ -1817,7 +1817,7 @@ Generate embeddings from a model

 Advanced parameters:

- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

 ### Examples
--- a/docs/api/openai-compatibility.mdx
+++ b/docs/api/openai-compatibility.mdx
--- a/docs/capabilities/tool-calling.mdx
+++ b/docs/capabilities/tool-calling.mdx
@@ -15,7 +15,7 @@ Also known as "single-shot" tool calling.
    ```shell
    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
      "model": "qwen3",
-      "messages": [{"role": "user", "content": "What's the temperature in New York?"}],
+      "messages": [{"role": "user", "content": "What is the temperature in New York?"}],
      "stream": false,
      "tools": [
        {
@@ -41,7 +41,7 @@ Also known as "single-shot" tool calling.
    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
      "model": "qwen3",
      "messages": [
-        {"role": "user", "content": "What's the temperature in New York?"},
+        {"role": "user", "content": "What is the temperature in New York?"},
        {
          "role": "assistant",
          "tool_calls": [
@@ -90,7 +90,7 @@ Also known as "single-shot" tool calling.
      }
      return temperatures.get(city, "Unknown")

-    messages = [{"role": "user", "content": "What's the temperature in New York?"}]
+    messages = [{"role": "user", "content": "What is the temperature in New York?"}]

    # pass functions directly as tools in the tools list or as a JSON schema
    response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
@@ -146,7 +146,7 @@ Also known as "single-shot" tool calling.
      },
    ]

-    const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+    const messages = [{ role: 'user', content: "What is the temperature in New York?" }]

    const response = await ollama.chat({
      model: 'qwen3',
@@ -609,7 +609,7 @@ def get_temperature(city: str) -> str:
  return temperatures.get(city, 'Unknown')


-messages = [{'role': 'user', 'content': "What's the temperature in New York?"}]
+messages = [{'role': 'user', 'content': "What is the temperature in New York?"}]

 while True:
  stream = chat(
@@ -684,7 +684,7 @@ const getTemperatureTool = {
 }

 async function agentLoop() {
-  const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+  const messages = [{ role: 'user', content: "What is the temperature in New York?" }]

  while (true) {
    const stream = await ollama.chat({
--- a/docs/capabilities/vision.mdx
+++ b/docs/capabilities/vision.mdx
@@ -36,7 +36,6 @@ Provide an `images` array. SDKs accept file paths, URLs or raw bytes while the R
        }],
        "stream": false
    }'
-    "
    ```
  </Tab>
  <Tab title="Python">
--- a/docs/development.md
+++ b/docs/development.md
@@ -49,6 +49,8 @@ Install prerequisites:
    - [Ninja](https://github.com/ninja-build/ninja/releases)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
+- (Optional) VULKAN GPU support
+    - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs

 Then, configure and build the project:

@@ -57,6 +59,17 @@ cmake -B build
 cmake --build build --config Release
 ```

+> Building for Vulkan requires VULKAN_SDK environment variable:
+> 
+> PowerShell
+> ```powershell
+> $env:VULKAN_SDK="C:\VulkanSDK\<version>"
+> ```
+> CMD
+> ```cmd
+> set VULKAN_SDK=C:\VulkanSDK\<version>
+> ```
+
 > [!IMPORTANT]
 > Building for ROCm requires additional flags:
 > ```
@@ -65,6 +78,7 @@ cmake --build build --config Release
 > ```


+
 Lastly, run Ollama:

 ```shell
@@ -84,7 +98,9 @@ Install prerequisites:
    - [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
 - (Optional) NVIDIA GPU support
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
-
+- (Optional) VULKAN GPU support
+    - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
+    - Or install via package manager: `sudo apt install vulkan-sdk` (Ubuntu/Debian) or `sudo dnf install vulkan-sdk` (Fedora/CentOS)
 > [!IMPORTANT]
 > Ensure prerequisites are in `PATH` before running CMake.

--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -14,11 +14,11 @@ curl -fsSL https://ollama.com/install.sh | sh

 ## How can I view the logs?

-Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
+Review the [Troubleshooting](./troubleshooting) docs for more about using logs.

 ## Is my GPU compatible with Ollama?

-Please refer to the [GPU docs](./gpu.md).
+Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

@@ -57,8 +57,13 @@ ollama ps
 ```

 <Info>
-  **Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
-  100% GPU 4 minutes from now ```
+
+**Output**:
+
+```
+NAME        ID            SIZE    PROCESSOR   UNTIL
+llama3:70b  bcfb190ca3a7  42 GB   100% GPU    4 minutes from now
+```
 </Info>

 The `Processor` column will show which memory the model was loaded in to:
@@ -385,4 +390,4 @@ Ollama for Windows and macOS register as a login item during installation.  You
 - In `Task Manager` go to the `Startup apps` tab, search for `ollama` then click `Disable`

 **MacOS**
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -33,7 +33,7 @@ Check your compute compatibility to see if your card is supported:
 | 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                                               |
 |                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`                     |

-For building locally to support older GPUs, see [developer.md](./development.md#linux-cuda-nvidia)
+For building locally to support older GPUs, see [developer](./development#linux-cuda-nvidia)

 ### GPU Selection

@@ -54,7 +54,7 @@ sudo modprobe nvidia_uvm`

 Ollama supports the following AMD GPUs via the ROCm library:

-> [!NOTE]
+> **NOTE:**
 > Additional AMD GPU support is provided by the Vulkan Library - see below.


@@ -132,9 +132,9 @@ Ollama supports GPU acceleration on Apple devices via the Metal API.

 ## Vulkan GPU Support

-> [!NOTE]
+> **NOTE:**
 > Vulkan is currently an Experimental feature.  To enable, you must set OLLAMA_VULKAN=1 for the Ollama server as
-described in the [FAQ](faq.md#how-do-i-configure-ollama-server)
+described in the [FAQ](faq#how-do-i-configure-ollama-server)

 Additional GPU support on Windows and Linux is provided via
 [Vulkan](https://www.vulkan.org/). On Windows most GPU vendors drivers come
@@ -161,6 +161,6 @@ sudo setcap cap_perfmon+ep /usr/local/bin/ollama

 To select specific Vulkan GPU(s), you can set the environment variable
 `GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
-described in the [FAQ](faq.md#how-do-i-configure-ollama-server). If you
+described in the [FAQ](faq#how-do-i-configure-ollama-server). If you
 encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
 by setting `GGML_VK_VISIBLE_DEVICES=-1` 
--- a/docs/integrations/vscode.mdx
+++ b/docs/integrations/vscode.mdx
@@ -1,34 +1,34 @@
 ---
-title: VS Code 
+title: VS Code
 ---

 ## Install

-Install [VS Code](https://code.visualstudio.com/download). 
+Install [VS Code](https://code.visualstudio.com/download).

-## Usage with Ollama 
+## Usage with Ollama

 1. Open Copilot side bar found in top right window
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/vscode-sidebar.png" 
-    alt="VS Code chat Sidebar"
-    width="75%"
-  />
-</div>
-2. Select the model drowpdown > **Manage models**
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/vscode-models.png" 
-    alt="VS Code model picker"
-    width="75%"
-  />
-</div>
+   <div style={{ display: "flex", justifyContent: "center" }}>
+     <img
+       src="/images/vscode-sidebar.png"
+       alt="VS Code chat Sidebar"
+       width="75%"
+     />
+   </div>
+2. Select the model dropdown > **Manage models**
+   <div style={{ display: "flex", justifyContent: "center" }}>
+     <img
+       src="/images/vscode-models.png"
+       alt="VS Code model picker"
+       width="75%"
+     />
+   </div>
 3. Enter **Ollama** under **Provider Dropdown** and select desired models (e.g `qwen3, qwen3-coder:480b-cloud`)
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/vscode-model-options.png" 
-    alt="VS Code model options dropdown"
-    width="75%"
-  />
-</div>
+   <div style={{ display: "flex", justifyContent: "center" }}>
+     <img
+       src="/images/vscode-model-options.png"
+       alt="VS Code model options dropdown"
+       width="75%"
+     />
+   </div>
--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -20,8 +20,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Download and extract the package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
+    | sudo tar x -C /usr
 ```

 Start Ollama:
@@ -41,8 +41,8 @@ ollama -v
 If you have an AMD GPU, also download and extract the additional ROCm package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
+    | sudo tar x -C /usr
 ```

 ### ARM64 install
@@ -50,8 +50,8 @@ curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
 Download and extract the ARM64-specific package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-arm64.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-arm64.tar.zst \
+    | sudo tar x -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)
@@ -146,8 +146,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by re-downloading Ollama:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
+    | sudo tar x -C /usr
 ```

 ## Installing specific versions
--- a/docs/modelfile.mdx
+++ b/docs/modelfile.mdx
@@ -41,6 +41,7 @@ INSTRUCTION arguments
 | [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
 | [`LICENSE`](#license)               | Specifies the legal license.                                   |
 | [`MESSAGE`](#message)               | Specify message history.                                       |
+| [`REQUIRES`](#requires)             | Specify the minimum version of Ollama required by the model.   |

 ## Examples

@@ -149,9 +150,6 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                                                                                                                                                     | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                                                                                                                                                 | int        | mirostat 0           |
-| mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                                                                                                                                                | float      | mirostat_eta 0.1     |
-| mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                                                                                                                                                 | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                                                                                                                                            | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                                                                                                                                                   | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                                                                                                                                             | float      | repeat_penalty 1.1   |
@@ -251,6 +249,16 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```

+### REQUIRES
+
+The `REQUIRES` instruction allows you to specify the minimum version of Ollama required by the model.
+
+```
+REQUIRES <version>
+```
+
+The version should be a valid Ollama version (e.g. 0.14.0).
+
 ## Notes

 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/tools/extract-examples/README.md
+++ b/docs/tools/extract-examples/README.md
@@ -0,0 +1,46 @@
+# extract-examples
+
+Extracts code examples from MDX files to a temp directory so you can run them.
+
+## Usage
+
+```shell
+go run docs/tools/extract-examples/main.go <mdx-file>
+```
+
+## Example
+
+```shell
+go run docs/tools/extract-examples/main.go docs/api/openai-compatibility.mdx
+```
+
+Output:
+
+```
+Extracting code examples to: /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
+
+  - 01_basic.py
+  - 01_basic.js
+  - 01_basic.sh
+  - 02_responses.py
+  - 02_responses.js
+  - 02_responses.sh
+  - 03_vision.py
+  - 03_vision.js
+  - 03_vision.sh
+
+Extracted 9 file(s) to /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
+
+To run examples:
+
+  cd /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
+  npm install   # for JS examples
+
+then run individual files with `node file.js`, `python file.py`, `bash file.sh`
+```
+
+## How it works
+
+- Parses MDX files looking for fenced code blocks with filenames (e.g., ` ```python basic.py `)
+- Groups examples by their `<CodeGroup>` and prefixes filenames with `01_`, `02_`, etc.
+- Writes all extracted files to a temp directory
--- a/docs/tools/extract-examples/main.go
+++ b/docs/tools/extract-examples/main.go
@@ -0,0 +1,137 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+)
+
+func main() {
+	if len(os.Args) < 2 {
+		fmt.Fprintln(os.Stderr, "Usage: go run extract-examples.go <mdx-file>")
+		os.Exit(1)
+	}
+
+	mdxFile := os.Args[1]
+
+	f, err := os.Open(mdxFile)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+		os.Exit(1)
+	}
+	defer f.Close()
+
+	// Create temp directory
+	tempDir, err := os.MkdirTemp("", "mdx-examples-*")
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error creating temp dir: %v\n", err)
+		os.Exit(1)
+	}
+
+	fmt.Printf("Extracting code examples to: %s\n\n", tempDir)
+
+	// Patterns
+	codeBlockStart := regexp.MustCompile("^```([a-zA-Z0-9_-]+)\\s+([^\\s]+)$")
+	codeGroupStart := regexp.MustCompile("^<CodeGroup")
+	codeGroupEnd := regexp.MustCompile("^</CodeGroup>")
+
+	scanner := bufio.NewScanner(f)
+	inCodeBlock := false
+	inCodeGroup := false
+	var currentFile string
+	var content strings.Builder
+	count := 0
+	codeGroupNum := 0
+
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Track CodeGroup boundaries
+		if codeGroupStart.MatchString(line) {
+			inCodeGroup = true
+			codeGroupNum++
+			continue
+		}
+		if codeGroupEnd.MatchString(line) {
+			inCodeGroup = false
+			continue
+		}
+
+		if inCodeBlock {
+			if line == "```" {
+				// End of code block - write file
+				if currentFile != "" {
+					outPath := filepath.Join(tempDir, currentFile)
+					if err := os.WriteFile(outPath, []byte(content.String()), 0o644); err != nil {
+						fmt.Fprintf(os.Stderr, "Error writing %s: %v\n", currentFile, err)
+					} else {
+						fmt.Printf("  - %s\n", currentFile)
+						count++
+					}
+				}
+				inCodeBlock = false
+				currentFile = ""
+				content.Reset()
+			} else {
+				content.WriteString(line)
+				content.WriteString("\n")
+			}
+		} else {
+			if matches := codeBlockStart.FindStringSubmatch(line); matches != nil {
+				inCodeBlock = true
+				filename := matches[2]
+				// Prefix with CodeGroup number if inside a CodeGroup
+				if inCodeGroup {
+					currentFile = fmt.Sprintf("%02d_%s", codeGroupNum, filename)
+				} else {
+					currentFile = filename
+				}
+				content.Reset()
+			}
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
+		os.Exit(1)
+	}
+
+	// Write package.json for JavaScript dependencies
+	packageJSON := `{
+  "name": "mdx-examples",
+  "type": "module",
+  "dependencies": {
+    "openai": "^4",
+    "ollama": "^0.5"
+  }
+}
+`
+	if err := os.WriteFile(filepath.Join(tempDir, "package.json"), []byte(packageJSON), 0o644); err != nil {
+		fmt.Fprintf(os.Stderr, "Error writing package.json: %v\n", err)
+	}
+
+	// Write pyproject.toml for Python dependencies
+	pyprojectTOML := `[project]
+name = "mdx-examples"
+version = "0.0.0"
+dependencies = [
+    "openai",
+    "ollama",
+]
+`
+	if err := os.WriteFile(filepath.Join(tempDir, "pyproject.toml"), []byte(pyprojectTOML), 0o644); err != nil {
+		fmt.Fprintf(os.Stderr, "Error writing pyproject.toml: %v\n", err)
+	}
+
+	fmt.Printf("\n")
+	fmt.Printf("Extracted %d file(s) to %s\n", count, tempDir)
+	fmt.Printf("\n")
+	fmt.Printf("To run examples:\n")
+	fmt.Printf("\n")
+	fmt.Printf("  cd %s\n  npm install   # for JS examples\n", tempDir)
+	fmt.Printf("\n")
+	fmt.Printf("then run individual files with `node file.js`, `python file.py`, `bash file.sh`\n")
+}
--- a/docs/troubleshooting.mdx
+++ b/docs/troubleshooting.mdx
@@ -87,7 +87,7 @@ When Ollama starts up, it takes inventory of the GPUs present in the system to d

 ### Linux NVIDIA Troubleshooting

-If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
+If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker](./docker)

 Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem

--- a/fs/config.go
+++ b/fs/config.go
@@ -1,5 +1,7 @@
 package fs

+import "iter"
+
 type Config interface {
 	Architecture() string
 	String(string, ...string) string
@@ -11,4 +13,8 @@ type Config interface {
 	Ints(string, ...[]int32) []int32
 	Floats(string, ...[]float32) []float32
 	Bools(string, ...[]bool) []bool
+
+	Len() int
+	Keys() iter.Seq[string]
+	Value(key string) any
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,13 +6,16 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"iter"
 	"log/slog"
+	"maps"
 	"math"
 	"slices"
 	"strings"

 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/util/bufioutil"
+	"github.com/ollama/ollama/ml"
 )

 type GGML struct {
@@ -238,14 +241,31 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
 	return val.values
 }

+func (kv KV) Len() int {
+	return len(kv)
+}
+
+func (kv KV) Keys() iter.Seq[string] {
+	return maps.Keys(kv)
+}
+
+func (kv KV) Value(key string) any {
+	return kv[key]
+}
+
 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
+		"bert",
+		"deepseek2",
+		"deepseekocr",
 		"gemma3",
 		"gemma3n",
 		"gptoss", "gpt-oss",
 		"llama4",
 		"mistral3",
 		"mllama",
+		"nomic-bert",
+		"olmo3",
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
@@ -547,7 +567,7 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	}, nil
 }

-func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
+func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
 	context *= uint64(numParallel)

 	embedding := f.KV().EmbeddingLength()
@@ -788,7 +808,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 		}

 		partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
-		if useFlashAttention {
+		if useFlashAttention == ml.FlashAttentionEnabled {
 			// rough estimate of graph size with flash attention on
 			partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
 		}
@@ -806,6 +826,14 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
 	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
 }

+// KVCacheTypeIsQuantized checks if the requested cache type is a quantized type
+func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
+	if cacheType == "" || cacheType == "f16" || cacheType == "f32" || cacheType == "bf16" {
+		return false
+	}
+	return true
+}
+
 // SupportsFlashAttention checks if the model supports flash attention
 func (f GGML) SupportsFlashAttention() bool {
 	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
@@ -826,8 +854,11 @@ func (f GGML) SupportsFlashAttention() bool {
 // FlashAttention checks if the model should enable flash attention
 func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
+		"bert",
 		"gemma3",
 		"gptoss", "gpt-oss",
+		"mistral3",
+		"olmo3",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
 	}, f.KV().String("general.architecture"))
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"maps"
 	"os"
 	"runtime"
 	"slices"
 	"strings"

+	"github.com/ollama/ollama/fs"
 	"golang.org/x/sync/errgroup"
 )

@@ -305,7 +305,7 @@ func readGGUFV1StringsData(llm *gguf, r io.Reader, a *array[string]) (any, error

 			a.values[i] = e
 		} else {
-			discardGGUFString(llm, r)
+			_ = discardGGUFString(llm, r)
 		}
 	}

@@ -508,7 +508,7 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
+func WriteGGUF(f *os.File, kv fs.Config, ts []*Tensor) error {
 	arch := kv.String("general.architecture")
 	if arch == "" {
 		return fmt.Errorf("architecture not set")
@@ -526,12 +526,12 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		return err
 	}

-	if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(kv.Len())); err != nil {
 		return err
 	}

-	for _, key := range slices.Sorted(maps.Keys(kv)) {
-		if err := ggufWriteKV(f, arch, key, kv[key]); err != nil {
+	for _, key := range slices.Sorted(kv.Keys()) {
+		if err := ggufWriteKV(f, arch, key, kv.Value(key)); err != nil {
 			return err
 		}
 	}
@@ -568,7 +568,6 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 	g.SetLimit(runtime.GOMAXPROCS(0))
 	// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
 	for _, t := range ts {
-		t := t
 		w := io.NewOffsetWriter(f, offset+int64(t.Offset))
 		g.Go(func() error {
 			_, err := t.WriteTo(w)
@@ -598,6 +597,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {

 	var err error
 	switch v := v.(type) {
+	case int32:
+		err = writeGGUF(ws, ggufTypeInt32, v)
+	case int64:
+		err = writeGGUF(ws, ggufTypeInt64, v)
 	case uint32, FileType:
 		err = writeGGUF(ws, ggufTypeUint32, v)
 	case uint64:
@@ -612,6 +615,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
 		err = writeGGUFArray(ws, ggufTypeInt32, v)
 	case *array[int32]:
 		err = writeGGUFArray(ws, ggufTypeInt32, v.values)
+	case []int64:
+		err = writeGGUFArray(ws, ggufTypeInt64, v)
+	case *array[int64]:
+		err = writeGGUFArray(ws, ggufTypeInt64, v.values)
 	case []uint32:
 		err = writeGGUFArray(ws, ggufTypeUint32, v)
 	case *array[uint32]:
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -42,6 +42,10 @@ func TestWriteGGUF(t *testing.T) {
 				"general.architecture": "test",
 				"general.alignment":    uint32(16),
 				"test.key":             "value",
+				"test.int32_key":       int32(-42),
+				"test.int64_key":       int64(-9223372036854775808),
+				"test.int32_array":     []int32{-1, 0, 1, 2147483647, -2147483648},
+				"test.int64_array":     []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808},
 				"attention.key":        "value2",
 				"tokenizer.key":        "value3",
 				"adapter.key":          "value4",
@@ -55,7 +59,7 @@ func TestWriteGGUF(t *testing.T) {
 			}
 			defer r.Close()

-			ff, err := Decode(r, 0)
+			ff, err := Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -65,15 +69,19 @@ func TestWriteGGUF(t *testing.T) {
 				"general.alignment":       uint32(16),
 				"general.parameter_count": uint64(54),
 				"test.key":                "value",
+				"test.int32_key":          int32(-42),
+				"test.int64_key":          int64(-9223372036854775808),
+				"test.int32_array":        &array[int32]{size: 5, values: []int32{-1, 0, 1, 2147483647, -2147483648}},
+				"test.int64_array":        &array[int64]{size: 5, values: []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808}},
 				"test.attention.key":      "value2",
 				"tokenizer.key":           "value3",
 				"adapter.key":             "value4",
-			}, ff.KV()); diff != "" {
+			}, ff.KV(), cmp.AllowUnexported(array[int32]{}, array[int64]{})); diff != "" {
 				t.Errorf("Mismatch (-want +got):\n%s", diff)
 			}

 			if diff := cmp.Diff(Tensors{
-				Offset: 800,
+				Offset: 992,
 				items: []*Tensor{
 					{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
 					{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
--- a/go.mod
+++ b/go.mod
@@ -15,9 +15,8 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.12.0
-	golang.org/x/sys v0.36.0
-
+	golang.org/x/sync v0.17.0
+	golang.org/x/sys v0.37.0
 )

 require (
@@ -29,13 +28,17 @@ require (
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
+	github.com/wk8/go-ordered-map/v2 v2.1.8
 	golang.org/x/image v0.22.0
-	golang.org/x/tools v0.30.0
+	golang.org/x/mod v0.30.0
+	golang.org/x/tools v0.38.0
 	gonum.org/v1/gonum v0.15.0
 )

 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
+	github.com/bahlo/generic-list-go v0.2.0 // indirect
+	github.com/buger/jsonparser v1.1.1 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
 	github.com/chewxy/math32 v1.11.0 // indirect
@@ -45,6 +48,7 @@ require (
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/google/flatbuffers v24.3.25+incompatible // indirect
 	github.com/kr/text v0.2.0 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
@@ -77,11 +81,11 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.36.0
+	golang.org/x/crypto v0.43.0
 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
-	golang.org/x/net v0.38.0 // indirect
-	golang.org/x/term v0.30.0
-	golang.org/x/text v0.23.0
+	golang.org/x/net v0.46.0 // indirect
+	golang.org/x/term v0.36.0
+	golang.org/x/text v0.30.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -14,7 +14,11 @@ github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6IC
 github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
+github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
+github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
 github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
+github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
+github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
@@ -123,6 +127,7 @@ github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
@@ -143,6 +148,8 @@ github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+
 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
@@ -207,6 +214,8 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
 github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
+github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc=
+github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
@@ -224,8 +233,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
-golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
+golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
+golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -255,6 +264,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
+golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -267,8 +278,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
+golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -278,8 +289,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
+golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -295,17 +306,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
-golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
+golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
+golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
+golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -319,8 +330,8 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY=
-golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY=
+golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
+golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -388,9 +388,9 @@ func NewFunctionNameMap() *FunctionNameMap {
 	}
 }

-// Init initializes the handler with tools and optional last message
+// Init initializes the handler with tools, optional last message, and think value
 // Implements the Parser interface
-func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
+func (h *HarmonyMessageHandler) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
 	// Initialize the harmony parser
 	if h.HarmonyParser == nil {
 		h.HarmonyParser = &HarmonyParser{
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -4,7 +4,9 @@ package integration

 import (
 	"context"
+	"errors"
 	"math"
+	"strings"
 	"testing"
 	"time"

@@ -204,8 +206,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected %v, got %v (similarity: %f)", expected[0:5], res.Embeddings[0][0:5], sim)
 	}

-	if res.PromptEvalCount != 6 {
-		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 8 {
+		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@@ -251,8 +253,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected %v, got %v (similarity: %f)", expected[1][0:5], res.Embeddings[1][0:5], sim)
 	}

-	if res.PromptEvalCount != 12 {
-		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 16 {
+		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@@ -275,7 +277,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	cases := []struct {
 		name    string
 		request api.EmbedRequest
-		check   func(*api.EmbedResponse, error)
+		check   func(*testing.T, *api.EmbedResponse, error)
 	}{
 		{
 			name: "target truncation",
@@ -283,7 +285,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Model: "all-minilm",
 				Input: "why",
 			},
-			check: func(got *api.EmbedResponse, err error) {
+			check: func(t *testing.T, got *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
@@ -300,10 +302,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Input:   "why is the sky blue?",
 				Options: map[string]any{"num_ctx": 3},
 			},
-			check: func(got *api.EmbedResponse, err error) {
+			check: func(t *testing.T, got *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
+				t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
 				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
 					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
 				}
@@ -317,10 +320,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 3},
 			},
-			check: func(got *api.EmbedResponse, err error) {
+			check: func(t *testing.T, got *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
+				t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
 				if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
 					t.Errorf("embedding mismatch (-want +got):\n%s", diff)
 				}
@@ -334,21 +338,21 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Truncate: &truncFalse,
 				Options:  map[string]any{"num_ctx": 3},
 			},
-			check: func(res *api.EmbedResponse, err error) {
-				if err.Error() != "input exceeds maximum context length" {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
+				if err.Error() != "the input length exceeds the context length" {
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
 			},
 		},
 		{
-			name: "input after truncate error",
+			name: "input after truncate error with context length of 1",
 			request: api.EmbedRequest{
 				Model:    "all-minilm",
 				Input:    "why is the sky blue?",
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 1},
 			},
-			check: func(res *api.EmbedResponse, err error) {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
 				if err.Error() != "input after truncation exceeds maximum context length" {
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
@@ -362,7 +366,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Truncate: &truncTrue,
 				Options:  map[string]any{"num_ctx": 0},
 			},
-			check: func(res *api.EmbedResponse, err error) {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
 				if err.Error() != "input after truncation exceeds maximum context length" {
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
@@ -375,7 +379,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Input:   "why is the sky blue? Why is the sky blue? hi there my",
 				Options: map[string]any{"num_ctx": 16},
 			},
-			check: func(res *api.EmbedResponse, err error) {
+			check: func(t *testing.T, res *api.EmbedResponse, err error) {
 				if err != nil {
 					t.Fatal(err)
 				}
@@ -385,7 +389,8 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {

 	for _, req := range cases {
 		t.Run(req.name, func(t *testing.T) {
-			req.check(embedTestHelper(ctx, client, t, req.request))
+			resp, err := embedTestHelper(ctx, client, t, req.request)
+			req.check(t, resp, err)
 		})
 	}
 }
@@ -409,3 +414,230 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req

 	return client.Embed(ctx, &req)
 }
+
+func TestEmbedTruncation(t *testing.T) {
+	// Use test deadline if set, otherwise default to 2 minutes
+	timeout := 2 * time.Minute
+	if deadline, ok := t.Deadline(); ok {
+		timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			// Check if we're running out of time (reserve 20s for current model)
+			if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
+				t.Skip("skipping remaining tests to avoid timeout")
+			}
+
+			// Give each model its own budget to account for first-time pulls/loads
+			mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
+			defer mcancel()
+
+			t.Run("truncation batch", func(t *testing.T) {
+				truncTrue := true
+				req := api.EmbedRequest{
+					Model:    model,
+					Input:    []string{"short", strings.Repeat("long ", 100), "medium text"},
+					Truncate: &truncTrue,
+					Options:  map[string]any{"num_ctx": 30},
+				}
+
+				res, err := embedTestHelper(mctx, client, t, req)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if len(res.Embeddings) != 3 {
+					t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
+				}
+
+				if res.PromptEvalCount > 90 {
+					t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
+				}
+			})
+
+			t.Run("runner token count accuracy", func(t *testing.T) {
+				baseline := api.EmbedRequest{Model: model, Input: "test"}
+				baseRes, err := embedTestHelper(mctx, client, t, baseline)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				batch := api.EmbedRequest{
+					Model: model,
+					Input: []string{"test", "test", "test"},
+				}
+				batchRes, err := embedTestHelper(mctx, client, t, batch)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				expectedCount := baseRes.PromptEvalCount * 3
+				if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
+					t.Fatalf("expected ~%d tokens (3 × %d), got %d",
+						expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
+				}
+			})
+		})
+	}
+}
+
+// TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
+func TestEmbedLargeInput(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
+			defer mcancel()
+
+			// Test with progressively larger inputs
+			testCases := []struct {
+				name       string
+				inputWords int
+			}{
+				{"medium_input_256_words", 256},
+				{"large_input_512_words", 512},
+				{"very_large_input_800_words", 800},
+			}
+
+			for _, tc := range testCases {
+				t.Run(tc.name, func(t *testing.T) {
+					words := make([]string, tc.inputWords)
+					for i := range words {
+						words[i] = "word"
+					}
+					input := strings.Join(words, " ")
+
+					req := api.EmbedRequest{
+						Model:     model,
+						Input:     input,
+						KeepAlive: &api.Duration{Duration: 30 * time.Second},
+					}
+
+					res, err := embedTestHelper(mctx, client, t, req)
+					if err != nil {
+						t.Fatalf("embedding failed for %d words: %v", tc.inputWords, err)
+					}
+
+					if len(res.Embeddings) != 1 {
+						t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
+					}
+
+					if len(res.Embeddings[0]) == 0 {
+						t.Fatal("expected non-empty embedding")
+					}
+
+					t.Logf("Successfully embedded %d words (%d tokens)", tc.inputWords, res.PromptEvalCount)
+				})
+			}
+		})
+	}
+}
+
+// TestEmbedStatusCode tests that errors from the embedding endpoint
+// properly preserve their HTTP status codes when returned to the client.
+// This test specifically checks the error handling path in EmbedHandler
+// where api.StatusError errors should maintain their original status code.
+func TestEmbedStatusCode(t *testing.T) {
+	// Use test deadline if set, otherwise default to 2 minutes
+	timeout := 2 * time.Minute
+	if deadline, ok := t.Deadline(); ok {
+		timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			// Check if we're running out of time (reserve 20s for current model)
+			if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
+				t.Skip("skipping remaining tests to avoid timeout")
+			}
+
+			mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
+			defer mcancel()
+
+			// Pull the model if needed
+			if err := PullIfMissing(mctx, client, model); err != nil {
+				t.Fatal(err)
+			}
+
+			t.Run("truncation error status code", func(t *testing.T) {
+				truncFalse := false
+				longInput := strings.Repeat("word ", 100)
+
+				req := api.EmbedRequest{
+					Model:    model,
+					Input:    longInput,
+					Truncate: &truncFalse,
+					Options:  map[string]any{"num_ctx": 10},
+				}
+
+				_, err := embedTestHelper(mctx, client, t, req)
+				if err == nil {
+					t.Fatal("expected error when truncate=false with long input")
+				}
+
+				// Check that it's a StatusError with the correct status code
+				var statusErr api.StatusError
+				if !errors.As(err, &statusErr) {
+					t.Fatalf("expected api.StatusError, got %T: %v", err, err)
+				}
+
+				// The error should be a 4xx client error (likely 400 Bad Request)
+				// not a 500 Internal Server Error
+				if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
+					t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
+				}
+
+				// Verify the error message is meaningful
+				if !strings.Contains(err.Error(), "context length") {
+					t.Errorf("expected error message to mention context length, got: %v", err)
+				}
+			})
+
+			t.Run("batch truncation error status code", func(t *testing.T) {
+				truncFalse := false
+				req := api.EmbedRequest{
+					Model: model,
+					Input: []string{
+						"short input",
+						strings.Repeat("very long input ", 100),
+						"another short input",
+					},
+					Truncate: &truncFalse,
+					Options:  map[string]any{"num_ctx": 10},
+				}
+
+				_, err := embedTestHelper(mctx, client, t, req)
+				if err == nil {
+					t.Fatal("expected error when one input exceeds context with truncate=false")
+				}
+
+				// Check that it's a StatusError with the correct status code
+				var statusErr api.StatusError
+				if !errors.As(err, &statusErr) {
+					t.Fatalf("expected api.StatusError, got %T: %v", err, err)
+				}
+
+				// The error should be a 4xx client error, not a 500 Internal Server Error
+				if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
+					t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
+				}
+			})
+		})
+	}
+}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -33,6 +33,9 @@ func TestVisionModels(t *testing.T) {
 			// Qwen 3 VL mixture of experts
 			model: "qwen3-vl:30b",
 		},
+		{
+			model: "ministral-3",
+		},
 	}

 	for _, v := range testCases {
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -11,6 +11,15 @@ import (
 	"github.com/ollama/ollama/api"
 )

+// testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests)
+func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
+	props := api.NewToolPropertiesMap()
+	for k, v := range m {
+		props.Set(k, v)
+	}
+	return props
+}
+
 func TestAPIToolCalling(t *testing.T) {
 	initialTimeout := 60 * time.Second
 	streamTimeout := 60 * time.Second
@@ -30,6 +39,7 @@ func TestAPIToolCalling(t *testing.T) {
 		"mistral":       6,
 		"qwen2.5":       6,
 		"qwen2":         6,
+		"ministral-3":   20,
 		"mistral-nemo":  9,
 		"mistral-small": 16,
 		"mixtral:8x22b": 80,
@@ -56,12 +66,12 @@ func TestAPIToolCalling(t *testing.T) {
 						Parameters: api.ToolFunctionParameters{
 							Type:     "object",
 							Required: []string{"location"},
-							Properties: map[string]api.ToolProperty{
+							Properties: testPropsMap(map[string]api.ToolProperty{
 								"location": {
 									Type:        api.PropertyType{"string"},
 									Description: "The city and state, e.g. San Francisco, CA",
 								},
-							},
+							}),
 						},
 					},
 				},
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -38,6 +38,7 @@ var (

 	// Note: add newer models at the top of the list to test them first
 	ollamaEngineChatModels = []string{
+		"ministral-3",
 		"qwen3-coder:30b",
 		"gpt-oss:20b",
 		"gemma3n:e2b",
@@ -167,6 +168,7 @@ var (
 		"medllama2",
 		"megadolphin",
 		"minicpm-v",
+		"ministral-3",
 		"mistral-large",
 		"mistral-nemo",
 		"mistral-openorca",
@@ -270,6 +272,7 @@ var (
 		"mistral",
 		"qwen2.5",
 		"qwen2",
+		"ministral-3",
 		"mistral-nemo",
 		"mistral-small",
 		"mixtral:8x22b",
--- a/internal/orderedmap/orderedmap.go
+++ b/internal/orderedmap/orderedmap.go
@@ -0,0 +1,94 @@
+// Package orderedmap provides a generic ordered map that maintains insertion order.
+// It wraps github.com/wk8/go-ordered-map/v2 to encapsulate the dependency.
+package orderedmap
+
+import (
+	"encoding/json"
+	"iter"
+
+	orderedmap "github.com/wk8/go-ordered-map/v2"
+)
+
+// Map is a generic ordered map that maintains insertion order.
+type Map[K comparable, V any] struct {
+	om *orderedmap.OrderedMap[K, V]
+}
+
+// New creates a new empty ordered map.
+func New[K comparable, V any]() *Map[K, V] {
+	return &Map[K, V]{
+		om: orderedmap.New[K, V](),
+	}
+}
+
+// Get retrieves a value by key.
+func (m *Map[K, V]) Get(key K) (V, bool) {
+	if m == nil || m.om == nil {
+		var zero V
+		return zero, false
+	}
+	return m.om.Get(key)
+}
+
+// Set sets a key-value pair. If the key already exists, its value is updated
+// but its position in the iteration order is preserved. If the key is new,
+// it is appended to the end.
+func (m *Map[K, V]) Set(key K, value V) {
+	if m == nil {
+		return
+	}
+	if m.om == nil {
+		m.om = orderedmap.New[K, V]()
+	}
+	m.om.Set(key, value)
+}
+
+// Len returns the number of entries.
+func (m *Map[K, V]) Len() int {
+	if m == nil || m.om == nil {
+		return 0
+	}
+	return m.om.Len()
+}
+
+// All returns an iterator over all key-value pairs in insertion order.
+func (m *Map[K, V]) All() iter.Seq2[K, V] {
+	return func(yield func(K, V) bool) {
+		if m == nil || m.om == nil {
+			return
+		}
+		for pair := m.om.Oldest(); pair != nil; pair = pair.Next() {
+			if !yield(pair.Key, pair.Value) {
+				return
+			}
+		}
+	}
+}
+
+// ToMap converts to a regular Go map.
+// Note: The resulting map does not preserve order.
+func (m *Map[K, V]) ToMap() map[K]V {
+	if m == nil || m.om == nil {
+		return nil
+	}
+	result := make(map[K]V, m.om.Len())
+	for pair := m.om.Oldest(); pair != nil; pair = pair.Next() {
+		result[pair.Key] = pair.Value
+	}
+	return result
+}
+
+// MarshalJSON implements json.Marshaler. The JSON output preserves key order.
+func (m *Map[K, V]) MarshalJSON() ([]byte, error) {
+	if m == nil || m.om == nil {
+		return []byte("null"), nil
+	}
+	return json.Marshal(m.om)
+}
+
+// UnmarshalJSON implements json.Unmarshaler. The insertion order matches the
+// order of keys in the JSON input.
+func (m *Map[K, V]) UnmarshalJSON(data []byte) error {
+	m.om = orderedmap.New[K, V]()
+	return json.Unmarshal(data, &m.om)
+}
--- a/internal/orderedmap/orderedmap_test.go
+++ b/internal/orderedmap/orderedmap_test.go
@@ -0,0 +1,348 @@
+package orderedmap
+
+import (
+	"encoding/json"
+	"slices"
+	"testing"
+)
+
+func TestMap_BasicOperations(t *testing.T) {
+	m := New[string, int]()
+
+	// Test empty map
+	if m.Len() != 0 {
+		t.Errorf("expected Len() = 0, got %d", m.Len())
+	}
+	v, ok := m.Get("a")
+	if ok {
+		t.Error("expected Get on empty map to return false")
+	}
+	if v != 0 {
+		t.Errorf("expected zero value, got %d", v)
+	}
+
+	// Test Set and Get
+	m.Set("a", 1)
+	m.Set("b", 2)
+	m.Set("c", 3)
+
+	if m.Len() != 3 {
+		t.Errorf("expected Len() = 3, got %d", m.Len())
+	}
+
+	v, ok = m.Get("a")
+	if !ok || v != 1 {
+		t.Errorf("expected Get(a) = (1, true), got (%d, %v)", v, ok)
+	}
+
+	v, ok = m.Get("b")
+	if !ok || v != 2 {
+		t.Errorf("expected Get(b) = (2, true), got (%d, %v)", v, ok)
+	}
+
+	v, ok = m.Get("c")
+	if !ok || v != 3 {
+		t.Errorf("expected Get(c) = (3, true), got (%d, %v)", v, ok)
+	}
+
+	// Test updating existing key preserves position
+	m.Set("a", 10)
+	v, ok = m.Get("a")
+	if !ok || v != 10 {
+		t.Errorf("expected Get(a) = (10, true), got (%d, %v)", v, ok)
+	}
+	if m.Len() != 3 {
+		t.Errorf("expected Len() = 3 after update, got %d", m.Len())
+	}
+}
+
+func TestMap_InsertionOrderPreserved(t *testing.T) {
+	m := New[string, int]()
+
+	// Insert in non-alphabetical order
+	m.Set("z", 1)
+	m.Set("a", 2)
+	m.Set("m", 3)
+	m.Set("b", 4)
+
+	// Verify iteration order matches insertion order
+	var keys []string
+	var values []int
+	for k, v := range m.All() {
+		keys = append(keys, k)
+		values = append(values, v)
+	}
+
+	expectedKeys := []string{"z", "a", "m", "b"}
+	expectedValues := []int{1, 2, 3, 4}
+
+	if !slices.Equal(keys, expectedKeys) {
+		t.Errorf("expected keys %v, got %v", expectedKeys, keys)
+	}
+	if !slices.Equal(values, expectedValues) {
+		t.Errorf("expected values %v, got %v", expectedValues, values)
+	}
+}
+
+func TestMap_UpdatePreservesPosition(t *testing.T) {
+	m := New[string, int]()
+
+	m.Set("first", 1)
+	m.Set("second", 2)
+	m.Set("third", 3)
+
+	// Update middle element
+	m.Set("second", 20)
+
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+
+	// Order should still be first, second, third
+	expected := []string{"first", "second", "third"}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected keys %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_MarshalJSON_PreservesOrder(t *testing.T) {
+	m := New[string, int]()
+
+	// Insert in non-alphabetical order
+	m.Set("z", 1)
+	m.Set("a", 2)
+	m.Set("m", 3)
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+
+	// JSON should preserve insertion order, not alphabetical
+	expected := `{"z":1,"a":2,"m":3}`
+	if string(data) != expected {
+		t.Errorf("expected %s, got %s", expected, string(data))
+	}
+}
+
+func TestMap_UnmarshalJSON_PreservesOrder(t *testing.T) {
+	// JSON with non-alphabetical key order
+	jsonData := `{"z":1,"a":2,"m":3}`
+
+	m := New[string, int]()
+	if err := json.Unmarshal([]byte(jsonData), m); err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	// Verify iteration order matches JSON order
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+
+	expected := []string{"z", "a", "m"}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected keys %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_JSONRoundTrip(t *testing.T) {
+	// Test that unmarshal -> marshal produces identical JSON
+	original := `{"zebra":"z","apple":"a","mango":"m","banana":"b"}`
+
+	m := New[string, string]()
+	if err := json.Unmarshal([]byte(original), m); err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+
+	if string(data) != original {
+		t.Errorf("round trip failed: expected %s, got %s", original, string(data))
+	}
+}
+
+func TestMap_ToMap(t *testing.T) {
+	m := New[string, int]()
+	m.Set("a", 1)
+	m.Set("b", 2)
+
+	regular := m.ToMap()
+
+	if len(regular) != 2 {
+		t.Errorf("expected len 2, got %d", len(regular))
+	}
+	if regular["a"] != 1 {
+		t.Errorf("expected regular[a] = 1, got %d", regular["a"])
+	}
+	if regular["b"] != 2 {
+		t.Errorf("expected regular[b] = 2, got %d", regular["b"])
+	}
+}
+
+func TestMap_NilSafety(t *testing.T) {
+	var m *Map[string, int]
+
+	// All operations should be safe on nil
+	if m.Len() != 0 {
+		t.Errorf("expected Len() = 0 on nil map, got %d", m.Len())
+	}
+
+	v, ok := m.Get("a")
+	if ok {
+		t.Error("expected Get on nil map to return false")
+	}
+	if v != 0 {
+		t.Errorf("expected zero value from nil map, got %d", v)
+	}
+
+	// Set on nil is a no-op
+	m.Set("a", 1)
+	if m.Len() != 0 {
+		t.Errorf("expected Len() = 0 after Set on nil, got %d", m.Len())
+	}
+
+	// All returns empty iterator
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+	if len(keys) != 0 {
+		t.Errorf("expected empty iteration on nil map, got %v", keys)
+	}
+
+	// ToMap returns nil
+	if m.ToMap() != nil {
+		t.Error("expected ToMap to return nil on nil map")
+	}
+
+	// MarshalJSON returns null
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+	if string(data) != "null" {
+		t.Errorf("expected null, got %s", string(data))
+	}
+}
+
+func TestMap_EmptyMapMarshal(t *testing.T) {
+	m := New[string, int]()
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+	if string(data) != "{}" {
+		t.Errorf("expected {}, got %s", string(data))
+	}
+}
+
+func TestMap_NestedValues(t *testing.T) {
+	m := New[string, any]()
+	m.Set("string", "hello")
+	m.Set("number", 42)
+	m.Set("bool", true)
+	m.Set("nested", map[string]int{"x": 1})
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+
+	expected := `{"string":"hello","number":42,"bool":true,"nested":{"x":1}}`
+	if string(data) != expected {
+		t.Errorf("expected %s, got %s", expected, string(data))
+	}
+}
+
+func TestMap_AllIteratorEarlyExit(t *testing.T) {
+	m := New[string, int]()
+	m.Set("a", 1)
+	m.Set("b", 2)
+	m.Set("c", 3)
+	m.Set("d", 4)
+
+	// Collect only first 2
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+		if len(keys) == 2 {
+			break
+		}
+	}
+
+	expected := []string{"a", "b"}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_IntegerKeys(t *testing.T) {
+	m := New[int, string]()
+	m.Set(3, "three")
+	m.Set(1, "one")
+	m.Set(2, "two")
+
+	var keys []int
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+
+	// Should preserve insertion order, not numerical order
+	expected := []int{3, 1, 2}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_UnmarshalIntoExisting(t *testing.T) {
+	m := New[string, int]()
+	m.Set("existing", 999)
+
+	// Unmarshal should replace contents
+	if err := json.Unmarshal([]byte(`{"new":1}`), m); err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	_, ok := m.Get("existing")
+	if ok {
+		t.Error("existing key should be gone after unmarshal")
+	}
+
+	v, ok := m.Get("new")
+	if !ok || v != 1 {
+		t.Errorf("expected Get(new) = (1, true), got (%d, %v)", v, ok)
+	}
+}
+
+func TestMap_LargeOrderPreservation(t *testing.T) {
+	m := New[string, int]()
+
+	// Create many keys in specific order
+	keys := make([]string, 100)
+	for i := range 100 {
+		keys[i] = string(rune('a' + (99 - i))) // reverse order: 'd', 'c', 'b', 'a' (extended)
+		if i >= 26 {
+			keys[i] = string(rune('A'+i-26)) + string(rune('a'+i%26))
+		}
+	}
+
+	for i, k := range keys {
+		m.Set(k, i)
+	}
+
+	// Verify order preserved
+	var resultKeys []string
+	for k := range m.All() {
+		resultKeys = append(resultKeys, k)
+	}
+
+	if !slices.Equal(keys, resultKeys) {
+		t.Error("large map should preserve insertion order")
+	}
+}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -3,7 +3,6 @@ package kvcache
 import (
 	"errors"
 	"fmt"
-	"log/slog"
 	"math"
 	"slices"

@@ -40,18 +39,18 @@ type Causal struct {

 	// ** current forward pass **

-	// the active layer for Get and Put
-	curLayer int
-
-	// starting location for data storage for this batch
-	curLoc int
-
 	// size of the current batch
 	curBatchSize int

+	// locations for data storage for this batch
+	curLoc ml.Tensor
+
 	// mask of the cache as used by this batch
 	curMask ml.Tensor

+	// the active layer for Get and Put
+	curLayer int
+
 	// locations in the cache that are needed for this batch
 	curCellRange cellRange

@@ -141,10 +140,6 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.CachePadding = 1
 	}

-	if c.config.MaskBatchPadding == 0 {
-		c.config.MaskBatchPadding = 1
-	}
-
 	if c.config.MaskDType == ml.DTypeOther {
 		c.config.MaskDType = ml.DTypeF32
 	}
@@ -206,45 +201,47 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

+	var locs []int32
 	if !reserve {
 		c.updateSlidingWindow()

 		var err error
-		c.curLoc, err = c.findStartLoc()
-		if errors.Is(err, ErrKvCacheFull) {
-			c.defrag()
-			c.curLoc, err = c.findStartLoc()
-		}
+		locs, err = c.findLocs()
 		if err != nil {
 			return err
 		}

 		for i, pos := range batch.Positions {
 			seq := batch.Sequences[i]
+			loc := int(locs[i])

-			c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
+			c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}

 			seqRange, ok := c.cellRanges[seq]
 			if !ok {
 				seqRange = newRange()
 			}

-			seqRange.min = min(seqRange.min, c.curLoc+i)
-			c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
+			seqRange.min = min(seqRange.min, loc)
+			c.curCellRange.min = min(c.curCellRange.min, loc)

-			seqRange.max = max(seqRange.max, c.curLoc+i)
-			c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
+			seqRange.max = max(seqRange.max, loc)
+			c.curCellRange.max = max(c.curCellRange.max, loc)

 			c.cellRanges[seq] = seqRange
 		}
 	} else {
 		// If we are reserving memory, don't update any of the cache metadata but set the size
 		// to the worst case.
-		c.curLoc = 0
+		locs = make([]int32, c.curBatchSize)
+		for i := range locs {
+			locs[i] = int32(i)
+		}
 		c.curCellRange.min = 0
 		c.curCellRange.max = len(c.cells) - 1
 	}

+	c.curLoc = ctx.Input().FromInts(locs, len(locs))
 	c.curMask = c.buildMask(ctx)

 	return nil
@@ -257,22 +254,20 @@ func newRange() cellRange {
 	}
 }

-// Find the first contiguous block of at least curBatchSize
-func (c *Causal) findStartLoc() (int, error) {
-	var start, count int
+// Returns a slice of locations where each token in the batch should be stored
+func (c *Causal) findLocs() ([]int32, error) {
+	loc := make([]int32, 0, c.curBatchSize)
+
 	for i := range c.cells {
 		if len(c.cells[i].sequences) == 0 {
-			count++
-			if count >= c.curBatchSize {
-				return start, nil
+			loc = append(loc, int32(i))
+			if len(loc) >= c.curBatchSize {
+				return loc, nil
 			}
-		} else {
-			start = i + 1
-			count = 0
 		}
 	}

-	return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
+	return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
 }

 func (c *Causal) updateSlidingWindow() {
@@ -365,15 +360,12 @@ func roundUp(length, pad int) int {
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
 func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
-	// Align and pad the two dimensions as required by the backend
-	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
-
 	c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1

 	length := c.curCellRange.max - c.curCellRange.min + 1

-	mask := make([]float32, batchSize*length)
+	mask := make([]float32, c.curBatchSize*length)

 	for i := range c.curBatchSize {
 		enabled := !slices.Contains(c.opts.Except, i)
@@ -387,13 +379,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		}
 	}

-	// Mask out any padding tokens we added. For padding that we added to the cache history, this
-	// has already been masked out because the sequence doesn't match.
-	for i := c.curBatchSize * length; i < len(mask); i++ {
-		mask[i] = float32(math.Inf(-1))
-	}
-
-	maskTensor := ctx.Input().FromFloats(mask, length, batchSize)
+	maskTensor := ctx.Input().FromFloats(mask, length, c.curBatchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
 		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
@@ -402,145 +388,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	return maskTensor
 }

-func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
-	for i, key := range c.keys {
-		if key == nil {
-			continue
-		}
-
-		kHeadDim := key.Dim(0)
-		numKVHeads := key.Dim(1)
-		rowSize := key.Stride(2)
-
-		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
-		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
-
-		value := c.values[i]
-		var vSrcView, vDstView ml.Tensor
-		if c.config.PermutedV {
-			vHeadDim := value.Dim(1)
-			elemSize := value.Stride(0)
-
-			vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-			vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-		} else {
-			vHeadDim := value.Dim(0)
-			rowSize := value.Stride(2)
-
-			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
-			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
-		}
-
-		ctx.Forward(
-			kSrcView.Copy(ctx, kDstView),
-			vSrcView.Copy(ctx, vDstView),
-		)
-	}
-}
-
-func (c *Causal) defrag() {
-	slog.Debug("defragmenting kv cache")
-
-	// Defrag strategy:
-	// - Search for empty holes at the beginning of the cache,
-	//   filling them with active data starting at the end
-	// - If there are contiguous elements that need to be moved,
-	//   combine them into a single operation by holding new moves
-	//   until we see that the next one is non-contiguous
-	// - Fill up the context with the maximum number of operations it
-	//   can hold then compute that and continue with a new context
-	//
-	// We could try to optimize placement by grouping blocks from
-	// the same sequences together but most likely the next forward
-	// pass will disrupt this anyways, so the real world benefit
-	// seems limited as this time.
-
-	ctx := c.backend.NewContext()
-
-	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v). We also need to refer to the original
-	// k and v cache tensors - once per layer, not per move.
-	layers := 0
-	for _, key := range c.keys {
-		if key == nil {
-			continue
-		}
-		layers++
-	}
-
-	maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
-	moves := 0
-
-	var pendingSrc, pendingDst, pendingLen int
-	src := len(c.cells) - 1
-
-	for dst := 0; dst < src; dst++ {
-		if len(c.cells[dst].sequences) == 0 {
-			for ; src > dst; src-- {
-				if len(c.cells[src].sequences) != 0 {
-					c.cells[dst] = c.cells[src]
-					c.cells[src] = cacheCell{}
-
-					if pendingLen > 0 {
-						if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
-							pendingSrc = src
-							pendingLen++
-							break
-						} else {
-							c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
-							moves++
-						}
-					}
-
-					pendingSrc = src
-					pendingDst = dst
-					pendingLen = 1
-
-					break
-				}
-			}
-		}
-
-		if moves >= maxMoves {
-			ctx.Compute()
-			ctx.Close()
-			ctx = c.backend.NewContext()
-
-			moves = 0
-		}
-	}
-
-	if pendingLen > 0 {
-		c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
-		moves++
-	}
-
-	if moves > 0 {
-		ctx.Compute()
-	}
-	ctx.Close()
-
-	// Reset range metadata
-	for seq := range c.cellRanges {
-		seqRange := newRange()
-
-		for i, cell := range c.cells {
-			if slices.Contains(cell.sequences, seq) {
-				if i < seqRange.min {
-					seqRange.min = i
-				}
-				if i > seqRange.max {
-					seqRange.max = i
-				}
-			}
-		}
-
-		c.cellRanges[seq] = seqRange
-	}
-
-	c.updateSlidingWindow()
-}
-
 func (c *Causal) SetLayer(layer int) {
 	c.curLayer = layer
 }
@@ -625,18 +472,25 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 		}
 	}

-	rowSize := c.keys[c.curLayer].Stride(2)
-	ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, rowSize*c.curLoc, kHeadDim*numKVHeads*batchSize)))
+	key = key.Reshape(ctx, kHeadDim*numKVHeads, batchSize)
+	keyCache := c.keys[c.curLayer]
+	keyCache = keyCache.Reshape(ctx, kHeadDim*numKVHeads, len(c.cells))
+	ctx.Forward(keyCache.SetRows(ctx, key, c.curLoc))

 	if c.config.PermutedV {
-		elemSize := c.values[c.curLayer].Stride(0)
+		value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
+		value = value.Permute(ctx, 2, 0, 1, 3)

-		value = value.Permute(ctx, 1, 2, 0, 3)
-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
+		valueCache := c.values[c.curLayer]
+		valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
+
+		ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
 	} else {
-		rowSize := c.values[c.curLayer].Stride(2)
+		value = value.Reshape(ctx, vHeadDim*numKVHeads, batchSize)
+		valueCache := c.values[c.curLayer]
+		valueCache = valueCache.Reshape(ctx, vHeadDim*numKVHeads, len(c.cells))

-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, rowSize*c.curLoc, vHeadDim*numKVHeads*batchSize)))
+		ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
 	}
 }

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "3cfa9c3f125763305b4226bc032f1954f08990dc";
+char const *LLAMA_COMMIT = "ec98e2002";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -17,11 +17,17 @@ include /tools/mtmd/clip.cpp
 include /tools/mtmd/mtmd.cpp
 include /tools/mtmd/mtmd-audio.cpp
 include /tools/mtmd/mtmd-helper.cpp
+include /tools/mtmd/models/
+include /tools/mtmd/models/*.h
+include /tools/mtmd/models/*.cpp
 include /src/
 include /src/llama.*
 include /src/llama-*.*
 include /src/unicode-data.*
 include /src/unicode.*
+include /src/models/
+include /src/models/*.h
+include /src/models/*.cpp
 include /vendor/
 include /vendor/miniaudio/
 include /vendor/miniaudio/*.h
--- a/Show More
+++ b/Show More