mlx: update upstream mlx version

docs: nemoclaw integration (#14962 )
--------- Co-authored-by: ParthSareen <parth.sareen@ollama.com>
2026-04-21 00:05:40 +02:00 · 2026-03-22 13:39:52 -07:00 · 2026-03-20 15:27:37 -07:00 · 2026-03-20 15:20:14 -07:00 · 2026-03-19 17:44:28 -07:00 · 2026-03-19 17:44:28 -07:00
329 changed files with 31207 additions and 12594 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -117,6 +117,25 @@ jobs:
            install: https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe
            flags: ''
            runner_dir: 'vulkan'
+          - os: windows
+            arch: amd64
+            preset: 'MLX CUDA 13'
+            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            cudnn-install: https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.18.1.3_cuda13-archive.zip
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"cufft"'
+              - '"cufft_dev"'
+              - '"nvrtc"'
+              - '"nvrtc_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
+            flags: ''
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -125,8 +144,10 @@ jobs:
      - name: Install system dependencies
        run: |
          choco install -y --no-progress ccache ninja
-          ccache -o cache_dir=${{ github.workspace }}\.ccache
-      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ') || startsWith(matrix.preset, 'Vulkan')
+          if (Get-Command ccache -ErrorAction SilentlyContinue) {
+            ccache -o cache_dir=${{ github.workspace }}\.ccache
+          }
+      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ') || startsWith(matrix.preset, 'Vulkan') || startsWith(matrix.preset, 'MLX ')
        id: cache-install
        uses: actions/cache/restore@v4
        with:
@@ -134,8 +155,9 @@ jobs:
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
            C:\VulkanSDK
-          key: ${{ matrix.install }}
-      - if: startsWith(matrix.preset, 'CUDA ')
+            C:\Program Files\NVIDIA\CUDNN
+          key: ${{ matrix.install }}-${{ matrix.cudnn-install }}
+      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'MLX ')
        name: Install CUDA ${{ matrix.cuda-version }}
        run: |
          $ErrorActionPreference = "Stop"
@@ -179,6 +201,23 @@ jobs:
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+      - if: startsWith(matrix.preset, 'MLX ')
+        name: Install cuDNN for MLX
+        run: |
+          $ErrorActionPreference = "Stop"
+          $cudnnRoot = "C:\Program Files\NVIDIA\CUDNN"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.cudnn-install }}" -OutFile "cudnn.zip"
+            Expand-Archive -Path cudnn.zip -DestinationPath cudnn-extracted
+            $cudnnDir = (Get-ChildItem -Path cudnn-extracted -Directory)[0].FullName
+            New-Item -ItemType Directory -Force -Path $cudnnRoot
+            Copy-Item -Path "$cudnnDir\*" -Destination "$cudnnRoot\" -Recurse
+          }
+
+          echo "CUDNN_ROOT_DIR=$cudnnRoot" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CUDNN_INCLUDE_PATH=$cudnnRoot\include" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CUDNN_LIBRARY_PATH=$cudnnRoot\lib\x64" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "$cudnnRoot\bin\x64" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -186,7 +225,8 @@ jobs:
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
            C:\VulkanSDK
-          key: ${{ matrix.install }}
+            C:\Program Files\NVIDIA\CUDNN
+          key: ${{ matrix.install }}-${{ matrix.cudnn-install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
        with:
@@ -198,7 +238,7 @@ jobs:
          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} --install-prefix "$((pwd).Path)\dist\${{ matrix.os }}-${{ matrix.arch }}"
          cmake --build --parallel ([Environment]::ProcessorCount) --preset "${{ matrix.preset }}"
-          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip
+          cmake --install build --component "${{ startsWith(matrix.preset, 'MLX ') && 'MLX' || startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip
          Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
        env:
          CMAKE_GENERATOR: Ninja
@@ -543,11 +583,19 @@ jobs:
          for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.tar.zst dist/*.exe dist/*.dmg dist/*.ps1 dist/*.sh ; do
            echo "Uploading $payload"
            gh release upload ${GITHUB_REF_NAME} $payload --clobber &
-            pids[$!]=$!
+            pids+=($!)
            sleep 1
          done
          echo "Waiting for uploads to complete"
-          for pid in "${pids[*]}"; do
-            wait $pid
+          failed=0
+          for pid in "${pids[@]}"; do
+            if ! wait $pid; then
+              echo "::error::Upload failed (pid $pid)"
+              failed=1
+            fi
          done
+          if [ $failed -ne 0 ]; then
+            echo "One or more uploads failed"
+            exit 1
+          fi
          echo "done"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -37,7 +37,7 @@ jobs:
              | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
          }

-          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*' '.github/**/*') | tee -a $GITHUB_OUTPUT
          echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT

  linux:
@@ -51,7 +51,7 @@ jobs:
            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
-            container: rocm/dev-ubuntu-22.04:6.1.2
+            container: rocm/dev-ubuntu-22.04:7.2
            extra-packages: rocm-libs
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
          - preset: Vulkan
@@ -60,6 +60,10 @@ jobs:
              mesa-vulkan-drivers vulkan-tools
              libvulkan1 libvulkan-dev
              vulkan-sdk cmake ccache g++ make
+          - preset: 'MLX CUDA 13'
+            container: nvidia/cuda:13.0.0-devel-ubuntu22.04
+            extra-packages: libcudnn9-dev-cuda-13 libopenblas-dev liblapack-dev liblapacke-dev git curl
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87 -DBLAS_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu -DLAPACK_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu'
    runs-on: linux
    container: ${{ matrix.container }}
    steps:
@@ -76,6 +80,10 @@ jobs:
            $sudo apt-get update
          fi
          $sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
+          # MLX requires CMake 3.25+, install from official releases
+          if [ "${{ matrix.preset }}" = "MLX CUDA 13" ]; then
+            curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.31.2/cmake-3.31.2-linux-$(uname -m).tar.gz | $sudo tar xz -C /usr/local --strip-components 1
+          fi
          # Export VULKAN_SDK if provided by LunarG package (defensive)
          if [ -d "/usr/lib/x86_64-linux-gnu/vulkan" ] && [ "${{ matrix.preset }}" = "Vulkan" ]; then
            echo "VULKAN_SDK=/usr" >> $GITHUB_ENV
@@ -87,8 +95,8 @@ jobs:
          path: /github/home/.cache/ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
      - run: |
-          cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
-          cmake --build --preset ${{ matrix.preset }} --parallel
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          cmake --build --preset "${{ matrix.preset }}" --parallel

  windows:
    needs: [changes]
@@ -114,12 +122,31 @@ jobs:
            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
          - preset: Vulkan
            install: https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe
+          - preset: 'MLX CUDA 13'
+            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
+            cudnn-install: https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.18.1.3_cuda13-archive.zip
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
+            cuda-components:
+              - '"cudart"'
+              - '"nvcc"'
+              - '"cublas"'
+              - '"cublas_dev"'
+              - '"cufft"'
+              - '"cufft_dev"'
+              - '"nvrtc"'
+              - '"nvrtc_dev"'
+              - '"crt"'
+              - '"nvvm"'
+              - '"nvptxcompiler"'
+            cuda-version: '13.0'
    runs-on: windows
    steps:
      - run: |
          choco install -y --no-progress ccache ninja
-          ccache -o cache_dir=${{ github.workspace }}\.ccache
-      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm' || matrix.preset == 'Vulkan'
+          if (Get-Command ccache -ErrorAction SilentlyContinue) {
+            ccache -o cache_dir=${{ github.workspace }}\.ccache
+          }
+      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm' || matrix.preset == 'Vulkan' || matrix.preset == 'MLX CUDA 13'
        id: cache-install
        uses: actions/cache/restore@v4
        with:
@@ -127,8 +154,9 @@ jobs:
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
            C:\VulkanSDK
-          key: ${{ matrix.install }}
-      - if: matrix.preset == 'CUDA'
+            C:\Program Files\NVIDIA\CUDNN
+          key: ${{ matrix.install }}-${{ matrix.cudnn-install }}
+      - if: matrix.preset == 'CUDA' || matrix.preset == 'MLX CUDA 13'
        name: Install CUDA ${{ matrix.cuda-version }}
        run: |
          $ErrorActionPreference = "Stop"
@@ -164,10 +192,27 @@ jobs:
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
            Start-Process -FilePath .\install.exe -ArgumentList "-c","--am","--al","in" -NoNewWindow -Wait
          }
-          
+
          $vulkanPath = (Resolve-Path "C:\VulkanSDK\*").path
          echo "$vulkanPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "VULKAN_SDK=$vulkanPath" >> $env:GITHUB_ENV
+      - if: matrix.preset == 'MLX CUDA 13'
+        name: Install cuDNN for MLX
+        run: |
+          $ErrorActionPreference = "Stop"
+          $cudnnRoot = "C:\Program Files\NVIDIA\CUDNN"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.cudnn-install }}" -OutFile "cudnn.zip"
+            Expand-Archive -Path cudnn.zip -DestinationPath cudnn-extracted
+            $cudnnDir = (Get-ChildItem -Path cudnn-extracted -Directory)[0].FullName
+            New-Item -ItemType Directory -Force -Path $cudnnRoot
+            Copy-Item -Path "$cudnnDir\*" -Destination "$cudnnRoot\" -Recurse
+          }
+
+          echo "CUDNN_ROOT_DIR=$cudnnRoot" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CUDNN_INCLUDE_PATH=$cudnnRoot\include" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CUDNN_LIBRARY_PATH=$cudnnRoot\lib\x64" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "$cudnnRoot\bin\x64" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -175,7 +220,8 @@ jobs:
            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
            C:\Program Files\AMD\ROCm
            C:\VulkanSDK
-          key: ${{ matrix.install }}
+            C:\Program Files\NVIDIA\CUDNN
+          key: ${{ matrix.install }}-${{ matrix.cudnn-install }}
      - uses: actions/checkout@v4
      - uses: actions/cache@v4
        with:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,10 +64,15 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})

-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
+# Store ggml include paths for use with target_include_directories later.
+# We avoid global include_directories() to prevent polluting the include path
+# for other projects like MLX (whose openblas dependency has its own common.h).
+set(GGML_INCLUDE_DIRS
+    ${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src
+    ${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu
+    ${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx
+)

 add_compile_definitions(NDEBUG GGML_VERSION=0x0 GGML_COMMIT=0x0)

@@ -87,6 +92,14 @@ if(NOT CPU_VARIANTS)
    set(CPU_VARIANTS "ggml-cpu")
 endif()

+# Apply ggml include directories to ggml targets only (not globally)
+target_include_directories(ggml-base PRIVATE ${GGML_INCLUDE_DIRS})
+foreach(variant ${CPU_VARIANTS})
+    if(TARGET ${variant})
+        target_include_directories(${variant} PRIVATE ${GGML_INCLUDE_DIRS})
+    endif()
+endforeach()
+
 install(TARGETS ggml-base ${CPU_VARIANTS}
    RUNTIME_DEPENDENCIES
        PRE_EXCLUDE_REGEXES ".*"
@@ -103,6 +116,7 @@ if(CMAKE_CUDA_COMPILER)

    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+    target_include_directories(ggml-cuda PRIVATE ${GGML_INCLUDE_DIRS})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
@@ -134,6 +148,7 @@ if(CMAKE_HIP_COMPILER)
    if(AMDGPU_TARGETS)
        find_package(hip REQUIRED)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+        target_include_directories(ggml-hip PRIVATE ${GGML_INCLUDE_DIRS})

        if (WIN32)
            target_compile_definitions(ggml-hip PRIVATE GGML_CUDA_NO_PEER_COPY)
@@ -148,7 +163,7 @@ if(CMAKE_HIP_COMPILER)
        )
        install(RUNTIME_DEPENDENCY_SET rocm
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
-                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
+                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register roctx64 rocroller drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
                POST_EXCLUDE_REGEXES "system32"
            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
@@ -168,6 +183,7 @@ if(NOT APPLE)
    find_package(Vulkan)
    if(Vulkan_FOUND)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
+        target_include_directories(ggml-vulkan PRIVATE ${GGML_INCLUDE_DIRS})
        install(TARGETS ggml-vulkan
            RUNTIME_DEPENDENCIES
                PRE_INCLUDE_REGEXES vulkan
@@ -179,7 +195,6 @@ if(NOT APPLE)
 endif()

 option(MLX_ENGINE "Enable MLX backend" OFF)
-
 if(MLX_ENGINE)
    message(STATUS "Setting up MLX (this takes a while...)")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/imagegen/mlx)
@@ -187,10 +202,36 @@ if(MLX_ENGINE)
    # Find CUDA toolkit if MLX is built with CUDA support
    find_package(CUDAToolkit)

+    # Build list of directories for runtime dependency resolution
+    set(MLX_RUNTIME_DIRS ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR})
+    # Add cuDNN bin paths for DLLs (Windows MLX CUDA builds)
+    # CUDNN_ROOT_DIR is the standard CMake variable for cuDNN location
+    if(DEFINED ENV{CUDNN_ROOT_DIR})
+        # cuDNN 9.x has versioned subdirectories under bin/ (e.g., bin/13.0/)
+        file(GLOB CUDNN_BIN_SUBDIRS "$ENV{CUDNN_ROOT_DIR}/bin/*")
+        list(APPEND MLX_RUNTIME_DIRS ${CUDNN_BIN_SUBDIRS})
+    endif()
+    # Add build output directory and MLX dependency build directories
+    list(APPEND MLX_RUNTIME_DIRS ${OLLAMA_BUILD_DIR})
+    # OpenBLAS DLL location (pre-built zip extracts into openblas-src/bin/)
+    list(APPEND MLX_RUNTIME_DIRS ${CMAKE_BINARY_DIR}/_deps/openblas-src/bin)
+    # NCCL: on Linux, if real NCCL is found, cmake bundles libnccl.so via the
+    # regex below. If NCCL is not found, MLX links a static stub (OBJECT lib)
+    # so there is no runtime dependency. This path covers the stub build dir
+    # for windows so we include the DLL in our dependencies.
+    list(APPEND MLX_RUNTIME_DIRS ${CMAKE_BINARY_DIR}/_deps/mlx-build/mlx/distributed/nccl/nccl_stub-prefix/src/nccl_stub-build/Release)
+
+    # Base regexes for runtime dependencies (cross-platform)
+    set(MLX_INCLUDE_REGEXES cublas cublasLt cudart cufft nvrtc nvrtc-builtins cudnn nccl openblas gfortran)
+    # On Windows, also include dl.dll (dlfcn-win32 POSIX emulation layer)
+    if(WIN32)
+        list(APPEND MLX_INCLUDE_REGEXES "^dl\\.dll$")
+    endif()
+
    install(TARGETS mlx mlxc
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc nvrtc-builtins cudnn nccl openblas gfortran
+            DIRECTORIES ${MLX_RUNTIME_DIRS}
+            PRE_INCLUDE_REGEXES ${MLX_INCLUDE_REGEXES}
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
@@ -205,13 +246,54 @@ if(MLX_ENGINE)
            COMPONENT MLX)
    endif()

-    # Manually install cudart and cublas since they might not be picked up as direct dependencies
+    # Install CCCL headers for NVRTC JIT compilation at runtime.
+    # MLX's own install rules use the default component so they get skipped by
+    # --component MLX. Headers are installed alongside libmlx in OLLAMA_INSTALL_DIR.
+    # On Linux, MLX's jit_module.cpp resolves CCCL via
+    # current_binary_dir().parent_path() / "include" / "cccl", so we create a
+    # symlink from lib/ollama/include -> ${OLLAMA_RUNNER_DIR}/include
+    # This will need refinement if we add multiple CUDA versions for MLX in the future.
+    if(EXISTS ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/cuda)
+        install(DIRECTORY ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/cuda
+            DESTINATION ${OLLAMA_INSTALL_DIR}/include/cccl
+            COMPONENT MLX)
+        install(DIRECTORY ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/nv
+            DESTINATION ${OLLAMA_INSTALL_DIR}/include/cccl
+            COMPONENT MLX)
+        if(NOT WIN32 AND NOT APPLE)
+            install(CODE "
+                set(_link \"${CMAKE_INSTALL_PREFIX}/lib/ollama/include\")
+                set(_target \"${OLLAMA_RUNNER_DIR}/include\")
+                if(NOT EXISTS \${_link})
+                    execute_process(COMMAND \${CMAKE_COMMAND} -E create_symlink \${_target} \${_link})
+                endif()
+            " COMPONENT MLX)
+        endif()
+    endif()
+
+    # On Windows, explicitly install dl.dll (dlfcn-win32 POSIX dlopen emulation)
+    # RUNTIME_DEPENDENCIES auto-excludes it via POST_EXCLUDE_FILES_STRICT because
+    # dlfcn-win32 is a known CMake target with its own install rules (which install
+    # to the wrong destination). We must install it explicitly here.
+    if(WIN32)
+        install(FILES ${OLLAMA_BUILD_DIR}/dl.dll
+            DESTINATION ${OLLAMA_INSTALL_DIR}
+            COMPONENT MLX)
+    endif()
+
+    # Manually install CUDA runtime libraries that MLX loads via dlopen
+    # (not detected by RUNTIME_DEPENDENCIES since they aren't link-time deps)
    if(CUDAToolkit_FOUND)
-        file(GLOB CUDART_LIBS
+        file(GLOB MLX_CUDA_LIBS
            "${CUDAToolkit_LIBRARY_DIR}/libcudart.so*"
-            "${CUDAToolkit_LIBRARY_DIR}/libcublas.so*")
-        if(CUDART_LIBS)
-            install(FILES ${CUDART_LIBS}
+            "${CUDAToolkit_LIBRARY_DIR}/libcublas.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libnvrtc.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libnvrtc-builtins.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libcufft.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libcudnn.so*")
+        if(MLX_CUDA_LIBS)
+            install(FILES ${MLX_CUDA_LIBS}
                DESTINATION ${OLLAMA_INSTALL_DIR}
                COMPONENT MLX)
        endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -77,6 +77,15 @@
        "OLLAMA_RUNNER_DIR": "rocm"
      }
    },
+    {
+      "name": "ROCm 7",
+      "inherits": [ "ROCm" ],
+      "cacheVariables": {
+        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
+        "AMDGPU_TARGETS": "gfx942;gfx950;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-",
+        "OLLAMA_RUNNER_DIR": "rocm"
+      }
+    },
    {
      "name": "Vulkan",
      "inherits": [ "Default" ],
@@ -103,6 +112,7 @@
      "name": "MLX CUDA 13",
      "inherits": [ "MLX", "CUDA 13" ],
      "cacheVariables": {
+        "MLX_CUDA_ARCHITECTURES": "86;89;90;90a;100;103;75-virtual;80-virtual;110-virtual;120-virtual;121-virtual",
        "OLLAMA_RUNNER_DIR": "mlx_cuda_v13"
      }
    }
@@ -158,6 +168,11 @@
      "inherits": [ "ROCm" ],
      "configurePreset": "ROCm 6"
    },
+    {
+      "name": "ROCm 7",
+      "inherits": [ "ROCm" ],
+      "configurePreset": "ROCm 7"
+    },
    {
      "name": "Vulkan",
      "targets": [ "ggml-vulkan" ],
--- a/122
+++ b/122
@@ -1,28 +1,23 @@
 # vim: filetype=dockerfile

 ARG FLAVOR=${TARGETARCH}
-ARG PARALLEL=8

-ARG ROCMVERSION=6.3.3
+ARG ROCMVERSION=7.2
 ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
+ARG NINJAVERSION=1.12.1
 ARG VULKANVERSION=1.4.321.1

+# Default empty stages for local MLX source overrides.
+# Override with: docker build --build-context local-mlx=../mlx --build-context local-mlx-c=../mlx-c
+FROM scratch AS local-mlx
+FROM scratch AS local-mlx-c
+
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN dnf install -y yum-utils ccache gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ gcc-toolset-11-binutils \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
-ARG VULKANVERSION
-RUN wget https://sdk.lunarg.com/sdk/download/${VULKANVERSION}/linux/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz -O /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
-    && tar xvf /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
-    && dnf -y install ninja-build \
-    && ln -s /usr/bin/python3 /usr/bin/python \  
-    && /${VULKANVERSION}/vulkansdk -j 8 vulkan-headers \
-    && /${VULKANVERSION}/vulkansdk -j 8 shaderc
-RUN cp -r /${VULKANVERSION}/x86_64/include/* /usr/local/include/ \
-    && cp -r /${VULKANVERSION}/x86_64/lib/* /usr/local/lib
-ENV PATH=/${VULKANVERSION}/x86_64/bin:$PATH

 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@@ -33,100 +28,119 @@ ENV CC=clang CXX=clang++

 FROM base-${TARGETARCH} AS base
 ARG CMAKEVERSION
+ARG NINJAVERSION
 RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+RUN dnf install -y unzip \
+    && curl -fsSL -o /tmp/ninja.zip https://github.com/ninja-build/ninja/releases/download/v${NINJAVERSION}/ninja-linux$([ "$(uname -m)" = "aarch64" ] && echo "-aarch64").zip \
+    && unzip /tmp/ninja.zip -d /usr/local/bin \
+    && rm /tmp/ninja.zip
+ENV CMAKE_GENERATOR=Ninja
 ENV LDFLAGS=-s

 FROM base AS cpu
 RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
-ARG PARALLEL
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CPU' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CPU' \
-        && cmake --install build --component CPU --strip --parallel ${PARALLEL}
+        && cmake --build --preset 'CPU' -- -l $(nproc) \
+        && cmake --install build --component CPU --strip

 FROM base AS cuda-11
 ARG CUDA11VERSION=11.8
 RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
-ARG PARALLEL
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --build --preset 'CUDA 11' -- -l $(nproc) \
+        && cmake --install build --component CUDA --strip

 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
-ARG PARALLEL
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 12' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 12' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --build --preset 'CUDA 12' -- -l $(nproc) \
+        && cmake --install build --component CUDA --strip


 FROM base AS cuda-13
 ARG CUDA13VERSION=13.0
 RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-}
 ENV PATH=/usr/local/cuda-13/bin:$PATH
-ARG PARALLEL
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 13' \
-        && cmake --build --parallel ${PARALLEL} --preset 'CUDA 13' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --build --preset 'CUDA 13' -- -l $(nproc) \
+        && cmake --install build --component CUDA --strip


-FROM base AS rocm-6
+FROM base AS rocm-7
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
-ARG PARALLEL
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'ROCm 6' \
-        && cmake --build --parallel ${PARALLEL} --preset 'ROCm 6' \
-        && cmake --install build --component HIP --strip --parallel ${PARALLEL}
+    cmake --preset 'ROCm 7' \
+        && cmake --build --preset 'ROCm 7' -- -l $(nproc) \
+        && cmake --install build --component HIP --strip
 RUN rm -f dist/lib/ollama/rocm/rocblas/library/*gfx90[06]*

 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
 ARG CMAKEVERSION
-RUN apt-get update && apt-get install -y curl ccache \
-    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+ARG NINJAVERSION
+RUN apt-get update && apt-get install -y curl ccache unzip \
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1 \
+    && curl -fsSL -o /tmp/ninja.zip https://github.com/ninja-build/ninja/releases/download/v${NINJAVERSION}/ninja-linux-aarch64.zip \
+    && unzip /tmp/ninja.zip -d /usr/local/bin \
+    && rm /tmp/ninja.zip
+ENV CMAKE_GENERATOR=Ninja
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
-ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'JetPack 5' \
-        && cmake --build --parallel ${PARALLEL} --preset 'JetPack 5' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --build --preset 'JetPack 5' -- -l $(nproc) \
+        && cmake --install build --component CUDA --strip

 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
 ARG CMAKEVERSION
-RUN apt-get update && apt-get install -y curl ccache \
-    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+ARG NINJAVERSION
+RUN apt-get update && apt-get install -y curl ccache unzip \
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1 \
+    && curl -fsSL -o /tmp/ninja.zip https://github.com/ninja-build/ninja/releases/download/v${NINJAVERSION}/ninja-linux-aarch64.zip \
+    && unzip /tmp/ninja.zip -d /usr/local/bin \
+    && rm /tmp/ninja.zip
+ENV CMAKE_GENERATOR=Ninja
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
-ARG PARALLEL
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'JetPack 6' \
-        && cmake --build --parallel ${PARALLEL} --preset 'JetPack 6' \
-        && cmake --install build --component CUDA --strip --parallel ${PARALLEL}
+        && cmake --build --preset 'JetPack 6' -- -l $(nproc) \
+        && cmake --install build --component CUDA --strip

 FROM base AS vulkan
+ARG VULKANVERSION
+RUN ln -s /usr/bin/python3 /usr/bin/python \
+    && wget https://sdk.lunarg.com/sdk/download/${VULKANVERSION}/linux/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz -O /tmp/vulkansdk.tar.xz \
+    && tar xvf /tmp/vulkansdk.tar.xz -C /tmp \
+    && /tmp/${VULKANVERSION}/vulkansdk -j 8 vulkan-headers \
+    && /tmp/${VULKANVERSION}/vulkansdk -j 8 shaderc \
+    && cp -r /tmp/${VULKANVERSION}/x86_64/include/* /usr/local/include/ \
+    && cp -r /tmp/${VULKANVERSION}/x86_64/lib/* /usr/local/lib \
+    && cp -r /tmp/${VULKANVERSION}/x86_64/bin/* /usr/local/bin/ \
+    && rm -rf /tmp/${VULKANVERSION} /tmp/vulkansdk.tar.xz
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'Vulkan' \
-        && cmake --build --parallel --preset 'Vulkan' \
-        && cmake --install build --component Vulkan --strip --parallel 8
+        && cmake --build --preset 'Vulkan' -- -l $(nproc) \
+        && cmake --install build --component Vulkan --strip

 FROM base AS mlx
 ARG CUDA13VERSION=13.0
@@ -138,20 +152,27 @@ ENV PATH=/usr/local/cuda-13/bin:$PATH
 ENV BLAS_INCLUDE_DIRS=/usr/include/openblas
 ENV LAPACK_INCLUDE_DIRS=/usr/include/openblas
 ENV CGO_LDFLAGS="-L/usr/local/cuda-13/lib64 -L/usr/local/cuda-13/targets/x86_64-linux/lib/stubs"
-ARG PARALLEL
 WORKDIR /go/src/github.com/ollama/ollama
 COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 COPY x/imagegen/mlx x/imagegen/mlx
 COPY go.mod go.sum .
-COPY MLX_VERSION .
+COPY MLX_VERSION MLX_CORE_VERSION .
 RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
 ENV PATH=/usr/local/go/bin:$PATH
 RUN go mod download
 RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
-        && cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
-        && cmake --install build --component MLX --strip --parallel ${PARALLEL}
+    --mount=type=bind,from=local-mlx,target=/tmp/local-mlx \
+    --mount=type=bind,from=local-mlx-c,target=/tmp/local-mlx-c \
+    if [ -f /tmp/local-mlx/CMakeLists.txt ]; then \
+        export OLLAMA_MLX_SOURCE=/tmp/local-mlx; \
+    fi \
+    && if [ -f /tmp/local-mlx-c/CMakeLists.txt ]; then \
+        export OLLAMA_MLX_C_SOURCE=/tmp/local-mlx-c; \
+    fi \
+    && cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
+        && cmake --build --preset 'MLX CUDA 13' -- -l $(nproc) \
+        && cmake --install build --component MLX --strip

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@@ -160,16 +181,14 @@ RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-
 ENV PATH=/usr/local/go/bin:$PATH
 RUN go mod download
 COPY . .
-# Clone mlx-c headers for CGO (version from MLX_VERSION file)
-RUN git clone --depth 1 --branch "$(cat MLX_VERSION)" https://github.com/ml-explore/mlx-c.git build/_deps/mlx-c-src
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
 ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
-ENV CGO_CFLAGS="${CGO_CFLAGS} -I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ENV CGO_CFLAGS="${CGO_CFLAGS}"
 ENV CGO_CXXFLAGS="${CGO_CXXFLAGS}"
 RUN --mount=type=cache,target=/root/.cache/go-build \
-    go build -tags mlx -trimpath -buildmode=pie -o /bin/ollama .
+    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
@@ -186,10 +205,9 @@ COPY --from=jetpack-5 dist/lib/ollama/ /lib/ollama/
 COPY --from=jetpack-6 dist/lib/ollama/ /lib/ollama/

 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama /lib/ollama
+COPY --from=rocm-7 dist/lib/ollama /lib/ollama

 FROM ${FLAVOR} AS archive
-ARG VULKANVERSION
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama

--- a/1
+++ b/1
@@ -0,0 +1 @@
+v0.31.1
--- a/2
+++ b/2
@@ -1 +1 @@
-v0.5.0
+v0.6.0
--- a/anthropic/anthropic.go
+++ b/anthropic/anthropic.go
@@ -852,6 +852,19 @@ func (c *StreamConverter) Process(r api.ChatResponse) []StreamEvent {
 			continue
 		}

+		// Close thinking block if still open (thinking → tool_use without text in between)
+		if c.thinkingStarted && !c.thinkingDone {
+			c.thinkingDone = true
+			events = append(events, StreamEvent{
+				Event: "content_block_stop",
+				Data: ContentBlockStopEvent{
+					Type:  "content_block_stop",
+					Index: c.contentIndex,
+				},
+			})
+			c.contentIndex++
+		}
+
 		if c.textStarted {
 			events = append(events, StreamEvent{
 				Event: "content_block_stop",
--- a/anthropic/anthropic_test.go
+++ b/anthropic/anthropic_test.go
@@ -799,6 +799,107 @@ func TestStreamConverter_WithToolCalls(t *testing.T) {
 	}
 }

+// TestStreamConverter_ThinkingDirectlyFollowedByToolCall verifies that when a
+// model emits a thinking block followed directly by a tool_use block (with no
+// text block in between), the streaming converter correctly closes the thinking
+// block and increments the content index before opening the tool_use block.
+// Previously, the converter reused contentIndex=0 for the tool_use block,
+// which caused "Content block not found" errors in clients. See #14816.
+func TestStreamConverter_ThinkingDirectlyFollowedByToolCall(t *testing.T) {
+	conv := NewStreamConverter("msg_123", "test-model", 0)
+
+	// First chunk: thinking content (no text)
+	resp1 := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role:     "assistant",
+			Thinking: "I should call the tool.",
+		},
+	}
+	events1 := conv.Process(resp1)
+
+	// Should have: message_start, content_block_start(thinking), content_block_delta(thinking)
+	if len(events1) < 3 {
+		t.Fatalf("expected at least 3 events for thinking chunk, got %d", len(events1))
+	}
+	if events1[0].Event != "message_start" {
+		t.Errorf("expected first event 'message_start', got %q", events1[0].Event)
+	}
+	thinkingStart, ok := events1[1].Data.(ContentBlockStartEvent)
+	if !ok || thinkingStart.ContentBlock.Type != "thinking" {
+		t.Errorf("expected content_block_start(thinking) as second event, got %+v", events1[1])
+	}
+	if thinkingStart.Index != 0 {
+		t.Errorf("expected thinking block at index 0, got %d", thinkingStart.Index)
+	}
+
+	// Second chunk: tool call (no text between thinking and tool)
+	resp2 := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role: "assistant",
+			ToolCalls: []api.ToolCall{
+				{
+					ID: "call_abc",
+					Function: api.ToolCallFunction{
+						Name:      "ask_user",
+						Arguments: testArgs(map[string]any{"question": "cats or dogs?"}),
+					},
+				},
+			},
+		},
+		Done:       true,
+		DoneReason: "stop",
+		Metrics:    api.Metrics{PromptEvalCount: 10, EvalCount: 5},
+	}
+	events2 := conv.Process(resp2)
+
+	// Expect: content_block_stop(index=0), content_block_start(tool_use, index=1),
+	//         content_block_delta(input_json_delta, index=1), content_block_stop(index=1),
+	//         message_delta, message_stop
+	var thinkingStop, toolStart, toolDelta, toolStop *StreamEvent
+	for i := range events2 {
+		e := &events2[i]
+		switch e.Event {
+		case "content_block_stop":
+			if stop, ok := e.Data.(ContentBlockStopEvent); ok {
+				if stop.Index == 0 && thinkingStop == nil {
+					thinkingStop = e
+				} else if stop.Index == 1 {
+					toolStop = e
+				}
+			}
+		case "content_block_start":
+			if start, ok := e.Data.(ContentBlockStartEvent); ok && start.ContentBlock.Type == "tool_use" {
+				toolStart = e
+			}
+		case "content_block_delta":
+			if delta, ok := e.Data.(ContentBlockDeltaEvent); ok && delta.Delta.Type == "input_json_delta" {
+				toolDelta = e
+			}
+		}
+	}
+
+	if thinkingStop == nil {
+		t.Error("expected content_block_stop for thinking block (index 0)")
+	}
+	if toolStart == nil {
+		t.Fatal("expected content_block_start for tool_use block")
+	}
+	if start, ok := toolStart.Data.(ContentBlockStartEvent); !ok || start.Index != 1 {
+		t.Errorf("expected tool_use block at index 1, got %+v", toolStart.Data)
+	}
+	if toolDelta == nil {
+		t.Fatal("expected input_json_delta event for tool call")
+	}
+	if delta, ok := toolDelta.Data.(ContentBlockDeltaEvent); !ok || delta.Index != 1 {
+		t.Errorf("expected tool delta at index 1, got %+v", toolDelta.Data)
+	}
+	if toolStop == nil {
+		t.Error("expected content_block_stop for tool_use block (index 1)")
+	}
+}
+
 func TestStreamConverter_ToolCallWithUnmarshalableArgs(t *testing.T) {
 	// Test that unmarshalable arguments (like channels) are handled gracefully
 	// and don't cause a panic or corrupt stream
--- a/api/client.go
+++ b/api/client.go
@@ -476,25 +476,3 @@ func (c *Client) Whoami(ctx context.Context) (*UserResponse, error) {
 	}
 	return &resp, nil
 }
-
-// AliasRequest is the request body for creating or updating a model alias.
-type AliasRequest struct {
-	Alias          string `json:"alias"`
-	Target         string `json:"target"`
-	PrefixMatching bool   `json:"prefix_matching,omitempty"`
-}
-
-// SetAliasExperimental creates or updates a model alias via the experimental aliases API.
-func (c *Client) SetAliasExperimental(ctx context.Context, req *AliasRequest) error {
-	return c.do(ctx, http.MethodPost, "/api/experimental/aliases", req, nil)
-}
-
-// AliasDeleteRequest is the request body for deleting a model alias.
-type AliasDeleteRequest struct {
-	Alias string `json:"alias"`
-}
-
-// DeleteAliasExperimental deletes a model alias via the experimental aliases API.
-func (c *Client) DeleteAliasExperimental(ctx context.Context, req *AliasDeleteRequest) error {
-	return c.do(ctx, http.MethodDelete, "/api/experimental/aliases", req, nil)
-}
--- a/app/cmd/app/app.go
+++ b/app/cmd/app/app.go
@@ -35,6 +35,7 @@ import (
 var (
 	wv           = &Webview{}
 	uiServerPort int
+	appStore     *store.Store
 )

 var debug = strings.EqualFold(os.Getenv("OLLAMA_DEBUG"), "true") || os.Getenv("OLLAMA_DEBUG") == "1"
@@ -208,6 +209,7 @@ func main() {
 	uiServerPort = port

 	st := &store.Store{}
+	appStore = st

 	// Enable CORS in development mode
 	if devMode {
@@ -294,8 +296,15 @@ func main() {

 	// Check for pending updates on startup (show tray notification if update is ready)
 	if updater.IsUpdatePending() {
-		slog.Debug("update pending on startup, showing tray notification")
-		UpdateAvailable("")
+		// On Windows, the tray is initialized in osRun(). Calling UpdateAvailable
+		// before that would dereference a nil tray callback.
+		// TODO: refactor so the update check runs after platform init on all platforms.
+		if runtime.GOOS == "windows" {
+			slog.Debug("update pending on startup, deferring tray notification until tray initialization")
+		} else {
+			slog.Debug("update pending on startup, showing tray notification")
+			UpdateAvailable("")
+		}
 	}

 	hasCompletedFirstRun, err := st.HasCompletedFirstRun()
@@ -360,8 +369,7 @@ func startHiddenTasks() {
 			slog.Info("deferring pending update for fast startup")
 		} else {
 			// Check if auto-update is enabled before automatically upgrading
-			st := &store.Store{}
-			settings, err := st.Settings()
+			settings, err := appStore.Settings()
 			if err != nil {
 				slog.Warn("failed to load settings for upgrade check", "error", err)
 			} else if !settings.AutoUpdateEnabled {
--- a/app/cmd/app/app_windows.go
+++ b/app/cmd/app/app_windows.go
@@ -154,6 +154,10 @@ func handleURLSchemeRequest(urlScheme string) {
 }

 func UpdateAvailable(ver string) error {
+	if app.t == nil {
+		slog.Debug("tray not yet initialized, skipping update notification")
+		return nil
+	}
 	return app.t.UpdateAvailable(ver)
 }

@@ -165,6 +169,14 @@ func osRun(shutdown func(), hasCompletedFirstRun, startHidden bool) {
 		log.Fatalf("Failed to start: %s", err)
 	}

+	// Check for pending updates now that the tray is initialized.
+	// The platform-independent check in app.go fires before osRun,
+	// when app.t is still nil, so we must re-check here.
+	if updater.IsUpdatePending() {
+		slog.Debug("update pending on startup, showing tray notification")
+		UpdateAvailable("")
+	}
+
 	signals := make(chan os.Signal, 1)
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)

--- a/app/ui/app/src/components/Settings.tsx
+++ b/app/ui/app/src/components/Settings.tsx
@@ -214,6 +214,7 @@ export default function Settings() {
        Agent: false,
        Tools: false,
        ContextLength: 0,
+        AutoUpdateEnabled: true,
      });
      updateSettingsMutation.mutate(defaultSettings);
    }
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -155,7 +155,7 @@ func (s *Server) ollamaProxy() http.Handler {
 					return
 				}

-				target := envconfig.Host()
+				target := envconfig.ConnectableHost()
 				s.log().Info("configuring ollama proxy", "target", target.String())

 				newProxy := httputil.NewSingleHostReverseProxy(target)
--- a/app/updater/updater.go
+++ b/app/updater/updater.go
@@ -289,6 +289,7 @@ func (u *Updater) TriggerImmediateCheck() {

 func (u *Updater) StartBackgroundUpdaterChecker(ctx context.Context, cb func(string) error) {
 	u.checkNow = make(chan struct{}, 1)
+	u.checkNow <- struct{}{} // Trigger first check after initial delay
 	go func() {
 		// Don't blast an update message immediately after startup
 		time.Sleep(UpdateCheckInitialDelay)
@@ -333,7 +334,7 @@ func (u *Updater) StartBackgroundUpdaterChecker(ctx context.Context, cb func(str
 				continue
 			}

-			// Download successful - show tray notification (regardless of toggle state)
+			// Download successful - show tray notification
 			err = cb(resp.UpdateVersion)
 			if err != nil {
 				slog.Warn("failed to register update available with tray", "error", err)
--- a/app/updater/updater_test.go
+++ b/app/updater/updater_test.go
@@ -351,10 +351,13 @@ func TestTriggerImmediateCheck(t *testing.T) {

 	updater.StartBackgroundUpdaterChecker(ctx, cb)

-	// Wait for goroutine to start and pass initial delay
-	time.Sleep(10 * time.Millisecond)
+	// Wait for the initial check that fires after the initial delay
+	select {
+	case <-checkDone:
+	case <-time.After(2 * time.Second):
+		t.Fatal("initial check did not happen")
+	}

-	// With 1 hour interval, no check should have happened yet
 	initialCount := checkCount.Load()

 	// Trigger immediate check
--- a/cmd/bench/README.md
+++ b/cmd/bench/README.md
@@ -1,27 +1,31 @@
 Ollama Benchmark Tool
 ---------------------

-A Go-based command-line tool for benchmarking Ollama models with configurable parameters and multiple output formats.
+A Go-based command-line tool for benchmarking Ollama models with configurable parameters, warmup phases, TTFT tracking, VRAM monitoring, and benchstat/CSV output.

 ## Features

 * Benchmark multiple models in a single run
 * Support for both text and image prompts
 * Configurable generation parameters (temperature, max tokens, seed, etc.)
- * Supports benchstat and CSV output formats
- * Detailed performance metrics (prefill, generate, load, total durations)
+ * Warmup phase before timed epochs to stabilize measurements
+ * Time-to-first-token (TTFT) tracking per epoch
+ * Model metadata display (parameter size, quantization level, family)
+ * VRAM and CPU memory usage tracking via running process info
+ * Controlled prompt token length for reproducible benchmarks
+ * Benchstat and CSV output formats

 ## Building from Source

 ```
-go build -o ollama-bench bench.go
-./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
+go build -o ollama-bench ./cmd/bench
+./ollama-bench -model gemma3 -epochs 6 -format csv
 ```

 Using Go Run (without building)

 ```
-go run bench.go -model gpt-oss:20b -epochs 3
+go run ./cmd/bench -model gemma3 -epochs 3
 ```

 ## Usage
@@ -45,10 +49,16 @@ benchstat -col /name gemma.bench
 ./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
 ```

+### Controlled Prompt Length
+
+```
+./ollama-bench -model gemma3 -epochs 6 -prompt-tokens 512
+```
+
 ### Advanced Example

 ```
-./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
+./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -warmup 2 -format csv -output results.csv
 ```

 ## Command Line Options
@@ -56,41 +66,48 @@ benchstat -col /name gemma.bench
 | Option  	| Description | Default |
 |----------|-------------|---------|
 | -model	| Comma-separated list of models to benchmark	| (required)		|
-| -epochs	| Number of iterations per model		| 1			|
-| -max-tokens	| Maximum tokens for model response		| 0 (unlimited)		|
+| -epochs	| Number of iterations per model		| 6			|
+| -max-tokens	| Maximum tokens for model response		| 200			|
 | -temperature	| Temperature parameter				| 0.0			|
 | -seed		| Random seed					| 0 (random)		|
 | -timeout	| Timeout in seconds				| 300			|
-| -p		| Prompt text					| "Write a long story."	|
+| -p		| Prompt text					| (default story prompt)	|
 | -image	| Image file to include in prompt		| 			|
 | -k		| Keep-alive duration in seconds		| 0			|
 | -format	| Output format (benchstat, csv)		| benchstat		|
 | -output	| Output file for results			| "" (stdout)		|
+| -warmup	| Number of warmup requests before timing	| 1			|
+| -prompt-tokens	| Generate prompt targeting ~N tokens (0 = use -p)	| 0		|
 | -v		| Verbose mode					| false			|
 | -debug	| Show debug information			| false			|

 ## Output Formats

-### Markdown Format
+### Benchstat Format (default)

-The default markdown format is suitable for copying and pasting into a GitHub issue and will look like:
-```
- Model | Step | Count | Duration | nsPerToken | tokensPerSec |
-|-------|------|-------|----------|------------|--------------|
-| gpt-oss:20b | prefill | 124 | 30.006458ms | 241987.56 | 4132.44 |
-| gpt-oss:20b | generate | 200 | 2.646843954s | 13234219.77 | 75.56 |
-| gpt-oss:20b | load | 1 | 121.674208ms | - | - |
-| gpt-oss:20b | total | 1 | 2.861047625s | - | - |
-```
-
-### Benchstat Format
-
-Compatible with Go's benchstat tool for statistical analysis:
+Compatible with Go's benchstat tool for statistical analysis. Uses one value/unit pair per line, standard `ns/op` for timing metrics, and `ns/token` for throughput. Each epoch produces one set of lines -- benchstat aggregates across repeated runs to compute statistics.

 ```
-BenchmarkModel/name=gpt-oss:20b/step=prefill 128 78125.00 ns/token 12800.00 token/sec
-BenchmarkModel/name=gpt-oss:20b/step=generate 512 19531.25 ns/token 51200.00 token/sec
-BenchmarkModel/name=gpt-oss:20b/step=load 1 1500000000 ns/request
+# Model: gemma3 | Params: 4.3B | Quant: Q4_K_M | Family: gemma3 | Size: 4080218931 | VRAM: 4080218931
+BenchmarkModel/name=gemma3/step=prefill 1 78125.00 ns/token 12800.00 token/sec
+BenchmarkModel/name=gemma3/step=generate 1 19531.25 ns/token 51200.00 token/sec
+BenchmarkModel/name=gemma3/step=ttft 1 45123000 ns/op
+BenchmarkModel/name=gemma3/step=load 1 1500000000 ns/op
+BenchmarkModel/name=gemma3/step=total 1 2861047625 ns/op
+```
+
+Use with benchstat:
+```
+./ollama-bench -model gemma3 -epochs 6 > gemma3.bench
+benchstat -col /step gemma3.bench
+```
+
+Compare two runs:
+```
+./ollama-bench -model gemma3 -epochs 6 > before.bench
+# ... make changes ...
+./ollama-bench -model gemma3 -epochs 6 > after.bench
+benchstat before.bench after.bench
 ```

 ### CSV Format
@@ -99,17 +116,28 @@ Machine-readable comma-separated values:

 ```
 NAME,STEP,COUNT,NS_PER_COUNT,TOKEN_PER_SEC
-gpt-oss:20b,prefill,128,78125.00,12800.00
-gpt-oss:20b,generate,512,19531.25,51200.00
-gpt-oss:20b,load,1,1500000000,0
+# Model: gemma3 | Params: 4.3B | Quant: Q4_K_M | Family: gemma3 | Size: 4080218931 | VRAM: 4080218931
+gemma3,prefill,128,78125.00,12800.00
+gemma3,generate,512,19531.25,51200.00
+gemma3,ttft,1,45123000,0
+gemma3,load,1,1500000000,0
+gemma3,total,1,2861047625,0
 ```

 ## Metrics Explained

-The tool reports four types of metrics for each model:
+The tool reports the following metrics for each epoch:

- * prefill: Time spent processing the prompt
- * generate: Time spent generating the response
- * load: Model loading time (one-time cost)
- * total: Total request duration
+ * **prefill**: Time spent processing the prompt (ns/token)
+ * **generate**: Time spent generating the response (ns/token)
+ * **ttft**: Time to first token -- latency from request start to first response content
+ * **load**: Model loading time (one-time cost)
+ * **total**: Total request duration

+Additionally, the model info comment line (displayed once per model before epochs) includes:
+
+ * **Params**: Model parameter count (e.g., 4.3B)
+ * **Quant**: Quantization level (e.g., Q4_K_M)
+ * **Family**: Model family (e.g., gemma3)
+ * **Size**: Total model memory in bytes
+ * **VRAM**: GPU memory used by the loaded model (when Size > VRAM, the difference is CPU spill)
--- a/cmd/bench/bench.go
+++ b/cmd/bench/bench.go
@@ -17,19 +17,21 @@ import (
 )

 type flagOptions struct {
-	models      *string
-	epochs      *int
-	maxTokens   *int
-	temperature *float64
-	seed        *int
-	timeout     *int
-	prompt      *string
-	imageFile   *string
-	keepAlive   *float64
-	format      *string
-	outputFile  *string
-	debug       *bool
-	verbose     *bool
+	models       *string
+	epochs       *int
+	maxTokens    *int
+	temperature  *float64
+	seed         *int
+	timeout      *int
+	prompt       *string
+	imageFile    *string
+	keepAlive    *float64
+	format       *string
+	outputFile   *string
+	debug        *bool
+	verbose      *bool
+	warmup       *int
+	promptTokens *int
 }

 type Metrics struct {
@@ -39,48 +41,169 @@ type Metrics struct {
 	Duration time.Duration
 }

-var once sync.Once
+type ModelInfo struct {
+	Name              string
+	ParameterSize     string
+	QuantizationLevel string
+	Family            string
+	SizeBytes         int64
+	VRAMBytes         int64
+}

 const DefaultPrompt = `Please write a descriptive story about a llama named Alonso who grows up to be President of the Land of Llamas. Include details about Alonso's childhood, adolescent years, and how he grew up to be a political mover and shaker. Write the story with a sense of whimsy.`

+// Word list for generating prompts targeting a specific token count.
+var promptWordList = []string{
+	"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
+	"a", "bright", "sunny", "day", "in", "the", "meadow", "where",
+	"flowers", "bloom", "and", "birds", "sing", "their", "morning",
+	"songs", "while", "gentle", "breeze", "carries", "sweet", "scent",
+	"of", "pine", "trees", "across", "rolling", "hills", "toward",
+	"distant", "mountains", "covered", "with", "fresh", "snow",
+	"beneath", "clear", "blue", "sky", "children", "play", "near",
+	"old", "stone", "bridge", "that", "crosses", "winding", "river",
+}
+
+func generatePromptForTokenCount(targetTokens int, epoch int) string {
+	// ~1.3 tokens per word heuristic
+	targetWords := int(float64(targetTokens) / 1.3)
+	if targetWords < 1 {
+		targetWords = 1
+	}
+
+	// Vary the starting offset by epoch to defeat KV cache prefix matching
+	offset := epoch * 7 // stride by a prime to get good distribution
+	n := len(promptWordList)
+	words := make([]string, targetWords)
+	for i := range words {
+		words[i] = promptWordList[((i+offset)%n+n)%n]
+	}
+	return strings.Join(words, " ")
+}
+
+func buildGenerateRequest(model string, fOpt flagOptions, imgData api.ImageData, epoch int) *api.GenerateRequest {
+	options := make(map[string]interface{})
+	if *fOpt.maxTokens > 0 {
+		options["num_predict"] = *fOpt.maxTokens
+	}
+	options["temperature"] = *fOpt.temperature
+	if fOpt.seed != nil && *fOpt.seed > 0 {
+		options["seed"] = *fOpt.seed
+	}
+
+	var keepAliveDuration *api.Duration
+	if *fOpt.keepAlive > 0 {
+		duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
+		keepAliveDuration = &duration
+	}
+
+	prompt := *fOpt.prompt
+	if *fOpt.promptTokens > 0 {
+		prompt = generatePromptForTokenCount(*fOpt.promptTokens, epoch)
+	} else {
+		// Vary the prompt per epoch to defeat KV cache prefix matching
+		prompt = fmt.Sprintf("[%d] %s", epoch, prompt)
+	}
+
+	req := &api.GenerateRequest{
+		Model:     model,
+		Prompt:    prompt,
+		Raw:       true,
+		Options:   options,
+		KeepAlive: keepAliveDuration,
+	}
+
+	if imgData != nil {
+		req.Images = []api.ImageData{imgData}
+	}
+
+	return req
+}
+
+func fetchModelInfo(ctx context.Context, client *api.Client, model string) ModelInfo {
+	info := ModelInfo{Name: model}
+	resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "WARNING: Could not fetch model info for '%s': %v\n", model, err)
+		return info
+	}
+	info.ParameterSize = resp.Details.ParameterSize
+	info.QuantizationLevel = resp.Details.QuantizationLevel
+	info.Family = resp.Details.Family
+	return info
+}
+
+func fetchMemoryUsage(ctx context.Context, client *api.Client, model string) (size, vram int64) {
+	resp, err := client.ListRunning(ctx)
+	if err != nil {
+		if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+			fmt.Fprintf(os.Stderr, "WARNING: Could not fetch memory usage: %v\n", err)
+		}
+		return 0, 0
+	}
+	for _, m := range resp.Models {
+		if m.Name == model || m.Model == model {
+			return m.Size, m.SizeVRAM
+		}
+	}
+	// Try prefix match (model names may include :latest or tags)
+	for _, m := range resp.Models {
+		if strings.HasPrefix(m.Name, model) || strings.HasPrefix(m.Model, model) {
+			return m.Size, m.SizeVRAM
+		}
+	}
+	return 0, 0
+}
+
+func outputFormatHeader(w io.Writer, format string, verbose bool) {
+	switch format {
+	case "benchstat":
+		if verbose {
+			fmt.Fprintf(w, "goos: %s\n", runtime.GOOS)
+			fmt.Fprintf(w, "goarch: %s\n", runtime.GOARCH)
+		}
+	case "csv":
+		headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
+		fmt.Fprintln(w, strings.Join(headings, ","))
+	}
+}
+
+func outputModelInfo(w io.Writer, format string, info ModelInfo) {
+	params := cmp.Or(info.ParameterSize, "unknown")
+	quant := cmp.Or(info.QuantizationLevel, "unknown")
+	family := cmp.Or(info.Family, "unknown")
+
+	memStr := ""
+	if info.SizeBytes > 0 {
+		memStr = fmt.Sprintf(" | Size: %d | VRAM: %d", info.SizeBytes, info.VRAMBytes)
+	}
+	fmt.Fprintf(w, "# Model: %s | Params: %s | Quant: %s | Family: %s%s\n",
+		info.Name, params, quant, family, memStr)
+}
+
 func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool) {
 	switch format {
 	case "benchstat":
-		if verbose {
-			printHeader := func() {
-				fmt.Fprintf(w, "sysname: %s\n", runtime.GOOS)
-				fmt.Fprintf(w, "machine: %s\n", runtime.GOARCH)
-			}
-			once.Do(printHeader)
-		}
 		for _, m := range metrics {
 			if m.Step == "generate" || m.Step == "prefill" {
 				if m.Count > 0 {
 					nsPerToken := float64(m.Duration.Nanoseconds()) / float64(m.Count)
 					tokensPerSec := float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
-
-					fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d %.2f ns/token %.2f token/sec\n",
-						m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
+					fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s 1 %.2f ns/token %.2f token/sec\n",
+						m.Model, m.Step, nsPerToken, tokensPerSec)
 				} else {
-					fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d 0 ns/token 0 token/sec\n",
-						m.Model, m.Step, m.Count)
+					fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s 1 0 ns/token 0 token/sec\n",
+						m.Model, m.Step)
 				}
+			} else if m.Step == "ttft" {
+				fmt.Fprintf(w, "BenchmarkModel/name=%s/step=ttft 1 %d ns/op\n",
+					m.Model, m.Duration.Nanoseconds())
 			} else {
-				var suffix string
-				if m.Step == "load" {
-					suffix = "/step=load"
-				}
-				fmt.Fprintf(w, "BenchmarkModel/name=%s%s 1 %d ns/request\n",
-					m.Model, suffix, m.Duration.Nanoseconds())
+				fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s 1 %d ns/op\n",
+					m.Model, m.Step, m.Duration.Nanoseconds())
 			}
 		}
 	case "csv":
-		printHeader := func() {
-			headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
-			fmt.Fprintln(w, strings.Join(headings, ","))
-		}
-		once.Do(printHeader)
-
 		for _, m := range metrics {
 			if m.Step == "generate" || m.Step == "prefill" {
 				var nsPerToken float64
@@ -94,39 +217,14 @@ func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool)
 				fmt.Fprintf(w, "%s,%s,1,%d,0\n", m.Model, m.Step, m.Duration.Nanoseconds())
 			}
 		}
-	case "markdown":
-		printHeader := func() {
-			fmt.Fprintln(w, "| Model | Step | Count | Duration | nsPerToken | tokensPerSec |")
-			fmt.Fprintln(w, "|-------|------|-------|----------|------------|--------------|")
-		}
-		once.Do(printHeader)
-
-		for _, m := range metrics {
-			var nsPerToken, tokensPerSec float64
-			var nsPerTokenStr, tokensPerSecStr string
-
-			if m.Step == "generate" || m.Step == "prefill" {
-				nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
-				tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
-				nsPerTokenStr = fmt.Sprintf("%.2f", nsPerToken)
-				tokensPerSecStr = fmt.Sprintf("%.2f", tokensPerSec)
-			} else {
-				nsPerTokenStr = "-"
-				tokensPerSecStr = "-"
-			}
-
-			fmt.Fprintf(w, "| %s | %s | %d | %v | %s | %s |\n",
-				m.Model, m.Step, m.Count, m.Duration, nsPerTokenStr, tokensPerSecStr)
-		}
 	default:
 		fmt.Fprintf(os.Stderr, "Unknown output format '%s'\n", format)
 	}
 }

-func BenchmarkChat(fOpt flagOptions) error {
+func BenchmarkModel(fOpt flagOptions) error {
 	models := strings.Split(*fOpt.models, ",")

-	// todo - add multi-image support
 	var imgData api.ImageData
 	var err error
 	if *fOpt.imageFile != "" {
@@ -158,71 +256,124 @@ func BenchmarkChat(fOpt flagOptions) error {
 		out = f
 	}

+	outputFormatHeader(out, *fOpt.format, *fOpt.verbose)
+
+	// Log prompt-tokens info in debug mode
+	if *fOpt.debug && *fOpt.promptTokens > 0 {
+		prompt := generatePromptForTokenCount(*fOpt.promptTokens, 0)
+		wordCount := len(strings.Fields(prompt))
+		fmt.Fprintf(os.Stderr, "Generated prompt targeting ~%d tokens (%d words, varied per epoch)\n", *fOpt.promptTokens, wordCount)
+	}
+
 	for _, model := range models {
-		for range *fOpt.epochs {
-			options := make(map[string]interface{})
-			if *fOpt.maxTokens > 0 {
-				options["num_predict"] = *fOpt.maxTokens
-			}
-			options["temperature"] = *fOpt.temperature
-			if fOpt.seed != nil && *fOpt.seed > 0 {
-				options["seed"] = *fOpt.seed
-			}
-
-			var keepAliveDuration *api.Duration
-			if *fOpt.keepAlive > 0 {
-				duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
-				keepAliveDuration = &duration
-			}
-
-			req := &api.ChatRequest{
-				Model: model,
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: *fOpt.prompt,
-					},
-				},
-				Options:   options,
-				KeepAlive: keepAliveDuration,
-			}
-
-			if imgData != nil {
-				req.Messages[0].Images = []api.ImageData{imgData}
-			}
-
-			var responseMetrics *api.Metrics
+		// Fetch model info
+		infoCtx, infoCancel := context.WithTimeout(context.Background(), 10*time.Second)
+		info := fetchModelInfo(infoCtx, client, model)
+		infoCancel()

+		// Warmup phase (uses negative epoch numbers to avoid colliding with timed epochs)
+		for i := range *fOpt.warmup {
+			req := buildGenerateRequest(model, fOpt, imgData, -(i + 1))
 			ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
-			defer cancel()

-			err = client.Chat(ctx, req, func(resp api.ChatResponse) error {
-				if *fOpt.debug {
-					fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Message.Thinking, resp.Message.Content))
-				}
-
-				if resp.Done {
-					responseMetrics = &resp.Metrics
-				}
+			err = client.Generate(ctx, req, func(resp api.GenerateResponse) error {
 				return nil
 			})
-
-			if *fOpt.debug {
-				fmt.Fprintln(os.Stderr)
-			}
+			cancel()

 			if err != nil {
-				if ctx.Err() == context.DeadlineExceeded {
-					fmt.Fprintf(os.Stderr, "ERROR: Chat request timed out with model '%s' after %vs\n", model, 1)
-					continue
+				fmt.Fprintf(os.Stderr, "WARNING: Warmup %d/%d for %s failed: %v\n", i+1, *fOpt.warmup, model, err)
+			} else if *fOpt.debug {
+				fmt.Fprintf(os.Stderr, "Warmup %d/%d for %s complete\n", i+1, *fOpt.warmup, model)
+			}
+		}
+
+		// Fetch memory usage once after warmup (model is loaded and stable)
+		memCtx, memCancel := context.WithTimeout(context.Background(), 5*time.Second)
+		info.SizeBytes, info.VRAMBytes = fetchMemoryUsage(memCtx, client, model)
+		memCancel()
+
+		outputModelInfo(out, *fOpt.format, info)
+
+		// Timed epoch loop
+		shortCount := 0
+		for epoch := range *fOpt.epochs {
+			var responseMetrics *api.Metrics
+			var ttft time.Duration
+			short := false
+
+			// Retry loop: if the model hits a stop token before max-tokens,
+			// retry with a different prompt (up to maxRetries times).
+			const maxRetries = 3
+			for attempt := range maxRetries + 1 {
+				responseMetrics = nil
+				ttft = 0
+				var ttftOnce sync.Once
+
+				req := buildGenerateRequest(model, fOpt, imgData, epoch+attempt*1000)
+				requestStart := time.Now()
+
+				ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
+
+				err = client.Generate(ctx, req, func(resp api.GenerateResponse) error {
+					if *fOpt.debug {
+						fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Thinking, resp.Response))
+					}
+
+					// Capture TTFT on first content
+					ttftOnce.Do(func() {
+						if resp.Response != "" || resp.Thinking != "" {
+							ttft = time.Since(requestStart)
+						}
+					})
+
+					if resp.Done {
+						responseMetrics = &resp.Metrics
+					}
+					return nil
+				})
+				cancel()
+
+				if *fOpt.debug {
+					fmt.Fprintln(os.Stderr)
 				}
-				fmt.Fprintf(os.Stderr, "ERROR: Couldn't chat with model '%s': %v\n", model, err)
+
+				if err != nil {
+					if ctx.Err() == context.DeadlineExceeded {
+						fmt.Fprintf(os.Stderr, "ERROR: Request timed out with model '%s' after %vs\n", model, *fOpt.timeout)
+					} else {
+						fmt.Fprintf(os.Stderr, "ERROR: Couldn't generate with model '%s': %v\n", model, err)
+					}
+					break
+				}
+
+				if responseMetrics == nil {
+					fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
+					break
+				}
+
+				// Check if the response was shorter than requested
+				short = *fOpt.maxTokens > 0 && responseMetrics.EvalCount < *fOpt.maxTokens
+				if !short || attempt == maxRetries {
+					break
+				}
+
+				if *fOpt.debug {
+					fmt.Fprintf(os.Stderr, "Short response (%d/%d tokens), retrying with different prompt (attempt %d/%d)\n",
+						responseMetrics.EvalCount, *fOpt.maxTokens, attempt+1, maxRetries)
+				}
+			}
+
+			if err != nil || responseMetrics == nil {
 				continue
 			}

-			if responseMetrics == nil {
-				fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
-				continue
+			if short {
+				shortCount++
+				if *fOpt.debug {
+					fmt.Fprintf(os.Stderr, "WARNING: Short response (%d/%d tokens) after %d retries for epoch %d\n",
+						responseMetrics.EvalCount, *fOpt.maxTokens, maxRetries, epoch+1)
+				}
 			}

 			metrics := []Metrics{
@@ -238,6 +389,12 @@ func BenchmarkChat(fOpt flagOptions) error {
 					Count:    responseMetrics.EvalCount,
 					Duration: responseMetrics.EvalDuration,
 				},
+				{
+					Model:    model,
+					Step:     "ttft",
+					Count:    1,
+					Duration: ttft,
+				},
 				{
 					Model:    model,
 					Step:     "load",
@@ -254,15 +411,42 @@ func BenchmarkChat(fOpt flagOptions) error {

 			OutputMetrics(out, *fOpt.format, metrics, *fOpt.verbose)

+			if *fOpt.debug && *fOpt.promptTokens > 0 {
+				fmt.Fprintf(os.Stderr, "Generated prompt targeting ~%d tokens (actual: %d)\n",
+					*fOpt.promptTokens, responseMetrics.PromptEvalCount)
+			}
+
 			if *fOpt.keepAlive > 0 {
 				time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
 			}
 		}
+
+		if shortCount > 0 {
+			fmt.Fprintf(os.Stderr, "WARNING: %d/%d epochs for '%s' had short responses (<%d tokens). Generation metrics may be unreliable.\n",
+				shortCount, *fOpt.epochs, model, *fOpt.maxTokens)
+		}
+
+		// Unload model before moving to the next one
+		unloadModel(client, model, *fOpt.timeout)
 	}

 	return nil
 }

+func unloadModel(client *api.Client, model string, timeout int) {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
+	defer cancel()
+
+	zero := api.Duration{Duration: 0}
+	req := &api.GenerateRequest{
+		Model:     model,
+		KeepAlive: &zero,
+	}
+	_ = client.Generate(ctx, req, func(resp api.GenerateResponse) error {
+		return nil
+	})
+}
+
 func readImage(filePath string) (api.ImageData, error) {
 	file, err := os.Open(filePath)
 	if err != nil {
@@ -280,19 +464,21 @@ func readImage(filePath string) (api.ImageData, error) {

 func main() {
 	fOpt := flagOptions{
-		models:      flag.String("model", "", "Model to benchmark"),
-		epochs:      flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
-		maxTokens:   flag.Int("max-tokens", 200, "Maximum tokens for model response"),
-		temperature: flag.Float64("temperature", 0, "Temperature parameter"),
-		seed:        flag.Int("seed", 0, "Random seed"),
-		timeout:     flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
-		prompt:      flag.String("p", DefaultPrompt, "Prompt to use"),
-		imageFile:   flag.String("image", "", "Filename for an image to include"),
-		keepAlive:   flag.Float64("k", 0, "Keep alive duration in seconds"),
-		format:      flag.String("format", "markdown", "Output format [benchstat|csv] (default benchstat)"),
-		outputFile:  flag.String("output", "", "Output file for results (stdout if empty)"),
-		verbose:     flag.Bool("v", false, "Show system information"),
-		debug:       flag.Bool("debug", false, "Show debug information"),
+		models:       flag.String("model", "", "Model to benchmark"),
+		epochs:       flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
+		maxTokens:    flag.Int("max-tokens", 200, "Maximum tokens for model response"),
+		temperature:  flag.Float64("temperature", 0, "Temperature parameter"),
+		seed:         flag.Int("seed", 0, "Random seed"),
+		timeout:      flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
+		prompt:       flag.String("p", DefaultPrompt, "Prompt to use"),
+		imageFile:    flag.String("image", "", "Filename for an image to include"),
+		keepAlive:    flag.Float64("k", 0, "Keep alive duration in seconds"),
+		format:       flag.String("format", "benchstat", "Output format [benchstat|csv]"),
+		outputFile:   flag.String("output", "", "Output file for results (stdout if empty)"),
+		verbose:      flag.Bool("v", false, "Show system information"),
+		debug:        flag.Bool("debug", false, "Show debug information"),
+		warmup:       flag.Int("warmup", 1, "Number of warmup requests before timing"),
+		promptTokens: flag.Int("prompt-tokens", 0, "Generate prompt targeting ~N tokens (0 = use -p prompt)"),
 	}

 	flag.Usage = func() {
@@ -302,11 +488,12 @@ func main() {
 		fmt.Fprintf(os.Stderr, "Options:\n")
 		flag.PrintDefaults()
 		fmt.Fprintf(os.Stderr, "\nExamples:\n")
-		fmt.Fprintf(os.Stderr, "  bench -model gpt-oss:20b -epochs 3 -temperature 0.7\n")
+		fmt.Fprintf(os.Stderr, "  bench -model gemma3,llama3 -epochs 6\n")
+		fmt.Fprintf(os.Stderr, "  bench -model gemma3 -epochs 6 -prompt-tokens 512 -format csv\n")
 	}
 	flag.Parse()

-	if !slices.Contains([]string{"markdown", "benchstat", "csv"}, *fOpt.format) {
+	if !slices.Contains([]string{"benchstat", "csv"}, *fOpt.format) {
 		fmt.Fprintf(os.Stderr, "ERROR: Unknown format '%s'\n", *fOpt.format)
 		os.Exit(1)
 	}
@@ -317,5 +504,5 @@ func main() {
 		return
 	}

-	BenchmarkChat(fOpt)
+	BenchmarkModel(fOpt)
 }
--- a/cmd/bench/bench_test.go
+++ b/cmd/bench/bench_test.go
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"log/slog"
 	"math"
 	"net"
 	"net/http"
@@ -38,9 +39,12 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/cmd/launch"
 	"github.com/ollama/ollama/cmd/tui"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/internal/modelref"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
@@ -57,36 +61,42 @@ import (

 func init() {
 	// Override default selectors to use Bubbletea TUI instead of raw terminal I/O.
-	config.DefaultSingleSelector = func(title string, items []config.ModelItem, current string) (string, error) {
+	launch.DefaultSingleSelector = func(title string, items []launch.ModelItem, current string) (string, error) {
+		if !term.IsTerminal(int(os.Stdin.Fd())) || !term.IsTerminal(int(os.Stdout.Fd())) {
+			return "", fmt.Errorf("model selection requires an interactive terminal; use --model to run in headless mode")
+		}
 		tuiItems := tui.ReorderItems(tui.ConvertItems(items))
 		result, err := tui.SelectSingle(title, tuiItems, current)
 		if errors.Is(err, tui.ErrCancelled) {
-			return "", config.ErrCancelled
+			return "", launch.ErrCancelled
 		}
 		return result, err
 	}

-	config.DefaultMultiSelector = func(title string, items []config.ModelItem, preChecked []string) ([]string, error) {
+	launch.DefaultMultiSelector = func(title string, items []launch.ModelItem, preChecked []string) ([]string, error) {
+		if !term.IsTerminal(int(os.Stdin.Fd())) || !term.IsTerminal(int(os.Stdout.Fd())) {
+			return nil, fmt.Errorf("model selection requires an interactive terminal; use --model to run in headless mode")
+		}
 		tuiItems := tui.ReorderItems(tui.ConvertItems(items))
 		result, err := tui.SelectMultiple(title, tuiItems, preChecked)
 		if errors.Is(err, tui.ErrCancelled) {
-			return nil, config.ErrCancelled
+			return nil, launch.ErrCancelled
 		}
 		return result, err
 	}

-	config.DefaultSignIn = func(modelName, signInURL string) (string, error) {
+	launch.DefaultSignIn = func(modelName, signInURL string) (string, error) {
 		userName, err := tui.RunSignIn(modelName, signInURL)
 		if errors.Is(err, tui.ErrCancelled) {
-			return "", config.ErrCancelled
+			return "", launch.ErrCancelled
 		}
 		return userName, err
 	}

-	config.DefaultConfirmPrompt = func(prompt string) (bool, error) {
+	launch.DefaultConfirmPrompt = func(prompt string) (bool, error) {
 		ok, err := tui.RunConfirm(prompt)
 		if errors.Is(err, tui.ErrCancelled) {
-			return false, config.ErrCancelled
+			return false, launch.ErrCancelled
 		}
 		return ok, err
 	}
@@ -131,6 +141,17 @@ func getModelfileName(cmd *cobra.Command) (string, error) {
 	return absName, nil
 }

+// isLocalhost returns true if the configured Ollama host is a loopback or unspecified address.
+func isLocalhost() bool {
+	host := envconfig.Host()
+	h, _, _ := net.SplitHostPort(host.Host)
+	if h == "localhost" {
+		return true
+	}
+	ip := net.ParseIP(h)
+	return ip != nil && (ip.IsLoopback() || ip.IsUnspecified())
+}
+
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()
@@ -145,6 +166,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	// Check for --experimental flag for safetensors model creation
 	experimental, _ := cmd.Flags().GetBool("experimental")
 	if experimental {
+		if !isLocalhost() {
+			return errors.New("remote safetensor model creation not yet supported")
+		}
 		// Get Modelfile content - either from -f flag or default to "FROM ."
 		var reader io.Reader
 		filename, err := getModelfileName(cmd)
@@ -168,29 +192,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			return fmt.Errorf("failed to parse Modelfile: %w", err)
 		}

-		// Extract FROM path and configuration
-		var modelDir string
-		mfConfig := &xcreateclient.ModelfileConfig{}
-
-		for _, cmd := range modelfile.Commands {
-			switch cmd.Name {
-			case "model":
-				modelDir = cmd.Args
-			case "template":
-				mfConfig.Template = cmd.Args
-			case "system":
-				mfConfig.System = cmd.Args
-			case "license":
-				mfConfig.License = cmd.Args
-			case "parser":
-				mfConfig.Parser = cmd.Args
-			case "renderer":
-				mfConfig.Renderer = cmd.Args
-			}
-		}
-
-		if modelDir == "" {
-			modelDir = "."
+		modelDir, mfConfig, err := xcreateclient.ConfigFromModelfile(modelfile)
+		if err != nil {
+			return err
 		}

 		// Resolve relative paths based on Modelfile location
@@ -214,6 +218,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		if filename == "" {
 			// No Modelfile found - check if current directory is an image gen model
 			if create.IsTensorModelDir(".") {
+				if !isLocalhost() {
+					return errors.New("remote safetensor model creation not yet supported")
+				}
 				quantize, _ := cmd.Flags().GetString("quantize")
 				return xcreateclient.CreateModel(xcreateclient.CreateOptions{
 					ModelName: modelName,
@@ -406,12 +413,14 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 		return err
 	}

+	requestedCloud := modelref.HasExplicitCloudSource(opts.Model)
+
 	if info, err := client.Show(cmd.Context(), &api.ShowRequest{Model: opts.Model}); err != nil {
 		return err
-	} else if info.RemoteHost != "" {
+	} else if info.RemoteHost != "" || requestedCloud {
 		// Cloud model, no need to load/unload

-		isCloud := strings.HasPrefix(info.RemoteHost, "https://ollama.com")
+		isCloud := requestedCloud || strings.HasPrefix(info.RemoteHost, "https://ollama.com")

 		// Check if user is signed in for ollama.com cloud models
 		if isCloud {
@@ -422,10 +431,14 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {

 		if opts.ShowConnect {
 			p.StopAndClear()
+			remoteModel := info.RemoteModel
+			if remoteModel == "" {
+				remoteModel = opts.Model
+			}
 			if isCloud {
-				fmt.Fprintf(os.Stderr, "Connecting to '%s' on 'ollama.com' ⚡\n", info.RemoteModel)
+				fmt.Fprintf(os.Stderr, "Connecting to '%s' on 'ollama.com' ⚡\n", remoteModel)
 			} else {
-				fmt.Fprintf(os.Stderr, "Connecting to '%s' on '%s'\n", info.RemoteModel, info.RemoteHost)
+				fmt.Fprintf(os.Stderr, "Connecting to '%s' on '%s'\n", remoteModel, info.RemoteHost)
 			}
 		}

@@ -497,6 +510,64 @@ func generateEmbedding(cmd *cobra.Command, modelName, input string, keepAlive *a
 	return nil
 }

+// TODO(parthsareen): consolidate with TUI signin flow
+func handleCloudAuthorizationError(err error) bool {
+	var authErr api.AuthorizationError
+	if errors.As(err, &authErr) && authErr.StatusCode == http.StatusUnauthorized {
+		fmt.Printf("You need to be signed in to Ollama to run Cloud models.\n\n")
+		if authErr.SigninURL != "" {
+			fmt.Printf(ConnectInstructions, authErr.SigninURL)
+		}
+		return true
+	}
+
+	return false
+}
+
+// TEMP(drifkin): To match legacy `ollama run some-model:cloud` behavior, we
+// best-effort pull cloud stub files for any explicit cloud source models.
+// Remove this once `/api/tags` is cloud-aware.
+func ensureCloudStub(ctx context.Context, client *api.Client, modelName string) {
+	if !modelref.HasExplicitCloudSource(modelName) {
+		return
+	}
+
+	normalizedName, _, err := modelref.NormalizePullName(modelName)
+	if err != nil {
+		slog.Warn("failed to normalize pull name", "model", modelName, "error", err, "normalizedName", normalizedName)
+		return
+	}
+
+	listResp, err := client.List(ctx)
+	if err != nil {
+		slog.Warn("failed to list models", "error", err)
+		return
+	}
+
+	if hasListedModelName(listResp.Models, modelName) || hasListedModelName(listResp.Models, normalizedName) {
+		return
+	}
+
+	logutil.Trace("pulling cloud stub", "model", modelName, "normalizedName", normalizedName)
+	err = client.Pull(ctx, &api.PullRequest{
+		Model: normalizedName,
+	}, func(api.ProgressResponse) error {
+		return nil
+	})
+	if err != nil {
+		slog.Warn("failed to pull cloud stub", "model", modelName, "error", err)
+	}
+}
+
+func hasListedModelName(models []api.ListModelResponse, name string) bool {
+	for _, m := range models {
+		if strings.EqualFold(m.Name, name) || strings.EqualFold(m.Model, name) {
+			return true
+		}
+	}
+	return false
+}
+
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true

@@ -585,17 +656,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.WordWrap = !nowrap

-	useImagegen := false
-	if cmd.Flags().Lookup("imagegen") != nil {
-		useImagegen, err = cmd.Flags().GetBool("imagegen")
-		if err != nil {
-			return err
-		}
-	}
-	if useImagegen {
-		opts.Options["use_imagegen_runner"] = true
-	}
-
 	// Fill out the rest of the options based on information about the
 	// model.
 	client, err := api.ClientFromEnvironment()
@@ -604,12 +664,16 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}

 	name := args[0]
+	requestedCloud := modelref.HasExplicitCloudSource(name)

 	info, err := func() (*api.ShowResponse, error) {
 		showReq := &api.ShowRequest{Name: name}
 		info, err := client.Show(cmd.Context(), showReq)
 		var se api.StatusError
 		if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
+			if requestedCloud {
+				return nil, err
+			}
 			if err := PullHandler(cmd, []string{name}); err != nil {
 				return nil, err
 			}
@@ -618,9 +682,14 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return info, err
 	}()
 	if err != nil {
+		if handleCloudAuthorizationError(err) {
+			return nil
+		}
 		return err
 	}

+	ensureCloudStub(cmd.Context(), client, name)
+
 	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
 	if err != nil {
 		return err
@@ -712,7 +781,13 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 		return generateInteractive(cmd, opts)
 	}
-	return generate(cmd, opts)
+	if err := generate(cmd, opts); err != nil {
+		if handleCloudAuthorizationError(err) {
+			return nil
+		}
+		return err
+	}
+	return nil
 }

 func SigninHandler(cmd *cobra.Command, args []string) error {
@@ -1892,6 +1967,24 @@ func ensureServerRunning(ctx context.Context) error {
 	}
 }

+func launchInteractiveModel(cmd *cobra.Command, modelName string) error {
+	opts := runOptions{
+		Model:       modelName,
+		WordWrap:    os.Getenv("TERM") == "xterm-256color",
+		Options:     map[string]any{},
+		ShowConnect: true,
+	}
+	// loadOrUnloadModel is cloud-safe here: remote/cloud models skip local preload
+	// and only validate auth/connectivity before interactive chat starts.
+	if err := loadOrUnloadModel(cmd, &opts); err != nil {
+		return fmt.Errorf("error loading model: %w", err)
+	}
+	if err := generateInteractive(cmd, opts); err != nil {
+		return fmt.Errorf("error running model: %w", err)
+	}
+	return nil
+}
+
 // runInteractiveTUI runs the main interactive TUI menu.
 func runInteractiveTUI(cmd *cobra.Command) {
 	// Ensure the server is running before showing the TUI
@@ -1900,175 +1993,81 @@ func runInteractiveTUI(cmd *cobra.Command) {
 		return
 	}

-	// Selector adapters for tui
-	singleSelector := func(title string, items []config.ModelItem, current string) (string, error) {
-		tuiItems := tui.ReorderItems(tui.ConvertItems(items))
-		result, err := tui.SelectSingle(title, tuiItems, current)
-		if errors.Is(err, tui.ErrCancelled) {
-			return "", config.ErrCancelled
-		}
-		return result, err
-	}
-
-	multiSelector := func(title string, items []config.ModelItem, preChecked []string) ([]string, error) {
-		tuiItems := tui.ReorderItems(tui.ConvertItems(items))
-		result, err := tui.SelectMultiple(title, tuiItems, preChecked)
-		if errors.Is(err, tui.ErrCancelled) {
-			return nil, config.ErrCancelled
-		}
-		return result, err
+	deps := launcherDeps{
+		buildState:        launch.BuildLauncherState,
+		runMenu:           tui.RunMenu,
+		resolveRunModel:   launch.ResolveRunModel,
+		launchIntegration: launch.LaunchIntegration,
+		runModel:          launchInteractiveModel,
 	}

 	for {
-		result, err := tui.Run()
+		continueLoop, err := runInteractiveTUIStep(cmd, deps)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+		}
+		if !continueLoop {
 			return
 		}
+	}
+}

-		runModel := func(modelName string) {
-			client, err := api.ClientFromEnvironment()
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-				return
-			}
-			if err := config.ShowOrPull(cmd.Context(), client, modelName); err != nil {
-				if errors.Is(err, config.ErrCancelled) {
-					return
-				}
-				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-				return
-			}
-			_ = config.SetLastModel(modelName)
-			opts := runOptions{
-				Model:       modelName,
-				WordWrap:    os.Getenv("TERM") == "xterm-256color",
-				Options:     map[string]any{},
-				ShowConnect: true,
-			}
-			if err := loadOrUnloadModel(cmd, &opts); err != nil {
-				fmt.Fprintf(os.Stderr, "Error loading model: %v\n", err)
-				return
-			}
-			if err := generateInteractive(cmd, opts); err != nil {
-				fmt.Fprintf(os.Stderr, "Error running model: %v\n", err)
-			}
-		}
+type launcherDeps struct {
+	buildState        func(context.Context) (*launch.LauncherState, error)
+	runMenu           func(*launch.LauncherState) (tui.TUIAction, error)
+	resolveRunModel   func(context.Context, launch.RunModelRequest) (string, error)
+	launchIntegration func(context.Context, launch.IntegrationLaunchRequest) error
+	runModel          func(*cobra.Command, string) error
+}

-		launchIntegration := func(name string) bool {
-			if err := config.EnsureInstalled(name); err != nil {
-				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-				return true
-			}
-			// If not configured or model no longer exists, prompt for model selection
-			configuredModel := config.IntegrationModel(name)
-			if configuredModel == "" || !config.ModelExists(cmd.Context(), configuredModel) || config.IsCloudModelDisabled(cmd.Context(), configuredModel) {
-				err := config.ConfigureIntegrationWithSelectors(cmd.Context(), name, singleSelector, multiSelector)
-				if errors.Is(err, config.ErrCancelled) {
-					return false // Return to main menu
-				}
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Error configuring %s: %v\n", name, err)
-					return true
-				}
-			}
-			if err := config.LaunchIntegration(name); err != nil {
-				fmt.Fprintf(os.Stderr, "Error launching %s: %v\n", name, err)
-			}
-			return true
-		}
+func runInteractiveTUIStep(cmd *cobra.Command, deps launcherDeps) (bool, error) {
+	state, err := deps.buildState(cmd.Context())
+	if err != nil {
+		return false, fmt.Errorf("build launcher state: %w", err)
+	}

-		switch result.Selection {
-		case tui.SelectionNone:
-			// User quit
-			return
-		case tui.SelectionRunModel:
-			_ = config.SetLastSelection("run")
-			if modelName := config.LastModel(); modelName != "" && !config.IsCloudModelDisabled(cmd.Context(), modelName) {
-				runModel(modelName)
-			} else {
-				modelName, err := config.SelectModelWithSelector(cmd.Context(), singleSelector)
-				if errors.Is(err, config.ErrCancelled) {
-					continue // Return to main menu
-				}
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Error selecting model: %v\n", err)
-					continue
-				}
-				runModel(modelName)
-			}
-		case tui.SelectionChangeRunModel:
-			_ = config.SetLastSelection("run")
-			// Use model from modal if selected, otherwise show picker
-			modelName := result.Model
-			if modelName == "" {
-				var err error
-				modelName, err = config.SelectModelWithSelector(cmd.Context(), singleSelector)
-				if errors.Is(err, config.ErrCancelled) {
-					continue // Return to main menu
-				}
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Error selecting model: %v\n", err)
-					continue
-				}
-			}
-			if config.IsCloudModelDisabled(cmd.Context(), modelName) {
-				continue // Return to main menu
-			}
-			runModel(modelName)
-		case tui.SelectionIntegration:
-			_ = config.SetLastSelection(result.Integration)
-			if !launchIntegration(result.Integration) {
-				continue // Return to main menu
-			}
-		case tui.SelectionChangeIntegration:
-			_ = config.SetLastSelection(result.Integration)
-			if len(result.Models) > 0 {
-				// Filter out cloud-disabled models
-				var filtered []string
-				for _, m := range result.Models {
-					if !config.IsCloudModelDisabled(cmd.Context(), m) {
-						filtered = append(filtered, m)
-					}
-				}
-				if len(filtered) == 0 {
-					continue
-				}
-				result.Models = filtered
-				// Multi-select from modal (Editor integrations)
-				if err := config.SaveAndEditIntegration(result.Integration, result.Models); err != nil {
-					fmt.Fprintf(os.Stderr, "Error configuring %s: %v\n", result.Integration, err)
-					continue
-				}
-				if err := config.LaunchIntegrationWithModel(result.Integration, result.Models[0]); err != nil {
-					fmt.Fprintf(os.Stderr, "Error launching %s: %v\n", result.Integration, err)
-				}
-			} else if result.Model != "" {
-				if config.IsCloudModelDisabled(cmd.Context(), result.Model) {
-					continue
-				}
-				// Single-select from modal - save and launch
-				if err := config.SaveIntegration(result.Integration, []string{result.Model}); err != nil {
-					fmt.Fprintf(os.Stderr, "Error saving config: %v\n", err)
-					continue
-				}
-				if err := config.LaunchIntegrationWithModel(result.Integration, result.Model); err != nil {
-					fmt.Fprintf(os.Stderr, "Error launching %s: %v\n", result.Integration, err)
-				}
-			} else {
-				err := config.ConfigureIntegrationWithSelectors(cmd.Context(), result.Integration, singleSelector, multiSelector)
-				if errors.Is(err, config.ErrCancelled) {
-					continue // Return to main menu
-				}
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Error configuring %s: %v\n", result.Integration, err)
-					continue
-				}
-				if err := config.LaunchIntegration(result.Integration); err != nil {
-					fmt.Fprintf(os.Stderr, "Error launching %s: %v\n", result.Integration, err)
-				}
-			}
+	action, err := deps.runMenu(state)
+	if err != nil {
+		return false, fmt.Errorf("run launcher menu: %w", err)
+	}
+
+	return runLauncherAction(cmd, action, deps)
+}
+
+func saveLauncherSelection(action tui.TUIAction) {
+	// Best effort only: this affects menu recall, not launch correctness.
+	_ = config.SetLastSelection(action.LastSelection())
+}
+
+func runLauncherAction(cmd *cobra.Command, action tui.TUIAction, deps launcherDeps) (bool, error) {
+	switch action.Kind {
+	case tui.TUIActionNone:
+		return false, nil
+	case tui.TUIActionRunModel:
+		saveLauncherSelection(action)
+		modelName, err := deps.resolveRunModel(cmd.Context(), action.RunModelRequest())
+		if errors.Is(err, launch.ErrCancelled) {
+			return true, nil
 		}
+		if err != nil {
+			return true, fmt.Errorf("selecting model: %w", err)
+		}
+		if err := deps.runModel(cmd, modelName); err != nil {
+			return true, err
+		}
+		return true, nil
+	case tui.TUIActionLaunchIntegration:
+		saveLauncherSelection(action)
+		err := deps.launchIntegration(cmd.Context(), action.IntegrationLaunchRequest())
+		if errors.Is(err, launch.ErrCancelled) {
+			return true, nil
+		}
+		if err != nil {
+			return true, fmt.Errorf("launching %s: %w", action.Integration, err)
+		}
+		return true, nil
+	default:
+		return false, fmt.Errorf("unknown launcher action: %d", action.Kind)
 	}
 }

@@ -2338,7 +2337,7 @@ func NewCLI() *cobra.Command {
 		copyCmd,
 		deleteCmd,
 		runnerCmd,
-		config.LaunchCmd(checkServerHeartbeat, runInteractiveTUI),
+		launch.LaunchCmd(checkServerHeartbeat, runInteractiveTUI),
 	)

 	return rootCmd
--- a/cmd/cmd_launcher_test.go
+++ b/cmd/cmd_launcher_test.go
@@ -0,0 +1,233 @@
+package cmd
+
+import (
+	"context"
+	"testing"
+
+	"github.com/spf13/cobra"
+
+	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/cmd/launch"
+	"github.com/ollama/ollama/cmd/tui"
+)
+
+func setCmdTestHome(t *testing.T, dir string) {
+	t.Helper()
+	t.Setenv("HOME", dir)
+	t.Setenv("USERPROFILE", dir)
+}
+
+func unexpectedRunModelResolution(t *testing.T) func(context.Context, launch.RunModelRequest) (string, error) {
+	t.Helper()
+	return func(ctx context.Context, req launch.RunModelRequest) (string, error) {
+		t.Fatalf("did not expect run-model resolution: %+v", req)
+		return "", nil
+	}
+}
+
+func unexpectedIntegrationLaunch(t *testing.T) func(context.Context, launch.IntegrationLaunchRequest) error {
+	t.Helper()
+	return func(ctx context.Context, req launch.IntegrationLaunchRequest) error {
+		t.Fatalf("did not expect integration launch: %+v", req)
+		return nil
+	}
+}
+
+func unexpectedModelLaunch(t *testing.T) func(*cobra.Command, string) error {
+	t.Helper()
+	return func(cmd *cobra.Command, model string) error {
+		t.Fatalf("did not expect chat launch: %s", model)
+		return nil
+	}
+}
+
+func TestRunInteractiveTUI_RunModelActionsUseResolveRunModel(t *testing.T) {
+	tests := []struct {
+		name      string
+		action    tui.TUIAction
+		wantForce bool
+		wantModel string
+	}{
+		{
+			name:      "enter uses saved model flow",
+			action:    tui.TUIAction{Kind: tui.TUIActionRunModel},
+			wantModel: "qwen3:8b",
+		},
+		{
+			name:      "right forces picker",
+			action:    tui.TUIAction{Kind: tui.TUIActionRunModel, ForceConfigure: true},
+			wantForce: true,
+			wantModel: "glm-5:cloud",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			setCmdTestHome(t, t.TempDir())
+
+			var menuCalls int
+			runMenu := func(state *launch.LauncherState) (tui.TUIAction, error) {
+				menuCalls++
+				if menuCalls == 1 {
+					return tt.action, nil
+				}
+				return tui.TUIAction{Kind: tui.TUIActionNone}, nil
+			}
+
+			var gotReq launch.RunModelRequest
+			var launched string
+			deps := launcherDeps{
+				buildState: func(ctx context.Context) (*launch.LauncherState, error) {
+					return &launch.LauncherState{}, nil
+				},
+				runMenu: runMenu,
+				resolveRunModel: func(ctx context.Context, req launch.RunModelRequest) (string, error) {
+					gotReq = req
+					return tt.wantModel, nil
+				},
+				launchIntegration: unexpectedIntegrationLaunch(t),
+				runModel: func(cmd *cobra.Command, model string) error {
+					launched = model
+					return nil
+				},
+			}
+
+			cmd := &cobra.Command{}
+			cmd.SetContext(context.Background())
+			for {
+				continueLoop, err := runInteractiveTUIStep(cmd, deps)
+				if err != nil {
+					t.Fatalf("unexpected step error: %v", err)
+				}
+				if !continueLoop {
+					break
+				}
+			}
+
+			if gotReq.ForcePicker != tt.wantForce {
+				t.Fatalf("expected ForcePicker=%v, got %v", tt.wantForce, gotReq.ForcePicker)
+			}
+			if launched != tt.wantModel {
+				t.Fatalf("expected interactive launcher to run %q, got %q", tt.wantModel, launched)
+			}
+			if got := config.LastSelection(); got != "run" {
+				t.Fatalf("expected last selection to be run, got %q", got)
+			}
+		})
+	}
+}
+
+func TestRunInteractiveTUI_IntegrationActionsUseLaunchIntegration(t *testing.T) {
+	tests := []struct {
+		name      string
+		action    tui.TUIAction
+		wantForce bool
+	}{
+		{
+			name:   "enter launches integration",
+			action: tui.TUIAction{Kind: tui.TUIActionLaunchIntegration, Integration: "claude"},
+		},
+		{
+			name:      "right forces configure",
+			action:    tui.TUIAction{Kind: tui.TUIActionLaunchIntegration, Integration: "claude", ForceConfigure: true},
+			wantForce: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			setCmdTestHome(t, t.TempDir())
+
+			var menuCalls int
+			runMenu := func(state *launch.LauncherState) (tui.TUIAction, error) {
+				menuCalls++
+				if menuCalls == 1 {
+					return tt.action, nil
+				}
+				return tui.TUIAction{Kind: tui.TUIActionNone}, nil
+			}
+
+			var gotReq launch.IntegrationLaunchRequest
+			deps := launcherDeps{
+				buildState: func(ctx context.Context) (*launch.LauncherState, error) {
+					return &launch.LauncherState{}, nil
+				},
+				runMenu:         runMenu,
+				resolveRunModel: unexpectedRunModelResolution(t),
+				launchIntegration: func(ctx context.Context, req launch.IntegrationLaunchRequest) error {
+					gotReq = req
+					return nil
+				},
+				runModel: unexpectedModelLaunch(t),
+			}
+
+			cmd := &cobra.Command{}
+			cmd.SetContext(context.Background())
+			for {
+				continueLoop, err := runInteractiveTUIStep(cmd, deps)
+				if err != nil {
+					t.Fatalf("unexpected step error: %v", err)
+				}
+				if !continueLoop {
+					break
+				}
+			}
+
+			if gotReq.Name != "claude" {
+				t.Fatalf("expected integration name to be passed through, got %q", gotReq.Name)
+			}
+			if gotReq.ForceConfigure != tt.wantForce {
+				t.Fatalf("expected ForceConfigure=%v, got %v", tt.wantForce, gotReq.ForceConfigure)
+			}
+			if got := config.LastSelection(); got != "claude" {
+				t.Fatalf("expected last selection to be claude, got %q", got)
+			}
+		})
+	}
+}
+
+func TestRunLauncherAction_RunModelContinuesAfterCancellation(t *testing.T) {
+	setCmdTestHome(t, t.TempDir())
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(context.Background())
+
+	continueLoop, err := runLauncherAction(cmd, tui.TUIAction{Kind: tui.TUIActionRunModel}, launcherDeps{
+		buildState: nil,
+		runMenu:    nil,
+		resolveRunModel: func(ctx context.Context, req launch.RunModelRequest) (string, error) {
+			return "", launch.ErrCancelled
+		},
+		launchIntegration: unexpectedIntegrationLaunch(t),
+		runModel:          unexpectedModelLaunch(t),
+	})
+	if err != nil {
+		t.Fatalf("expected nil error on cancellation, got %v", err)
+	}
+	if !continueLoop {
+		t.Fatal("expected cancellation to continue the menu loop")
+	}
+}
+
+func TestRunLauncherAction_IntegrationContinuesAfterCancellation(t *testing.T) {
+	setCmdTestHome(t, t.TempDir())
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(context.Background())
+
+	continueLoop, err := runLauncherAction(cmd, tui.TUIAction{Kind: tui.TUIActionLaunchIntegration, Integration: "claude"}, launcherDeps{
+		buildState:      nil,
+		runMenu:         nil,
+		resolveRunModel: unexpectedRunModelResolution(t),
+		launchIntegration: func(ctx context.Context, req launch.IntegrationLaunchRequest) error {
+			return launch.ErrCancelled
+		},
+		runModel: unexpectedModelLaunch(t),
+	})
+	if err != nil {
+		t.Fatalf("expected nil error on cancellation, got %v", err)
+	}
+	if !continueLoop {
+		t.Fatal("expected cancellation to continue the menu loop")
+	}
+}
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -705,6 +705,347 @@ func TestRunEmbeddingModelNoInput(t *testing.T) {
 	}
 }

+func TestRunHandler_CloudAuthErrorOnShow_PrintsSigninMessage(t *testing.T) {
+	var generateCalled bool
+
+	mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/show" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusUnauthorized)
+			if err := json.NewEncoder(w).Encode(map[string]string{
+				"error":      "unauthorized",
+				"signin_url": "https://ollama.com/signin",
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/generate" && r.Method == http.MethodPost:
+			generateCalled = true
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.GenerateResponse{Done: true}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+
+	t.Setenv("OLLAMA_HOST", mockServer.URL)
+	t.Cleanup(mockServer.Close)
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(t.Context())
+	cmd.Flags().String("keepalive", "", "")
+	cmd.Flags().Bool("truncate", false, "")
+	cmd.Flags().Int("dimensions", 0, "")
+	cmd.Flags().Bool("verbose", false, "")
+	cmd.Flags().Bool("insecure", false, "")
+	cmd.Flags().Bool("nowordwrap", false, "")
+	cmd.Flags().String("format", "", "")
+	cmd.Flags().String("think", "", "")
+	cmd.Flags().Bool("hidethinking", false, "")
+
+	oldStdout := os.Stdout
+	readOut, writeOut, _ := os.Pipe()
+	os.Stdout = writeOut
+	t.Cleanup(func() { os.Stdout = oldStdout })
+
+	err := RunHandler(cmd, []string{"gpt-oss:20b:cloud", "hi"})
+
+	_ = writeOut.Close()
+	var out bytes.Buffer
+	_, _ = io.Copy(&out, readOut)
+
+	if err != nil {
+		t.Fatalf("RunHandler returned error: %v", err)
+	}
+
+	if generateCalled {
+		t.Fatal("expected run to stop before /api/generate after unauthorized /api/show")
+	}
+
+	if !strings.Contains(out.String(), "You need to be signed in to Ollama to run Cloud models.") {
+		t.Fatalf("expected sign-in guidance message, got %q", out.String())
+	}
+
+	if !strings.Contains(out.String(), "https://ollama.com/signin") {
+		t.Fatalf("expected signin_url in output, got %q", out.String())
+	}
+}
+
+func TestRunHandler_CloudAuthErrorOnGenerate_PrintsSigninMessage(t *testing.T) {
+	mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/show" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ShowResponse{
+				Capabilities: []model.Capability{model.CapabilityCompletion},
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/generate" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusUnauthorized)
+			if err := json.NewEncoder(w).Encode(map[string]string{
+				"error":      "unauthorized",
+				"signin_url": "https://ollama.com/signin",
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+
+	t.Setenv("OLLAMA_HOST", mockServer.URL)
+	t.Cleanup(mockServer.Close)
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(t.Context())
+	cmd.Flags().String("keepalive", "", "")
+	cmd.Flags().Bool("truncate", false, "")
+	cmd.Flags().Int("dimensions", 0, "")
+	cmd.Flags().Bool("verbose", false, "")
+	cmd.Flags().Bool("insecure", false, "")
+	cmd.Flags().Bool("nowordwrap", false, "")
+	cmd.Flags().String("format", "", "")
+	cmd.Flags().String("think", "", "")
+	cmd.Flags().Bool("hidethinking", false, "")
+
+	oldStdout := os.Stdout
+	readOut, writeOut, _ := os.Pipe()
+	os.Stdout = writeOut
+	t.Cleanup(func() { os.Stdout = oldStdout })
+
+	err := RunHandler(cmd, []string{"gpt-oss:20b:cloud", "hi"})
+
+	_ = writeOut.Close()
+	var out bytes.Buffer
+	_, _ = io.Copy(&out, readOut)
+
+	if err != nil {
+		t.Fatalf("RunHandler returned error: %v", err)
+	}
+
+	if !strings.Contains(out.String(), "You need to be signed in to Ollama to run Cloud models.") {
+		t.Fatalf("expected sign-in guidance message, got %q", out.String())
+	}
+
+	if !strings.Contains(out.String(), "https://ollama.com/signin") {
+		t.Fatalf("expected signin_url in output, got %q", out.String())
+	}
+}
+
+func TestRunHandler_ExplicitCloudStubMissing_PullsNormalizedNameTEMP(t *testing.T) {
+	var pulledModel string
+	var generateCalled bool
+
+	mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/show" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ShowResponse{
+				Capabilities: []model.Capability{model.CapabilityCompletion},
+				RemoteModel:  "gpt-oss:20b",
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/tags" && r.Method == http.MethodGet:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ListResponse{Models: nil}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/pull" && r.Method == http.MethodPost:
+			var req api.PullRequest
+			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+				http.Error(w, err.Error(), http.StatusBadRequest)
+				return
+			}
+			pulledModel = req.Model
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ProgressResponse{Status: "success"}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/generate" && r.Method == http.MethodPost:
+			generateCalled = true
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.GenerateResponse{Done: true}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+
+	t.Setenv("OLLAMA_HOST", mockServer.URL)
+	t.Cleanup(mockServer.Close)
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(t.Context())
+	cmd.Flags().String("keepalive", "", "")
+	cmd.Flags().Bool("truncate", false, "")
+	cmd.Flags().Int("dimensions", 0, "")
+	cmd.Flags().Bool("verbose", false, "")
+	cmd.Flags().Bool("insecure", false, "")
+	cmd.Flags().Bool("nowordwrap", false, "")
+	cmd.Flags().String("format", "", "")
+	cmd.Flags().String("think", "", "")
+	cmd.Flags().Bool("hidethinking", false, "")
+
+	err := RunHandler(cmd, []string{"gpt-oss:20b:cloud", "hi"})
+	if err != nil {
+		t.Fatalf("RunHandler returned error: %v", err)
+	}
+
+	if pulledModel != "gpt-oss:20b-cloud" {
+		t.Fatalf("expected normalized pull model %q, got %q", "gpt-oss:20b-cloud", pulledModel)
+	}
+
+	if !generateCalled {
+		t.Fatal("expected /api/generate to be called")
+	}
+}
+
+func TestRunHandler_ExplicitCloudStubPresent_SkipsPullTEMP(t *testing.T) {
+	var pullCalled bool
+	var generateCalled bool
+
+	mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/show" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ShowResponse{
+				Capabilities: []model.Capability{model.CapabilityCompletion},
+				RemoteModel:  "gpt-oss:20b",
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/tags" && r.Method == http.MethodGet:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ListResponse{
+				Models: []api.ListModelResponse{{Name: "gpt-oss:20b-cloud"}},
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/pull" && r.Method == http.MethodPost:
+			pullCalled = true
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ProgressResponse{Status: "success"}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/generate" && r.Method == http.MethodPost:
+			generateCalled = true
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.GenerateResponse{Done: true}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+
+	t.Setenv("OLLAMA_HOST", mockServer.URL)
+	t.Cleanup(mockServer.Close)
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(t.Context())
+	cmd.Flags().String("keepalive", "", "")
+	cmd.Flags().Bool("truncate", false, "")
+	cmd.Flags().Int("dimensions", 0, "")
+	cmd.Flags().Bool("verbose", false, "")
+	cmd.Flags().Bool("insecure", false, "")
+	cmd.Flags().Bool("nowordwrap", false, "")
+	cmd.Flags().String("format", "", "")
+	cmd.Flags().String("think", "", "")
+	cmd.Flags().Bool("hidethinking", false, "")
+
+	err := RunHandler(cmd, []string{"gpt-oss:20b:cloud", "hi"})
+	if err != nil {
+		t.Fatalf("RunHandler returned error: %v", err)
+	}
+
+	if pullCalled {
+		t.Fatal("expected /api/pull not to be called when cloud stub already exists")
+	}
+
+	if !generateCalled {
+		t.Fatal("expected /api/generate to be called")
+	}
+}
+
+func TestRunHandler_ExplicitCloudStubPullFailure_IsBestEffortTEMP(t *testing.T) {
+	var generateCalled bool
+
+	mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/show" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ShowResponse{
+				Capabilities: []model.Capability{model.CapabilityCompletion},
+				RemoteModel:  "gpt-oss:20b",
+			}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/tags" && r.Method == http.MethodGet:
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.ListResponse{Models: nil}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/pull" && r.Method == http.MethodPost:
+			w.WriteHeader(http.StatusInternalServerError)
+			if err := json.NewEncoder(w).Encode(map[string]string{"error": "pull failed"}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		case r.URL.Path == "/api/generate" && r.Method == http.MethodPost:
+			generateCalled = true
+			w.WriteHeader(http.StatusOK)
+			if err := json.NewEncoder(w).Encode(api.GenerateResponse{Done: true}); err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+			}
+			return
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+
+	t.Setenv("OLLAMA_HOST", mockServer.URL)
+	t.Cleanup(mockServer.Close)
+
+	cmd := &cobra.Command{}
+	cmd.SetContext(t.Context())
+	cmd.Flags().String("keepalive", "", "")
+	cmd.Flags().Bool("truncate", false, "")
+	cmd.Flags().Int("dimensions", 0, "")
+	cmd.Flags().Bool("verbose", false, "")
+	cmd.Flags().Bool("insecure", false, "")
+	cmd.Flags().Bool("nowordwrap", false, "")
+	cmd.Flags().String("format", "", "")
+	cmd.Flags().String("think", "", "")
+	cmd.Flags().Bool("hidethinking", false, "")
+
+	err := RunHandler(cmd, []string{"gpt-oss:20b:cloud", "hi"})
+	if err != nil {
+		t.Fatalf("RunHandler returned error: %v", err)
+	}
+
+	if !generateCalled {
+		t.Fatal("expected /api/generate to be called despite pull failure")
+	}
+}
+
 func TestGetModelfileName(t *testing.T) {
 	tests := []struct {
 		name          string
@@ -1212,6 +1553,20 @@ func TestNewCreateRequest(t *testing.T) {
 				Model: "newmodel",
 			},
 		},
+		{
+			"explicit cloud model preserves source when parent lacks it",
+			"newmodel",
+			runOptions{
+				Model:       "qwen3.5:cloud",
+				ParentModel: "qwen3.5",
+				Messages:    []api.Message{},
+				WordWrap:    true,
+			},
+			&api.CreateRequest{
+				From:  "qwen3.5:cloud",
+				Model: "newmodel",
+			},
+		},
 		{
 			"parent model as filepath test",
 			"newmodel",
@@ -1663,31 +2018,81 @@ func TestRunOptions_Copy_Independence(t *testing.T) {

 func TestLoadOrUnloadModel_CloudModelAuth(t *testing.T) {
 	tests := []struct {
-		name          string
-		remoteHost    string
-		whoamiStatus  int
-		whoamiResp    any
-		expectedError string
+		name            string
+		model           string
+		showStatus      int
+		remoteHost      string
+		remoteModel     string
+		whoamiStatus    int
+		whoamiResp      any
+		expectWhoami    bool
+		expectedError   string
+		expectAuthError bool
 	}{
 		{
 			name:         "ollama.com cloud model - user signed in",
+			model:        "test-cloud-model",
 			remoteHost:   "https://ollama.com",
+			remoteModel:  "test-model",
 			whoamiStatus: http.StatusOK,
 			whoamiResp:   api.UserResponse{Name: "testuser"},
+			expectWhoami: true,
 		},
 		{
 			name:         "ollama.com cloud model - user not signed in",
+			model:        "test-cloud-model",
 			remoteHost:   "https://ollama.com",
+			remoteModel:  "test-model",
 			whoamiStatus: http.StatusUnauthorized,
 			whoamiResp: map[string]string{
 				"error":      "unauthorized",
 				"signin_url": "https://ollama.com/signin",
 			},
-			expectedError: "unauthorized",
+			expectWhoami:    true,
+			expectedError:   "unauthorized",
+			expectAuthError: true,
 		},
 		{
 			name:         "non-ollama.com remote - no auth check",
+			model:        "test-cloud-model",
 			remoteHost:   "https://other-remote.com",
+			remoteModel:  "test-model",
+			whoamiStatus: http.StatusUnauthorized, // should not be called
+			whoamiResp:   nil,
+		},
+		{
+			name:         "explicit :cloud model - auth check without remote metadata",
+			model:        "kimi-k2.5:cloud",
+			remoteHost:   "",
+			remoteModel:  "",
+			whoamiStatus: http.StatusOK,
+			whoamiResp:   api.UserResponse{Name: "testuser"},
+			expectWhoami: true,
+		},
+		{
+			name:            "explicit :cloud model without local stub returns not found by default",
+			model:           "minimax-m2.7:cloud",
+			showStatus:      http.StatusNotFound,
+			whoamiStatus:    http.StatusOK,
+			whoamiResp:      api.UserResponse{Name: "testuser"},
+			expectedError:   "not found",
+			expectWhoami:    false,
+			expectAuthError: false,
+		},
+		{
+			name:         "explicit -cloud model - auth check without remote metadata",
+			model:        "kimi-k2.5:latest-cloud",
+			remoteHost:   "",
+			remoteModel:  "",
+			whoamiStatus: http.StatusOK,
+			whoamiResp:   api.UserResponse{Name: "testuser"},
+			expectWhoami: true,
+		},
+		{
+			name:         "dash cloud-like name without explicit source does not require auth",
+			model:        "test-cloud-model",
+			remoteHost:   "",
+			remoteModel:  "",
 			whoamiStatus: http.StatusUnauthorized, // should not be called
 			whoamiResp:   nil,
 		},
@@ -1699,10 +2104,15 @@ func TestLoadOrUnloadModel_CloudModelAuth(t *testing.T) {
 			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				switch r.URL.Path {
 				case "/api/show":
+					if tt.showStatus != 0 && tt.showStatus != http.StatusOK {
+						w.WriteHeader(tt.showStatus)
+						_ = json.NewEncoder(w).Encode(map[string]string{"error": "not found"})
+						return
+					}
 					w.Header().Set("Content-Type", "application/json")
 					if err := json.NewEncoder(w).Encode(api.ShowResponse{
 						RemoteHost:  tt.remoteHost,
-						RemoteModel: "test-model",
+						RemoteModel: tt.remoteModel,
 					}); err != nil {
 						http.Error(w, err.Error(), http.StatusInternalServerError)
 					}
@@ -1715,6 +2125,8 @@ func TestLoadOrUnloadModel_CloudModelAuth(t *testing.T) {
 							http.Error(w, err.Error(), http.StatusInternalServerError)
 						}
 					}
+				case "/api/generate":
+					w.WriteHeader(http.StatusOK)
 				default:
 					http.NotFound(w, r)
 				}
@@ -1727,29 +2139,28 @@ func TestLoadOrUnloadModel_CloudModelAuth(t *testing.T) {
 			cmd.SetContext(t.Context())

 			opts := &runOptions{
-				Model:       "test-cloud-model",
+				Model:       tt.model,
 				ShowConnect: false,
 			}

 			err := loadOrUnloadModel(cmd, opts)

-			if strings.HasPrefix(tt.remoteHost, "https://ollama.com") {
-				if !whoamiCalled {
-					t.Error("expected whoami to be called for ollama.com cloud model")
-				}
-			} else {
-				if whoamiCalled {
-					t.Error("whoami should not be called for non-ollama.com remote")
-				}
+			if whoamiCalled != tt.expectWhoami {
+				t.Errorf("whoami called = %v, want %v", whoamiCalled, tt.expectWhoami)
 			}

 			if tt.expectedError != "" {
 				if err == nil {
 					t.Errorf("expected error containing %q, got nil", tt.expectedError)
 				} else {
-					var authErr api.AuthorizationError
-					if !errors.As(err, &authErr) {
-						t.Errorf("expected AuthorizationError, got %T: %v", err, err)
+					if !tt.expectAuthError && !strings.Contains(strings.ToLower(err.Error()), strings.ToLower(tt.expectedError)) {
+						t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
+					}
+					if tt.expectAuthError {
+						var authErr api.AuthorizationError
+						if !errors.As(err, &authErr) {
+							t.Errorf("expected AuthorizationError, got %T: %v", err, err)
+						}
 					}
 				}
 			} else {
@@ -1760,3 +2171,38 @@ func TestLoadOrUnloadModel_CloudModelAuth(t *testing.T) {
 		})
 	}
 }
+
+func TestIsLocalhost(t *testing.T) {
+	tests := []struct {
+		name     string
+		host     string
+		expected bool
+	}{
+		{"default empty", "", true},
+		{"localhost no port", "localhost", true},
+		{"localhost with port", "localhost:11435", true},
+		{"127.0.0.1 no port", "127.0.0.1", true},
+		{"127.0.0.1 with port", "127.0.0.1:11434", true},
+		{"0.0.0.0 no port", "0.0.0.0", true},
+		{"0.0.0.0 with port", "0.0.0.0:11434", true},
+		{"::1 no port", "::1", true},
+		{"[::1] with port", "[::1]:11434", true},
+		{"loopback with scheme", "http://localhost:11434", true},
+		{"remote hostname", "example.com", false},
+		{"remote hostname with port", "example.com:11434", false},
+		{"remote IP", "192.168.1.1", false},
+		{"remote IP with port", "192.168.1.1:11434", false},
+		{"remote with scheme", "http://example.com:11434", false},
+		{"https remote", "https://example.com:443", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Setenv("OLLAMA_HOST", tt.host)
+			got := isLocalhost()
+			if got != tt.expected {
+				t.Errorf("isLocalhost() with OLLAMA_HOST=%q = %v, want %v", tt.host, got, tt.expected)
+			}
+		})
+	}
+}
--- a/cmd/config/claude.go
+++ b/cmd/config/claude.go
@@ -1,192 +0,0 @@
-package config
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
-)
-
-// Claude implements Runner and AliasConfigurer for Claude Code integration
-type Claude struct{}
-
-// Compile-time check that Claude implements AliasConfigurer
-var _ AliasConfigurer = (*Claude)(nil)
-
-func (c *Claude) String() string { return "Claude Code" }
-
-func (c *Claude) args(model string, extra []string) []string {
-	var args []string
-	if model != "" {
-		args = append(args, "--model", model)
-	}
-	args = append(args, extra...)
-	return args
-}
-
-func (c *Claude) findPath() (string, error) {
-	if p, err := exec.LookPath("claude"); err == nil {
-		return p, nil
-	}
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	name := "claude"
-	if runtime.GOOS == "windows" {
-		name = "claude.exe"
-	}
-	fallback := filepath.Join(home, ".claude", "local", name)
-	if _, err := os.Stat(fallback); err != nil {
-		return "", err
-	}
-	return fallback, nil
-}
-
-func (c *Claude) Run(model string, args []string) error {
-	claudePath, err := c.findPath()
-	if err != nil {
-		return fmt.Errorf("claude is not installed, install from https://code.claude.com/docs/en/quickstart")
-	}
-
-	cmd := exec.Command(claudePath, c.args(model, args)...)
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-
-	env := append(os.Environ(),
-		"ANTHROPIC_BASE_URL="+envconfig.Host().String(),
-		"ANTHROPIC_API_KEY=",
-		"ANTHROPIC_AUTH_TOKEN=ollama",
-	)
-
-	env = append(env, c.modelEnvVars(model)...)
-
-	cmd.Env = env
-	return cmd.Run()
-}
-
-// modelEnvVars returns Claude Code env vars that route all model tiers through Ollama.
-func (c *Claude) modelEnvVars(model string) []string {
-	primary := model
-	fast := model
-	if cfg, err := loadIntegration("claude"); err == nil && cfg.Aliases != nil {
-		if p := cfg.Aliases["primary"]; p != "" {
-			primary = p
-		}
-		if f := cfg.Aliases["fast"]; f != "" {
-			fast = f
-		}
-	}
-	return []string{
-		"ANTHROPIC_DEFAULT_OPUS_MODEL=" + primary,
-		"ANTHROPIC_DEFAULT_SONNET_MODEL=" + primary,
-		"ANTHROPIC_DEFAULT_HAIKU_MODEL=" + fast,
-		"CLAUDE_CODE_SUBAGENT_MODEL=" + primary,
-	}
-}
-
-// ConfigureAliases sets up model aliases for Claude Code.
-// model: the model to use (if empty, user will be prompted to select)
-// aliases: existing alias configuration to preserve/update
-// Cloud-only: subagent routing (fast model) is gated to cloud models only until
-// there is a better strategy for prompt caching on local models.
-func (c *Claude) ConfigureAliases(ctx context.Context, model string, existingAliases map[string]string, force bool) (map[string]string, bool, error) {
-	aliases := make(map[string]string)
-	for k, v := range existingAliases {
-		aliases[k] = v
-	}
-
-	if model != "" {
-		aliases["primary"] = model
-	}
-
-	if !force && aliases["primary"] != "" {
-		client, _ := api.ClientFromEnvironment()
-		if isCloudModel(ctx, client, aliases["primary"]) {
-			if isCloudModel(ctx, client, aliases["fast"]) {
-				return aliases, false, nil
-			}
-		} else {
-			delete(aliases, "fast")
-			return aliases, false, nil
-		}
-	}
-
-	items, existingModels, cloudModels, client, err := listModels(ctx)
-	if err != nil {
-		return nil, false, err
-	}
-
-	fmt.Fprintf(os.Stderr, "\n%sModel Configuration%s\n\n", ansiBold, ansiReset)
-
-	if aliases["primary"] == "" || force {
-		primary, err := DefaultSingleSelector("Select model:", items, aliases["primary"])
-		if err != nil {
-			return nil, false, err
-		}
-		if err := pullIfNeeded(ctx, client, existingModels, primary); err != nil {
-			return nil, false, err
-		}
-		if err := ensureAuth(ctx, client, cloudModels, []string{primary}); err != nil {
-			return nil, false, err
-		}
-		aliases["primary"] = primary
-	}
-
-	if isCloudModel(ctx, client, aliases["primary"]) {
-		if aliases["fast"] == "" || !isCloudModel(ctx, client, aliases["fast"]) {
-			aliases["fast"] = aliases["primary"]
-		}
-	} else {
-		delete(aliases, "fast")
-	}
-
-	return aliases, true, nil
-}
-
-// SetAliases syncs the configured aliases to the Ollama server using prefix matching.
-// Cloud-only: for local models (fast is empty), we delete any existing aliases to
-// prevent stale routing to a previous cloud model.
-func (c *Claude) SetAliases(ctx context.Context, aliases map[string]string) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	prefixes := []string{"claude-sonnet-", "claude-haiku-"}
-
-	if aliases["fast"] == "" {
-		for _, prefix := range prefixes {
-			_ = client.DeleteAliasExperimental(ctx, &api.AliasDeleteRequest{Alias: prefix})
-		}
-		return nil
-	}
-
-	prefixAliases := map[string]string{
-		"claude-sonnet-": aliases["primary"],
-		"claude-haiku-":  aliases["fast"],
-	}
-
-	var errs []string
-	for prefix, target := range prefixAliases {
-		req := &api.AliasRequest{
-			Alias:          prefix,
-			Target:         target,
-			PrefixMatching: true,
-		}
-		if err := client.SetAliasExperimental(ctx, req); err != nil {
-			errs = append(errs, prefix)
-		}
-	}
-
-	if len(errs) > 0 {
-		return fmt.Errorf("failed to set aliases: %v", errs)
-	}
-	return nil
-}
--- a/cmd/config/config.go
+++ b/cmd/config/config.go
@@ -3,7 +3,6 @@
 package config

 import (
-	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -11,7 +10,7 @@ import (
 	"path/filepath"
 	"strings"

-	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
 )

 type integration struct {
@@ -20,6 +19,9 @@ type integration struct {
 	Onboarded bool              `json:"onboarded,omitempty"`
 }

+// IntegrationConfig is the persisted config for one integration.
+type IntegrationConfig = integration
+
 type config struct {
 	Integrations  map[string]*integration `json:"integrations"`
 	LastModel     string                  `json:"last_model,omitempty"`
@@ -124,7 +126,7 @@ func save(cfg *config) error {
 		return err
 	}

-	return writeWithBackup(path, data)
+	return fileutil.WriteWithBackup(path, data)
 }

 func SaveIntegration(appName string, models []string) error {
@@ -155,8 +157,8 @@ func SaveIntegration(appName string, models []string) error {
 	return save(cfg)
 }

-// integrationOnboarded marks an integration as onboarded in ollama's config.
-func integrationOnboarded(appName string) error {
+// MarkIntegrationOnboarded marks an integration as onboarded in Ollama's config.
+func MarkIntegrationOnboarded(appName string) error {
 	cfg, err := load()
 	if err != nil {
 		return err
@@ -174,7 +176,7 @@ func integrationOnboarded(appName string) error {

 // IntegrationModel returns the first configured model for an integration, or empty string if not configured.
 func IntegrationModel(appName string) string {
-	integrationConfig, err := loadIntegration(appName)
+	integrationConfig, err := LoadIntegration(appName)
 	if err != nil || len(integrationConfig.Models) == 0 {
 		return ""
 	}
@@ -183,7 +185,7 @@ func IntegrationModel(appName string) string {

 // IntegrationModels returns all configured models for an integration, or nil.
 func IntegrationModels(appName string) []string {
-	integrationConfig, err := loadIntegration(appName)
+	integrationConfig, err := LoadIntegration(appName)
 	if err != nil || len(integrationConfig.Models) == 0 {
 		return nil
 	}
@@ -228,28 +230,8 @@ func SetLastSelection(selection string) error {
 	return save(cfg)
 }

-// ModelExists checks if a model exists on the Ollama server.
-func ModelExists(ctx context.Context, name string) bool {
-	if name == "" {
-		return false
-	}
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return false
-	}
-	models, err := client.List(ctx)
-	if err != nil {
-		return false
-	}
-	for _, m := range models.Models {
-		if m.Name == name || strings.HasPrefix(m.Name, name+":") {
-			return true
-		}
-	}
-	return false
-}
-
-func loadIntegration(appName string) (*integration, error) {
+// LoadIntegration returns the saved config for one integration.
+func LoadIntegration(appName string) (*integration, error) {
 	cfg, err := load()
 	if err != nil {
 		return nil, err
@@ -263,7 +245,8 @@ func loadIntegration(appName string) (*integration, error) {
 	return integrationConfig, nil
 }

-func saveAliases(appName string, aliases map[string]string) error {
+// SaveAliases replaces the saved aliases for one integration.
+func SaveAliases(appName string, aliases map[string]string) error {
 	if appName == "" {
 		return errors.New("app name cannot be empty")
 	}
--- a/cmd/config/config_cloud_test.go
+++ b/cmd/config/config_cloud_test.go
@@ -1,7 +1,6 @@
 package config

 import (
-	"context"
 	"errors"
 	"os"
 	"path/filepath"
@@ -45,12 +44,12 @@ func TestSaveAliases_ReplacesNotMerges(t *testing.T) {
 		"primary": "cloud-model",
 		"fast":    "cloud-model",
 	}
-	if err := saveAliases("claude", initial); err != nil {
+	if err := SaveAliases("claude", initial); err != nil {
 		t.Fatalf("failed to save initial aliases: %v", err)
 	}

 	// Verify both are saved
-	loaded, err := loadIntegration("claude")
+	loaded, err := LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -63,12 +62,12 @@ func TestSaveAliases_ReplacesNotMerges(t *testing.T) {
 		"primary": "local-model",
 		// fast intentionally missing
 	}
-	if err := saveAliases("claude", updated); err != nil {
+	if err := SaveAliases("claude", updated); err != nil {
 		t.Fatalf("failed to save updated aliases: %v", err)
 	}

 	// Verify fast is GONE (not merged/preserved)
-	loaded, err = loadIntegration("claude")
+	loaded, err = LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load after update: %v", err)
 	}
@@ -91,12 +90,12 @@ func TestSaveAliases_PreservesModels(t *testing.T) {

 	// Then update aliases
 	aliases := map[string]string{"primary": "new-model"}
-	if err := saveAliases("claude", aliases); err != nil {
+	if err := SaveAliases("claude", aliases); err != nil {
 		t.Fatalf("failed to save aliases: %v", err)
 	}

 	// Verify models are preserved
-	loaded, err := loadIntegration("claude")
+	loaded, err := LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -111,16 +110,16 @@ func TestSaveAliases_EmptyMap(t *testing.T) {
 	setTestHome(t, tmpDir)

 	// Save with aliases
-	if err := saveAliases("claude", map[string]string{"primary": "model", "fast": "model"}); err != nil {
+	if err := SaveAliases("claude", map[string]string{"primary": "model", "fast": "model"}); err != nil {
 		t.Fatalf("failed to save: %v", err)
 	}

 	// Save empty map
-	if err := saveAliases("claude", map[string]string{}); err != nil {
+	if err := SaveAliases("claude", map[string]string{}); err != nil {
 		t.Fatalf("failed to save empty: %v", err)
 	}

-	loaded, err := loadIntegration("claude")
+	loaded, err := LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -135,16 +134,16 @@ func TestSaveAliases_NilMap(t *testing.T) {
 	setTestHome(t, tmpDir)

 	// Save with aliases first
-	if err := saveAliases("claude", map[string]string{"primary": "model"}); err != nil {
+	if err := SaveAliases("claude", map[string]string{"primary": "model"}); err != nil {
 		t.Fatalf("failed to save: %v", err)
 	}

 	// Save nil map - should clear aliases
-	if err := saveAliases("claude", nil); err != nil {
+	if err := SaveAliases("claude", nil); err != nil {
 		t.Fatalf("failed to save nil: %v", err)
 	}

-	loaded, err := loadIntegration("claude")
+	loaded, err := LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -155,7 +154,7 @@ func TestSaveAliases_NilMap(t *testing.T) {

 // TestSaveAliases_EmptyAppName returns error
 func TestSaveAliases_EmptyAppName(t *testing.T) {
-	err := saveAliases("", map[string]string{"primary": "model"})
+	err := SaveAliases("", map[string]string{"primary": "model"})
 	if err == nil {
 		t.Error("expected error for empty app name")
 	}
@@ -165,12 +164,12 @@ func TestSaveAliases_CaseInsensitive(t *testing.T) {
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)

-	if err := saveAliases("Claude", map[string]string{"primary": "model1"}); err != nil {
+	if err := SaveAliases("Claude", map[string]string{"primary": "model1"}); err != nil {
 		t.Fatalf("failed to save: %v", err)
 	}

 	// Load with different case
-	loaded, err := loadIntegration("claude")
+	loaded, err := LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -179,11 +178,11 @@ func TestSaveAliases_CaseInsensitive(t *testing.T) {
 	}

 	// Update with different case
-	if err := saveAliases("CLAUDE", map[string]string{"primary": "model2"}); err != nil {
+	if err := SaveAliases("CLAUDE", map[string]string{"primary": "model2"}); err != nil {
 		t.Fatalf("failed to update: %v", err)
 	}

-	loaded, err = loadIntegration("claude")
+	loaded, err = LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load after update: %v", err)
 	}
@@ -198,11 +197,11 @@ func TestSaveAliases_CreatesIntegration(t *testing.T) {
 	setTestHome(t, tmpDir)

 	// Save aliases for non-existent integration
-	if err := saveAliases("newintegration", map[string]string{"primary": "model"}); err != nil {
+	if err := SaveAliases("newintegration", map[string]string{"primary": "model"}); err != nil {
 		t.Fatalf("failed to save: %v", err)
 	}

-	loaded, err := loadIntegration("newintegration")
+	loaded, err := LoadIntegration("newintegration")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -371,12 +370,12 @@ func TestAtomicUpdate_ServerSucceedsConfigSaved(t *testing.T) {
 		t.Fatal("server should succeed")
 	}

-	if err := saveAliases("claude", map[string]string{"primary": "model"}); err != nil {
+	if err := SaveAliases("claude", map[string]string{"primary": "model"}); err != nil {
 		t.Fatalf("saveAliases failed: %v", err)
 	}

 	// Verify it was actually saved
-	loaded, err := loadIntegration("claude")
+	loaded, err := LoadIntegration("claude")
 	if err != nil {
 		t.Fatalf("failed to load: %v", err)
 	}
@@ -408,7 +407,7 @@ func TestConfigFile_PreservesUnknownFields(t *testing.T) {
 	os.WriteFile(configPath, []byte(initialConfig), 0o644)

 	// Update aliases
-	if err := saveAliases("claude", map[string]string{"primary": "model2"}); err != nil {
+	if err := SaveAliases("claude", map[string]string{"primary": "model2"}); err != nil {
 		t.Fatalf("failed to save: %v", err)
 	}

@@ -440,11 +439,6 @@ func containsHelper(s, substr string) bool {
 	return false
 }

-func TestClaudeImplementsAliasConfigurer(t *testing.T) {
-	c := &Claude{}
-	var _ AliasConfigurer = c // Compile-time check
-}
-
 func TestModelNameEdgeCases(t *testing.T) {
 	testCases := []struct {
 		name  string
@@ -464,11 +458,11 @@ func TestModelNameEdgeCases(t *testing.T) {
 			setTestHome(t, tmpDir)

 			aliases := map[string]string{"primary": tc.model}
-			if err := saveAliases("claude", aliases); err != nil {
+			if err := SaveAliases("claude", aliases); err != nil {
 				t.Fatalf("failed to save model %q: %v", tc.model, err)
 			}

-			loaded, err := loadIntegration("claude")
+			loaded, err := LoadIntegration("claude")
 			if err != nil {
 				t.Fatalf("failed to load: %v", err)
 			}
@@ -485,7 +479,7 @@ func TestSwitchingScenarios(t *testing.T) {
 		setTestHome(t, tmpDir)

 		// Initial cloud config
-		if err := saveAliases("claude", map[string]string{
+		if err := SaveAliases("claude", map[string]string{
 			"primary": "cloud-model",
 			"fast":    "cloud-model",
 		}); err != nil {
@@ -493,13 +487,13 @@ func TestSwitchingScenarios(t *testing.T) {
 		}

 		// Switch to local (no fast)
-		if err := saveAliases("claude", map[string]string{
+		if err := SaveAliases("claude", map[string]string{
 			"primary": "local-model",
 		}); err != nil {
 			t.Fatal(err)
 		}

-		loaded, _ := loadIntegration("claude")
+		loaded, _ := LoadIntegration("claude")
 		if loaded.Aliases["fast"] != "" {
 			t.Errorf("fast should be removed, got %q", loaded.Aliases["fast"])
 		}
@@ -513,21 +507,21 @@ func TestSwitchingScenarios(t *testing.T) {
 		setTestHome(t, tmpDir)

 		// Initial local config
-		if err := saveAliases("claude", map[string]string{
+		if err := SaveAliases("claude", map[string]string{
 			"primary": "local-model",
 		}); err != nil {
 			t.Fatal(err)
 		}

 		// Switch to cloud (with fast)
-		if err := saveAliases("claude", map[string]string{
+		if err := SaveAliases("claude", map[string]string{
 			"primary": "cloud-model",
 			"fast":    "cloud-model",
 		}); err != nil {
 			t.Fatal(err)
 		}

-		loaded, _ := loadIntegration("claude")
+		loaded, _ := LoadIntegration("claude")
 		if loaded.Aliases["fast"] != "cloud-model" {
 			t.Errorf("fast should be cloud-model, got %q", loaded.Aliases["fast"])
 		}
@@ -538,7 +532,7 @@ func TestSwitchingScenarios(t *testing.T) {
 		setTestHome(t, tmpDir)

 		// Initial cloud config
-		if err := saveAliases("claude", map[string]string{
+		if err := SaveAliases("claude", map[string]string{
 			"primary": "cloud-model-1",
 			"fast":    "cloud-model-1",
 		}); err != nil {
@@ -546,14 +540,14 @@ func TestSwitchingScenarios(t *testing.T) {
 		}

 		// Switch to different cloud
-		if err := saveAliases("claude", map[string]string{
+		if err := SaveAliases("claude", map[string]string{
 			"primary": "cloud-model-2",
 			"fast":    "cloud-model-2",
 		}); err != nil {
 			t.Fatal(err)
 		}

-		loaded, _ := loadIntegration("claude")
+		loaded, _ := LoadIntegration("claude")
 		if loaded.Aliases["primary"] != "cloud-model-2" {
 			t.Errorf("primary should be cloud-model-2, got %q", loaded.Aliases["primary"])
 		}
@@ -563,43 +557,13 @@ func TestSwitchingScenarios(t *testing.T) {
 	})
 }

-func TestToolCapabilityFiltering(t *testing.T) {
-	t.Run("all models checked for tool capability", func(t *testing.T) {
-		// Both cloud and local models are checked for tool capability via Show API
-		// Only models with "tools" in capabilities are included
-		m := modelInfo{Name: "tool-model", Remote: false, ToolCapable: true}
-		if !m.ToolCapable {
-			t.Error("tool capable model should be marked as such")
-		}
-	})
-
-	t.Run("modelInfo includes ToolCapable field", func(t *testing.T) {
-		m := modelInfo{Name: "test", Remote: true, ToolCapable: true}
-		if !m.ToolCapable {
-			t.Error("ToolCapable field should be accessible")
-		}
-	})
-}
-
-func TestIsCloudModel_RequiresClient(t *testing.T) {
-	t.Run("nil client always returns false", func(t *testing.T) {
-		// isCloudModel now only uses Show API, no suffix detection
-		if isCloudModel(context.Background(), nil, "model:cloud") {
-			t.Error("nil client should return false regardless of suffix")
-		}
-		if isCloudModel(context.Background(), nil, "local-model") {
-			t.Error("nil client should return false")
-		}
-	})
-}
-
 func TestModelsAndAliasesMustStayInSync(t *testing.T) {
 	t.Run("saveAliases followed by saveIntegration keeps them in sync", func(t *testing.T) {
 		tmpDir := t.TempDir()
 		setTestHome(t, tmpDir)

 		// Save aliases with one model
-		if err := saveAliases("claude", map[string]string{"primary": "model-a"}); err != nil {
+		if err := SaveAliases("claude", map[string]string{"primary": "model-a"}); err != nil {
 			t.Fatal(err)
 		}

@@ -608,7 +572,7 @@ func TestModelsAndAliasesMustStayInSync(t *testing.T) {
 			t.Fatal(err)
 		}

-		loaded, _ := loadIntegration("claude")
+		loaded, _ := LoadIntegration("claude")
 		if loaded.Aliases["primary"] != loaded.Models[0] {
 			t.Errorf("aliases.primary (%q) != models[0] (%q)", loaded.Aliases["primary"], loaded.Models[0])
 		}
@@ -622,11 +586,11 @@ func TestModelsAndAliasesMustStayInSync(t *testing.T) {
 		if err := SaveIntegration("claude", []string{"old-model"}); err != nil {
 			t.Fatal(err)
 		}
-		if err := saveAliases("claude", map[string]string{"primary": "new-model"}); err != nil {
+		if err := SaveAliases("claude", map[string]string{"primary": "new-model"}); err != nil {
 			t.Fatal(err)
 		}

-		loaded, _ := loadIntegration("claude")
+		loaded, _ := LoadIntegration("claude")

 		// They should be different (this is the bug state)
 		if loaded.Models[0] == loaded.Aliases["primary"] {
@@ -638,7 +602,7 @@ func TestModelsAndAliasesMustStayInSync(t *testing.T) {
 			t.Fatal(err)
 		}

-		loaded, _ = loadIntegration("claude")
+		loaded, _ = LoadIntegration("claude")
 		if loaded.Models[0] != loaded.Aliases["primary"] {
 			t.Errorf("after fix: models[0] (%q) should equal aliases.primary (%q)",
 				loaded.Models[0], loaded.Aliases["primary"])
@@ -653,20 +617,20 @@ func TestModelsAndAliasesMustStayInSync(t *testing.T) {
 		if err := SaveIntegration("claude", []string{"initial-model"}); err != nil {
 			t.Fatal(err)
 		}
-		if err := saveAliases("claude", map[string]string{"primary": "initial-model"}); err != nil {
+		if err := SaveAliases("claude", map[string]string{"primary": "initial-model"}); err != nil {
 			t.Fatal(err)
 		}

 		// Update aliases AND models together
 		newAliases := map[string]string{"primary": "updated-model"}
-		if err := saveAliases("claude", newAliases); err != nil {
+		if err := SaveAliases("claude", newAliases); err != nil {
 			t.Fatal(err)
 		}
 		if err := SaveIntegration("claude", []string{newAliases["primary"]}); err != nil {
 			t.Fatal(err)
 		}

-		loaded, _ := loadIntegration("claude")
+		loaded, _ := LoadIntegration("claude")
 		if loaded.Models[0] != "updated-model" {
 			t.Errorf("models[0] should be updated-model, got %q", loaded.Models[0])
 		}
--- a/cmd/config/config_test.go
+++ b/cmd/config/config_test.go
@@ -10,17 +10,10 @@ import (
 // setTestHome sets both HOME (Unix) and USERPROFILE (Windows) for cross-platform tests
 func setTestHome(t *testing.T, dir string) {
 	t.Setenv("HOME", dir)
+	t.Setenv("TMPDIR", dir)
 	t.Setenv("USERPROFILE", dir)
 }

-// editorPaths is a test helper that safely calls Paths if the runner implements Editor
-func editorPaths(r Runner) []string {
-	if editor, ok := r.(Editor); ok {
-		return editor.Paths()
-	}
-	return nil
-}
-
 func TestIntegrationConfig(t *testing.T) {
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
@@ -31,7 +24,7 @@ func TestIntegrationConfig(t *testing.T) {
 			t.Fatal(err)
 		}

-		config, err := loadIntegration("claude")
+		config, err := LoadIntegration("claude")
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -55,11 +48,11 @@ func TestIntegrationConfig(t *testing.T) {
 			"primary": "llama3.2:70b",
 			"fast":    "llama3.2:8b",
 		}
-		if err := saveAliases("claude", aliases); err != nil {
+		if err := SaveAliases("claude", aliases); err != nil {
 			t.Fatal(err)
 		}

-		config, err := loadIntegration("claude")
+		config, err := LoadIntegration("claude")
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -77,14 +70,14 @@ func TestIntegrationConfig(t *testing.T) {
 		if err := SaveIntegration("claude", []string{"model-a"}); err != nil {
 			t.Fatal(err)
 		}
-		if err := saveAliases("claude", map[string]string{"primary": "model-a", "fast": "model-small"}); err != nil {
+		if err := SaveAliases("claude", map[string]string{"primary": "model-a", "fast": "model-small"}); err != nil {
 			t.Fatal(err)
 		}

 		if err := SaveIntegration("claude", []string{"model-b"}); err != nil {
 			t.Fatal(err)
 		}
-		config, err := loadIntegration("claude")
+		config, err := LoadIntegration("claude")
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -96,7 +89,7 @@ func TestIntegrationConfig(t *testing.T) {
 	t.Run("defaultModel returns first model", func(t *testing.T) {
 		SaveIntegration("codex", []string{"model-a", "model-b"})

-		config, _ := loadIntegration("codex")
+		config, _ := LoadIntegration("codex")
 		defaultModel := ""
 		if len(config.Models) > 0 {
 			defaultModel = config.Models[0]
@@ -120,7 +113,7 @@ func TestIntegrationConfig(t *testing.T) {
 	t.Run("app name is case-insensitive", func(t *testing.T) {
 		SaveIntegration("Claude", []string{"model-x"})

-		config, err := loadIntegration("claude")
+		config, err := LoadIntegration("claude")
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -137,8 +130,8 @@ func TestIntegrationConfig(t *testing.T) {
 		SaveIntegration("app1", []string{"model-1"})
 		SaveIntegration("app2", []string{"model-2"})

-		config1, _ := loadIntegration("app1")
-		config2, _ := loadIntegration("app2")
+		config1, _ := LoadIntegration("app1")
+		config2, _ := LoadIntegration("app2")

 		defaultModel1 := ""
 		if len(config1.Models) > 0 {
@@ -185,64 +178,6 @@ func TestListIntegrations(t *testing.T) {
 	})
 }

-func TestEditorPaths(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("returns empty for claude (no Editor)", func(t *testing.T) {
-		r := integrations["claude"]
-		paths := editorPaths(r)
-		if len(paths) != 0 {
-			t.Errorf("expected no paths for claude, got %v", paths)
-		}
-	})
-
-	t.Run("returns empty for codex (no Editor)", func(t *testing.T) {
-		r := integrations["codex"]
-		paths := editorPaths(r)
-		if len(paths) != 0 {
-			t.Errorf("expected no paths for codex, got %v", paths)
-		}
-	})
-
-	t.Run("returns empty for droid when no config exists", func(t *testing.T) {
-		r := integrations["droid"]
-		paths := editorPaths(r)
-		if len(paths) != 0 {
-			t.Errorf("expected no paths, got %v", paths)
-		}
-	})
-
-	t.Run("returns path for droid when config exists", func(t *testing.T) {
-		settingsDir, _ := os.UserHomeDir()
-		settingsDir = filepath.Join(settingsDir, ".factory")
-		os.MkdirAll(settingsDir, 0o755)
-		os.WriteFile(filepath.Join(settingsDir, "settings.json"), []byte(`{}`), 0o644)
-
-		r := integrations["droid"]
-		paths := editorPaths(r)
-		if len(paths) != 1 {
-			t.Errorf("expected 1 path, got %d", len(paths))
-		}
-	})
-
-	t.Run("returns paths for opencode when configs exist", func(t *testing.T) {
-		home, _ := os.UserHomeDir()
-		configDir := filepath.Join(home, ".config", "opencode")
-		stateDir := filepath.Join(home, ".local", "state", "opencode")
-		os.MkdirAll(configDir, 0o755)
-		os.MkdirAll(stateDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "opencode.json"), []byte(`{}`), 0o644)
-		os.WriteFile(filepath.Join(stateDir, "model.json"), []byte(`{}`), 0o644)
-
-		r := integrations["opencode"]
-		paths := editorPaths(r)
-		if len(paths) != 2 {
-			t.Errorf("expected 2 paths, got %d: %v", len(paths), paths)
-		}
-	})
-}
-
 func TestLoadIntegration_CorruptedJSON(t *testing.T) {
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
@@ -251,7 +186,7 @@ func TestLoadIntegration_CorruptedJSON(t *testing.T) {
 	os.MkdirAll(dir, 0o755)
 	os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{corrupted json`), 0o644)

-	_, err := loadIntegration("test")
+	_, err := LoadIntegration("test")
 	if err == nil {
 		t.Error("expected error for nonexistent integration in corrupted file")
 	}
@@ -265,7 +200,7 @@ func TestSaveIntegration_NilModels(t *testing.T) {
 		t.Fatalf("saveIntegration with nil models failed: %v", err)
 	}

-	config, err := loadIntegration("test")
+	config, err := LoadIntegration("test")
 	if err != nil {
 		t.Fatalf("loadIntegration failed: %v", err)
 	}
@@ -294,7 +229,7 @@ func TestLoadIntegration_NonexistentIntegration(t *testing.T) {
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)

-	_, err := loadIntegration("nonexistent")
+	_, err := LoadIntegration("nonexistent")
 	if err == nil {
 		t.Error("expected error for nonexistent integration, got nil")
 	}
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -1,59 +0,0 @@
-package config
-
-import (
-	"errors"
-	"fmt"
-	"os"
-
-	"golang.org/x/term"
-)
-
-// ANSI escape sequences for terminal formatting.
-const (
-	ansiBold   = "\033[1m"
-	ansiReset  = "\033[0m"
-	ansiGray   = "\033[37m"
-	ansiGreen  = "\033[32m"
-	ansiYellow = "\033[33m"
-)
-
-// ErrCancelled is returned when the user cancels a selection.
-var ErrCancelled = errors.New("cancelled")
-
-// errCancelled is kept as an alias for backward compatibility within the package.
-var errCancelled = ErrCancelled
-
-// DefaultConfirmPrompt provides a TUI-based confirmation prompt.
-// When set, confirmPrompt delegates to it instead of using raw terminal I/O.
-var DefaultConfirmPrompt func(prompt string) (bool, error)
-
-func confirmPrompt(prompt string) (bool, error) {
-	if DefaultConfirmPrompt != nil {
-		return DefaultConfirmPrompt(prompt)
-	}
-
-	fd := int(os.Stdin.Fd())
-	oldState, err := term.MakeRaw(fd)
-	if err != nil {
-		return false, err
-	}
-	defer term.Restore(fd, oldState)
-
-	fmt.Fprintf(os.Stderr, "%s (\033[1my\033[0m/n) ", prompt)
-
-	buf := make([]byte, 1)
-	for {
-		if _, err := os.Stdin.Read(buf); err != nil {
-			return false, err
-		}
-
-		switch buf[0] {
-		case 'Y', 'y', 13:
-			fmt.Fprintf(os.Stderr, "yes\r\n")
-			return true, nil
-		case 'N', 'n', 27, 3:
-			fmt.Fprintf(os.Stderr, "no\r\n")
-			return false, nil
-		}
-	}
-}
--- a/cmd/config/selector_test.go
+++ b/cmd/config/selector_test.go
@@ -1,19 +0,0 @@
-package config
-
-import (
-	"testing"
-)
-
-func TestErrCancelled(t *testing.T) {
-	t.Run("NotNil", func(t *testing.T) {
-		if errCancelled == nil {
-			t.Error("errCancelled should not be nil")
-		}
-	})
-
-	t.Run("Message", func(t *testing.T) {
-		if errCancelled.Error() != "cancelled" {
-			t.Errorf("expected 'cancelled', got %q", errCancelled.Error())
-		}
-	})
-}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -17,6 +17,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/internal/modelref"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -540,6 +541,13 @@ func NewCreateRequest(name string, opts runOptions) *api.CreateRequest {
 		parentModel = ""
 	}

+	// Preserve explicit cloud intent for sessions started with `:cloud`.
+	// Cloud model metadata can return a source-less parent_model (for example
+	// "qwen3.5"), which would otherwise make `/save` create a local derivative.
+	if modelref.HasExplicitCloudSource(opts.Model) && !modelref.HasExplicitCloudSource(parentModel) {
+		parentModel = ""
+	}
+
 	req := &api.CreateRequest{
 		Model: name,
 		From:  cmp.Or(parentModel, opts.Model),
--- a/cmd/internal/fileutil/files.go
+++ b/cmd/internal/fileutil/files.go
@@ -1,4 +1,6 @@
-package config
+// Package fileutil provides small shared helpers for reading JSON files
+// and writing config files with backup-on-overwrite semantics.
+package fileutil

 import (
 	"bytes"
@@ -9,7 +11,8 @@ import (
 	"time"
 )

-func readJSONFile(path string) (map[string]any, error) {
+// ReadJSON reads a JSON object file into a generic map.
+func ReadJSON(path string) (map[string]any, error) {
 	data, err := os.ReadFile(path)
 	if err != nil {
 		return nil, err
@@ -33,12 +36,13 @@ func copyFile(src, dst string) error {
 	return os.WriteFile(dst, data, info.Mode().Perm())
 }

-func backupDir() string {
+// BackupDir returns the shared backup directory used before overwriting files.
+func BackupDir() string {
 	return filepath.Join(os.TempDir(), "ollama-backups")
 }

 func backupToTmp(srcPath string) (string, error) {
-	dir := backupDir()
+	dir := BackupDir()
 	if err := os.MkdirAll(dir, 0o755); err != nil {
 		return "", err
 	}
@@ -50,8 +54,8 @@ func backupToTmp(srcPath string) (string, error) {
 	return backupPath, nil
 }

-// writeWithBackup writes data to path via temp file + rename, backing up any existing file first
-func writeWithBackup(path string, data []byte) error {
+// WriteWithBackup writes data to path via temp file + rename, backing up any existing file first.
+func WriteWithBackup(path string, data []byte) error {
 	var backupPath string
 	// backup must be created before any writes to the target file
 	if existingContent, err := os.ReadFile(path); err == nil {
--- a/cmd/internal/fileutil/files_test.go
+++ b/cmd/internal/fileutil/files_test.go
@@ -1,4 +1,4 @@
-package config
+package fileutil

 import (
 	"encoding/json"
@@ -9,6 +9,21 @@ import (
 	"testing"
 )

+func TestMain(m *testing.M) {
+	tmpRoot, err := os.MkdirTemp("", "fileutil-test-*")
+	if err != nil {
+		panic(err)
+	}
+
+	if err := os.Setenv("TMPDIR", tmpRoot); err != nil {
+		panic(err)
+	}
+
+	code := m.Run()
+	_ = os.RemoveAll(tmpRoot)
+	os.Exit(code)
+}
+
 func mustMarshal(t *testing.T, v any) []byte {
 	t.Helper()
 	data, err := json.MarshalIndent(v, "", "  ")
@@ -18,14 +33,19 @@ func mustMarshal(t *testing.T, v any) []byte {
 	return data
 }

+func isolatedTempDir(t *testing.T) string {
+	t.Helper()
+	return t.TempDir()
+}
+
 func TestWriteWithBackup(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)

 	t.Run("creates file", func(t *testing.T) {
 		path := filepath.Join(tmpDir, "new.json")
 		data := mustMarshal(t, map[string]string{"key": "value"})

-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatal(err)
 		}

@@ -43,17 +63,17 @@ func TestWriteWithBackup(t *testing.T) {
 		}
 	})

-	t.Run("creates backup in /tmp/ollama-backups", func(t *testing.T) {
+	t.Run("creates backup in the temp backup directory", func(t *testing.T) {
 		path := filepath.Join(tmpDir, "backup.json")

 		os.WriteFile(path, []byte(`{"original": true}`), 0o644)

 		data := mustMarshal(t, map[string]bool{"updated": true})
-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatal(err)
 		}

-		entries, err := os.ReadDir(backupDir())
+		entries, err := os.ReadDir(BackupDir())
 		if err != nil {
 			t.Fatal("backup directory not created")
 		}
@@ -63,7 +83,7 @@ func TestWriteWithBackup(t *testing.T) {
 			if filepath.Ext(entry.Name()) != ".json" {
 				name := entry.Name()
 				if len(name) > len("backup.json.") && name[:len("backup.json.")] == "backup.json." {
-					backupPath := filepath.Join(backupDir(), name)
+					backupPath := filepath.Join(BackupDir(), name)
 					backup, err := os.ReadFile(backupPath)
 					if err == nil {
 						var backupData map[string]bool
@@ -79,7 +99,7 @@ func TestWriteWithBackup(t *testing.T) {
 		}

 		if !foundBackup {
-			t.Error("backup file not created in /tmp/ollama-backups")
+			t.Error("backup file not created in backup directory")
 		}

 		current, _ := os.ReadFile(path)
@@ -94,11 +114,11 @@ func TestWriteWithBackup(t *testing.T) {
 		path := filepath.Join(tmpDir, "nobak.json")

 		data := mustMarshal(t, map[string]string{"new": "file"})
-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatal(err)
 		}

-		entries, _ := os.ReadDir(backupDir())
+		entries, _ := os.ReadDir(BackupDir())
 		for _, entry := range entries {
 			if len(entry.Name()) > len("nobak.json.") && entry.Name()[:len("nobak.json.")] == "nobak.json." {
 				t.Error("backup should not exist for new file")
@@ -111,11 +131,11 @@ func TestWriteWithBackup(t *testing.T) {

 		data := mustMarshal(t, map[string]string{"key": "value"})

-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatal(err)
 		}

-		entries1, _ := os.ReadDir(backupDir())
+		entries1, _ := os.ReadDir(BackupDir())
 		countBefore := 0
 		for _, e := range entries1 {
 			if len(e.Name()) > len("unchanged.json.") && e.Name()[:len("unchanged.json.")] == "unchanged.json." {
@@ -123,11 +143,11 @@ func TestWriteWithBackup(t *testing.T) {
 			}
 		}

-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatal(err)
 		}

-		entries2, _ := os.ReadDir(backupDir())
+		entries2, _ := os.ReadDir(BackupDir())
 		countAfter := 0
 		for _, e := range entries2 {
 			if len(e.Name()) > len("unchanged.json.") && e.Name()[:len("unchanged.json.")] == "unchanged.json." {
@@ -145,11 +165,11 @@ func TestWriteWithBackup(t *testing.T) {

 		os.WriteFile(path, []byte(`{"v": 1}`), 0o644)
 		data := mustMarshal(t, map[string]int{"v": 2})
-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatal(err)
 		}

-		entries, _ := os.ReadDir(backupDir())
+		entries, _ := os.ReadDir(BackupDir())
 		var found bool
 		for _, entry := range entries {
 			name := entry.Name()
@@ -161,7 +181,7 @@ func TestWriteWithBackup(t *testing.T) {
 					}
 				}
 				found = true
-				os.Remove(filepath.Join(backupDir(), name))
+				os.Remove(filepath.Join(BackupDir(), name))
 				break
 			}
 		}
@@ -180,7 +200,7 @@ func TestWriteWithBackup_FailsIfBackupFails(t *testing.T) {
 		t.Skip("permission tests unreliable on Windows")
 	}

-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	path := filepath.Join(tmpDir, "config.json")

 	// Create original file
@@ -188,13 +208,13 @@ func TestWriteWithBackup_FailsIfBackupFails(t *testing.T) {
 	os.WriteFile(path, originalContent, 0o644)

 	// Make backup directory read-only to force backup failure
-	backupDir := backupDir()
+	backupDir := BackupDir()
 	os.MkdirAll(backupDir, 0o755)
 	os.Chmod(backupDir, 0o444) // Read-only
 	defer os.Chmod(backupDir, 0o755)

 	newContent := []byte(`{"updated": true}`)
-	err := writeWithBackup(path, newContent)
+	err := WriteWithBackup(path, newContent)

 	// Should fail because backup couldn't be created
 	if err == nil {
@@ -215,7 +235,7 @@ func TestWriteWithBackup_PermissionDenied(t *testing.T) {
 		t.Skip("permission tests unreliable on Windows")
 	}

-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)

 	// Create a read-only directory
 	readOnlyDir := filepath.Join(tmpDir, "readonly")
@@ -224,7 +244,7 @@ func TestWriteWithBackup_PermissionDenied(t *testing.T) {
 	defer os.Chmod(readOnlyDir, 0o755)

 	path := filepath.Join(readOnlyDir, "config.json")
-	err := writeWithBackup(path, []byte(`{"test": true}`))
+	err := WriteWithBackup(path, []byte(`{"test": true}`))

 	if err == nil {
 		t.Error("expected permission error, got nil")
@@ -234,10 +254,10 @@ func TestWriteWithBackup_PermissionDenied(t *testing.T) {
 // TestWriteWithBackup_DirectoryDoesNotExist verifies behavior when target directory doesn't exist.
 // writeWithBackup doesn't create directories - caller is responsible.
 func TestWriteWithBackup_DirectoryDoesNotExist(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	path := filepath.Join(tmpDir, "nonexistent", "subdir", "config.json")

-	err := writeWithBackup(path, []byte(`{"test": true}`))
+	err := WriteWithBackup(path, []byte(`{"test": true}`))

 	// Should fail because directory doesn't exist
 	if err == nil {
@@ -252,7 +272,7 @@ func TestWriteWithBackup_SymlinkTarget(t *testing.T) {
 		t.Skip("symlink tests may require admin on Windows")
 	}

-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	realFile := filepath.Join(tmpDir, "real.json")
 	symlink := filepath.Join(tmpDir, "link.json")

@@ -261,7 +281,7 @@ func TestWriteWithBackup_SymlinkTarget(t *testing.T) {
 	os.Symlink(realFile, symlink)

 	// Write through symlink
-	err := writeWithBackup(symlink, []byte(`{"v": 2}`))
+	err := WriteWithBackup(symlink, []byte(`{"v": 2}`))
 	if err != nil {
 		t.Fatalf("writeWithBackup through symlink failed: %v", err)
 	}
@@ -276,7 +296,7 @@ func TestWriteWithBackup_SymlinkTarget(t *testing.T) {
 // TestBackupToTmp_SpecialCharsInFilename verifies backup works with special characters.
 // User may have config files with unusual names.
 func TestBackupToTmp_SpecialCharsInFilename(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)

 	// File with spaces and special chars
 	path := filepath.Join(tmpDir, "my config (backup).json")
@@ -305,7 +325,7 @@ func TestCopyFile_PreservesPermissions(t *testing.T) {
 		t.Skip("permission preservation tests unreliable on Windows")
 	}

-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	src := filepath.Join(tmpDir, "src.json")
 	dst := filepath.Join(tmpDir, "dst.json")

@@ -327,7 +347,7 @@ func TestCopyFile_PreservesPermissions(t *testing.T) {

 // TestCopyFile_SourceNotFound verifies clear error when source doesn't exist.
 func TestCopyFile_SourceNotFound(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	src := filepath.Join(tmpDir, "nonexistent.json")
 	dst := filepath.Join(tmpDir, "dst.json")

@@ -339,11 +359,11 @@ func TestCopyFile_SourceNotFound(t *testing.T) {

 // TestWriteWithBackup_TargetIsDirectory verifies error when path points to a directory.
 func TestWriteWithBackup_TargetIsDirectory(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	dirPath := filepath.Join(tmpDir, "actualdir")
 	os.MkdirAll(dirPath, 0o755)

-	err := writeWithBackup(dirPath, []byte(`{"test": true}`))
+	err := WriteWithBackup(dirPath, []byte(`{"test": true}`))
 	if err == nil {
 		t.Error("expected error when target is a directory, got nil")
 	}
@@ -351,10 +371,10 @@ func TestWriteWithBackup_TargetIsDirectory(t *testing.T) {

 // TestWriteWithBackup_EmptyData verifies writing zero bytes works correctly.
 func TestWriteWithBackup_EmptyData(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	path := filepath.Join(tmpDir, "empty.json")

-	err := writeWithBackup(path, []byte{})
+	err := WriteWithBackup(path, []byte{})
 	if err != nil {
 		t.Fatalf("writeWithBackup with empty data failed: %v", err)
 	}
@@ -375,7 +395,7 @@ func TestWriteWithBackup_FileUnreadableButDirWritable(t *testing.T) {
 		t.Skip("permission tests unreliable on Windows")
 	}

-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	path := filepath.Join(tmpDir, "unreadable.json")

 	// Create file and make it unreadable
@@ -384,7 +404,7 @@ func TestWriteWithBackup_FileUnreadableButDirWritable(t *testing.T) {
 	defer os.Chmod(path, 0o644)

 	// Should fail because we can't read the file to compare/backup
-	err := writeWithBackup(path, []byte(`{"updated": true}`))
+	err := WriteWithBackup(path, []byte(`{"updated": true}`))
 	if err == nil {
 		t.Error("expected error when file is unreadable, got nil")
 	}
@@ -393,7 +413,7 @@ func TestWriteWithBackup_FileUnreadableButDirWritable(t *testing.T) {
 // TestWriteWithBackup_RapidSuccessiveWrites verifies backup works with multiple writes
 // within the same second (timestamp collision scenario).
 func TestWriteWithBackup_RapidSuccessiveWrites(t *testing.T) {
-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)
 	path := filepath.Join(tmpDir, "rapid.json")

 	// Create initial file
@@ -402,7 +422,7 @@ func TestWriteWithBackup_RapidSuccessiveWrites(t *testing.T) {
 	// Rapid successive writes
 	for i := 1; i <= 3; i++ {
 		data := []byte(fmt.Sprintf(`{"v": %d}`, i))
-		if err := writeWithBackup(path, data); err != nil {
+		if err := WriteWithBackup(path, data); err != nil {
 			t.Fatalf("write %d failed: %v", i, err)
 		}
 	}
@@ -414,7 +434,7 @@ func TestWriteWithBackup_RapidSuccessiveWrites(t *testing.T) {
 	}

 	// Verify at least one backup exists
-	entries, _ := os.ReadDir(backupDir())
+	entries, _ := os.ReadDir(BackupDir())
 	var backupCount int
 	for _, e := range entries {
 		if len(e.Name()) > len("rapid.json.") && e.Name()[:len("rapid.json.")] == "rapid.json." {
@@ -432,8 +452,9 @@ func TestWriteWithBackup_BackupDirIsFile(t *testing.T) {
 		t.Skip("test modifies system temp directory")
 	}

+	tmpDir := isolatedTempDir(t)
 	// Create a file at the backup directory path
-	backupPath := backupDir()
+	backupPath := BackupDir()
 	// Clean up any existing directory first
 	os.RemoveAll(backupPath)
 	// Create a file instead of directory
@@ -443,11 +464,10 @@ func TestWriteWithBackup_BackupDirIsFile(t *testing.T) {
 		os.MkdirAll(backupPath, 0o755)
 	}()

-	tmpDir := t.TempDir()
 	path := filepath.Join(tmpDir, "test.json")
 	os.WriteFile(path, []byte(`{"original": true}`), 0o644)

-	err := writeWithBackup(path, []byte(`{"updated": true}`))
+	err := WriteWithBackup(path, []byte(`{"updated": true}`))
 	if err == nil {
 		t.Error("expected error when backup dir is a file, got nil")
 	}
@@ -459,7 +479,7 @@ func TestWriteWithBackup_NoOrphanTempFiles(t *testing.T) {
 		t.Skip("permission tests unreliable on Windows")
 	}

-	tmpDir := t.TempDir()
+	tmpDir := isolatedTempDir(t)

 	// Count existing temp files
 	countTempFiles := func() int {
@@ -493,7 +513,7 @@ func TestWriteWithBackup_NoOrphanTempFiles(t *testing.T) {
 	badPath := filepath.Join(tmpDir, "isdir")
 	os.MkdirAll(badPath, 0o755)

-	_ = writeWithBackup(badPath, []byte(`{"test": true}`))
+	_ = WriteWithBackup(badPath, []byte(`{"test": true}`))

 	after := countTempFiles()
 	if after > before {
--- a/cmd/launch/claude.go
+++ b/cmd/launch/claude.go
@@ -0,0 +1,87 @@
+package launch
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strconv"
+
+	"github.com/ollama/ollama/envconfig"
+)
+
+// Claude implements Runner for Claude Code integration.
+type Claude struct{}
+
+func (c *Claude) String() string { return "Claude Code" }
+
+func (c *Claude) args(model string, extra []string) []string {
+	var args []string
+	if model != "" {
+		args = append(args, "--model", model)
+	}
+	args = append(args, extra...)
+	return args
+}
+
+func (c *Claude) findPath() (string, error) {
+	if p, err := exec.LookPath("claude"); err == nil {
+		return p, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+	name := "claude"
+	if runtime.GOOS == "windows" {
+		name = "claude.exe"
+	}
+	fallback := filepath.Join(home, ".claude", "local", name)
+	if _, err := os.Stat(fallback); err != nil {
+		return "", err
+	}
+	return fallback, nil
+}
+
+func (c *Claude) Run(model string, args []string) error {
+	claudePath, err := c.findPath()
+	if err != nil {
+		return fmt.Errorf("claude is not installed, install from https://code.claude.com/docs/en/quickstart")
+	}
+
+	cmd := exec.Command(claudePath, c.args(model, args)...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	env := append(os.Environ(),
+		"ANTHROPIC_BASE_URL="+envconfig.Host().String(),
+		"ANTHROPIC_API_KEY=",
+		"ANTHROPIC_AUTH_TOKEN=ollama",
+		"CLAUDE_CODE_ATTRIBUTION_HEADER=0",
+	)
+
+	env = append(env, c.modelEnvVars(model)...)
+
+	cmd.Env = env
+	return cmd.Run()
+}
+
+// modelEnvVars returns Claude Code env vars that route all model tiers through Ollama.
+func (c *Claude) modelEnvVars(model string) []string {
+	env := []string{
+		"ANTHROPIC_DEFAULT_OPUS_MODEL=" + model,
+		"ANTHROPIC_DEFAULT_SONNET_MODEL=" + model,
+		"ANTHROPIC_DEFAULT_HAIKU_MODEL=" + model,
+		"CLAUDE_CODE_SUBAGENT_MODEL=" + model,
+	}
+
+	if isCloudModelName(model) {
+		if l, ok := lookupCloudModelLimit(model); ok {
+			env = append(env, "CLAUDE_CODE_AUTO_COMPACT_WINDOW="+strconv.Itoa(l.Context))
+		}
+	}
+
+	return env
+}
--- a/cmd/launch/claude_test.go
+++ b/cmd/launch/claude_test.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"os"
@@ -117,10 +117,7 @@ func TestClaudeModelEnvVars(t *testing.T) {
 		return m
 	}

-	t.Run("falls back to model param when no aliases saved", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
+	t.Run("maps all Claude model env vars to the provided model", func(t *testing.T) {
 		got := envMap(c.modelEnvVars("llama3.2"))
 		if got["ANTHROPIC_DEFAULT_OPUS_MODEL"] != "llama3.2" {
 			t.Errorf("OPUS = %q, want llama3.2", got["ANTHROPIC_DEFAULT_OPUS_MODEL"])
@@ -134,65 +131,41 @@ func TestClaudeModelEnvVars(t *testing.T) {
 		if got["CLAUDE_CODE_SUBAGENT_MODEL"] != "llama3.2" {
 			t.Errorf("SUBAGENT = %q, want llama3.2", got["CLAUDE_CODE_SUBAGENT_MODEL"])
 		}
-	})
-
-	t.Run("uses primary alias for opus sonnet and subagent", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		SaveIntegration("claude", []string{"qwen3:8b"})
-		saveAliases("claude", map[string]string{"primary": "qwen3:8b"})
-
-		got := envMap(c.modelEnvVars("qwen3:8b"))
-		if got["ANTHROPIC_DEFAULT_OPUS_MODEL"] != "qwen3:8b" {
-			t.Errorf("OPUS = %q, want qwen3:8b", got["ANTHROPIC_DEFAULT_OPUS_MODEL"])
-		}
-		if got["ANTHROPIC_DEFAULT_SONNET_MODEL"] != "qwen3:8b" {
-			t.Errorf("SONNET = %q, want qwen3:8b", got["ANTHROPIC_DEFAULT_SONNET_MODEL"])
-		}
-		if got["ANTHROPIC_DEFAULT_HAIKU_MODEL"] != "qwen3:8b" {
-			t.Errorf("HAIKU = %q, want qwen3:8b (no fast alias)", got["ANTHROPIC_DEFAULT_HAIKU_MODEL"])
-		}
-		if got["CLAUDE_CODE_SUBAGENT_MODEL"] != "qwen3:8b" {
-			t.Errorf("SUBAGENT = %q, want qwen3:8b", got["CLAUDE_CODE_SUBAGENT_MODEL"])
+		if got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"] != "" {
+			t.Errorf("AUTO_COMPACT_WINDOW = %q, want empty for local models", got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"])
 		}
 	})

-	t.Run("uses fast alias for haiku", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		SaveIntegration("claude", []string{"llama3.2:70b"})
-		saveAliases("claude", map[string]string{
-			"primary": "llama3.2:70b",
-			"fast":    "llama3.2:8b",
-		})
-
-		got := envMap(c.modelEnvVars("llama3.2:70b"))
-		if got["ANTHROPIC_DEFAULT_OPUS_MODEL"] != "llama3.2:70b" {
-			t.Errorf("OPUS = %q, want llama3.2:70b", got["ANTHROPIC_DEFAULT_OPUS_MODEL"])
+	t.Run("supports empty model", func(t *testing.T) {
+		got := envMap(c.modelEnvVars(""))
+		if got["ANTHROPIC_DEFAULT_OPUS_MODEL"] != "" {
+			t.Errorf("OPUS = %q, want empty", got["ANTHROPIC_DEFAULT_OPUS_MODEL"])
 		}
-		if got["ANTHROPIC_DEFAULT_SONNET_MODEL"] != "llama3.2:70b" {
-			t.Errorf("SONNET = %q, want llama3.2:70b", got["ANTHROPIC_DEFAULT_SONNET_MODEL"])
+		if got["ANTHROPIC_DEFAULT_SONNET_MODEL"] != "" {
+			t.Errorf("SONNET = %q, want empty", got["ANTHROPIC_DEFAULT_SONNET_MODEL"])
 		}
-		if got["ANTHROPIC_DEFAULT_HAIKU_MODEL"] != "llama3.2:8b" {
-			t.Errorf("HAIKU = %q, want llama3.2:8b", got["ANTHROPIC_DEFAULT_HAIKU_MODEL"])
+		if got["ANTHROPIC_DEFAULT_HAIKU_MODEL"] != "" {
+			t.Errorf("HAIKU = %q, want empty", got["ANTHROPIC_DEFAULT_HAIKU_MODEL"])
 		}
-		if got["CLAUDE_CODE_SUBAGENT_MODEL"] != "llama3.2:70b" {
-			t.Errorf("SUBAGENT = %q, want llama3.2:70b", got["CLAUDE_CODE_SUBAGENT_MODEL"])
+		if got["CLAUDE_CODE_SUBAGENT_MODEL"] != "" {
+			t.Errorf("SUBAGENT = %q, want empty", got["CLAUDE_CODE_SUBAGENT_MODEL"])
+		}
+		if got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"] != "" {
+			t.Errorf("AUTO_COMPACT_WINDOW = %q, want empty", got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"])
 		}
 	})

-	t.Run("alias primary overrides model param", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
+	t.Run("sets auto compact window for known cloud models", func(t *testing.T) {
+		got := envMap(c.modelEnvVars("glm-5:cloud"))
+		if got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"] != "202752" {
+			t.Errorf("AUTO_COMPACT_WINDOW = %q, want 202752", got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"])
+		}
+	})

-		SaveIntegration("claude", []string{"saved-model"})
-		saveAliases("claude", map[string]string{"primary": "saved-model"})
-
-		got := envMap(c.modelEnvVars("different-model"))
-		if got["ANTHROPIC_DEFAULT_OPUS_MODEL"] != "saved-model" {
-			t.Errorf("OPUS = %q, want saved-model", got["ANTHROPIC_DEFAULT_OPUS_MODEL"])
+	t.Run("does not set auto compact window for unknown cloud models", func(t *testing.T) {
+		got := envMap(c.modelEnvVars("unknown-model:cloud"))
+		if got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"] != "" {
+			t.Errorf("AUTO_COMPACT_WINDOW = %q, want empty", got["CLAUDE_CODE_AUTO_COMPACT_WINDOW"])
 		}
 	})
 }
--- a/cmd/launch/cline.go
+++ b/cmd/launch/cline.go
@@ -1,14 +1,13 @@
-package config
+package launch

 import (
-	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"

+	"github.com/ollama/ollama/cmd/internal/fileutil"
 	"github.com/ollama/ollama/envconfig"
 )

@@ -22,24 +21,6 @@ func (c *Cline) Run(model string, args []string) error {
 		return fmt.Errorf("cline is not installed, install with: npm install -g cline")
 	}

-	models := []string{model}
-	if config, err := loadIntegration("cline"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	var err error
-	models, err = resolveEditorModels("cline", models, func() ([]string, error) {
-		return selectModels(context.Background(), "cline", "")
-	})
-	if errors.Is(err, errCancelled) {
-		return nil
-	}
-	if err != nil {
-		return err
-	}
-	if err := c.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
 	cmd := exec.Command("cline", args...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -97,7 +78,7 @@ func (c *Cline) Edit(models []string) error {
 	if err != nil {
 		return err
 	}
-	return writeWithBackup(configPath, data)
+	return fileutil.WriteWithBackup(configPath, data)
 }

 func (c *Cline) Models() []string {
@@ -106,7 +87,7 @@ func (c *Cline) Models() []string {
 		return nil
 	}

-	config, err := readJSONFile(filepath.Join(home, ".cline", "data", "globalState.json"))
+	config, err := fileutil.ReadJSON(filepath.Join(home, ".cline", "data", "globalState.json"))
 	if err != nil {
 		return nil
 	}
--- a/cmd/launch/cline_test.go
+++ b/cmd/launch/cline_test.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"encoding/json"
--- a/cmd/launch/codex.go
+++ b/cmd/launch/codex.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"fmt"
--- a/cmd/launch/codex_test.go
+++ b/cmd/launch/codex_test.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"slices"
@@ -16,7 +16,7 @@ func TestCodexArgs(t *testing.T) {
 	}{
 		{"with model", "llama3.2", nil, []string{"--oss", "-m", "llama3.2"}},
 		{"empty model", "", nil, []string{"--oss"}},
-		{"with model and profile", "qwen3-coder", []string{"-p", "myprofile"}, []string{"--oss", "-m", "qwen3-coder", "-p", "myprofile"}},
+		{"with model and profile", "qwen3.5", []string{"-p", "myprofile"}, []string{"--oss", "-m", "qwen3.5", "-p", "myprofile"}},
 		{"with sandbox flag", "llama3.2", []string{"--sandbox", "workspace-write"}, []string{"--oss", "-m", "llama3.2", "--sandbox", "workspace-write"}},
 	}

--- a/cmd/launch/command_test.go
+++ b/cmd/launch/command_test.go
@@ -0,0 +1,598 @@
+package launch
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/cmd/config"
+	"github.com/spf13/cobra"
+)
+
+func captureStderr(t *testing.T, fn func()) string {
+	t.Helper()
+
+	oldStderr := os.Stderr
+	r, w, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("failed to create stderr pipe: %v", err)
+	}
+	os.Stderr = w
+	defer func() {
+		os.Stderr = oldStderr
+	}()
+
+	done := make(chan string, 1)
+	go func() {
+		var buf bytes.Buffer
+		_, _ = io.Copy(&buf, r)
+		done <- buf.String()
+	}()
+
+	fn()
+
+	_ = w.Close()
+	return <-done
+}
+
+func TestLaunchCmd(t *testing.T) {
+	mockCheck := func(cmd *cobra.Command, args []string) error {
+		return nil
+	}
+	mockTUI := func(cmd *cobra.Command) {}
+	cmd := LaunchCmd(mockCheck, mockTUI)
+
+	t.Run("command structure", func(t *testing.T) {
+		if cmd.Use != "launch [INTEGRATION] [-- [EXTRA_ARGS...]]" {
+			t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION] [-- [EXTRA_ARGS...]]")
+		}
+		if cmd.Short == "" {
+			t.Error("Short description should not be empty")
+		}
+		if cmd.Long == "" {
+			t.Error("Long description should not be empty")
+		}
+	})
+
+	t.Run("flags exist", func(t *testing.T) {
+		if cmd.Flags().Lookup("model") == nil {
+			t.Error("--model flag should exist")
+		}
+		if cmd.Flags().Lookup("config") == nil {
+			t.Error("--config flag should exist")
+		}
+		if cmd.Flags().Lookup("yes") == nil {
+			t.Error("--yes flag should exist")
+		}
+	})
+
+	t.Run("PreRunE is set", func(t *testing.T) {
+		if cmd.PreRunE == nil {
+			t.Error("PreRunE should be set to checkServerHeartbeat")
+		}
+	})
+}
+
+func TestLaunchCmdTUICallback(t *testing.T) {
+	mockCheck := func(cmd *cobra.Command, args []string) error {
+		return nil
+	}
+
+	t.Run("no args calls TUI", func(t *testing.T) {
+		tuiCalled := false
+		mockTUI := func(cmd *cobra.Command) {
+			tuiCalled = true
+		}
+
+		cmd := LaunchCmd(mockCheck, mockTUI)
+		cmd.SetArgs([]string{})
+		_ = cmd.Execute()
+
+		if !tuiCalled {
+			t.Error("TUI callback should be called when no args provided")
+		}
+	})
+
+	t.Run("integration arg bypasses TUI", func(t *testing.T) {
+		srv := httptest.NewServer(http.NotFoundHandler())
+		defer srv.Close()
+		t.Setenv("OLLAMA_HOST", srv.URL)
+
+		tuiCalled := false
+		mockTUI := func(cmd *cobra.Command) {
+			tuiCalled = true
+		}
+
+		cmd := LaunchCmd(mockCheck, mockTUI)
+		cmd.SetArgs([]string{"claude"})
+		_ = cmd.Execute()
+
+		if tuiCalled {
+			t.Error("TUI callback should NOT be called when integration arg provided")
+		}
+	})
+
+	t.Run("--model flag without integration returns error", func(t *testing.T) {
+		tuiCalled := false
+		mockTUI := func(cmd *cobra.Command) {
+			tuiCalled = true
+		}
+
+		cmd := LaunchCmd(mockCheck, mockTUI)
+		cmd.SetArgs([]string{"--model", "test-model"})
+		err := cmd.Execute()
+
+		if err == nil {
+			t.Fatal("expected --model without an integration to fail")
+		}
+		if !strings.Contains(err.Error(), "require an integration name") {
+			t.Fatalf("expected integration-name guidance, got %v", err)
+		}
+		if tuiCalled {
+			t.Error("TUI callback should NOT be called when --model is provided without an integration")
+		}
+	})
+
+	t.Run("--config flag without integration returns error", func(t *testing.T) {
+		tuiCalled := false
+		mockTUI := func(cmd *cobra.Command) {
+			tuiCalled = true
+		}
+
+		cmd := LaunchCmd(mockCheck, mockTUI)
+		cmd.SetArgs([]string{"--config"})
+		err := cmd.Execute()
+
+		if err == nil {
+			t.Fatal("expected --config without an integration to fail")
+		}
+		if !strings.Contains(err.Error(), "require an integration name") {
+			t.Fatalf("expected integration-name guidance, got %v", err)
+		}
+		if tuiCalled {
+			t.Error("TUI callback should NOT be called when --config is provided without an integration")
+		}
+	})
+
+	t.Run("--yes flag without integration returns error", func(t *testing.T) {
+		tuiCalled := false
+		mockTUI := func(cmd *cobra.Command) {
+			tuiCalled = true
+		}
+
+		cmd := LaunchCmd(mockCheck, mockTUI)
+		cmd.SetArgs([]string{"--yes"})
+		err := cmd.Execute()
+
+		if err == nil {
+			t.Fatal("expected --yes without an integration to fail")
+		}
+		if !strings.Contains(err.Error(), "require an integration name") {
+			t.Fatalf("expected integration-name guidance, got %v", err)
+		}
+		if tuiCalled {
+			t.Error("TUI callback should NOT be called when --yes is provided without an integration")
+		}
+	})
+
+	t.Run("extra args without integration return error", func(t *testing.T) {
+		tuiCalled := false
+		mockTUI := func(cmd *cobra.Command) {
+			tuiCalled = true
+		}
+
+		cmd := LaunchCmd(mockCheck, mockTUI)
+		cmd.SetArgs([]string{"--model", "test-model", "--", "--sandbox", "workspace-write"})
+		err := cmd.Execute()
+
+		if err == nil {
+			t.Fatal("expected flags and extra args without an integration to fail")
+		}
+		if !strings.Contains(err.Error(), "require an integration name") {
+			t.Fatalf("expected integration-name guidance, got %v", err)
+		}
+		if tuiCalled {
+			t.Error("TUI callback should NOT be called when flags or extra args are provided without an integration")
+		}
+	})
+}
+
+func TestLaunchCmdNilHeartbeat(t *testing.T) {
+	cmd := LaunchCmd(nil, nil)
+	if cmd == nil {
+		t.Fatal("LaunchCmd returned nil")
+	}
+	if cmd.PreRunE != nil {
+		t.Log("Note: PreRunE is set even when nil is passed (acceptable)")
+	}
+}
+
+func TestLaunchCmdModelFlagFiltersDisabledCloudFromSavedConfig(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+
+	if err := config.SaveIntegration("stubeditor", []string{"glm-5:cloud"}); err != nil {
+		t.Fatalf("failed to seed saved config: %v", err)
+	}
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/status":
+			fmt.Fprintf(w, `{"cloud":{"disabled":true,"source":"config"}}`)
+		case "/api/show":
+			fmt.Fprintf(w, `{"model":"llama3.2"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherEditorRunner{}
+	restore := OverrideIntegration("stubeditor", stub)
+	defer restore()
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubeditor", "--model", "llama3.2"})
+	if err := cmd.Execute(); err != nil {
+		t.Fatalf("launch command failed: %v", err)
+	}
+
+	saved, err := config.LoadIntegration("stubeditor")
+	if err != nil {
+		t.Fatalf("failed to reload integration config: %v", err)
+	}
+	if diff := cmp.Diff([]string{"llama3.2"}, saved.Models); diff != "" {
+		t.Fatalf("saved models mismatch (-want +got):\n%s", diff)
+	}
+	if diff := cmp.Diff([][]string{{"llama3.2"}}, stub.edited); diff != "" {
+		t.Fatalf("editor models mismatch (-want +got):\n%s", diff)
+	}
+	if stub.ranModel != "llama3.2" {
+		t.Fatalf("expected launch to run with llama3.2, got %q", stub.ranModel)
+	}
+}
+
+func TestLaunchCmdModelFlagClearsDisabledCloudOverride(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/status":
+			fmt.Fprintf(w, `{"cloud":{"disabled":true,"source":"config"}}`)
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"llama3.2"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model":"llama3.2"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherSingleRunner{}
+	restore := OverrideIntegration("stubapp", stub)
+	defer restore()
+
+	oldSelector := DefaultSingleSelector
+	defer func() { DefaultSingleSelector = oldSelector }()
+
+	var selectorCalls int
+	var gotCurrent string
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		selectorCalls++
+		gotCurrent = current
+		return "llama3.2", nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubapp", "--model", "glm-5:cloud"})
+	stderr := captureStderr(t, func() {
+		if err := cmd.Execute(); err != nil {
+			t.Fatalf("launch command failed: %v", err)
+		}
+	})
+
+	if selectorCalls != 1 {
+		t.Fatalf("expected disabled cloud override to fall back to selector, got %d calls", selectorCalls)
+	}
+	if gotCurrent != "" {
+		t.Fatalf("expected disabled override to be cleared before selection, got current %q", gotCurrent)
+	}
+	if stub.ranModel != "llama3.2" {
+		t.Fatalf("expected launch to run with replacement local model, got %q", stub.ranModel)
+	}
+	if !strings.Contains(stderr, "Warning: ignoring --model glm-5:cloud because cloud is disabled") {
+		t.Fatalf("expected disabled-cloud warning, got stderr: %q", stderr)
+	}
+
+	saved, err := config.LoadIntegration("stubapp")
+	if err != nil {
+		t.Fatalf("failed to reload integration config: %v", err)
+	}
+	if diff := cmp.Diff([]string{"llama3.2"}, saved.Models); diff != "" {
+		t.Fatalf("saved models mismatch (-want +got):\n%s", diff)
+	}
+}
+
+func TestLaunchCmdYes_AutoConfirmsLaunchPromptPath(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withLauncherHooks(t)
+	withInteractiveSession(t, false)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model":"llama3.2"}`)
+		case "/api/status":
+			w.WriteHeader(http.StatusNotFound)
+			fmt.Fprint(w, `{"error":"not found"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherEditorRunner{paths: []string{"/tmp/stubeditor.json"}}
+	restore := OverrideIntegration("stubeditor", stub)
+	defer restore()
+
+	DefaultConfirmPrompt = func(prompt string) (bool, error) {
+		t.Fatalf("unexpected prompt with --yes: %q", prompt)
+		return false, nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubeditor", "--model", "llama3.2", "--yes"})
+	if err := cmd.Execute(); err != nil {
+		t.Fatalf("launch command with --yes failed: %v", err)
+	}
+
+	if diff := cmp.Diff([][]string{{"llama3.2"}}, stub.edited); diff != "" {
+		t.Fatalf("editor models mismatch (-want +got):\n%s", diff)
+	}
+	if stub.ranModel != "llama3.2" {
+		t.Fatalf("expected launch to run with llama3.2, got %q", stub.ranModel)
+	}
+}
+
+func TestLaunchCmdHeadlessWithYes_AutoPullsMissingLocalModel(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withLauncherHooks(t)
+	withInteractiveSession(t, false)
+
+	var pullCalled bool
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			w.WriteHeader(http.StatusNotFound)
+			fmt.Fprint(w, `{"error":"model not found"}`)
+		case "/api/pull":
+			pullCalled = true
+			w.WriteHeader(http.StatusOK)
+			fmt.Fprint(w, `{"status":"success"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherSingleRunner{}
+	restore := OverrideIntegration("stubapp", stub)
+	defer restore()
+
+	DefaultConfirmPrompt = func(prompt string) (bool, error) {
+		t.Fatalf("unexpected prompt with --yes in headless autopull path: %q", prompt)
+		return false, nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubapp", "--model", "missing-model", "--yes"})
+	if err := cmd.Execute(); err != nil {
+		t.Fatalf("launch command with --yes failed: %v", err)
+	}
+
+	if !pullCalled {
+		t.Fatal("expected missing local model to be auto-pulled with --yes in headless mode")
+	}
+	if stub.ranModel != "missing-model" {
+		t.Fatalf("expected launch to run with pulled model, got %q", stub.ranModel)
+	}
+}
+
+func TestLaunchCmdHeadlessWithoutYes_ReturnsActionableConfirmError(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withLauncherHooks(t)
+	withInteractiveSession(t, false)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model":"llama3.2"}`)
+		case "/api/status":
+			w.WriteHeader(http.StatusNotFound)
+			fmt.Fprint(w, `{"error":"not found"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherEditorRunner{paths: []string{"/tmp/stubeditor.json"}}
+	restore := OverrideIntegration("stubeditor", stub)
+	defer restore()
+
+	DefaultConfirmPrompt = func(prompt string) (bool, error) {
+		t.Fatalf("unexpected prompt in headless non-yes mode: %q", prompt)
+		return false, nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubeditor", "--model", "llama3.2"})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected launch command to fail without --yes in headless mode")
+	}
+	if !strings.Contains(err.Error(), "re-run with --yes") {
+		t.Fatalf("expected actionable --yes guidance, got %v", err)
+	}
+	if len(stub.edited) != 0 {
+		t.Fatalf("expected no editor writes when confirmation is blocked, got %v", stub.edited)
+	}
+	if stub.ranModel != "" {
+		t.Fatalf("expected launch to abort before run, got %q", stub.ranModel)
+	}
+}
+
+func TestLaunchCmdIntegrationArgPromptsForModelWithSavedSelection(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+
+	if err := config.SaveIntegration("stubapp", []string{"llama3.2"}); err != nil {
+		t.Fatalf("failed to seed saved config: %v", err)
+	}
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"llama3.2"},{"name":"qwen3:8b"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model":"qwen3:8b"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherSingleRunner{}
+	restore := OverrideIntegration("stubapp", stub)
+	defer restore()
+
+	oldSelector := DefaultSingleSelector
+	defer func() { DefaultSingleSelector = oldSelector }()
+
+	var gotCurrent string
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		gotCurrent = current
+		return "qwen3:8b", nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubapp"})
+	if err := cmd.Execute(); err != nil {
+		t.Fatalf("launch command failed: %v", err)
+	}
+
+	if gotCurrent != "llama3.2" {
+		t.Fatalf("expected selector current model to be saved model llama3.2, got %q", gotCurrent)
+	}
+	if stub.ranModel != "qwen3:8b" {
+		t.Fatalf("expected launch to run selected model qwen3:8b, got %q", stub.ranModel)
+	}
+
+	saved, err := config.LoadIntegration("stubapp")
+	if err != nil {
+		t.Fatalf("failed to reload integration config: %v", err)
+	}
+	if diff := cmp.Diff([]string{"qwen3:8b"}, saved.Models); diff != "" {
+		t.Fatalf("saved models mismatch (-want +got):\n%s", diff)
+	}
+}
+
+func TestLaunchCmdHeadlessYes_IntegrationRequiresModelEvenWhenSaved(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withLauncherHooks(t)
+	withInteractiveSession(t, false)
+
+	if err := config.SaveIntegration("stubapp", []string{"llama3.2"}); err != nil {
+		t.Fatalf("failed to seed saved config: %v", err)
+	}
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model":"llama3.2"}`)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherSingleRunner{}
+	restore := OverrideIntegration("stubapp", stub)
+	defer restore()
+
+	oldSelector := DefaultSingleSelector
+	defer func() { DefaultSingleSelector = oldSelector }()
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		t.Fatal("selector should not be called for headless --yes saved-model launch")
+		return "", nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubapp", "--yes"})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected launch command to fail when --yes is used headlessly without --model")
+	}
+	if !strings.Contains(err.Error(), "requires --model <model>") {
+		t.Fatalf("expected actionable --model guidance, got %v", err)
+	}
+	if stub.ranModel != "" {
+		t.Fatalf("expected launch to abort before run, got %q", stub.ranModel)
+	}
+}
+
+func TestLaunchCmdHeadlessYes_IntegrationWithoutSavedModelReturnsError(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withLauncherHooks(t)
+	withInteractiveSession(t, false)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	stub := &launcherSingleRunner{}
+	restore := OverrideIntegration("stubapp", stub)
+	defer restore()
+
+	oldSelector := DefaultSingleSelector
+	defer func() { DefaultSingleSelector = oldSelector }()
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		t.Fatal("selector should not be called for headless --yes without saved model")
+		return "", nil
+	}
+
+	cmd := LaunchCmd(func(cmd *cobra.Command, args []string) error { return nil }, func(cmd *cobra.Command) {})
+	cmd.SetArgs([]string{"stubapp", "--yes"})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected launch command to fail when --yes is used headlessly without --model")
+	}
+	if !strings.Contains(err.Error(), "requires --model <model>") {
+		t.Fatalf("expected actionable --model guidance, got %v", err)
+	}
+	if stub.ranModel != "" {
+		t.Fatalf("expected launch to abort before run, got %q", stub.ranModel)
+	}
+}
--- a/cmd/launch/droid.go
+++ b/cmd/launch/droid.go
@@ -1,16 +1,14 @@
-package config
+package launch

 import (
-	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"slices"

-	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
 	"github.com/ollama/ollama/envconfig"
 )

@@ -47,25 +45,6 @@ func (d *Droid) Run(model string, args []string) error {
 		return fmt.Errorf("droid is not installed, install from https://docs.factory.ai/cli/getting-started/quickstart")
 	}

-	// Call Edit() to ensure config is up-to-date before launch
-	models := []string{model}
-	if config, err := loadIntegration("droid"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	var err error
-	models, err = resolveEditorModels("droid", models, func() ([]string, error) {
-		return selectModels(context.Background(), "droid", "")
-	})
-	if errors.Is(err, errCancelled) {
-		return nil
-	}
-	if err != nil {
-		return err
-	}
-	if err := d.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
 	cmd := exec.Command("droid", args...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -111,6 +90,16 @@ func (d *Droid) Edit(models []string) error {
 		json.Unmarshal(data, &settings) // ignore error, zero values are fine
 	}

+	settingsMap = updateDroidSettings(settingsMap, settings, models)
+
+	data, err := json.MarshalIndent(settingsMap, "", "  ")
+	if err != nil {
+		return err
+	}
+	return fileutil.WriteWithBackup(settingsPath, data)
+}
+
+func updateDroidSettings(settingsMap map[string]any, settings droidSettings, models []string) map[string]any {
 	// Keep only non-Ollama models from the raw map (preserves extra fields)
 	// Rebuild Ollama models
 	var nonOllamaModels []any
@@ -125,13 +114,12 @@ func (d *Droid) Edit(models []string) error {
 	}

 	// Build new Ollama model entries with sequential indices (0, 1, 2, ...)
-	client, _ := api.ClientFromEnvironment()

 	var newModels []any
 	var defaultModelID string
 	for i, model := range models {
 		maxOutput := 64000
-		if isCloudModel(context.Background(), client, model) {
+		if isCloudModelName(model) {
 			if l, ok := lookupCloudModelLimit(model); ok {
 				maxOutput = l.Output
 			}
@@ -167,12 +155,7 @@ func (d *Droid) Edit(models []string) error {
 	}

 	settingsMap["sessionDefaultSettings"] = sessionSettings
-
-	data, err := json.MarshalIndent(settingsMap, "", "  ")
-	if err != nil {
-		return err
-	}
-	return writeWithBackup(settingsPath, data)
+	return settingsMap
 }

 func (d *Droid) Models() []string {
--- a/cmd/launch/droid_test.go
+++ b/cmd/launch/droid_test.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"encoding/json"
@@ -6,6 +6,8 @@ import (
 	"os"
 	"path/filepath"
 	"testing"
+
+	"github.com/ollama/ollama/cmd/internal/fileutil"
 )

 func TestDroidIntegration(t *testing.T) {
@@ -362,7 +364,7 @@ func TestDroidEdit_DuplicateModels(t *testing.T) {
 		t.Fatalf("Edit with duplicates failed: %v", err)
 	}

-	settings, err := readJSONFile(settingsPath)
+	settings, err := fileutil.ReadJSON(settingsPath)
 	if err != nil {
 		t.Fatalf("readJSONFile failed: %v", err)
 	}
@@ -392,7 +394,7 @@ func TestDroidEdit_MalformedModelEntry(t *testing.T) {
 	}

 	// Malformed entries (non-object) are dropped - only valid model objects are preserved
-	settings, _ := readJSONFile(settingsPath)
+	settings, _ := fileutil.ReadJSON(settingsPath)
 	customModels, _ := settings["customModels"].([]any)

 	// Should have: 1 new Ollama model only (malformed entries dropped)
@@ -419,7 +421,7 @@ func TestDroidEdit_WrongTypeSessionSettings(t *testing.T) {
 	}

 	// Should create proper sessionDefaultSettings
-	settings, _ := readJSONFile(settingsPath)
+	settings, _ := fileutil.ReadJSON(settingsPath)
 	session, ok := settings["sessionDefaultSettings"].(map[string]any)
 	if !ok {
 		t.Fatalf("sessionDefaultSettings should be map after setup, got %T", settings["sessionDefaultSettings"])
@@ -1008,34 +1010,34 @@ func TestDroidEdit_ModelNamesWithSpecialCharacters(t *testing.T) {
 }

 func TestDroidEdit_MissingCustomModelsKey(t *testing.T) {
-	d := &Droid{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	settingsDir := filepath.Join(tmpDir, ".factory")
-	settingsPath := filepath.Join(settingsDir, "settings.json")
-
-	os.MkdirAll(settingsDir, 0o755)
-
 	// No customModels key at all
 	original := `{
 		"diffMode": "github",
 		"sessionDefaultSettings": {"autonomyMode": "auto-high"}
 	}`
-	os.WriteFile(settingsPath, []byte(original), 0o644)

-	if err := d.Edit([]string{"model-a"}); err != nil {
+	var settingsStruct droidSettings
+	var settings map[string]any
+	if err := json.Unmarshal([]byte(original), &settings); err != nil {
+		t.Fatal(err)
+	}
+	if err := json.Unmarshal([]byte(original), &settingsStruct); err != nil {
 		t.Fatal(err)
 	}

-	data, _ := os.ReadFile(settingsPath)
-	var settings map[string]any
-	json.Unmarshal(data, &settings)
+	settings = updateDroidSettings(settings, settingsStruct, []string{"model-a"})

 	// Original fields preserved
 	if settings["diffMode"] != "github" {
 		t.Error("diffMode not preserved")
 	}
+	session, ok := settings["sessionDefaultSettings"].(map[string]any)
+	if !ok {
+		t.Fatal("sessionDefaultSettings not preserved")
+	}
+	if session["autonomyMode"] != "auto-high" {
+		t.Error("sessionDefaultSettings.autonomyMode not preserved")
+	}

 	// customModels created
 	models, ok := settings["customModels"].([]any)
@@ -1276,25 +1278,17 @@ func TestDroidEdit_LocalModelDefaultMaxOutput(t *testing.T) {

 func TestDroidEdit_CloudModelLimitsUsed(t *testing.T) {
 	// Verify that every cloud model in cloudModelLimits has a valid output
-	// value that would be used for maxOutputTokens when isCloudModel returns true.
-	// :cloud suffix stripping must also work since that's how users specify them.
+	// value that would be used for maxOutputTokens when the selected model uses
+	// the explicit :cloud source tag.
 	for name, expected := range cloudModelLimits {
 		t.Run(name, func(t *testing.T) {
-			l, ok := lookupCloudModelLimit(name)
-			if !ok {
-				t.Fatalf("lookupCloudModelLimit(%q) returned false", name)
-			}
-			if l.Output != expected.Output {
-				t.Errorf("output = %d, want %d", l.Output, expected.Output)
-			}
-			// Also verify :cloud suffix lookup
 			cloudName := name + ":cloud"
-			l2, ok := lookupCloudModelLimit(cloudName)
+			l, ok := lookupCloudModelLimit(cloudName)
 			if !ok {
 				t.Fatalf("lookupCloudModelLimit(%q) returned false", cloudName)
 			}
-			if l2.Output != expected.Output {
-				t.Errorf(":cloud output = %d, want %d", l2.Output, expected.Output)
+			if l.Output != expected.Output {
+				t.Errorf("output = %d, want %d", l.Output, expected.Output)
 			}
 		})
 	}
--- a/cmd/launch/integrations_test.go
+++ b/cmd/launch/integrations_test.go
--- a/cmd/launch/launch.go
+++ b/cmd/launch/launch.go
@@ -0,0 +1,840 @@
+package launch
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"os"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/config"
+	"github.com/spf13/cobra"
+	"golang.org/x/term"
+)
+
+// LauncherState is the launch-owned snapshot used to render the root launcher menu.
+type LauncherState struct {
+	LastSelection  string
+	RunModel       string
+	RunModelUsable bool
+	Integrations   map[string]LauncherIntegrationState
+}
+
+// LauncherIntegrationState is the launch-owned status for one launcher integration.
+type LauncherIntegrationState struct {
+	Name            string
+	DisplayName     string
+	Description     string
+	Installed       bool
+	AutoInstallable bool
+	Selectable      bool
+	Changeable      bool
+	CurrentModel    string
+	ModelUsable     bool
+	InstallHint     string
+	Editor          bool
+}
+
+// RunModelRequest controls how the root launcher resolves the chat model.
+type RunModelRequest struct {
+	ForcePicker bool
+	Policy      *LaunchPolicy
+}
+
+// LaunchConfirmMode controls confirmation behavior across launch flows.
+type LaunchConfirmMode int
+
+const (
+	// LaunchConfirmPrompt prompts the user for confirmation.
+	LaunchConfirmPrompt LaunchConfirmMode = iota
+	// LaunchConfirmAutoApprove skips prompts and treats confirmation as accepted.
+	LaunchConfirmAutoApprove
+	// LaunchConfirmRequireYes rejects confirmation requests with a --yes hint.
+	LaunchConfirmRequireYes
+)
+
+// LaunchMissingModelMode controls local missing-model handling in launch flows.
+type LaunchMissingModelMode int
+
+const (
+	// LaunchMissingModelPromptToPull prompts to pull a missing local model.
+	LaunchMissingModelPromptToPull LaunchMissingModelMode = iota
+	// LaunchMissingModelAutoPull pulls a missing local model without prompting.
+	LaunchMissingModelAutoPull
+	// LaunchMissingModelFail fails immediately when a local model is missing.
+	LaunchMissingModelFail
+)
+
+// LaunchPolicy controls launch behavior that may vary by caller context.
+type LaunchPolicy struct {
+	Confirm      LaunchConfirmMode
+	MissingModel LaunchMissingModelMode
+}
+
+func defaultLaunchPolicy(interactive bool, yes bool) LaunchPolicy {
+	policy := LaunchPolicy{
+		Confirm:      LaunchConfirmPrompt,
+		MissingModel: LaunchMissingModelPromptToPull,
+	}
+	switch {
+	case yes:
+		// if yes flag is set, auto approve and auto pull
+		policy.Confirm = LaunchConfirmAutoApprove
+		policy.MissingModel = LaunchMissingModelAutoPull
+	case !interactive:
+		// otherwise make sure to stop when needed
+		policy.Confirm = LaunchConfirmRequireYes
+		policy.MissingModel = LaunchMissingModelFail
+	}
+	return policy
+}
+
+func (p LaunchPolicy) confirmPolicy() launchConfirmPolicy {
+	switch p.Confirm {
+	case LaunchConfirmAutoApprove:
+		return launchConfirmPolicy{yes: true}
+	case LaunchConfirmRequireYes:
+		return launchConfirmPolicy{requireYesMessage: true}
+	default:
+		return launchConfirmPolicy{}
+	}
+}
+
+func (p LaunchPolicy) missingModelPolicy() missingModelPolicy {
+	switch p.MissingModel {
+	case LaunchMissingModelAutoPull:
+		return missingModelAutoPull
+	case LaunchMissingModelFail:
+		return missingModelFail
+	default:
+		return missingModelPromptPull
+	}
+}
+
+// IntegrationLaunchRequest controls the canonical integration launcher flow.
+type IntegrationLaunchRequest struct {
+	Name           string
+	ModelOverride  string
+	ForceConfigure bool
+	ConfigureOnly  bool
+	ExtraArgs      []string
+	Policy         *LaunchPolicy
+}
+
+var isInteractiveSession = func() bool {
+	return term.IsTerminal(int(os.Stdin.Fd())) && term.IsTerminal(int(os.Stdout.Fd()))
+}
+
+// Runner executes a model with an integration.
+type Runner interface {
+	Run(model string, args []string) error
+	String() string
+}
+
+// Editor can edit config files for integrations that support model configuration.
+type Editor interface {
+	Paths() []string
+	Edit(models []string) error
+	Models() []string
+}
+
+type modelInfo struct {
+	Name        string
+	Remote      bool
+	ToolCapable bool
+}
+
+// ModelInfo re-exports launcher model inventory details for callers.
+type ModelInfo = modelInfo
+
+// ModelItem represents a model for selection UIs.
+type ModelItem struct {
+	Name        string
+	Description string
+	Recommended bool
+}
+
+// LaunchCmd returns the cobra command for launching integrations.
+// The runTUI callback is called when the root launcher UI should be shown.
+func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error, runTUI func(cmd *cobra.Command)) *cobra.Command {
+	var modelFlag string
+	var configFlag bool
+	var yesFlag bool
+
+	cmd := &cobra.Command{
+		Use:   "launch [INTEGRATION] [-- [EXTRA_ARGS...]]",
+		Short: "Launch the Ollama menu or an integration",
+		Long: `Launch the Ollama interactive menu, or directly launch a specific integration.
+
+Without arguments, this is equivalent to running 'ollama' directly.
+Flags and extra arguments require an integration name.
+
+Supported integrations:
+  claude    Claude Code
+  cline     Cline
+  codex     Codex
+  droid     Droid
+  opencode  OpenCode
+  openclaw  OpenClaw (aliases: clawdbot, moltbot)
+  pi        Pi
+
+Examples:
+  ollama launch
+  ollama launch claude
+  ollama launch claude --model <model>
+  ollama launch droid --config (does not auto-launch)
+  ollama launch codex -- -p myprofile (pass extra args to integration)
+  ollama launch codex -- --sandbox workspace-write`,
+		Args:    cobra.ArbitraryArgs,
+		PreRunE: checkServerHeartbeat,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			policy := defaultLaunchPolicy(isInteractiveSession(), yesFlag)
+			// reset when done to make sure state doens't leak between launches
+			restoreConfirmPolicy := withLaunchConfirmPolicy(policy.confirmPolicy())
+			defer restoreConfirmPolicy()
+
+			var name string
+			var passArgs []string
+			dashIdx := cmd.ArgsLenAtDash()
+
+			if dashIdx == -1 {
+				if len(args) > 1 {
+					return fmt.Errorf("unexpected arguments: %v\nUse '--' to pass extra arguments to the integration", args[1:])
+				}
+				if len(args) == 1 {
+					name = args[0]
+				}
+			} else {
+				if dashIdx > 1 {
+					return fmt.Errorf("expected at most 1 integration name before '--', got %d", dashIdx)
+				}
+				if dashIdx == 1 {
+					name = args[0]
+				}
+				passArgs = args[dashIdx:]
+			}
+
+			if name == "" {
+				if cmd.Flags().Changed("model") || cmd.Flags().Changed("config") || cmd.Flags().Changed("yes") || len(passArgs) > 0 {
+					return fmt.Errorf("flags and extra args require an integration name, for example: 'ollama launch claude --model qwen3.5'")
+				}
+				runTUI(cmd)
+				return nil
+			}
+
+			if modelFlag != "" && isCloudModelName(modelFlag) {
+				if client, err := api.ClientFromEnvironment(); err == nil {
+					if disabled, _ := cloudStatusDisabled(cmd.Context(), client); disabled {
+						fmt.Fprintf(os.Stderr, "Warning: ignoring --model %s because cloud is disabled\n", modelFlag)
+						modelFlag = ""
+					}
+				}
+			}
+
+			headlessYes := yesFlag && !isInteractiveSession()
+			err := LaunchIntegration(cmd.Context(), IntegrationLaunchRequest{
+				Name:           name,
+				ModelOverride:  modelFlag,
+				ForceConfigure: configFlag || (modelFlag == "" && !headlessYes),
+				ConfigureOnly:  configFlag,
+				ExtraArgs:      passArgs,
+				Policy:         &policy,
+			})
+			if errors.Is(err, ErrCancelled) {
+				return nil
+			}
+			return err
+		},
+	}
+
+	cmd.Flags().StringVar(&modelFlag, "model", "", "Model to use")
+	cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
+	cmd.Flags().BoolVarP(&yesFlag, "yes", "y", false, "Automatically answer yes to confirmation prompts")
+	return cmd
+}
+
+type launcherClient struct {
+	apiClient       *api.Client
+	modelInventory  []ModelInfo
+	inventoryLoaded bool
+	policy          LaunchPolicy
+}
+
+func newLauncherClient(policy LaunchPolicy) (*launcherClient, error) {
+	apiClient, err := api.ClientFromEnvironment()
+	if err != nil {
+		return nil, err
+	}
+
+	return &launcherClient{
+		apiClient: apiClient,
+		policy:    policy,
+	}, nil
+}
+
+// BuildLauncherState returns the launch-owned root launcher menu snapshot.
+func BuildLauncherState(ctx context.Context) (*LauncherState, error) {
+	launchClient, err := newLauncherClient(defaultLaunchPolicy(isInteractiveSession(), false))
+	if err != nil {
+		return nil, err
+	}
+	return launchClient.buildLauncherState(ctx)
+}
+
+// ResolveRunModel returns the model that should be used for interactive chat.
+func ResolveRunModel(ctx context.Context, req RunModelRequest) (string, error) {
+	// Called by the launcher TUI "Run a model" action (cmd/runLauncherAction),
+	// which resolves models separately from LaunchIntegration. Callers can pass
+	// Policy directly; otherwise we fall back to ambient --yes/session defaults.
+	policy := defaultLaunchPolicy(isInteractiveSession(), currentLaunchConfirmPolicy.yes)
+	if req.Policy != nil {
+		policy = *req.Policy
+	}
+
+	launchClient, err := newLauncherClient(policy)
+	if err != nil {
+		return "", err
+	}
+	return launchClient.resolveRunModel(ctx, req)
+}
+
+// LaunchIntegration runs the canonical launcher flow for one integration.
+func LaunchIntegration(ctx context.Context, req IntegrationLaunchRequest) error {
+	name, runner, err := LookupIntegration(req.Name)
+	if err != nil {
+		return err
+	}
+	if !req.ConfigureOnly {
+		if err := EnsureIntegrationInstalled(name, runner); err != nil {
+			return err
+		}
+	}
+
+	var policy LaunchPolicy
+	// TUI does not set a policy, whereas ollama launch <app> does as it can have flags which change the behavior
+	if req.Policy == nil {
+		policy = defaultLaunchPolicy(isInteractiveSession(), false)
+	} else {
+		policy = *req.Policy
+	}
+
+	launchClient, err := newLauncherClient(policy)
+	if err != nil {
+		return err
+	}
+	saved, _ := loadStoredIntegrationConfig(name)
+	// In headless --yes mode we cannot prompt, so require an explicit --model.
+	if policy.Confirm == LaunchConfirmAutoApprove && !isInteractiveSession() && req.ModelOverride == "" {
+		return fmt.Errorf("headless --yes launch for %s requires --model <model>", name)
+	}
+
+	if editor, ok := runner.(Editor); ok {
+		return launchClient.launchEditorIntegration(ctx, name, runner, editor, saved, req)
+	}
+	return launchClient.launchSingleIntegration(ctx, name, runner, saved, req)
+}
+
+func (c *launcherClient) buildLauncherState(ctx context.Context) (*LauncherState, error) {
+	_ = c.loadModelInventoryOnce(ctx)
+
+	state := &LauncherState{
+		LastSelection: config.LastSelection(),
+		RunModel:      config.LastModel(),
+		Integrations:  make(map[string]LauncherIntegrationState),
+	}
+	runModelUsable, err := c.savedModelUsable(ctx, state.RunModel)
+	if err != nil {
+		runModelUsable = false
+	}
+	state.RunModelUsable = runModelUsable
+
+	for _, info := range ListIntegrationInfos() {
+		integrationState, err := c.buildLauncherIntegrationState(ctx, info)
+		if err != nil {
+			return nil, err
+		}
+		state.Integrations[info.Name] = integrationState
+	}
+
+	return state, nil
+}
+
+func (c *launcherClient) buildLauncherIntegrationState(ctx context.Context, info IntegrationInfo) (LauncherIntegrationState, error) {
+	integration, err := integrationFor(info.Name)
+	if err != nil {
+		return LauncherIntegrationState{}, err
+	}
+	currentModel, usable, err := c.launcherModelState(ctx, info.Name, integration.editor)
+	if err != nil {
+		return LauncherIntegrationState{}, err
+	}
+
+	return LauncherIntegrationState{
+		Name:            info.Name,
+		DisplayName:     info.DisplayName,
+		Description:     info.Description,
+		Installed:       integration.installed,
+		AutoInstallable: integration.autoInstallable,
+		Selectable:      integration.installed || integration.autoInstallable,
+		Changeable:      integration.installed || integration.autoInstallable,
+		CurrentModel:    currentModel,
+		ModelUsable:     usable,
+		InstallHint:     integration.installHint,
+		Editor:          integration.editor,
+	}, nil
+}
+
+func (c *launcherClient) launcherModelState(ctx context.Context, name string, isEditor bool) (string, bool, error) {
+	cfg, loadErr := loadStoredIntegrationConfig(name)
+	hasModels := loadErr == nil && len(cfg.Models) > 0
+	if !hasModels {
+		return "", false, nil
+	}
+
+	if isEditor {
+		filtered := c.filterDisabledCloudModels(ctx, cfg.Models)
+		if len(filtered) > 0 {
+			return filtered[0], true, nil
+		}
+		return cfg.Models[0], false, nil
+	}
+
+	model := cfg.Models[0]
+	usable, usableErr := c.savedModelUsable(ctx, model)
+	return model, usableErr == nil && usable, nil
+}
+
+func (c *launcherClient) resolveRunModel(ctx context.Context, req RunModelRequest) (string, error) {
+	current := config.LastModel()
+	if !req.ForcePicker && current != "" && c.policy.Confirm == LaunchConfirmAutoApprove && !isInteractiveSession() {
+		if err := c.ensureModelsReady(ctx, []string{current}); err != nil {
+			return "", err
+		}
+		fmt.Fprintf(os.Stderr, "Headless mode: auto-selected last used model %q\n", current)
+		return current, nil
+	}
+
+	if !req.ForcePicker {
+		usable, err := c.savedModelUsable(ctx, current)
+		if err != nil {
+			return "", err
+		}
+		if usable {
+			if err := c.ensureModelsReady(ctx, []string{current}); err != nil {
+				return "", err
+			}
+			return current, nil
+		}
+	}
+
+	model, err := c.selectSingleModelWithSelector(ctx, "Select model to run:", current, DefaultSingleSelector)
+	if err != nil {
+		return "", err
+	}
+	if model != current {
+		if err := config.SetLastModel(model); err != nil {
+			return "", err
+		}
+	}
+	return model, nil
+}
+
+func (c *launcherClient) launchSingleIntegration(ctx context.Context, name string, runner Runner, saved *config.IntegrationConfig, req IntegrationLaunchRequest) error {
+	current := primaryModelFromConfig(saved)
+	target := req.ModelOverride
+	needsConfigure := req.ForceConfigure
+
+	if target == "" {
+		target = current
+		usable, err := c.savedModelUsable(ctx, target)
+		if err != nil {
+			return err
+		}
+		if !usable {
+			needsConfigure = true
+		}
+	}
+
+	if needsConfigure {
+		selected, err := c.selectSingleModelWithSelector(ctx, fmt.Sprintf("Select model for %s:", runner), target, DefaultSingleSelector)
+		if err != nil {
+			return err
+		}
+		target = selected
+	} else if err := c.ensureModelsReady(ctx, []string{target}); err != nil {
+		return err
+	}
+
+	if target == "" {
+		return nil
+	}
+
+	if target != current {
+		if err := config.SaveIntegration(name, []string{target}); err != nil {
+			return fmt.Errorf("failed to save: %w", err)
+		}
+	}
+
+	return launchAfterConfiguration(name, runner, target, req)
+}
+
+func (c *launcherClient) launchEditorIntegration(ctx context.Context, name string, runner Runner, editor Editor, saved *config.IntegrationConfig, req IntegrationLaunchRequest) error {
+	models, needsConfigure := c.resolveEditorLaunchModels(ctx, saved, req)
+
+	if needsConfigure {
+		selected, err := c.selectMultiModelsForIntegration(ctx, runner, models)
+		if err != nil {
+			return err
+		}
+		models = selected
+	} else if err := c.ensureModelsReady(ctx, models); err != nil {
+		return err
+	}
+
+	if len(models) == 0 {
+		return nil
+	}
+
+	if needsConfigure || req.ModelOverride != "" {
+		if err := prepareEditorIntegration(name, runner, editor, models); err != nil {
+			return err
+		}
+	}
+
+	return launchAfterConfiguration(name, runner, models[0], req)
+}
+
+func (c *launcherClient) selectSingleModelWithSelector(ctx context.Context, title, current string, selector SingleSelector) (string, error) {
+	if selector == nil {
+		return "", fmt.Errorf("no selector configured")
+	}
+
+	items, _, err := c.loadSelectableModels(ctx, nil, current, "no models available, run 'ollama pull <model>' first")
+	if err != nil {
+		return "", err
+	}
+
+	selected, err := selector(title, items, current)
+	if err != nil {
+		return "", err
+	}
+	if err := c.ensureModelsReady(ctx, []string{selected}); err != nil {
+		return "", err
+	}
+	return selected, nil
+}
+
+func (c *launcherClient) selectMultiModelsForIntegration(ctx context.Context, runner Runner, preChecked []string) ([]string, error) {
+	if DefaultMultiSelector == nil {
+		return nil, fmt.Errorf("no selector configured")
+	}
+
+	current := firstModel(preChecked)
+
+	items, orderedChecked, err := c.loadSelectableModels(ctx, preChecked, current, "no models available")
+	if err != nil {
+		return nil, err
+	}
+	if len(preChecked) > 0 {
+		// Keep list order stable in multi-select even when there are existing checks.
+		// checked/default state still comes from orderedChecked.
+		stableItems, _, stableErr := c.loadSelectableModels(ctx, nil, current, "no models available")
+		if stableErr != nil {
+			return nil, stableErr
+		}
+		items = stableItems
+	}
+
+	selected, err := DefaultMultiSelector(fmt.Sprintf("Select models for %s:", runner), items, orderedChecked)
+	if err != nil {
+		return nil, err
+	}
+	if err := c.ensureModelsReady(ctx, selected); err != nil {
+		return nil, err
+	}
+	return selected, nil
+}
+
+func (c *launcherClient) loadSelectableModels(ctx context.Context, preChecked []string, current, emptyMessage string) ([]ModelItem, []string, error) {
+	if err := c.loadModelInventoryOnce(ctx); err != nil {
+		return nil, nil, err
+	}
+
+	cloudDisabled, _ := cloudStatusDisabled(ctx, c.apiClient)
+	items, orderedChecked, _, _ := buildModelList(c.modelInventory, preChecked, current)
+	if cloudDisabled {
+		items = filterCloudItems(items)
+		orderedChecked = c.filterDisabledCloudModels(ctx, orderedChecked)
+	}
+	if len(items) == 0 {
+		return nil, nil, errors.New(emptyMessage)
+	}
+	return items, orderedChecked, nil
+}
+
+func (c *launcherClient) ensureModelsReady(ctx context.Context, models []string) error {
+	var deduped []string
+	seen := make(map[string]bool, len(models))
+	for _, model := range models {
+		if model == "" || seen[model] {
+			continue
+		}
+		seen[model] = true
+		deduped = append(deduped, model)
+	}
+	models = deduped
+	if len(models) == 0 {
+		return nil
+	}
+
+	cloudModels := make(map[string]bool, len(models))
+	for _, model := range models {
+		isCloudModel := isCloudModelName(model)
+		if isCloudModel {
+			cloudModels[model] = true
+		}
+		if err := showOrPullWithPolicy(ctx, c.apiClient, model, c.policy.missingModelPolicy(), isCloudModel); err != nil {
+			return err
+		}
+	}
+	return ensureAuth(ctx, c.apiClient, cloudModels, models)
+}
+
+func (c *launcherClient) resolveEditorLaunchModels(ctx context.Context, saved *config.IntegrationConfig, req IntegrationLaunchRequest) ([]string, bool) {
+	if req.ForceConfigure {
+		return editorPreCheckedModels(saved, req.ModelOverride), true
+	}
+
+	if req.ModelOverride != "" {
+		models := append([]string{req.ModelOverride}, additionalSavedModels(saved, req.ModelOverride)...)
+		models = c.filterDisabledCloudModels(ctx, models)
+		return models, len(models) == 0
+	}
+
+	if saved == nil || len(saved.Models) == 0 {
+		return nil, true
+	}
+
+	models := c.filterDisabledCloudModels(ctx, saved.Models)
+	return models, len(models) == 0
+}
+
+func (c *launcherClient) filterDisabledCloudModels(ctx context.Context, models []string) []string {
+	// if connection cannot be established or there is a 404, cloud models will continue to be displayed
+	cloudDisabled, _ := cloudStatusDisabled(ctx, c.apiClient)
+	if !cloudDisabled {
+		return append([]string(nil), models...)
+	}
+
+	filtered := make([]string, 0, len(models))
+	for _, model := range models {
+		if !isCloudModelName(model) {
+			filtered = append(filtered, model)
+		}
+	}
+	return filtered
+}
+
+func (c *launcherClient) savedModelUsable(ctx context.Context, name string) (bool, error) {
+	if err := c.loadModelInventoryOnce(ctx); err != nil {
+		return c.showBasedModelUsable(ctx, name)
+	}
+	return c.singleModelUsable(ctx, name), nil
+}
+
+func (c *launcherClient) showBasedModelUsable(ctx context.Context, name string) (bool, error) {
+	if name == "" {
+		return false, nil
+	}
+
+	info, err := c.apiClient.Show(ctx, &api.ShowRequest{Model: name})
+	if err != nil {
+		var statusErr api.StatusError
+		if errors.As(err, &statusErr) && statusErr.StatusCode == http.StatusNotFound {
+			return false, nil
+		}
+		return false, err
+	}
+
+	if isCloudModelName(name) || info.RemoteModel != "" {
+		cloudDisabled, _ := cloudStatusDisabled(ctx, c.apiClient)
+
+		return !cloudDisabled, nil
+	}
+
+	return true, nil
+}
+
+func (c *launcherClient) singleModelUsable(ctx context.Context, name string) bool {
+	if name == "" {
+		return false
+	}
+	if isCloudModelName(name) {
+		cloudDisabled, _ := cloudStatusDisabled(ctx, c.apiClient)
+		return !cloudDisabled
+	}
+	return c.hasLocalModel(name)
+}
+
+func (c *launcherClient) hasLocalModel(name string) bool {
+	for _, model := range c.modelInventory {
+		if model.Remote {
+			continue
+		}
+		if model.Name == name || strings.HasPrefix(model.Name, name+":") {
+			return true
+		}
+	}
+	return false
+}
+
+func (c *launcherClient) loadModelInventoryOnce(ctx context.Context) error {
+	if c.inventoryLoaded {
+		return nil
+	}
+
+	resp, err := c.apiClient.List(ctx)
+	if err != nil {
+		return err
+	}
+
+	c.modelInventory = c.modelInventory[:0]
+	for _, model := range resp.Models {
+		c.modelInventory = append(c.modelInventory, ModelInfo{
+			Name:   model.Name,
+			Remote: model.RemoteModel != "",
+		})
+	}
+
+	cloudDisabled, _ := cloudStatusDisabled(ctx, c.apiClient)
+	if cloudDisabled {
+		c.modelInventory = filterCloudModels(c.modelInventory)
+	}
+	c.inventoryLoaded = true
+	return nil
+}
+
+func runIntegration(runner Runner, modelName string, args []string) error {
+	fmt.Fprintf(os.Stderr, "\nLaunching %s with %s...\n", runner, modelName)
+	return runner.Run(modelName, args)
+}
+
+func launchAfterConfiguration(name string, runner Runner, model string, req IntegrationLaunchRequest) error {
+	if req.ConfigureOnly {
+		launch, err := ConfirmPrompt(fmt.Sprintf("Launch %s now?", runner))
+		if err != nil {
+			return err
+		}
+		if !launch {
+			return nil
+		}
+	}
+	if err := EnsureIntegrationInstalled(name, runner); err != nil {
+		return err
+	}
+	return runIntegration(runner, model, req.ExtraArgs)
+}
+
+func loadStoredIntegrationConfig(name string) (*config.IntegrationConfig, error) {
+	cfg, err := config.LoadIntegration(name)
+	if err == nil {
+		return cfg, nil
+	}
+	if !errors.Is(err, os.ErrNotExist) {
+		return nil, err
+	}
+
+	spec, specErr := LookupIntegrationSpec(name)
+	if specErr != nil {
+		return nil, err
+	}
+
+	for _, alias := range spec.Aliases {
+		legacy, legacyErr := config.LoadIntegration(alias)
+		if legacyErr == nil {
+			migrateLegacyIntegrationConfig(spec.Name, legacy)
+			if migrated, migratedErr := config.LoadIntegration(spec.Name); migratedErr == nil {
+				return migrated, nil
+			}
+			return legacy, nil
+		}
+		if legacyErr != nil && !errors.Is(legacyErr, os.ErrNotExist) {
+			return nil, legacyErr
+		}
+	}
+
+	return nil, err
+}
+
+func migrateLegacyIntegrationConfig(canonical string, legacy *config.IntegrationConfig) {
+	if legacy == nil {
+		return
+	}
+
+	_ = config.SaveIntegration(canonical, append([]string(nil), legacy.Models...))
+	if len(legacy.Aliases) > 0 {
+		_ = config.SaveAliases(canonical, cloneAliases(legacy.Aliases))
+	}
+	if legacy.Onboarded {
+		_ = config.MarkIntegrationOnboarded(canonical)
+	}
+}
+
+func primaryModelFromConfig(cfg *config.IntegrationConfig) string {
+	if cfg == nil || len(cfg.Models) == 0 {
+		return ""
+	}
+	return cfg.Models[0]
+}
+
+func cloneAliases(aliases map[string]string) map[string]string {
+	if len(aliases) == 0 {
+		return make(map[string]string)
+	}
+
+	cloned := make(map[string]string, len(aliases))
+	for key, value := range aliases {
+		cloned[key] = value
+	}
+	return cloned
+}
+
+func singleModelPrechecked(current string) []string {
+	if current == "" {
+		return nil
+	}
+	return []string{current}
+}
+
+func firstModel(models []string) string {
+	if len(models) == 0 {
+		return ""
+	}
+	return models[0]
+}
+
+func editorPreCheckedModels(saved *config.IntegrationConfig, override string) []string {
+	if override == "" {
+		if saved == nil {
+			return nil
+		}
+		return append([]string(nil), saved.Models...)
+	}
+	return append([]string{override}, additionalSavedModels(saved, override)...)
+}
+
+func additionalSavedModels(saved *config.IntegrationConfig, exclude string) []string {
+	if saved == nil {
+		return nil
+	}
+
+	var models []string
+	for _, model := range saved.Models {
+		if model != exclude {
+			models = append(models, model)
+		}
+	}
+	return models
+}
--- a/cmd/launch/launch_test.go
+++ b/cmd/launch/launch_test.go
--- a/cmd/launch/models.go
+++ b/cmd/launch/models.go
@@ -0,0 +1,494 @@
+package launch
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"runtime"
+	"slices"
+	"strings"
+	"time"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
+	internalcloud "github.com/ollama/ollama/internal/cloud"
+	"github.com/ollama/ollama/internal/modelref"
+	"github.com/ollama/ollama/progress"
+)
+
+var recommendedModels = []ModelItem{
+	{Name: "kimi-k2.5:cloud", Description: "Multimodal reasoning with subagents", Recommended: true},
+	{Name: "qwen3.5:cloud", Description: "Reasoning, coding, and agentic tool use with vision", Recommended: true},
+	{Name: "glm-5:cloud", Description: "Reasoning and code generation", Recommended: true},
+	{Name: "minimax-m2.7:cloud", Description: "Fast, efficient coding and real-world productivity", Recommended: true},
+	{Name: "glm-4.7-flash", Description: "Reasoning and code generation locally", Recommended: true},
+	{Name: "qwen3.5", Description: "Reasoning, coding, and visual understanding locally", Recommended: true},
+}
+
+var recommendedVRAM = map[string]string{
+	"glm-4.7-flash": "~25GB",
+	"qwen3.5":       "~11GB",
+}
+
+// cloudModelLimit holds context and output token limits for a cloud model.
+type cloudModelLimit struct {
+	Context int
+	Output  int
+}
+
+// cloudModelLimits maps cloud model base names to their token limits.
+// TODO(parthsareen): grab context/output limits from model info instead of hardcoding
+var cloudModelLimits = map[string]cloudModelLimit{
+	"minimax-m2.7":        {Context: 204_800, Output: 128_000},
+	"cogito-2.1:671b":     {Context: 163_840, Output: 65_536},
+	"deepseek-v3.1:671b":  {Context: 163_840, Output: 163_840},
+	"deepseek-v3.2":       {Context: 163_840, Output: 65_536},
+	"glm-4.6":             {Context: 202_752, Output: 131_072},
+	"glm-4.7":             {Context: 202_752, Output: 131_072},
+	"glm-5":               {Context: 202_752, Output: 131_072},
+	"gpt-oss:120b":        {Context: 131_072, Output: 131_072},
+	"gpt-oss:20b":         {Context: 131_072, Output: 131_072},
+	"kimi-k2:1t":          {Context: 262_144, Output: 262_144},
+	"kimi-k2.5":           {Context: 262_144, Output: 262_144},
+	"kimi-k2-thinking":    {Context: 262_144, Output: 262_144},
+	"nemotron-3-nano:30b": {Context: 1_048_576, Output: 131_072},
+	"qwen3-coder:480b":    {Context: 262_144, Output: 65_536},
+	"qwen3-coder-next":    {Context: 262_144, Output: 32_768},
+	"qwen3-next:80b":      {Context: 262_144, Output: 32_768},
+	"qwen3.5":             {Context: 262_144, Output: 32_768},
+}
+
+// lookupCloudModelLimit returns the token limits for a cloud model.
+// It normalizes explicit cloud source suffixes before checking the shared limit map.
+func lookupCloudModelLimit(name string) (cloudModelLimit, bool) {
+	base, stripped := modelref.StripCloudSourceTag(name)
+	if stripped {
+		if l, ok := cloudModelLimits[base]; ok {
+			return l, true
+		}
+	}
+	return cloudModelLimit{}, false
+}
+
+// missingModelPolicy controls how model-not-found errors should be handled.
+type missingModelPolicy int
+
+const (
+	// missingModelPromptPull prompts the user to download missing local models.
+	missingModelPromptPull missingModelPolicy = iota
+	// missingModelAutoPull downloads missing local models without prompting.
+	missingModelAutoPull
+	// missingModelFail returns an error for missing local models without prompting.
+	missingModelFail
+)
+
+// OpenBrowser opens the URL in the user's browser.
+func OpenBrowser(url string) {
+	switch runtime.GOOS {
+	case "darwin":
+		_ = exec.Command("open", url).Start()
+	case "linux":
+		// Skip on headless systems where no display server is available
+		if os.Getenv("DISPLAY") == "" && os.Getenv("WAYLAND_DISPLAY") == "" {
+			return
+		}
+		_ = exec.Command("xdg-open", url).Start()
+	case "windows":
+		_ = exec.Command("rundll32", "url.dll,FileProtocolHandler", url).Start()
+	}
+}
+
+// ensureAuth ensures the user is signed in before cloud-backed models run.
+func ensureAuth(ctx context.Context, client *api.Client, cloudModels map[string]bool, selected []string) error {
+	var selectedCloudModels []string
+	for _, m := range selected {
+		if cloudModels[m] {
+			selectedCloudModels = append(selectedCloudModels, m)
+		}
+	}
+	if len(selectedCloudModels) == 0 {
+		return nil
+	}
+	if disabled, known := cloudStatusDisabled(ctx, client); known && disabled {
+		return errors.New(internalcloud.DisabledError("remote inference is unavailable"))
+	}
+
+	user, err := client.Whoami(ctx)
+	if err == nil && user != nil && user.Name != "" {
+		return nil
+	}
+
+	var aErr api.AuthorizationError
+	if !errors.As(err, &aErr) || aErr.SigninURL == "" {
+		return err
+	}
+
+	modelList := strings.Join(selectedCloudModels, ", ")
+
+	if DefaultSignIn != nil {
+		_, err := DefaultSignIn(modelList, aErr.SigninURL)
+		if errors.Is(err, ErrCancelled) {
+			return ErrCancelled
+		}
+		if err != nil {
+			return fmt.Errorf("%s requires sign in", modelList)
+		}
+		return nil
+	}
+
+	yes, err := ConfirmPrompt(fmt.Sprintf("sign in to use %s?", modelList))
+	if errors.Is(err, ErrCancelled) {
+		return ErrCancelled
+	}
+	if err != nil {
+		return err
+	}
+	if !yes {
+		return ErrCancelled
+	}
+
+	fmt.Fprintf(os.Stderr, "\nTo sign in, navigate to:\n    %s\n\n", aErr.SigninURL)
+	OpenBrowser(aErr.SigninURL)
+
+	spinnerFrames := []string{"|", "/", "-", "\\"}
+	frame := 0
+	fmt.Fprintf(os.Stderr, "\033[90mwaiting for sign in to complete... %s\033[0m", spinnerFrames[0])
+
+	ticker := time.NewTicker(200 * time.Millisecond)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			fmt.Fprintf(os.Stderr, "\r\033[K")
+			return ctx.Err()
+		case <-ticker.C:
+			frame++
+			fmt.Fprintf(os.Stderr, "\r\033[90mwaiting for sign in to complete... %s\033[0m", spinnerFrames[frame%len(spinnerFrames)])
+
+			if frame%10 == 0 {
+				u, err := client.Whoami(ctx)
+				if err == nil && u != nil && u.Name != "" {
+					fmt.Fprintf(os.Stderr, "\r\033[K\033[A\r\033[K\033[1msigned in:\033[0m %s\n", u.Name)
+					return nil
+				}
+			}
+		}
+	}
+}
+
+// showOrPullWithPolicy checks if a model exists and applies the provided missing-model policy.
+func showOrPullWithPolicy(ctx context.Context, client *api.Client, model string, policy missingModelPolicy, isCloudModel bool) error {
+	if _, err := client.Show(ctx, &api.ShowRequest{Model: model}); err == nil {
+		return nil
+	} else {
+		var statusErr api.StatusError
+		if !errors.As(err, &statusErr) || statusErr.StatusCode != http.StatusNotFound {
+			return err
+		}
+	}
+
+	if isCloudModel {
+		if disabled, known := cloudStatusDisabled(ctx, client); known && disabled {
+			return errors.New(internalcloud.DisabledError("remote inference is unavailable"))
+		}
+		return fmt.Errorf("model %q not found", model)
+	}
+
+	switch policy {
+	case missingModelAutoPull:
+		return pullMissingModel(ctx, client, model)
+	case missingModelFail:
+		return fmt.Errorf("model %q not found; run 'ollama pull %s' first, or use --yes to auto-pull", model, model)
+	default:
+		return confirmAndPull(ctx, client, model)
+	}
+}
+
+func confirmAndPull(ctx context.Context, client *api.Client, model string) error {
+	if ok, err := ConfirmPrompt(fmt.Sprintf("Download %s?", model)); err != nil {
+		return err
+	} else if !ok {
+		return errCancelled
+	}
+	fmt.Fprintf(os.Stderr, "\n")
+	return pullMissingModel(ctx, client, model)
+}
+
+func pullMissingModel(ctx context.Context, client *api.Client, model string) error {
+	if err := pullModel(ctx, client, model, false); err != nil {
+		return fmt.Errorf("failed to pull %s: %w", model, err)
+	}
+	return nil
+}
+
+// prepareEditorIntegration persists models and applies editor-managed config files.
+func prepareEditorIntegration(name string, runner Runner, editor Editor, models []string) error {
+	if ok, err := confirmEditorEdit(runner, editor); err != nil {
+		return err
+	} else if !ok {
+		return errCancelled
+	}
+	if err := editor.Edit(models); err != nil {
+		return fmt.Errorf("setup failed: %w", err)
+	}
+	if err := config.SaveIntegration(name, models); err != nil {
+		return fmt.Errorf("failed to save: %w", err)
+	}
+	return nil
+}
+
+func confirmEditorEdit(runner Runner, editor Editor) (bool, error) {
+	paths := editor.Paths()
+	if len(paths) == 0 {
+		return true, nil
+	}
+
+	fmt.Fprintf(os.Stderr, "This will modify your %s configuration:\n", runner)
+	for _, path := range paths {
+		fmt.Fprintf(os.Stderr, "  %s\n", path)
+	}
+	fmt.Fprintf(os.Stderr, "Backups will be saved to %s/\n\n", fileutil.BackupDir())
+
+	return ConfirmPrompt("Proceed?")
+}
+
+// buildModelList merges existing models with recommendations for selection UIs.
+func buildModelList(existing []modelInfo, preChecked []string, current string) (items []ModelItem, orderedChecked []string, existingModels, cloudModels map[string]bool) {
+	existingModels = make(map[string]bool)
+	cloudModels = make(map[string]bool)
+	recommended := make(map[string]bool)
+	var hasLocalModel, hasCloudModel bool
+
+	recDesc := make(map[string]string)
+	for _, rec := range recommendedModels {
+		recommended[rec.Name] = true
+		recDesc[rec.Name] = rec.Description
+	}
+
+	for _, m := range existing {
+		existingModels[m.Name] = true
+		if m.Remote {
+			cloudModels[m.Name] = true
+			hasCloudModel = true
+		} else {
+			hasLocalModel = true
+		}
+		displayName := strings.TrimSuffix(m.Name, ":latest")
+		existingModels[displayName] = true
+		item := ModelItem{Name: displayName, Recommended: recommended[displayName], Description: recDesc[displayName]}
+		items = append(items, item)
+	}
+
+	for _, rec := range recommendedModels {
+		if existingModels[rec.Name] || existingModels[rec.Name+":latest"] {
+			continue
+		}
+		items = append(items, rec)
+		if isCloudModelName(rec.Name) {
+			cloudModels[rec.Name] = true
+		}
+	}
+
+	checked := make(map[string]bool, len(preChecked))
+	for _, n := range preChecked {
+		checked[n] = true
+	}
+
+	if current != "" {
+		matchedCurrent := false
+		for _, item := range items {
+			if item.Name == current {
+				current = item.Name
+				matchedCurrent = true
+				break
+			}
+		}
+		if !matchedCurrent {
+			for _, item := range items {
+				if strings.HasPrefix(item.Name, current+":") {
+					current = item.Name
+					break
+				}
+			}
+		}
+	}
+
+	if checked[current] {
+		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
+	}
+
+	notInstalled := make(map[string]bool)
+	for i := range items {
+		if !existingModels[items[i].Name] && !cloudModels[items[i].Name] {
+			notInstalled[items[i].Name] = true
+			var parts []string
+			if items[i].Description != "" {
+				parts = append(parts, items[i].Description)
+			}
+			if vram := recommendedVRAM[items[i].Name]; vram != "" {
+				parts = append(parts, vram)
+			}
+			parts = append(parts, "(not downloaded)")
+			items[i].Description = strings.Join(parts, ", ")
+		}
+	}
+
+	recRank := make(map[string]int)
+	for i, rec := range recommendedModels {
+		recRank[rec.Name] = i + 1
+	}
+
+	onlyLocal := hasLocalModel && !hasCloudModel
+
+	if hasLocalModel || hasCloudModel {
+		slices.SortStableFunc(items, func(a, b ModelItem) int {
+			ac, bc := checked[a.Name], checked[b.Name]
+			aNew, bNew := notInstalled[a.Name], notInstalled[b.Name]
+			aRec, bRec := recRank[a.Name] > 0, recRank[b.Name] > 0
+			aCloud, bCloud := cloudModels[a.Name], cloudModels[b.Name]
+
+			if ac != bc {
+				if ac {
+					return -1
+				}
+				return 1
+			}
+			if aRec != bRec {
+				if aRec {
+					return -1
+				}
+				return 1
+			}
+			if aRec && bRec {
+				if aCloud != bCloud {
+					if onlyLocal {
+						if aCloud {
+							return 1
+						}
+						return -1
+					}
+					if aCloud {
+						return -1
+					}
+					return 1
+				}
+				return recRank[a.Name] - recRank[b.Name]
+			}
+			if aNew != bNew {
+				if aNew {
+					return 1
+				}
+				return -1
+			}
+			return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
+		})
+	}
+
+	return items, preChecked, existingModels, cloudModels
+}
+
+// isCloudModelName reports whether the model name has an explicit cloud source.
+func isCloudModelName(name string) bool {
+	return modelref.HasExplicitCloudSource(name)
+}
+
+// filterCloudModels drops remote-only models from the given inventory.
+func filterCloudModels(existing []modelInfo) []modelInfo {
+	filtered := existing[:0]
+	for _, m := range existing {
+		if !m.Remote {
+			filtered = append(filtered, m)
+		}
+	}
+	return filtered
+}
+
+// filterCloudItems removes cloud models from selection items.
+func filterCloudItems(items []ModelItem) []ModelItem {
+	filtered := items[:0]
+	for _, item := range items {
+		if !isCloudModelName(item.Name) {
+			filtered = append(filtered, item)
+		}
+	}
+	return filtered
+}
+
+func isCloudModel(ctx context.Context, client *api.Client, name string) bool {
+	if client == nil {
+		return false
+	}
+	resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
+	if err != nil {
+		return false
+	}
+	return resp.RemoteModel != ""
+}
+
+// cloudStatusDisabled returns whether cloud usage is currently disabled.
+func cloudStatusDisabled(ctx context.Context, client *api.Client) (disabled bool, known bool) {
+	status, err := client.CloudStatusExperimental(ctx)
+	if err != nil {
+		var statusErr api.StatusError
+		if errors.As(err, &statusErr) && statusErr.StatusCode == http.StatusNotFound {
+			return false, false
+		}
+		return false, false
+	}
+	return status.Cloud.Disabled, true
+}
+
+// TODO(parthsareen): this duplicates the pull progress UI in cmd.PullHandler.
+// Move the shared pull rendering to a small utility once the package boundary settles.
+func pullModel(ctx context.Context, client *api.Client, model string, insecure bool) error {
+	p := progress.NewProgress(os.Stderr)
+	defer p.Stop()
+
+	bars := make(map[string]*progress.Bar)
+	var status string
+	var spinner *progress.Spinner
+
+	fn := func(resp api.ProgressResponse) error {
+		if resp.Digest != "" {
+			if resp.Completed == 0 {
+				return nil
+			}
+
+			if spinner != nil {
+				spinner.Stop()
+			}
+
+			bar, ok := bars[resp.Digest]
+			if !ok {
+				name, isDigest := strings.CutPrefix(resp.Digest, "sha256:")
+				name = strings.TrimSpace(name)
+				if isDigest {
+					name = name[:min(12, len(name))]
+				}
+				bar = progress.NewBar(fmt.Sprintf("pulling %s:", name), resp.Total, resp.Completed)
+				bars[resp.Digest] = bar
+				p.Add(resp.Digest, bar)
+			}
+
+			bar.Set(resp.Completed)
+		} else if status != resp.Status {
+			if spinner != nil {
+				spinner.Stop()
+			}
+
+			status = resp.Status
+			spinner = progress.NewSpinner(status)
+			p.Add(status, spinner)
+		}
+
+		return nil
+	}
+
+	request := api.PullRequest{Name: model, Insecure: insecure}
+	return client.Pull(ctx, &request, fn)
+}
--- a/cmd/launch/openclaw.go
+++ b/cmd/launch/openclaw.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"context"
@@ -14,7 +14,10 @@ import (
 	"strings"
 	"time"

+	"golang.org/x/mod/semver"
+
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )
@@ -24,6 +27,9 @@ const defaultGatewayPort = 18789
 // Bound model capability probing so launch/config cannot hang on slow/unreachable API calls.
 var openclawModelShowTimeout = 5 * time.Second

+// openclawFreshInstall is set to true when ensureOpenclawInstalled performs an install
+var openclawFreshInstall bool
+
 type Openclaw struct{}

 func (c *Openclaw) String() string { return "OpenClaw" }
@@ -34,10 +40,7 @@ func (c *Openclaw) Run(model string, args []string) error {
 		return err
 	}

-	firstLaunch := true
-	if integrationConfig, err := loadIntegration("openclaw"); err == nil {
-		firstLaunch = !integrationConfig.Onboarded
-	}
+	firstLaunch := !c.onboarded()

 	if firstLaunch {
 		fmt.Fprintf(os.Stderr, "\n%sSecurity%s\n\n", ansiBold, ansiReset)
@@ -45,28 +48,46 @@ func (c *Openclaw) Run(model string, args []string) error {
 		fmt.Fprintf(os.Stderr, "  A bad prompt can trick it into doing unsafe things.\n\n")
 		fmt.Fprintf(os.Stderr, "%s  Learn more: https://docs.openclaw.ai/gateway/security%s\n\n", ansiGray, ansiReset)

-		ok, err := confirmPrompt("I understand the risks. Continue?")
+		ok, err := ConfirmPrompt("I understand the risks. Continue?")
 		if err != nil {
 			return err
 		}
 		if !ok {
 			return nil
 		}
-	}

-	if !c.onboarded() {
+		// Ensure the latest version is installed before onboarding so we get
+		// the newest wizard flags (e.g. --auth-choice ollama).
+		if !openclawFreshInstall {
+			update := exec.Command(bin, "update")
+			update.Stdout = os.Stdout
+			update.Stderr = os.Stderr
+			_ = update.Run() // best-effort; continue even if update fails
+		}
+
 		fmt.Fprintf(os.Stderr, "\n%sSetting up OpenClaw with Ollama...%s\n", ansiGreen, ansiReset)
 		fmt.Fprintf(os.Stderr, "%s  Model: %s%s\n\n", ansiGray, model, ansiReset)

-		cmd := exec.Command(bin, "onboard",
+		onboardArgs := []string{
+			"onboard",
 			"--non-interactive",
 			"--accept-risk",
-			"--auth-choice", "skip",
-			"--gateway-token", "ollama",
-			"--install-daemon",
+			"--auth-choice", "ollama",
+			"--custom-base-url", envconfig.Host().String(),
+			"--custom-model-id", model,
 			"--skip-channels",
 			"--skip-skills",
-		)
+		}
+		if canInstallDaemon() {
+			onboardArgs = append(onboardArgs, "--install-daemon")
+		} else {
+			// When we can't install a daemon (e.g. no systemd, sudo dropped
+			// XDG_RUNTIME_DIR, or container environment), skip the gateway
+			// health check so non-interactive onboarding completes. The
+			// gateway is started as a foreground child process after onboarding.
+			onboardArgs = append(onboardArgs, "--skip-health")
+		}
+		cmd := exec.Command(bin, onboardArgs...)
 		cmd.Stdin = os.Stdin
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
@@ -75,25 +96,13 @@ func (c *Openclaw) Run(model string, args []string) error {
 		}

 		patchDeviceScopes()
-
-		// Onboarding overwrites openclaw.json, so re-apply the model config
-		// that Edit() wrote before Run() was called.
-		if err := c.Edit([]string{model}); err != nil {
-			fmt.Fprintf(os.Stderr, "%s  Warning: could not re-apply model config: %v%s\n", ansiYellow, err, ansiReset)
-		}
 	}

-	if strings.HasSuffix(model, ":cloud") || strings.HasSuffix(model, "-cloud") {
-		if ensureWebSearchPlugin() {
-			registerWebSearchPlugin()
-		}
+	if ensureWebSearchPlugin() {
+		registerWebSearchPlugin()
 	}

-	if firstLaunch {
-		fmt.Fprintf(os.Stderr, "\n%sPreparing your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)
-	} else {
-		fmt.Fprintf(os.Stderr, "\n%sStarting your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)
-	}
+	fmt.Fprintf(os.Stderr, "\n%sStarting your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)

 	// When extra args are passed through, run exactly what the user asked for
 	// after setup and skip the built-in gateway+TUI convenience flow.
@@ -106,11 +115,6 @@ func (c *Openclaw) Run(model string, args []string) error {
 		if err := cmd.Run(); err != nil {
 			return windowsHint(err)
 		}
-		if firstLaunch {
-			if err := integrationOnboarded("openclaw"); err != nil {
-				return fmt.Errorf("failed to save onboarding state: %w", err)
-			}
-		}
 		return nil
 	}

@@ -118,7 +122,7 @@ func (c *Openclaw) Run(model string, args []string) error {
 	addr := fmt.Sprintf("localhost:%d", port)

 	// If the gateway is already running (e.g. via the daemon), restart it
-	// so it picks up any config changes from Edit() above (model, provider, etc.).
+	// so it picks up any config changes (model, provider, etc.).
 	if portOpen(addr) {
 		restart := exec.Command(bin, "daemon", "restart")
 		restart.Env = openclawEnv()
@@ -165,11 +169,6 @@ func (c *Openclaw) Run(model string, args []string) error {
 		return windowsHint(err)
 	}

-	if firstLaunch {
-		if err := integrationOnboarded("openclaw"); err != nil {
-			return fmt.Errorf("failed to save onboarding state: %w", err)
-		}
-	}
 	return nil
 }

@@ -409,6 +408,25 @@ func patchScopes(obj map[string]any, key string, required []string) bool {
 	return added
 }

+// canInstallDaemon reports whether the openclaw daemon can be installed as a
+// background service. Returns false on Linux when systemd is absent (e.g.
+// containers) so that --install-daemon is omitted and the gateway is started
+// as a foreground child process instead. Returns true in all other cases.
+func canInstallDaemon() bool {
+	if runtime.GOOS != "linux" {
+		return true
+	}
+	// /run/systemd/system exists as a directory when systemd is the init system.
+	// This is absent in most containers.
+	fi, err := os.Stat("/run/systemd/system")
+	if err != nil || !fi.IsDir() {
+		return false
+	}
+	// Even when systemd is the init system, user services require a user
+	// manager instance. XDG_RUNTIME_DIR being set is a prerequisite.
+	return os.Getenv("XDG_RUNTIME_DIR") != ""
+}
+
 func ensureOpenclawInstalled() (string, error) {
 	if _, err := exec.LookPath("openclaw"); err == nil {
 		return "openclaw", nil
@@ -417,16 +435,20 @@ func ensureOpenclawInstalled() (string, error) {
 		return "clawdbot", nil
 	}

-	if _, err := exec.LookPath("npm"); err != nil {
-		return "", fmt.Errorf("openclaw is not installed and npm was not found\n\n" +
-			"Install Node.js first:\n" +
-			"  https://nodejs.org/\n\n" +
-			"Then rerun:\n" +
-			"  ollama launch\n" +
-			"and select OpenClaw")
+	_, npmErr := exec.LookPath("npm")
+	_, gitErr := exec.LookPath("git")
+	if npmErr != nil || gitErr != nil {
+		var missing []string
+		if npmErr != nil {
+			missing = append(missing, "npm (Node.js): https://nodejs.org/")
+		}
+		if gitErr != nil {
+			missing = append(missing, "git: https://git-scm.com/")
+		}
+		return "", fmt.Errorf("openclaw is not installed and required dependencies are missing\n\nInstall the following first:\n  %s", strings.Join(missing, "\n  "))
 	}

-	ok, err := confirmPrompt("OpenClaw is not installed. Install with npm?")
+	ok, err := ConfirmPrompt("OpenClaw is not installed. Install with npm?")
 	if err != nil {
 		return "", err
 	}
@@ -448,6 +470,7 @@ func ensureOpenclawInstalled() (string, error) {
 	}

 	fmt.Fprintf(os.Stderr, "%sOpenClaw installed successfully%s\n\n", ansiGreen, ansiReset)
+	openclawFreshInstall = true
 	return "openclaw", nil
 }

@@ -502,7 +525,7 @@ func (c *Openclaw) Edit(models []string) error {
 		ollama = make(map[string]any)
 	}

-	ollama["baseUrl"] = envconfig.Host().String() + "/v1"
+	ollama["baseUrl"] = envconfig.Host().String()
 	// needed to register provider
 	ollama["apiKey"] = "ollama-local"
 	ollama["api"] = "ollama"
@@ -561,7 +584,7 @@ func (c *Openclaw) Edit(models []string) error {
 	if err != nil {
 		return err
 	}
-	if err := writeWithBackup(configPath, data); err != nil {
+	if err := fileutil.WriteWithBackup(configPath, data); err != nil {
 		return err
 	}

@@ -592,6 +615,8 @@ func clearSessionModelOverride(primary string) {
 		if override, _ := sess["modelOverride"].(string); override != "" && override != primary {
 			delete(sess, "modelOverride")
 			delete(sess, "providerOverride")
+		}
+		if model, _ := sess["model"].(string); model != "" && model != primary {
 			sess["model"] = primary
 			changed = true
 		}
@@ -606,11 +631,15 @@ func clearSessionModelOverride(primary string) {
 	_ = os.WriteFile(path, out, 0o600)
 }

-const webSearchNpmPackage = "@ollama/openclaw-web-search"
+const (
+	webSearchNpmPackage = "@ollama/openclaw-web-search"
+	webSearchMinVersion = "0.2.1"
+)

 // ensureWebSearchPlugin installs the openclaw-web-search extension into the
 // user-level extensions directory (~/.openclaw/extensions/) if it isn't already
-// present. Returns true if the extension is available.
+// present, or re-installs if the installed version is older than webSearchMinVersion.
+// Returns true if the extension is available.
 func ensureWebSearchPlugin() bool {
 	home, err := os.UserHomeDir()
 	if err != nil {
@@ -618,8 +647,8 @@ func ensureWebSearchPlugin() bool {
 	}

 	pluginDir := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
-	if _, err := os.Stat(filepath.Join(pluginDir, "index.ts")); err == nil {
-		return true // already installed
+	if webSearchPluginUpToDate(pluginDir) {
+		return true
 	}

 	npmBin, err := exec.LookPath("npm")
@@ -653,6 +682,34 @@ func ensureWebSearchPlugin() bool {
 	return true
 }

+// webSearchPluginUpToDate returns true if the plugin is installed and its
+// package.json version is >= webSearchMinVersion.
+func webSearchPluginUpToDate(pluginDir string) bool {
+	data, err := os.ReadFile(filepath.Join(pluginDir, "package.json"))
+	if err != nil {
+		return false
+	}
+	var pkg struct {
+		Version string `json:"version"`
+	}
+	if json.Unmarshal(data, &pkg) != nil || pkg.Version == "" {
+		return false
+	}
+	return !versionLessThan(pkg.Version, webSearchMinVersion)
+}
+
+// versionLessThan compares two semver version strings (major.minor.patch).
+// Inputs may omit the "v" prefix; it is added automatically for semver.Compare.
+func versionLessThan(a, b string) bool {
+	if !strings.HasPrefix(a, "v") {
+		a = "v" + a
+	}
+	if !strings.HasPrefix(b, "v") {
+		b = "v" + b
+	}
+	return semver.Compare(a, b) < 0
+}
+
 // registerWebSearchPlugin adds plugins.entries.openclaw-web-search to the OpenClaw
 // config so the gateway activates it on next start. Best-effort; silently returns
 // on any error.
@@ -679,23 +736,67 @@ func registerWebSearchPlugin() {
 	if entries == nil {
 		entries = make(map[string]any)
 	}
-	if _, ok := entries["openclaw-web-search"]; ok {
-		return // already registered
-	}
 	entries["openclaw-web-search"] = map[string]any{"enabled": true}
 	plugins["entries"] = entries
+
+	// Pin trust so the gateway doesn't warn about untracked plugins.
+	allow, _ := plugins["allow"].([]any)
+	hasAllow := false
+	for _, v := range allow {
+		if s, ok := v.(string); ok && s == "openclaw-web-search" {
+			hasAllow = true
+			break
+		}
+	}
+	if !hasAllow {
+		allow = append(allow, "openclaw-web-search")
+	}
+	plugins["allow"] = allow
+
+	// Record install provenance so the loader can verify the plugin origin.
+	installs, _ := plugins["installs"].(map[string]any)
+	if installs == nil {
+		installs = make(map[string]any)
+	}
+	pluginDir := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
+	installs["openclaw-web-search"] = map[string]any{
+		"source":      "npm",
+		"spec":        webSearchNpmPackage,
+		"installPath": pluginDir,
+	}
+	plugins["installs"] = installs
+
 	config["plugins"] = plugins

-	// Disable the built-in web search since our plugin replaces it.
+	// Add plugin tools to tools.alsoAllow so they survive the coding profile's
+	// policy pipeline (which has an explicit allow list of core tools only).
 	tools, _ := config["tools"].(map[string]any)
 	if tools == nil {
 		tools = make(map[string]any)
 	}
+
+	alsoAllow, _ := tools["alsoAllow"].([]any)
+	needed := []string{"ollama_web_search", "ollama_web_fetch"}
+	have := make(map[string]bool, len(alsoAllow))
+	for _, v := range alsoAllow {
+		if s, ok := v.(string); ok {
+			have[s] = true
+		}
+	}
+	for _, name := range needed {
+		if !have[name] {
+			alsoAllow = append(alsoAllow, name)
+		}
+	}
+	tools["alsoAllow"] = alsoAllow
+
+	// Disable built-in web search/fetch since our plugin replaces them.
 	web, _ := tools["web"].(map[string]any)
 	if web == nil {
 		web = make(map[string]any)
 	}
 	web["search"] = map[string]any{"enabled": false}
+	web["fetch"] = map[string]any{"enabled": false}
 	tools["web"] = web
 	config["tools"] = tools

@@ -776,9 +877,9 @@ func (c *Openclaw) Models() []string {
 		return nil
 	}

-	config, err := readJSONFile(filepath.Join(home, ".openclaw", "openclaw.json"))
+	config, err := fileutil.ReadJSON(filepath.Join(home, ".openclaw", "openclaw.json"))
 	if err != nil {
-		config, err = readJSONFile(filepath.Join(home, ".clawdbot", "clawdbot.json"))
+		config, err = fileutil.ReadJSON(filepath.Join(home, ".clawdbot", "clawdbot.json"))
 		if err != nil {
 			return nil
 		}
--- a/cmd/launch/openclaw_test.go
+++ b/cmd/launch/openclaw_test.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"bytes"
@@ -82,78 +82,6 @@ func TestOpenclawRunPassthroughArgs(t *testing.T) {
 	}
 }

-func TestOpenclawRunFirstLaunchPersistence(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("uses a POSIX shell test binary")
-	}
-
-	oldHook := DefaultConfirmPrompt
-	DefaultConfirmPrompt = func(prompt string) (bool, error) {
-		return true, nil
-	}
-	defer func() { DefaultConfirmPrompt = oldHook }()
-
-	t.Run("success persists onboarding flag", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		t.Setenv("PATH", tmpDir)
-
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		if err := os.MkdirAll(configDir, 0o755); err != nil {
-			t.Fatal(err)
-		}
-		// Mark OpenClaw onboarding complete so Run takes passthrough path directly.
-		if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
-			"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"}
-		}`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if err := os.WriteFile(filepath.Join(tmpDir, "openclaw"), []byte("#!/bin/sh\nexit 0\n"), 0o755); err != nil {
-			t.Fatal(err)
-		}
-
-		c := &Openclaw{}
-		if err := c.Run("llama3.2", []string{"gateway", "--status"}); err != nil {
-			t.Fatalf("Run() error = %v", err)
-		}
-		integrationConfig, err := loadIntegration("openclaw")
-		if err != nil {
-			t.Fatalf("loadIntegration() error = %v", err)
-		}
-		if !integrationConfig.Onboarded {
-			t.Fatal("expected onboarding flag to be persisted after successful run")
-		}
-	})
-
-	t.Run("failure does not persist onboarding flag", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		t.Setenv("PATH", tmpDir)
-
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		if err := os.MkdirAll(configDir, 0o755); err != nil {
-			t.Fatal(err)
-		}
-		if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
-			"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"}
-		}`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if err := os.WriteFile(filepath.Join(tmpDir, "openclaw"), []byte("#!/bin/sh\nexit 1\n"), 0o755); err != nil {
-			t.Fatal(err)
-		}
-
-		c := &Openclaw{}
-		if err := c.Run("llama3.2", []string{"gateway", "--status"}); err == nil {
-			t.Fatal("expected run failure")
-		}
-		integrationConfig, err := loadIntegration("openclaw")
-		if err == nil && integrationConfig.Onboarded {
-			t.Fatal("expected onboarding flag to remain unset after failed run")
-		}
-	})
-}
-
 func TestOpenclawEdit(t *testing.T) {
 	c := &Openclaw{}
 	tmpDir := t.TempDir()
@@ -589,7 +517,7 @@ const testOpenclawFixture = `{
    "providers": {
      "anthropic": {"apiKey": "xxx"},
      "ollama": {
-        "baseUrl": "http://127.0.0.1:11434/v1",
+        "baseUrl": "http://127.0.0.1:11434",
        "models": [{"id": "old-model", "customField": "preserved"}]
      }
    }
@@ -1448,7 +1376,7 @@ func TestOpenclawModelConfig(t *testing.T) {
 		// report it as a remote/cloud model
 		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			if r.URL.Path == "/api/show" {
-				fmt.Fprintf(w, `{"capabilities":[],"model_info":{},"remote_model":"minimax-m2.5"}`)
+				fmt.Fprintf(w, `{"capabilities":[],"model_info":{},"remote_model":"minimax-m2.7"}`)
 				return
 			}
 			w.WriteHeader(http.StatusNotFound)
@@ -1458,7 +1386,7 @@ func TestOpenclawModelConfig(t *testing.T) {
 		u, _ := url.Parse(srv.URL)
 		client := api.NewClient(u, srv.Client())

-		cfg, isCloud := openclawModelConfig(context.Background(), client, "minimax-m2.5:cloud")
+		cfg, isCloud := openclawModelConfig(context.Background(), client, "minimax-m2.7:cloud")

 		if !isCloud {
 			t.Error("expected isCloud = true for cloud model")
@@ -1528,7 +1456,7 @@ func TestIntegrationOnboarded(t *testing.T) {
 		tmpDir := t.TempDir()
 		setTestHome(t, tmpDir)

-		integrationConfig, err := loadIntegration("openclaw")
+		integrationConfig, err := LoadIntegration("openclaw")
 		if err == nil && integrationConfig.Onboarded {
 			t.Error("expected false for fresh config")
 		}
@@ -1542,7 +1470,7 @@ func TestIntegrationOnboarded(t *testing.T) {
 		if err := integrationOnboarded("openclaw"); err != nil {
 			t.Fatal(err)
 		}
-		integrationConfig, err := loadIntegration("openclaw")
+		integrationConfig, err := LoadIntegration("openclaw")
 		if err != nil || !integrationConfig.Onboarded {
 			t.Error("expected true after integrationOnboarded")
 		}
@@ -1556,7 +1484,7 @@ func TestIntegrationOnboarded(t *testing.T) {
 		if err := integrationOnboarded("OpenClaw"); err != nil {
 			t.Fatal(err)
 		}
-		integrationConfig, err := loadIntegration("openclaw")
+		integrationConfig, err := LoadIntegration("openclaw")
 		if err != nil || !integrationConfig.Onboarded {
 			t.Error("expected true when set with different case")
 		}
@@ -1575,7 +1503,7 @@ func TestIntegrationOnboarded(t *testing.T) {
 		}

 		// Verify onboarded is set
-		integrationConfig, err := loadIntegration("openclaw")
+		integrationConfig, err := LoadIntegration("openclaw")
 		if err != nil || !integrationConfig.Onboarded {
 			t.Error("expected true after integrationOnboarded")
 		}
@@ -1587,3 +1515,377 @@ func TestIntegrationOnboarded(t *testing.T) {
 		}
 	})
 }
+
+func TestVersionLessThan(t *testing.T) {
+	tests := []struct {
+		a, b string
+		want bool
+	}{
+		{"0.1.7", "0.2.1", true},
+		{"0.2.0", "0.2.1", true},
+		{"0.2.1", "0.2.1", false},
+		{"0.2.2", "0.2.1", false},
+		{"1.0.0", "0.2.1", false},
+		{"0.2.1", "1.0.0", true},
+		{"v0.1.7", "0.2.1", true},
+		{"0.2.1", "v0.2.1", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.a+"_vs_"+tt.b, func(t *testing.T) {
+			if got := versionLessThan(tt.a, tt.b); got != tt.want {
+				t.Errorf("versionLessThan(%q, %q) = %v, want %v", tt.a, tt.b, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestWebSearchPluginUpToDate(t *testing.T) {
+	t.Run("missing directory", func(t *testing.T) {
+		if webSearchPluginUpToDate(filepath.Join(t.TempDir(), "nonexistent")) {
+			t.Error("expected false for missing directory")
+		}
+	})
+
+	t.Run("missing package.json", func(t *testing.T) {
+		dir := t.TempDir()
+		if webSearchPluginUpToDate(dir) {
+			t.Error("expected false for missing package.json")
+		}
+	})
+
+	t.Run("old version", func(t *testing.T) {
+		dir := t.TempDir()
+		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":"0.1.7"}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if webSearchPluginUpToDate(dir) {
+			t.Error("expected false for old version 0.1.7")
+		}
+	})
+
+	t.Run("exact minimum version", func(t *testing.T) {
+		dir := t.TempDir()
+		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":"0.2.1"}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if !webSearchPluginUpToDate(dir) {
+			t.Error("expected true for exact minimum version 0.2.1")
+		}
+	})
+
+	t.Run("newer version", func(t *testing.T) {
+		dir := t.TempDir()
+		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":"1.0.0"}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if !webSearchPluginUpToDate(dir) {
+			t.Error("expected true for newer version 1.0.0")
+		}
+	})
+
+	t.Run("invalid json", func(t *testing.T) {
+		dir := t.TempDir()
+		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`not json`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if webSearchPluginUpToDate(dir) {
+			t.Error("expected false for invalid json")
+		}
+	})
+
+	t.Run("empty version", func(t *testing.T) {
+		dir := t.TempDir()
+		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":""}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if webSearchPluginUpToDate(dir) {
+			t.Error("expected false for empty version")
+		}
+	})
+}
+
+func TestRegisterWebSearchPlugin(t *testing.T) {
+	home := t.TempDir()
+	setTestHome(t, home)
+
+	configDir := filepath.Join(home, ".openclaw")
+	if err := os.MkdirAll(configDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	configPath := filepath.Join(configDir, "openclaw.json")
+
+	t.Run("fresh config", func(t *testing.T) {
+		if err := os.WriteFile(configPath, []byte(`{}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+
+		registerWebSearchPlugin()
+
+		data, err := os.ReadFile(configPath)
+		if err != nil {
+			t.Fatal(err)
+		}
+		var config map[string]any
+		if err := json.Unmarshal(data, &config); err != nil {
+			t.Fatal(err)
+		}
+
+		plugins, _ := config["plugins"].(map[string]any)
+		if plugins == nil {
+			t.Fatal("plugins section missing")
+		}
+
+		// Check entries
+		entries, _ := plugins["entries"].(map[string]any)
+		entry, _ := entries["openclaw-web-search"].(map[string]any)
+		if enabled, _ := entry["enabled"].(bool); !enabled {
+			t.Error("expected entries.openclaw-web-search.enabled = true")
+		}
+
+		// Check allow list
+		allow, _ := plugins["allow"].([]any)
+		found := false
+		for _, v := range allow {
+			if s, ok := v.(string); ok && s == "openclaw-web-search" {
+				found = true
+			}
+		}
+		if !found {
+			t.Error("expected plugins.allow to contain openclaw-web-search")
+		}
+
+		// Check install provenance
+		installs, _ := plugins["installs"].(map[string]any)
+		record, _ := installs["openclaw-web-search"].(map[string]any)
+		if record == nil {
+			t.Fatal("expected plugins.installs.openclaw-web-search")
+		}
+		if source, _ := record["source"].(string); source != "npm" {
+			t.Errorf("install source = %q, want %q", source, "npm")
+		}
+		if spec, _ := record["spec"].(string); spec != webSearchNpmPackage {
+			t.Errorf("install spec = %q, want %q", spec, webSearchNpmPackage)
+		}
+		expectedPath := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
+		if installPath, _ := record["installPath"].(string); installPath != expectedPath {
+			t.Errorf("installPath = %q, want %q", installPath, expectedPath)
+		}
+	})
+
+	t.Run("idempotent", func(t *testing.T) {
+		if err := os.WriteFile(configPath, []byte(`{}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+
+		registerWebSearchPlugin()
+		registerWebSearchPlugin()
+
+		data, err := os.ReadFile(configPath)
+		if err != nil {
+			t.Fatal(err)
+		}
+		var config map[string]any
+		if err := json.Unmarshal(data, &config); err != nil {
+			t.Fatal(err)
+		}
+
+		plugins, _ := config["plugins"].(map[string]any)
+		allow, _ := plugins["allow"].([]any)
+		count := 0
+		for _, v := range allow {
+			if s, ok := v.(string); ok && s == "openclaw-web-search" {
+				count++
+			}
+		}
+		if count != 1 {
+			t.Errorf("expected exactly 1 openclaw-web-search in allow, got %d", count)
+		}
+	})
+
+	t.Run("preserves existing config", func(t *testing.T) {
+		initial := map[string]any{
+			"plugins": map[string]any{
+				"allow": []any{"some-other-plugin"},
+				"entries": map[string]any{
+					"some-other-plugin": map[string]any{"enabled": true},
+				},
+				"installs": map[string]any{
+					"some-other-plugin": map[string]any{
+						"source":      "npm",
+						"installPath": "/some/path",
+					},
+				},
+			},
+			"customField": "preserved",
+		}
+		data, _ := json.Marshal(initial)
+		if err := os.WriteFile(configPath, data, 0o644); err != nil {
+			t.Fatal(err)
+		}
+
+		registerWebSearchPlugin()
+
+		out, err := os.ReadFile(configPath)
+		if err != nil {
+			t.Fatal(err)
+		}
+		var config map[string]any
+		if err := json.Unmarshal(out, &config); err != nil {
+			t.Fatal(err)
+		}
+
+		if config["customField"] != "preserved" {
+			t.Error("customField was not preserved")
+		}
+
+		plugins, _ := config["plugins"].(map[string]any)
+		entries, _ := plugins["entries"].(map[string]any)
+		if entries["some-other-plugin"] == nil {
+			t.Error("existing plugin entry was lost")
+		}
+
+		installs, _ := plugins["installs"].(map[string]any)
+		if installs["some-other-plugin"] == nil {
+			t.Error("existing install record was lost")
+		}
+
+		allow, _ := plugins["allow"].([]any)
+		hasOther, hasWebSearch := false, false
+		for _, v := range allow {
+			s, _ := v.(string)
+			if s == "some-other-plugin" {
+				hasOther = true
+			}
+			if s == "openclaw-web-search" {
+				hasWebSearch = true
+			}
+		}
+		if !hasOther {
+			t.Error("existing allow entry was lost")
+		}
+		if !hasWebSearch {
+			t.Error("openclaw-web-search not added to allow")
+		}
+	})
+}
+
+func TestClearSessionModelOverride(t *testing.T) {
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+
+	sessionsDir := filepath.Join(tmpDir, ".openclaw", "agents", "main", "sessions")
+	sessionsPath := filepath.Join(sessionsDir, "sessions.json")
+
+	writeSessionsFile := func(t *testing.T, sessions map[string]map[string]any) {
+		t.Helper()
+		if err := os.MkdirAll(sessionsDir, 0o755); err != nil {
+			t.Fatal(err)
+		}
+		data, err := json.Marshal(sessions)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(sessionsPath, data, 0o600); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	readSessionsFile := func(t *testing.T) map[string]map[string]any {
+		t.Helper()
+		data, err := os.ReadFile(sessionsPath)
+		if err != nil {
+			t.Fatalf("reading sessions file: %v", err)
+		}
+		var sessions map[string]map[string]any
+		if err := json.Unmarshal(data, &sessions); err != nil {
+			t.Fatalf("parsing sessions file: %v", err)
+		}
+		return sessions
+	}
+
+	t.Run("clears modelOverride and updates model", func(t *testing.T) {
+		writeSessionsFile(t, map[string]map[string]any{
+			"sess1": {"model": "ollama/old-model", "modelOverride": "old-model", "providerOverride": "ollama"},
+		})
+		clearSessionModelOverride("new-model")
+		sessions := readSessionsFile(t)
+		sess := sessions["sess1"]
+		if _, ok := sess["modelOverride"]; ok {
+			t.Error("modelOverride should have been deleted")
+		}
+		if _, ok := sess["providerOverride"]; ok {
+			t.Error("providerOverride should have been deleted")
+		}
+		if sess["model"] != "new-model" {
+			t.Errorf("model = %q, want %q", sess["model"], "new-model")
+		}
+	})
+
+	t.Run("updates model field in sessions without modelOverride", func(t *testing.T) {
+		// This is the bug case: session has model pointing to old primary,
+		// but no explicit modelOverride. After changing primary, the session
+		// model field must also be updated.
+		writeSessionsFile(t, map[string]map[string]any{
+			"sess1": {"model": "ollama/old-model"},
+		})
+		clearSessionModelOverride("new-model")
+		sessions := readSessionsFile(t)
+		if sessions["sess1"]["model"] != "new-model" {
+			t.Errorf("model = %q, want %q", sessions["sess1"]["model"], "new-model")
+		}
+	})
+
+	t.Run("does not update session already using primary", func(t *testing.T) {
+		writeSessionsFile(t, map[string]map[string]any{
+			"sess1": {"model": "current-model"},
+		})
+		clearSessionModelOverride("current-model")
+		sessions := readSessionsFile(t)
+		if sessions["sess1"]["model"] != "current-model" {
+			t.Errorf("model = %q, want %q", sessions["sess1"]["model"], "current-model")
+		}
+	})
+
+	t.Run("does not update session with empty model field", func(t *testing.T) {
+		writeSessionsFile(t, map[string]map[string]any{
+			"sess1": {"other": "data"},
+		})
+		clearSessionModelOverride("new-model")
+		sessions := readSessionsFile(t)
+		if _, ok := sessions["sess1"]["model"]; ok {
+			t.Error("model field should not have been added to session with no model")
+		}
+	})
+
+	t.Run("handles multiple sessions mixed", func(t *testing.T) {
+		writeSessionsFile(t, map[string]map[string]any{
+			"with-override":    {"model": "old", "modelOverride": "old", "providerOverride": "ollama"},
+			"without-override": {"model": "old"},
+			"already-current":  {"model": "new-model"},
+			"no-model":         {"other": "data"},
+		})
+		clearSessionModelOverride("new-model")
+		sessions := readSessionsFile(t)
+
+		if sessions["with-override"]["model"] != "new-model" {
+			t.Errorf("with-override model = %q, want %q", sessions["with-override"]["model"], "new-model")
+		}
+		if _, ok := sessions["with-override"]["modelOverride"]; ok {
+			t.Error("with-override: modelOverride should be deleted")
+		}
+		if sessions["without-override"]["model"] != "new-model" {
+			t.Errorf("without-override model = %q, want %q", sessions["without-override"]["model"], "new-model")
+		}
+		if sessions["already-current"]["model"] != "new-model" {
+			t.Errorf("already-current model = %q, want %q", sessions["already-current"]["model"], "new-model")
+		}
+		if _, ok := sessions["no-model"]["model"]; ok {
+			t.Error("no-model: model should not have been added")
+		}
+	})
+
+	t.Run("no-op when sessions file missing", func(t *testing.T) {
+		os.RemoveAll(sessionsDir)
+		clearSessionModelOverride("new-model") // should not panic or error
+	})
+}
--- a/cmd/launch/opencode.go
+++ b/cmd/launch/opencode.go
@@ -1,9 +1,7 @@
-package config
+package launch

 import (
-	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"maps"
 	"os"
@@ -12,34 +10,13 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
 	"github.com/ollama/ollama/envconfig"
 )

 // OpenCode implements Runner and Editor for OpenCode integration
 type OpenCode struct{}

-// cloudModelLimit holds context and output token limits for a cloud model.
-type cloudModelLimit struct {
-	Context int
-	Output  int
-}
-
-// lookupCloudModelLimit returns the token limits for a cloud model.
-// It tries the exact name first, then strips the ":cloud" suffix.
-func lookupCloudModelLimit(name string) (cloudModelLimit, bool) {
-	if l, ok := cloudModelLimits[name]; ok {
-		return l, true
-	}
-	base := strings.TrimSuffix(name, ":cloud")
-	if base != name {
-		if l, ok := cloudModelLimits[base]; ok {
-			return l, true
-		}
-	}
-	return cloudModelLimit{}, false
-}
-
 func (o *OpenCode) String() string { return "OpenCode" }

 func (o *OpenCode) Run(model string, args []string) error {
@@ -47,25 +24,6 @@ func (o *OpenCode) Run(model string, args []string) error {
 		return fmt.Errorf("opencode is not installed, install from https://opencode.ai")
 	}

-	// Call Edit() to ensure config is up-to-date before launch
-	models := []string{model}
-	if config, err := loadIntegration("opencode"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	var err error
-	models, err = resolveEditorModels("opencode", models, func() ([]string, error) {
-		return selectModels(context.Background(), "opencode", "")
-	})
-	if errors.Is(err, errCancelled) {
-		return nil
-	}
-	if err != nil {
-		return err
-	}
-	if err := o.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
 	cmd := exec.Command("opencode", args...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -122,13 +80,18 @@ func (o *OpenCode) Edit(modelList []string) error {
 	if !ok {
 		ollama = map[string]any{
 			"npm":  "@ai-sdk/openai-compatible",
-			"name": "Ollama (local)",
+			"name": "Ollama",
 			"options": map[string]any{
 				"baseURL": envconfig.Host().String() + "/v1",
 			},
 		}
 	}

+	// Migrate legacy provider name
+	if name, _ := ollama["name"].(string); name == "Ollama (local)" {
+		ollama["name"] = "Ollama"
+	}
+
 	models, ok := ollama["models"].(map[string]any)
 	if !ok {
 		models = make(map[string]any)
@@ -147,8 +110,6 @@ func (o *OpenCode) Edit(modelList []string) error {
 		}
 	}

-	client, _ := api.ClientFromEnvironment()
-
 	for _, model := range modelList {
 		if existing, ok := models[model].(map[string]any); ok {
 			// migrate existing models without _launch marker
@@ -158,7 +119,7 @@ func (o *OpenCode) Edit(modelList []string) error {
 					existing["name"] = strings.TrimSuffix(name, " [Ollama]")
 				}
 			}
-			if isCloudModel(context.Background(), client, model) {
+			if isCloudModelName(model) {
 				if l, ok := lookupCloudModelLimit(model); ok {
 					existing["limit"] = map[string]any{
 						"context": l.Context,
@@ -172,7 +133,7 @@ func (o *OpenCode) Edit(modelList []string) error {
 			"name":    model,
 			"_launch": true,
 		}
-		if isCloudModel(context.Background(), client, model) {
+		if isCloudModelName(model) {
 			if l, ok := lookupCloudModelLimit(model); ok {
 				entry["limit"] = map[string]any{
 					"context": l.Context,
@@ -191,7 +152,7 @@ func (o *OpenCode) Edit(modelList []string) error {
 	if err != nil {
 		return err
 	}
-	if err := writeWithBackup(configPath, configData); err != nil {
+	if err := fileutil.WriteWithBackup(configPath, configData); err != nil {
 		return err
 	}

@@ -243,7 +204,7 @@ func (o *OpenCode) Edit(modelList []string) error {
 	if err != nil {
 		return err
 	}
-	return writeWithBackup(statePath, stateData)
+	return fileutil.WriteWithBackup(statePath, stateData)
 }

 func (o *OpenCode) Models() []string {
@@ -251,7 +212,7 @@ func (o *OpenCode) Models() []string {
 	if err != nil {
 		return nil
 	}
-	config, err := readJSONFile(filepath.Join(home, ".config", "opencode", "opencode.json"))
+	config, err := fileutil.ReadJSON(filepath.Join(home, ".config", "opencode", "opencode.json"))
 	if err != nil {
 		return nil
 	}
--- a/cmd/launch/opencode_test.go
+++ b/cmd/launch/opencode_test.go
@@ -1,8 +1,10 @@
-package config
+package launch

 import (
 	"encoding/json"
 	"fmt"
+	"net/http"
+	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"testing"
@@ -232,6 +234,44 @@ func TestOpenCodeEdit(t *testing.T) {
 		}
 	})

+	t.Run("migrate Ollama (local) provider name", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"provider":{"ollama":{"name":"Ollama (local)","npm":"@ai-sdk/openai-compatible","options":{"baseURL":"http://localhost:11434/v1"}}}}`), 0o644)
+
+		if err := o.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		provider := cfg["provider"].(map[string]any)
+		ollama := provider["ollama"].(map[string]any)
+		if ollama["name"] != "Ollama" {
+			t.Errorf("provider name not migrated: got %q, want %q", ollama["name"], "Ollama")
+		}
+	})
+
+	t.Run("preserve custom provider name", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"provider":{"ollama":{"name":"My Custom Ollama","npm":"@ai-sdk/openai-compatible","options":{"baseURL":"http://localhost:11434/v1"}}}}`), 0o644)
+
+		if err := o.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		provider := cfg["provider"].(map[string]any)
+		ollama := provider["ollama"].(map[string]any)
+		if ollama["name"] != "My Custom Ollama" {
+			t.Errorf("custom provider name was changed: got %q, want %q", ollama["name"], "My Custom Ollama")
+		}
+	})
+
 	t.Run("remove model preserves non-ollama models", func(t *testing.T) {
 		cleanup()
 		os.MkdirAll(configDir, 0o755)
@@ -619,6 +659,54 @@ func TestOpenCodeEdit_CloudModelLimitStructure(t *testing.T) {
 	}
 }

+func TestOpenCodeEdit_BackfillsCloudModelLimitOnExistingEntry(t *testing.T) {
+	o := &OpenCode{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/api/show" {
+			fmt.Fprintf(w, `{"capabilities":[],"model_info":{},"remote_model":"glm-5"}`)
+			return
+		}
+		w.WriteHeader(http.StatusNotFound)
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	configDir := filepath.Join(tmpDir, ".config", "opencode")
+	configPath := filepath.Join(configDir, "opencode.json")
+	os.MkdirAll(configDir, 0o755)
+	os.WriteFile(configPath, []byte(`{
+		"provider": {
+			"ollama": {
+				"models": {
+					"glm-5:cloud": {
+						"name": "glm-5:cloud",
+						"_launch": true
+					}
+				}
+			}
+		}
+	}`), 0o644)
+
+	if err := o.Edit([]string{"glm-5:cloud"}); err != nil {
+		t.Fatal(err)
+	}
+
+	entry := readOpenCodeModel(t, configPath, "glm-5:cloud")
+	limit, ok := entry["limit"].(map[string]any)
+	if !ok {
+		t.Fatal("cloud model limit was not added on re-edit")
+	}
+	if limit["context"] != float64(202_752) {
+		t.Errorf("context = %v, want 202752", limit["context"])
+	}
+	if limit["output"] != float64(131_072) {
+		t.Errorf("output = %v, want 131072", limit["output"])
+	}
+}
+
 func TestLookupCloudModelLimit(t *testing.T) {
 	tests := []struct {
 		name        string
@@ -626,13 +714,19 @@ func TestLookupCloudModelLimit(t *testing.T) {
 		wantContext int
 		wantOutput  int
 	}{
-		{"glm-4.7", true, 202_752, 131_072},
+		{"glm-4.7", false, 0, 0},
 		{"glm-4.7:cloud", true, 202_752, 131_072},
-		{"kimi-k2.5", true, 262_144, 262_144},
+		{"glm-5:cloud", true, 202_752, 131_072},
+		{"gpt-oss:120b-cloud", true, 131_072, 131_072},
+		{"gpt-oss:20b-cloud", true, 131_072, 131_072},
+		{"kimi-k2.5", false, 0, 0},
 		{"kimi-k2.5:cloud", true, 262_144, 262_144},
-		{"deepseek-v3.2", true, 163_840, 65_536},
+		{"deepseek-v3.2", false, 0, 0},
 		{"deepseek-v3.2:cloud", true, 163_840, 65_536},
-		{"qwen3-coder:480b", true, 262_144, 65_536},
+		{"qwen3.5", false, 0, 0},
+		{"qwen3.5:cloud", true, 262_144, 32_768},
+		{"qwen3-coder:480b", false, 0, 0},
+		{"qwen3-coder:480b:cloud", true, 262_144, 65_536},
 		{"qwen3-coder-next:cloud", true, 262_144, 32_768},
 		{"llama3.2", false, 0, 0},
 		{"unknown-model:cloud", false, 0, 0},
--- a/cmd/launch/pi.go
+++ b/cmd/launch/pi.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"context"
@@ -12,6 +12,7 @@ import (
 	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )
@@ -26,15 +27,6 @@ func (p *Pi) Run(model string, args []string) error {
 		return fmt.Errorf("pi is not installed, install with: npm install -g @mariozechner/pi-coding-agent")
 	}

-	// Call Edit() to ensure config is up-to-date before launch
-	models := []string{model}
-	if config, err := loadIntegration("pi"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	if err := p.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
 	cmd := exec.Command("pi", args...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -107,7 +99,8 @@ func (p *Pi) Edit(models []string) error {

 	// Build new models list:
 	// 1. Keep user-managed models (no _launch marker) - untouched
-	// 2. Keep ollama-managed models (_launch marker) that are still selected
+	// 2. Keep ollama-managed models (_launch marker) that are still selected,
+	//    except stale cloud entries that should be rebuilt below
 	// 3. Add new ollama-managed models
 	var newModels []any
 	for _, m := range existingModels {
@@ -117,7 +110,13 @@ func (p *Pi) Edit(models []string) error {
 				if !isPiOllamaModel(modelObj) {
 					newModels = append(newModels, m)
 				} else if selectedSet[id] {
-					// Ollama-managed and still selected - keep it
+					// Rebuild stale managed cloud entries so createConfig refreshes
+					// the whole entry instead of patching it in place.
+					if !hasContextWindow(modelObj) {
+						if _, ok := lookupCloudModelLimit(id); ok {
+							continue
+						}
+					}
 					newModels = append(newModels, m)
 					selectedSet[id] = false
 				}
@@ -142,7 +141,7 @@ func (p *Pi) Edit(models []string) error {
 	if err != nil {
 		return err
 	}
-	if err := writeWithBackup(configPath, configData); err != nil {
+	if err := fileutil.WriteWithBackup(configPath, configData); err != nil {
 		return err
 	}

@@ -160,7 +159,7 @@ func (p *Pi) Edit(models []string) error {
 	if err != nil {
 		return err
 	}
-	return writeWithBackup(settingsPath, settingsData)
+	return fileutil.WriteWithBackup(settingsPath, settingsData)
 }

 func (p *Pi) Models() []string {
@@ -170,7 +169,7 @@ func (p *Pi) Models() []string {
 	}

 	configPath := filepath.Join(home, ".pi", "agent", "models.json")
-	config, err := readJSONFile(configPath)
+	config, err := fileutil.ReadJSON(configPath)
 	if err != nil {
 		return nil
 	}
@@ -199,15 +198,38 @@ func isPiOllamaModel(cfg map[string]any) bool {
 	return false
 }

+func hasContextWindow(cfg map[string]any) bool {
+	switch v := cfg["contextWindow"].(type) {
+	case float64:
+		return v > 0
+	case int:
+		return v > 0
+	case int64:
+		return v > 0
+	default:
+		return false
+	}
+}
+
 // createConfig builds Pi model config with capability detection
 func createConfig(ctx context.Context, client *api.Client, modelID string) map[string]any {
 	cfg := map[string]any{
 		"id":      modelID,
 		"_launch": true,
 	}
+	if l, ok := lookupCloudModelLimit(modelID); ok {
+		cfg["contextWindow"] = l.Context
+	}
+
+	applyCloudContextFallback := func() {
+		if l, ok := lookupCloudModelLimit(modelID); ok {
+			cfg["contextWindow"] = l.Context
+		}
+	}

 	resp, err := client.Show(ctx, &api.ShowRequest{Model: modelID})
 	if err != nil {
+		applyCloudContextFallback()
 		return cfg
 	}

@@ -223,15 +245,21 @@ func createConfig(ctx context.Context, client *api.Client, modelID string) map[s
 		cfg["reasoning"] = true
 	}

-	// Extract context window from ModelInfo
+	// Extract context window from ModelInfo. For known cloud models, the
+	// pre-filled shared limit remains unless the server provides a positive value.
+	hasContextWindow := false
 	for key, val := range resp.ModelInfo {
 		if strings.HasSuffix(key, ".context_length") {
 			if ctxLen, ok := val.(float64); ok && ctxLen > 0 {
 				cfg["contextWindow"] = int(ctxLen)
+				hasContextWindow = true
 			}
 			break
 		}
 	}
+	if !hasContextWindow {
+		applyCloudContextFallback()
+	}

 	return cfg
 }
--- a/cmd/launch/pi_test.go
+++ b/cmd/launch/pi_test.go
@@ -1,4 +1,4 @@
-package config
+package launch

 import (
 	"context"
@@ -192,6 +192,48 @@ func TestPiEdit(t *testing.T) {
 		}
 	})

+	t.Run("rebuilds stale existing managed cloud model", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+
+		existingConfig := `{
+			"providers": {
+				"ollama": {
+					"baseUrl": "http://localhost:11434/v1",
+					"api": "openai-completions",
+					"apiKey": "ollama",
+					"models": [
+						{"id": "glm-5:cloud", "_launch": true, "legacyField": "stale"}
+					]
+				}
+			}
+		}`
+		if err := os.WriteFile(configPath, []byte(existingConfig), 0o644); err != nil {
+			t.Fatal(err)
+		}
+
+		if err := pi.Edit([]string{"glm-5:cloud"}); err != nil {
+			t.Fatalf("Edit() error = %v", err)
+		}
+
+		cfg := readConfig()
+		providers := cfg["providers"].(map[string]any)
+		ollama := providers["ollama"].(map[string]any)
+		modelsArray := ollama["models"].([]any)
+		modelEntry := modelsArray[0].(map[string]any)
+
+		if modelEntry["contextWindow"] != float64(202_752) {
+			t.Errorf("contextWindow = %v, want 202752", modelEntry["contextWindow"])
+		}
+		input, ok := modelEntry["input"].([]any)
+		if !ok || len(input) != 1 || input[0] != "text" {
+			t.Errorf("input = %v, want [text]", modelEntry["input"])
+		}
+		if _, ok := modelEntry["legacyField"]; ok {
+			t.Error("legacyField should be removed when stale managed cloud entry is rebuilt")
+		}
+	})
+
 	t.Run("replaces old models with new ones", func(t *testing.T) {
 		cleanup()
 		os.MkdirAll(configDir, 0o755)
@@ -798,6 +840,59 @@ func TestCreateConfig(t *testing.T) {
 		}
 	})

+	t.Run("cloud model falls back to hardcoded context when show fails", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.WriteHeader(http.StatusNotFound)
+			fmt.Fprintf(w, `{"error":"model not found"}`)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg := createConfig(context.Background(), client, "kimi-k2.5:cloud")
+
+		if cfg["contextWindow"] != 262_144 {
+			t.Errorf("contextWindow = %v, want 262144", cfg["contextWindow"])
+		}
+	})
+
+	t.Run("cloud model falls back to hardcoded context when show omits model info", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":[],"model_info":{}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg := createConfig(context.Background(), client, "glm-5:cloud")
+
+		if cfg["contextWindow"] != 202_752 {
+			t.Errorf("contextWindow = %v, want 202752", cfg["contextWindow"])
+		}
+	})
+
+	t.Run("cloud model with dash suffix falls back to hardcoded context", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.WriteHeader(http.StatusNotFound)
+			fmt.Fprintf(w, `{"error":"model not found"}`)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg := createConfig(context.Background(), client, "gpt-oss:120b-cloud")
+
+		if cfg["contextWindow"] != 131_072 {
+			t.Errorf("contextWindow = %v, want 131072", cfg["contextWindow"])
+		}
+	})
 	t.Run("skips zero context length", func(t *testing.T) {
 		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			if r.URL.Path == "/api/show" {
--- a/cmd/launch/registry.go
+++ b/cmd/launch/registry.go
@@ -0,0 +1,355 @@
+package launch
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"slices"
+	"strings"
+)
+
+// IntegrationInstallSpec describes how launcher should detect and guide installation.
+type IntegrationInstallSpec struct {
+	CheckInstalled  func() bool
+	EnsureInstalled func() error
+	URL             string
+	Command         []string
+}
+
+// IntegrationSpec is the canonical registry entry for one integration.
+type IntegrationSpec struct {
+	Name        string
+	Runner      Runner
+	Aliases     []string
+	Hidden      bool
+	Description string
+	Install     IntegrationInstallSpec
+}
+
+// IntegrationInfo contains display information about a registered integration.
+type IntegrationInfo struct {
+	Name        string
+	DisplayName string
+	Description string
+}
+
+var launcherIntegrationOrder = []string{"opencode", "droid", "pi", "cline"}
+
+var integrationSpecs = []*IntegrationSpec{
+	{
+		Name:        "claude",
+		Runner:      &Claude{},
+		Description: "Anthropic's coding tool with subagents",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := (&Claude{}).findPath()
+				return err == nil
+			},
+			URL: "https://code.claude.com/docs/en/quickstart",
+		},
+	},
+	{
+		Name:        "cline",
+		Runner:      &Cline{},
+		Description: "Autonomous coding agent with parallel execution",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := exec.LookPath("cline")
+				return err == nil
+			},
+			Command: []string{"npm", "install", "-g", "cline"},
+		},
+	},
+	{
+		Name:        "codex",
+		Runner:      &Codex{},
+		Description: "OpenAI's open-source coding agent",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := exec.LookPath("codex")
+				return err == nil
+			},
+			URL:     "https://developers.openai.com/codex/cli/",
+			Command: []string{"npm", "install", "-g", "@openai/codex"},
+		},
+	},
+	{
+		Name:        "droid",
+		Runner:      &Droid{},
+		Description: "Factory's coding agent across terminal and IDEs",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := exec.LookPath("droid")
+				return err == nil
+			},
+			URL: "https://docs.factory.ai/cli/getting-started/quickstart",
+		},
+	},
+	{
+		Name:        "opencode",
+		Runner:      &OpenCode{},
+		Description: "Anomaly's open-source coding agent",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := exec.LookPath("opencode")
+				return err == nil
+			},
+			URL: "https://opencode.ai",
+		},
+	},
+	{
+		Name:        "openclaw",
+		Runner:      &Openclaw{},
+		Aliases:     []string{"clawdbot", "moltbot"},
+		Description: "Personal AI with 100+ skills",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				if _, err := exec.LookPath("openclaw"); err == nil {
+					return true
+				}
+				if _, err := exec.LookPath("clawdbot"); err == nil {
+					return true
+				}
+				return false
+			},
+			EnsureInstalled: func() error {
+				_, err := ensureOpenclawInstalled()
+				return err
+			},
+			URL: "https://docs.openclaw.ai",
+		},
+	},
+	{
+		Name:        "pi",
+		Runner:      &Pi{},
+		Description: "Minimal AI agent toolkit with plugin support",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := exec.LookPath("pi")
+				return err == nil
+			},
+			Command: []string{"npm", "install", "-g", "@mariozechner/pi-coding-agent"},
+		},
+	},
+}
+
+var integrationSpecsByName map[string]*IntegrationSpec
+
+func init() {
+	rebuildIntegrationSpecIndexes()
+}
+
+func hyperlink(url, text string) string {
+	return fmt.Sprintf("\033]8;;%s\033\\%s\033]8;;\033\\", url, text)
+}
+
+func rebuildIntegrationSpecIndexes() {
+	integrationSpecsByName = make(map[string]*IntegrationSpec, len(integrationSpecs))
+
+	canonical := make(map[string]bool, len(integrationSpecs))
+	for _, spec := range integrationSpecs {
+		key := strings.ToLower(spec.Name)
+		if key == "" {
+			panic("launch: integration spec missing name")
+		}
+		if canonical[key] {
+			panic(fmt.Sprintf("launch: duplicate integration name %q", key))
+		}
+		canonical[key] = true
+		integrationSpecsByName[key] = spec
+	}
+
+	seenAliases := make(map[string]string)
+	for _, spec := range integrationSpecs {
+		for _, alias := range spec.Aliases {
+			key := strings.ToLower(alias)
+			if key == "" {
+				panic(fmt.Sprintf("launch: integration %q has empty alias", spec.Name))
+			}
+			if canonical[key] {
+				panic(fmt.Sprintf("launch: alias %q collides with canonical integration name", key))
+			}
+			if owner, exists := seenAliases[key]; exists {
+				panic(fmt.Sprintf("launch: alias %q collides between %q and %q", key, owner, spec.Name))
+			}
+			seenAliases[key] = spec.Name
+			integrationSpecsByName[key] = spec
+		}
+	}
+
+	orderSeen := make(map[string]bool, len(launcherIntegrationOrder))
+	for _, name := range launcherIntegrationOrder {
+		key := strings.ToLower(name)
+		if orderSeen[key] {
+			panic(fmt.Sprintf("launch: duplicate launcher order entry %q", key))
+		}
+		orderSeen[key] = true
+
+		spec, ok := integrationSpecsByName[key]
+		if !ok {
+			panic(fmt.Sprintf("launch: unknown launcher order entry %q", key))
+		}
+		if spec.Name != key {
+			panic(fmt.Sprintf("launch: launcher order entry %q must use canonical name, not alias", key))
+		}
+		if spec.Hidden {
+			panic(fmt.Sprintf("launch: hidden integration %q cannot appear in launcher order", key))
+		}
+	}
+}
+
+// LookupIntegrationSpec resolves either a canonical integration name or alias to its spec.
+func LookupIntegrationSpec(name string) (*IntegrationSpec, error) {
+	spec, ok := integrationSpecsByName[strings.ToLower(name)]
+	if !ok {
+		return nil, fmt.Errorf("unknown integration: %s", name)
+	}
+	return spec, nil
+}
+
+// LookupIntegration resolves a registry name to the canonical key and runner.
+func LookupIntegration(name string) (string, Runner, error) {
+	spec, err := LookupIntegrationSpec(name)
+	if err != nil {
+		return "", nil, err
+	}
+	return spec.Name, spec.Runner, nil
+}
+
+// ListVisibleIntegrationSpecs returns the canonical integrations that should appear in interactive UIs.
+func ListVisibleIntegrationSpecs() []IntegrationSpec {
+	visible := make([]IntegrationSpec, 0, len(integrationSpecs))
+	for _, spec := range integrationSpecs {
+		if spec.Hidden {
+			continue
+		}
+		visible = append(visible, *spec)
+	}
+
+	orderRank := make(map[string]int, len(launcherIntegrationOrder))
+	for i, name := range launcherIntegrationOrder {
+		orderRank[name] = i + 1
+	}
+
+	slices.SortFunc(visible, func(a, b IntegrationSpec) int {
+		aRank, bRank := orderRank[a.Name], orderRank[b.Name]
+		if aRank > 0 && bRank > 0 {
+			return aRank - bRank
+		}
+		if aRank > 0 {
+			return 1
+		}
+		if bRank > 0 {
+			return -1
+		}
+		return strings.Compare(a.Name, b.Name)
+	})
+
+	return visible
+}
+
+// ListIntegrationInfos returns the registered integrations in launcher display order.
+func ListIntegrationInfos() []IntegrationInfo {
+	visible := ListVisibleIntegrationSpecs()
+	infos := make([]IntegrationInfo, 0, len(visible))
+	for _, spec := range visible {
+		infos = append(infos, IntegrationInfo{
+			Name:        spec.Name,
+			DisplayName: spec.Runner.String(),
+			Description: spec.Description,
+		})
+	}
+	return infos
+}
+
+// IntegrationSelectionItems returns the sorted integration items shown by launcher selection UIs.
+func IntegrationSelectionItems() ([]ModelItem, error) {
+	visible := ListVisibleIntegrationSpecs()
+	if len(visible) == 0 {
+		return nil, fmt.Errorf("no integrations available")
+	}
+
+	items := make([]ModelItem, 0, len(visible))
+	for _, spec := range visible {
+		description := spec.Runner.String()
+		if conn, err := loadStoredIntegrationConfig(spec.Name); err == nil && len(conn.Models) > 0 {
+			description = fmt.Sprintf("%s (%s)", spec.Runner.String(), conn.Models[0])
+		}
+		items = append(items, ModelItem{Name: spec.Name, Description: description})
+	}
+	return items, nil
+}
+
+// IsIntegrationInstalled checks if an integration binary is installed.
+func IsIntegrationInstalled(name string) bool {
+	integration, err := integrationFor(name)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Ollama couldn't find integration %q, so it'll show up as not installed.\n", name)
+		return false
+	}
+	return integration.installed
+}
+
+// integration is resolved registry metadata used by launcher state and install checks.
+// It combines immutable registry spec data with computed runtime traits.
+type integration struct {
+	spec            *IntegrationSpec
+	installed       bool
+	autoInstallable bool
+	editor          bool
+	installHint     string
+}
+
+// integrationFor resolves an integration name into the canonical spec plus
+// derived launcher/install traits used across registry and launch flows.
+func integrationFor(name string) (integration, error) {
+	spec, err := LookupIntegrationSpec(name)
+	if err != nil {
+		return integration{}, err
+	}
+
+	installed := true
+	if spec.Install.CheckInstalled != nil {
+		installed = spec.Install.CheckInstalled()
+	}
+
+	_, editor := spec.Runner.(Editor)
+	hint := ""
+	if spec.Install.URL != "" {
+		hint = "Install from " + hyperlink(spec.Install.URL, spec.Install.URL)
+	} else if len(spec.Install.Command) > 0 {
+		hint = "Install with: " + strings.Join(spec.Install.Command, " ")
+	}
+
+	return integration{
+		spec:            spec,
+		installed:       installed,
+		autoInstallable: spec.Install.EnsureInstalled != nil,
+		editor:          editor,
+		installHint:     hint,
+	}, nil
+}
+
+// EnsureIntegrationInstalled installs auto-installable integrations when missing.
+func EnsureIntegrationInstalled(name string, runner Runner) error {
+	integration, err := integrationFor(name)
+	if err != nil {
+		return fmt.Errorf("%s is not installed", runner)
+	}
+
+	if integration.installed {
+		return nil
+	}
+	if integration.autoInstallable {
+		return integration.spec.Install.EnsureInstalled()
+	}
+
+	switch {
+	case integration.spec.Install.URL != "":
+		return fmt.Errorf("%s is not installed, install from %s", integration.spec.Name, integration.spec.Install.URL)
+	case len(integration.spec.Install.Command) > 0:
+		return fmt.Errorf("%s is not installed, install with: %s", integration.spec.Name, strings.Join(integration.spec.Install.Command, " "))
+	default:
+		return fmt.Errorf("%s is not installed", runner)
+	}
+}
--- a/cmd/launch/registry_test_helpers_test.go
+++ b/cmd/launch/registry_test_helpers_test.go
@@ -0,0 +1,21 @@
+package launch
+
+import "strings"
+
+// OverrideIntegration replaces one registry entry's runner for tests and returns a restore function.
+func OverrideIntegration(name string, runner Runner) func() {
+	spec, err := LookupIntegrationSpec(name)
+	if err != nil {
+		key := strings.ToLower(name)
+		integrationSpecsByName[key] = &IntegrationSpec{Name: key, Runner: runner}
+		return func() {
+			delete(integrationSpecsByName, key)
+		}
+	}
+
+	original := spec.Runner
+	spec.Runner = runner
+	return func() {
+		spec.Runner = original
+	}
+}
--- a/cmd/launch/runner_exec_only_test.go
+++ b/cmd/launch/runner_exec_only_test.go
@@ -0,0 +1,68 @@
+package launch
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestEditorRunsDoNotRewriteConfig(t *testing.T) {
+	tests := []struct {
+		name      string
+		binary    string
+		runner    Runner
+		checkPath func(home string) string
+	}{
+		{
+			name:   "droid",
+			binary: "droid",
+			runner: &Droid{},
+			checkPath: func(home string) string {
+				return filepath.Join(home, ".factory", "settings.json")
+			},
+		},
+		{
+			name:   "opencode",
+			binary: "opencode",
+			runner: &OpenCode{},
+			checkPath: func(home string) string {
+				return filepath.Join(home, ".config", "opencode", "opencode.json")
+			},
+		},
+		{
+			name:   "cline",
+			binary: "cline",
+			runner: &Cline{},
+			checkPath: func(home string) string {
+				return filepath.Join(home, ".cline", "data", "globalState.json")
+			},
+		},
+		{
+			name:   "pi",
+			binary: "pi",
+			runner: &Pi{},
+			checkPath: func(home string) string {
+				return filepath.Join(home, ".pi", "agent", "models.json")
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			home := t.TempDir()
+			setTestHome(t, home)
+
+			binDir := t.TempDir()
+			writeFakeBinary(t, binDir, tt.binary)
+			t.Setenv("PATH", binDir)
+
+			configPath := tt.checkPath(home)
+			if err := tt.runner.Run("llama3.2", nil); err != nil {
+				t.Fatalf("Run returned error: %v", err)
+			}
+			if _, err := os.Stat(configPath); !os.IsNotExist(err) {
+				t.Fatalf("expected Run to leave %s untouched, got err=%v", configPath, err)
+			}
+		})
+	}
+}
--- a/cmd/launch/selector_hooks.go
+++ b/cmd/launch/selector_hooks.go
@@ -0,0 +1,103 @@
+package launch
+
+import (
+	"errors"
+	"fmt"
+	"os"
+
+	"golang.org/x/term"
+)
+
+// ANSI escape sequences for terminal formatting.
+const (
+	ansiBold   = "\033[1m"
+	ansiReset  = "\033[0m"
+	ansiGray   = "\033[37m"
+	ansiGreen  = "\033[32m"
+	ansiYellow = "\033[33m"
+)
+
+// ErrCancelled is returned when the user cancels a selection.
+var ErrCancelled = errors.New("cancelled")
+
+// errCancelled is kept as an internal alias for existing call sites.
+var errCancelled = ErrCancelled
+
+// DefaultConfirmPrompt provides a TUI-based confirmation prompt.
+// When set, ConfirmPrompt delegates to it instead of using raw terminal I/O.
+var DefaultConfirmPrompt func(prompt string) (bool, error)
+
+// SingleSelector is a function type for single item selection.
+// current is the name of the previously selected item to highlight; empty means no pre-selection.
+type SingleSelector func(title string, items []ModelItem, current string) (string, error)
+
+// MultiSelector is a function type for multi item selection.
+type MultiSelector func(title string, items []ModelItem, preChecked []string) ([]string, error)
+
+// DefaultSingleSelector is the default single-select implementation.
+var DefaultSingleSelector SingleSelector
+
+// DefaultMultiSelector is the default multi-select implementation.
+var DefaultMultiSelector MultiSelector
+
+// DefaultSignIn provides a TUI-based sign-in flow.
+// When set, ensureAuth uses it instead of plain text prompts.
+// Returns the signed-in username or an error.
+var DefaultSignIn func(modelName, signInURL string) (string, error)
+
+type launchConfirmPolicy struct {
+	yes               bool
+	requireYesMessage bool
+}
+
+var currentLaunchConfirmPolicy launchConfirmPolicy
+
+func withLaunchConfirmPolicy(policy launchConfirmPolicy) func() {
+	old := currentLaunchConfirmPolicy
+	currentLaunchConfirmPolicy = policy
+	return func() {
+		currentLaunchConfirmPolicy = old
+	}
+}
+
+// ConfirmPrompt is the shared confirmation gate for launch flows (integration
+// edits, missing-model pulls, sign-in prompts, OpenClaw install/security, etc).
+// Behavior is controlled by currentLaunchConfirmPolicy, typically scoped by
+// withLaunchConfirmPolicy in LaunchCmd (e.g. auto-approve with --yes).
+func ConfirmPrompt(prompt string) (bool, error) {
+	if currentLaunchConfirmPolicy.yes {
+		return true, nil
+	}
+	if currentLaunchConfirmPolicy.requireYesMessage {
+		return false, fmt.Errorf("%s requires confirmation; re-run with --yes to continue", prompt)
+	}
+
+	if DefaultConfirmPrompt != nil {
+		return DefaultConfirmPrompt(prompt)
+	}
+
+	fd := int(os.Stdin.Fd())
+	oldState, err := term.MakeRaw(fd)
+	if err != nil {
+		return false, err
+	}
+	defer term.Restore(fd, oldState)
+
+	fmt.Fprintf(os.Stderr, "%s (\033[1my\033[0m/n) ", prompt)
+
+	buf := make([]byte, 1)
+	for {
+		if _, err := os.Stdin.Read(buf); err != nil {
+			return false, err
+		}
+
+		switch buf[0] {
+		case 'Y', 'y', 13:
+			fmt.Fprintf(os.Stderr, "yes\r\n")
+			return true, nil
+		case 'N', 'n', 27, 3:
+			fmt.Fprintf(os.Stderr, "no\r\n")
+			return false, nil
+		}
+	}
+}
--- a/cmd/launch/selector_test.go
+++ b/cmd/launch/selector_test.go
@@ -0,0 +1,76 @@
+package launch
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestErrCancelled(t *testing.T) {
+	t.Run("NotNil", func(t *testing.T) {
+		if errCancelled == nil {
+			t.Error("errCancelled should not be nil")
+		}
+	})
+
+	t.Run("Message", func(t *testing.T) {
+		if errCancelled.Error() != "cancelled" {
+			t.Errorf("expected 'cancelled', got %q", errCancelled.Error())
+		}
+	})
+}
+
+func TestWithLaunchConfirmPolicy_ScopesAndRestores(t *testing.T) {
+	oldPolicy := currentLaunchConfirmPolicy
+	oldHook := DefaultConfirmPrompt
+	t.Cleanup(func() {
+		currentLaunchConfirmPolicy = oldPolicy
+		DefaultConfirmPrompt = oldHook
+	})
+
+	currentLaunchConfirmPolicy = launchConfirmPolicy{}
+	var hookCalls int
+	DefaultConfirmPrompt = func(prompt string) (bool, error) {
+		hookCalls++
+		return true, nil
+	}
+
+	restoreOuter := withLaunchConfirmPolicy(launchConfirmPolicy{requireYesMessage: true})
+	restoreInner := withLaunchConfirmPolicy(launchConfirmPolicy{yes: true})
+
+	ok, err := ConfirmPrompt("test prompt")
+	if err != nil {
+		t.Fatalf("expected --yes policy to allow prompt, got error: %v", err)
+	}
+	if !ok {
+		t.Fatal("expected --yes policy to auto-accept prompt")
+	}
+	if hookCalls != 0 {
+		t.Fatalf("expected --yes to skip hook, got %d hook calls", hookCalls)
+	}
+
+	restoreInner()
+
+	_, err = ConfirmPrompt("test prompt")
+	if err == nil {
+		t.Fatal("expected requireYesMessage policy to block prompt")
+	}
+	if !strings.Contains(err.Error(), "re-run with --yes") {
+		t.Fatalf("expected actionable --yes error, got: %v", err)
+	}
+	if hookCalls != 0 {
+		t.Fatalf("expected blocking policy to skip hook, got %d hook calls", hookCalls)
+	}
+
+	restoreOuter()
+
+	ok, err = ConfirmPrompt("test prompt")
+	if err != nil {
+		t.Fatalf("expected restored default behavior to use hook, got error: %v", err)
+	}
+	if !ok {
+		t.Fatal("expected hook to return true")
+	}
+	if hookCalls != 1 {
+		t.Fatalf("expected one hook call after restore, got %d", hookCalls)
+	}
+}
--- a/cmd/launch/test_config_helpers_test.go
+++ b/cmd/launch/test_config_helpers_test.go
@@ -0,0 +1,82 @@
+package launch
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/cmd/config"
+)
+
+var (
+	integrations       map[string]Runner
+	integrationAliases map[string]bool
+	integrationOrder   = launcherIntegrationOrder
+)
+
+func init() {
+	integrations = buildTestIntegrations()
+	integrationAliases = buildTestIntegrationAliases()
+}
+
+func buildTestIntegrations() map[string]Runner {
+	result := make(map[string]Runner, len(integrationSpecsByName))
+	for name, spec := range integrationSpecsByName {
+		result[strings.ToLower(name)] = spec.Runner
+	}
+	return result
+}
+
+func buildTestIntegrationAliases() map[string]bool {
+	result := make(map[string]bool)
+	for _, spec := range integrationSpecs {
+		for _, alias := range spec.Aliases {
+			result[strings.ToLower(alias)] = true
+		}
+	}
+	return result
+}
+
+func setTestHome(t *testing.T, dir string) {
+	t.Helper()
+	setLaunchTestHome(t, dir)
+}
+
+func SaveIntegration(appName string, models []string) error {
+	return config.SaveIntegration(appName, models)
+}
+
+func LoadIntegration(appName string) (*config.IntegrationConfig, error) {
+	return config.LoadIntegration(appName)
+}
+
+func SaveAliases(appName string, aliases map[string]string) error {
+	return config.SaveAliases(appName, aliases)
+}
+
+func LastModel() string {
+	return config.LastModel()
+}
+
+func SetLastModel(model string) error {
+	return config.SetLastModel(model)
+}
+
+func LastSelection() string {
+	return config.LastSelection()
+}
+
+func SetLastSelection(selection string) error {
+	return config.SetLastSelection(selection)
+}
+
+func IntegrationModel(appName string) string {
+	return config.IntegrationModel(appName)
+}
+
+func IntegrationModels(appName string) []string {
+	return config.IntegrationModels(appName)
+}
+
+func integrationOnboarded(appName string) error {
+	return config.MarkIntegrationOnboarded(appName)
+}
--- a/cmd/tui/selector.go
+++ b/cmd/tui/selector.go
@@ -7,7 +7,7 @@ import (

 	tea "github.com/charmbracelet/bubbletea"
 	"github.com/charmbracelet/lipgloss"
-	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/cmd/launch"
 )

 var (
@@ -64,8 +64,8 @@ type SelectItem struct {
 	Recommended bool
 }

-// ConvertItems converts config.ModelItem slice to SelectItem slice.
-func ConvertItems(items []config.ModelItem) []SelectItem {
+// ConvertItems converts launch.ModelItem slice to SelectItem slice.
+func ConvertItems(items []launch.ModelItem) []SelectItem {
 	out := make([]SelectItem, len(items))
 	for i, item := range items {
 		out[i] = SelectItem{Name: item.Name, Description: item.Description, Recommended: item.Recommended}
@@ -101,6 +101,16 @@ type selectorModel struct {
 	width        int
 }

+func selectorModelWithCurrent(title string, items []SelectItem, current string) selectorModel {
+	m := selectorModel{
+		title:  title,
+		items:  items,
+		cursor: cursorForCurrent(items, current),
+	}
+	m.updateScroll(m.otherStart())
+	return m
+}
+
 func (m selectorModel) filteredItems() []SelectItem {
 	if m.filter == "" {
 		return m.items
@@ -367,13 +377,24 @@ func (m selectorModel) View() string {

 // cursorForCurrent returns the item index matching current, or 0 if not found.
 func cursorForCurrent(items []SelectItem, current string) int {
-	if current != "" {
-		for i, item := range items {
-			if item.Name == current || strings.HasPrefix(item.Name, current+":") || strings.HasPrefix(current, item.Name+":") {
-				return i
-			}
+	if current == "" {
+		return 0
+	}
+
+	// Prefer exact name matches before tag-prefix fallback so "qwen3.5" does not
+	// incorrectly select "qwen3.5:cloud" (and vice versa) based on list order.
+	for i, item := range items {
+		if item.Name == current {
+			return i
 		}
 	}
+
+	for i, item := range items {
+		if strings.HasPrefix(item.Name, current+":") || strings.HasPrefix(current, item.Name+":") {
+			return i
+		}
+	}
+
 	return 0
 }

@@ -382,11 +403,7 @@ func SelectSingle(title string, items []SelectItem, current string) (string, err
 		return "", fmt.Errorf("no items to select from")
 	}

-	m := selectorModel{
-		title:  title,
-		items:  items,
-		cursor: cursorForCurrent(items, current),
-	}
+	m := selectorModelWithCurrent(title, items, current)

 	p := tea.NewProgram(m)
 	finalModel, err := p.Run()
@@ -523,6 +540,7 @@ func (m *multiSelectorModel) toggleItem() {
 	origIdx := m.itemIndex[item.Name]

 	if m.checked[origIdx] {
+		wasDefault := len(m.checkOrder) > 0 && m.checkOrder[len(m.checkOrder)-1] == origIdx
 		delete(m.checked, origIdx)
 		for i, idx := range m.checkOrder {
 			if idx == origIdx {
@@ -530,6 +548,34 @@ func (m *multiSelectorModel) toggleItem() {
 				break
 			}
 		}
+		if wasDefault {
+			// When removing the default, pick the nearest checked model above it
+			// (or below if none above) so default fallback follows list order.
+			newDefault := -1
+			for i := origIdx - 1; i >= 0; i-- {
+				if m.checked[i] {
+					newDefault = i
+					break
+				}
+			}
+			if newDefault == -1 {
+				for i := origIdx + 1; i < len(m.items); i++ {
+					if m.checked[i] {
+						newDefault = i
+						break
+					}
+				}
+			}
+			if newDefault != -1 {
+				for i, idx := range m.checkOrder {
+					if idx == newDefault {
+						m.checkOrder = append(m.checkOrder[:i], m.checkOrder[i+1:]...)
+						break
+					}
+				}
+				m.checkOrder = append(m.checkOrder, newDefault)
+			}
+		}
 	} else {
 		m.checked[origIdx] = true
 		m.checkOrder = append(m.checkOrder, origIdx)
--- a/cmd/tui/selector_test.go
+++ b/cmd/tui/selector_test.go
@@ -216,6 +216,41 @@ func TestUpdateScroll(t *testing.T) {
 	}
 }

+func TestSelectorModelWithCurrent_ScrollsToCurrentInMoreSection(t *testing.T) {
+	m := selectorModelWithCurrent("Pick:", mixedItems(), "other-10")
+
+	if m.cursor != 11 {
+		t.Fatalf("cursor = %d, want 11", m.cursor)
+	}
+	if m.scrollOffset == 0 {
+		t.Fatal("scrollOffset should move to reveal current item in More section")
+	}
+
+	content := m.renderContent()
+	if !strings.Contains(content, "▸ other-10") {
+		t.Fatalf("expected current item to be visible and highlighted\n%s", content)
+	}
+}
+
+func TestSelectorModelWithCurrent_HighlightsExactLocalWhenCloudVariantExists(t *testing.T) {
+	m := selectorModelWithCurrent("Pick:", []SelectItem{
+		{Name: "qwen3.5:cloud", Recommended: true},
+		{Name: "qwen3.5", Recommended: true},
+	}, "qwen3.5")
+
+	if m.cursor != 1 {
+		t.Fatalf("cursor = %d, want 1", m.cursor)
+	}
+
+	content := m.renderContent()
+	if !strings.Contains(content, "▸ qwen3.5") {
+		t.Fatalf("expected local qwen3.5 to be highlighted\n%s", content)
+	}
+	if strings.Contains(content, "▸ qwen3.5:cloud") {
+		t.Fatalf("did not expect cloud qwen3.5:cloud to be highlighted\n%s", content)
+	}
+}
+
 func TestRenderContent_SectionHeaders(t *testing.T) {
 	m := selectorModel{
 		title: "Pick:",
@@ -418,6 +453,28 @@ func TestCursorForCurrent(t *testing.T) {
 	}
 }

+func TestCursorForCurrent_PrefersExactLocalOverCloudPrefix(t *testing.T) {
+	testItems := []SelectItem{
+		{Name: "qwen3.5:cloud", Recommended: true},
+		{Name: "qwen3.5", Recommended: true},
+	}
+
+	if got := cursorForCurrent(testItems, "qwen3.5"); got != 1 {
+		t.Errorf("cursorForCurrent(%q) = %d, want %d", "qwen3.5", got, 1)
+	}
+}
+
+func TestCursorForCurrent_PrefersExactCloudOverLocalPrefix(t *testing.T) {
+	testItems := []SelectItem{
+		{Name: "qwen3.5", Recommended: true},
+		{Name: "qwen3.5:cloud", Recommended: true},
+	}
+
+	if got := cursorForCurrent(testItems, "qwen3.5:cloud"); got != 1 {
+		t.Errorf("cursorForCurrent(%q) = %d, want %d", "qwen3.5:cloud", got, 1)
+	}
+}
+
 // --- ReorderItems ---

 func TestReorderItems(t *testing.T) {
@@ -783,6 +840,34 @@ func TestMulti_LastCheckedIsDefault(t *testing.T) {
 	}
 }

+func TestMulti_UncheckingDefaultFallsBackToNearestCheckedAbove(t *testing.T) {
+	// Default is "b", and checked models are "a", "b", "c".
+	// Unticking default should make "a" (the nearest checked item above) default.
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"b", "c", "a"})
+	m.multi = true
+	m.cursor = 1 // "b"
+	m.toggleItem()
+
+	lastIdx := m.checkOrder[len(m.checkOrder)-1]
+	if m.items[lastIdx].Name != "a" {
+		t.Fatalf("expected default to fall back to 'a', got %q", m.items[lastIdx].Name)
+	}
+}
+
+func TestMulti_UncheckingTopDefaultFallsBackToNearestCheckedBelow(t *testing.T) {
+	// Default is top item "a". With no checked item above, fallback should pick
+	// the nearest checked item below ("b").
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"a", "c", "b"})
+	m.multi = true
+	m.cursor = 0 // "a"
+	m.toggleItem()
+
+	lastIdx := m.checkOrder[len(m.checkOrder)-1]
+	if m.items[lastIdx].Name != "b" {
+		t.Fatalf("expected default to fall back to 'b', got %q", m.items[lastIdx].Name)
+	}
+}
+
 // Key message helpers for testing

 type keyType = int
--- a/cmd/tui/signin.go
+++ b/cmd/tui/signin.go
@@ -1,15 +1,24 @@
 package tui

 import (
+	"context"
 	"fmt"
 	"strings"
 	"time"

 	tea "github.com/charmbracelet/bubbletea"
 	"github.com/charmbracelet/lipgloss"
-	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/launch"
 )

+type signInTickMsg struct{}
+
+type signInCheckMsg struct {
+	signedIn bool
+	userName string
+}
+
 type signInModel struct {
 	modelName string
 	signInURL string
@@ -88,11 +97,8 @@ func renderSignIn(modelName, signInURL string, spinner, width int) string {

 	fmt.Fprintf(&s, "To use %s, please sign in.\n\n", selectorSelectedItemStyle.Render(modelName))

-	// Wrap in OSC 8 hyperlink so the entire URL is clickable even when wrapped.
-	// Padding is outside the hyperlink so spaces don't get underlined.
-	link := fmt.Sprintf("\033]8;;%s\033\\%s\033]8;;\033\\", signInURL, urlColor.Render(signInURL))
 	s.WriteString("Navigate to:\n")
-	s.WriteString(urlWrap.Render(link))
+	s.WriteString(urlWrap.Render(urlColor.Render(signInURL)))
 	s.WriteString("\n\n")

 	s.WriteString(lipgloss.NewStyle().Foreground(lipgloss.AdaptiveColor{Light: "242", Dark: "246"}).Render(
@@ -104,9 +110,21 @@ func renderSignIn(modelName, signInURL string, spinner, width int) string {
 	return lipgloss.NewStyle().PaddingLeft(2).Render(s.String())
 }

+func checkSignIn() tea.Msg {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return signInCheckMsg{signedIn: false}
+	}
+	user, err := client.Whoami(context.Background())
+	if err == nil && user != nil && user.Name != "" {
+		return signInCheckMsg{signedIn: true, userName: user.Name}
+	}
+	return signInCheckMsg{signedIn: false}
+}
+
 // RunSignIn shows a bubbletea sign-in dialog and polls until the user signs in or cancels.
 func RunSignIn(modelName, signInURL string) (string, error) {
-	config.OpenBrowser(signInURL)
+	launch.OpenBrowser(signInURL)

 	m := signInModel{
 		modelName: modelName,
--- a/cmd/tui/signin_test.go
+++ b/cmd/tui/signin_test.go
@@ -25,22 +25,6 @@ func TestRenderSignIn_ContainsURL(t *testing.T) {
 	}
 }

-func TestRenderSignIn_OSC8Hyperlink(t *testing.T) {
-	url := "https://ollama.com/connect?key=abc123"
-	got := renderSignIn("test:cloud", url, 0, 120)
-
-	// Should contain OSC 8 open sequence with the URL
-	osc8Open := "\033]8;;" + url + "\033\\"
-	if !strings.Contains(got, osc8Open) {
-		t.Error("should contain OSC 8 open sequence with URL")
-	}
-
-	// Should contain OSC 8 close sequence
-	osc8Close := "\033]8;;\033\\"
-	if !strings.Contains(got, osc8Close) {
-		t.Error("should contain OSC 8 close sequence")
-	}
-}

 func TestRenderSignIn_ContainsSpinner(t *testing.T) {
 	got := renderSignIn("test:cloud", "https://example.com", 0, 80)
--- a/cmd/tui/tui.go
+++ b/cmd/tui/tui.go
@@ -1,16 +1,11 @@
 package tui

 import (
-	"context"
-	"errors"
 	"fmt"
-	"strings"
-	"time"

 	tea "github.com/charmbracelet/bubbletea"
 	"github.com/charmbracelet/lipgloss"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/cmd/launch"
 	"github.com/ollama/ollama/version"
 )

@@ -45,7 +40,7 @@ var (
 type menuItem struct {
 	title       string
 	description string
-	integration string // integration name for loading model config, empty if not an integration
+	integration string
 	isRunModel  bool
 	isOthers    bool
 }
@@ -57,18 +52,12 @@ var mainMenuItems = []menuItem{
 		isRunModel:  true,
 	},
 	{
-		title:       "Launch Claude Code",
-		description: "Agentic coding across large codebases",
 		integration: "claude",
 	},
 	{
-		title:       "Launch Codex",
-		description: "OpenAI's open-source coding agent",
 		integration: "codex",
 	},
 	{
-		title:       "Launch OpenClaw",
-		description: "Personal AI with 100+ skills",
 		integration: "openclaw",
 	},
 }
@@ -79,277 +68,106 @@ var othersMenuItem = menuItem{
 	isOthers:    true,
 }

-// getOtherIntegrations dynamically builds the "Others" list from the integration
-// registry, excluding any integrations already present in the pinned mainMenuItems.
-func getOtherIntegrations() []menuItem {
-	pinned := map[string]bool{
-		"run": true, // not an integration but in the pinned list
+type model struct {
+	state      *launch.LauncherState
+	items      []menuItem
+	cursor     int
+	showOthers bool
+	width      int
+	quitting   bool
+	selected   bool
+	action     TUIAction
+}
+
+func newModel(state *launch.LauncherState) model {
+	m := model{
+		state: state,
 	}
+	m.showOthers = shouldExpandOthers(state)
+	m.items = buildMenuItems(state, m.showOthers)
+	m.cursor = initialCursor(state, m.items)
+	return m
+}
+
+func shouldExpandOthers(state *launch.LauncherState) bool {
+	if state == nil {
+		return false
+	}
+	for _, item := range otherIntegrationItems(state) {
+		if item.integration == state.LastSelection {
+			return true
+		}
+	}
+	return false
+}
+
+func buildMenuItems(state *launch.LauncherState, showOthers bool) []menuItem {
+	items := make([]menuItem, 0, len(mainMenuItems)+1)
 	for _, item := range mainMenuItems {
-		if item.integration != "" {
-			pinned[item.integration] = true
+		if item.integration == "" {
+			items = append(items, item)
+			continue
+		}
+		if integrationState, ok := state.Integrations[item.integration]; ok {
+			items = append(items, integrationMenuItem(integrationState))
 		}
 	}

-	var others []menuItem
-	for _, info := range config.ListIntegrationInfos() {
+	if showOthers {
+		items = append(items, otherIntegrationItems(state)...)
+	} else {
+		items = append(items, othersMenuItem)
+	}
+
+	return items
+}
+
+func integrationMenuItem(state launch.LauncherIntegrationState) menuItem {
+	description := state.Description
+	if description == "" {
+		description = "Open " + state.DisplayName + " integration"
+	}
+	return menuItem{
+		title:       "Launch " + state.DisplayName,
+		description: description,
+		integration: state.Name,
+	}
+}
+
+func otherIntegrationItems(state *launch.LauncherState) []menuItem {
+	pinned := map[string]bool{
+		"claude":   true,
+		"codex":    true,
+		"openclaw": true,
+	}
+
+	var items []menuItem
+	for _, info := range launch.ListIntegrationInfos() {
 		if pinned[info.Name] {
 			continue
 		}
-		desc := info.Description
-		if desc == "" {
-			desc = "Open " + info.DisplayName + " integration"
-		}
-		others = append(others, menuItem{
-			title:       "Launch " + info.DisplayName,
-			description: desc,
-			integration: info.Name,
-		})
-	}
-	return others
-}
-
-type model struct {
-	items           []menuItem
-	cursor          int
-	quitting        bool
-	selected        bool
-	changeModel     bool
-	changeModels    []string // multi-select result for Editor integrations
-	showOthers      bool
-	availableModels map[string]bool
-	err             error
-
-	showingModal  bool
-	modalSelector selectorModel
-	modalItems    []SelectItem
-
-	showingMultiModal  bool
-	multiModalSelector multiSelectorModel
-
-	showingSignIn   bool
-	signInURL       string
-	signInModel     string
-	signInSpinner   int
-	signInFromModal bool // true if sign-in was triggered from modal (not main menu)
-
-	width     int    // terminal width from WindowSizeMsg
-	statusMsg string // temporary status message shown near help text
-}
-
-type signInTickMsg struct{}
-
-type signInCheckMsg struct {
-	signedIn bool
-	userName string
-}
-
-type clearStatusMsg struct{}
-
-func (m *model) modelExists(name string) bool {
-	if m.availableModels == nil || name == "" {
-		return false
-	}
-	if m.availableModels[name] {
-		return true
-	}
-	// Check for prefix match (e.g., "llama2" matches "llama2:latest")
-	for modelName := range m.availableModels {
-		if strings.HasPrefix(modelName, name+":") {
-			return true
-		}
-	}
-	return false
-}
-
-func (m *model) buildModalItems() []SelectItem {
-	modelItems, _ := config.GetModelItems(context.Background())
-	return ReorderItems(ConvertItems(modelItems))
-}
-
-func (m *model) openModelModal(currentModel string) {
-	m.modalItems = m.buildModalItems()
-	cursor := 0
-	if currentModel != "" {
-		for i, item := range m.modalItems {
-			if item.Name == currentModel || strings.HasPrefix(item.Name, currentModel+":") || strings.HasPrefix(currentModel, item.Name+":") {
-				cursor = i
-				break
-			}
-		}
-	}
-	m.modalSelector = selectorModel{
-		title:    "Select model:",
-		items:    m.modalItems,
-		cursor:   cursor,
-		helpText: "↑/↓ navigate • enter select • ← back",
-	}
-	m.modalSelector.updateScroll(m.modalSelector.otherStart())
-	m.showingModal = true
-}
-
-func (m *model) openMultiModelModal(integration string) {
-	items := m.buildModalItems()
-	var preChecked []string
-	if models := config.IntegrationModels(integration); len(models) > 0 {
-		preChecked = models
-	}
-	m.multiModalSelector = newMultiSelectorModel("Select models:", items, preChecked)
-	// Set cursor to the first pre-checked (last used) model
-	if len(preChecked) > 0 {
-		for i, item := range items {
-			if item.Name == preChecked[0] {
-				m.multiModalSelector.cursor = i
-				m.multiModalSelector.updateScroll(m.multiModalSelector.otherStart())
-				break
-			}
-		}
-	}
-	m.showingMultiModal = true
-}
-
-func isCloudModel(name string) bool {
-	return strings.HasSuffix(name, ":cloud") || strings.HasSuffix(name, "-cloud")
-}
-
-func cloudStatusDisabled(client *api.Client) bool {
-	status, err := client.CloudStatusExperimental(context.Background())
-	if err != nil {
-		return false
-	}
-	return status.Cloud.Disabled
-}
-
-func cloudModelDisabled(name string) bool {
-	if !isCloudModel(name) {
-		return false
-	}
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return false
-	}
-	return cloudStatusDisabled(client)
-}
-
-// checkCloudSignIn checks if a cloud model needs sign-in.
-// Returns a command to start sign-in if needed, or nil if already signed in.
-func (m *model) checkCloudSignIn(modelName string, fromModal bool) tea.Cmd {
-	if modelName == "" || !isCloudModel(modelName) {
-		return nil
-	}
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return nil
-	}
-	if cloudStatusDisabled(client) {
-		return nil
-	}
-	user, err := client.Whoami(context.Background())
-	if err == nil && user != nil && user.Name != "" {
-		return nil
-	}
-	var aErr api.AuthorizationError
-	if errors.As(err, &aErr) && aErr.SigninURL != "" {
-		return m.startSignIn(modelName, aErr.SigninURL, fromModal)
-	}
-	return nil
-}
-
-// startSignIn initiates the sign-in flow for a cloud model.
-// fromModal indicates if this was triggered from the model picker modal.
-func (m *model) startSignIn(modelName, signInURL string, fromModal bool) tea.Cmd {
-	m.showingModal = false
-	m.showingSignIn = true
-	m.signInURL = signInURL
-	m.signInModel = modelName
-	m.signInSpinner = 0
-	m.signInFromModal = fromModal
-
-	config.OpenBrowser(signInURL)
-
-	return tea.Tick(200*time.Millisecond, func(t time.Time) tea.Msg {
-		return signInTickMsg{}
-	})
-}
-
-func checkSignIn() tea.Msg {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return signInCheckMsg{signedIn: false}
-	}
-	user, err := client.Whoami(context.Background())
-	if err == nil && user != nil && user.Name != "" {
-		return signInCheckMsg{signedIn: true, userName: user.Name}
-	}
-	return signInCheckMsg{signedIn: false}
-}
-
-func (m *model) loadAvailableModels() {
-	m.availableModels = make(map[string]bool)
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return
-	}
-	models, err := client.List(context.Background())
-	if err != nil {
-		return
-	}
-	cloudDisabled := cloudStatusDisabled(client)
-	for _, mdl := range models.Models {
-		if cloudDisabled && mdl.RemoteModel != "" {
+		integrationState, ok := state.Integrations[info.Name]
+		if !ok {
 			continue
 		}
-		m.availableModels[mdl.Name] = true
+		items = append(items, integrationMenuItem(integrationState))
 	}
+	return items
 }

-func (m *model) buildItems() {
-	others := getOtherIntegrations()
-	m.items = make([]menuItem, 0, len(mainMenuItems)+1+len(others))
-	m.items = append(m.items, mainMenuItems...)
-
-	if m.showOthers {
-		m.items = append(m.items, others...)
-	} else {
-		m.items = append(m.items, othersMenuItem)
+func initialCursor(state *launch.LauncherState, items []menuItem) int {
+	if state == nil || state.LastSelection == "" {
+		return 0
 	}
-}
-
-func isOthersIntegration(name string) bool {
-	for _, item := range getOtherIntegrations() {
-		if item.integration == name {
-			return true
+	for i, item := range items {
+		if state.LastSelection == "run" && item.isRunModel {
+			return i
+		}
+		if item.integration == state.LastSelection {
+			return i
 		}
 	}
-	return false
-}
-
-func initialModel() model {
-	m := model{
-		cursor: 0,
-	}
-	m.loadAvailableModels()
-
-	lastSelection := config.LastSelection()
-	if isOthersIntegration(lastSelection) {
-		m.showOthers = true
-	}
-
-	m.buildItems()
-
-	if lastSelection != "" {
-		for i, item := range m.items {
-			if lastSelection == "run" && item.isRunModel {
-				m.cursor = i
-				break
-			} else if item.integration == lastSelection {
-				m.cursor = i
-				break
-			}
-		}
-	}
-
-	return m
+	return 0
 }

 func (m model) Init() tea.Cmd {
@@ -357,143 +175,11 @@ func (m model) Init() tea.Cmd {
 }

 func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	if wmsg, ok := msg.(tea.WindowSizeMsg); ok {
-		wasSet := m.width > 0
-		m.width = wmsg.Width
-		if wasSet {
-			return m, tea.EnterAltScreen
-		}
-		return m, nil
-	}
-
-	if _, ok := msg.(clearStatusMsg); ok {
-		m.statusMsg = ""
-		return m, nil
-	}
-
-	if m.showingSignIn {
-		switch msg := msg.(type) {
-		case tea.KeyMsg:
-			switch msg.Type {
-			case tea.KeyCtrlC, tea.KeyEsc:
-				m.showingSignIn = false
-				if m.signInFromModal {
-					m.showingModal = true
-				}
-				return m, nil
-			}
-
-		case signInTickMsg:
-			m.signInSpinner++
-			// Check sign-in status every 5th tick (~1 second)
-			if m.signInSpinner%5 == 0 {
-				return m, tea.Batch(
-					tea.Tick(200*time.Millisecond, func(t time.Time) tea.Msg {
-						return signInTickMsg{}
-					}),
-					checkSignIn,
-				)
-			}
-			return m, tea.Tick(200*time.Millisecond, func(t time.Time) tea.Msg {
-				return signInTickMsg{}
-			})
-
-		case signInCheckMsg:
-			if msg.signedIn {
-				if m.signInFromModal {
-					m.modalSelector.selected = m.signInModel
-					m.changeModel = true
-				} else {
-					m.selected = true
-				}
-				m.quitting = true
-				return m, tea.Quit
-			}
-		}
-		return m, nil
-	}
-
-	if m.showingMultiModal {
-		switch msg := msg.(type) {
-		case tea.KeyMsg:
-			if msg.Type == tea.KeyLeft {
-				m.showingMultiModal = false
-				return m, nil
-			}
-			updated, cmd := m.multiModalSelector.Update(msg)
-			m.multiModalSelector = updated.(multiSelectorModel)
-
-			if m.multiModalSelector.cancelled {
-				m.showingMultiModal = false
-				return m, nil
-			}
-			if m.multiModalSelector.confirmed {
-				var selected []string
-				if m.multiModalSelector.singleAdd != "" {
-					// Single-add mode: prepend picked model, keep existing deduped
-					selected = []string{m.multiModalSelector.singleAdd}
-					for _, name := range config.IntegrationModels(m.items[m.cursor].integration) {
-						if name != m.multiModalSelector.singleAdd {
-							selected = append(selected, name)
-						}
-					}
-				} else {
-					// Last checked is default (first in result)
-					co := m.multiModalSelector.checkOrder
-					last := co[len(co)-1]
-					selected = []string{m.multiModalSelector.items[last].Name}
-					for _, idx := range co {
-						if idx != last {
-							selected = append(selected, m.multiModalSelector.items[idx].Name)
-						}
-					}
-				}
-				if len(selected) > 0 {
-					m.changeModels = selected
-					m.changeModel = true
-					m.quitting = true
-					return m, tea.Quit
-				}
-				m.multiModalSelector.confirmed = false
-				return m, nil
-			}
-			return m, cmd
-		}
-		return m, nil
-	}
-
-	if m.showingModal {
-		switch msg := msg.(type) {
-		case tea.KeyMsg:
-			switch msg.Type {
-			case tea.KeyCtrlC, tea.KeyEsc, tea.KeyLeft:
-				m.showingModal = false
-				return m, nil
-
-			case tea.KeyEnter:
-				filtered := m.modalSelector.filteredItems()
-				if len(filtered) > 0 && m.modalSelector.cursor < len(filtered) {
-					m.modalSelector.selected = filtered[m.modalSelector.cursor].Name
-				}
-				if m.modalSelector.selected != "" {
-					if cmd := m.checkCloudSignIn(m.modalSelector.selected, true); cmd != nil {
-						return m, cmd
-					}
-					m.changeModel = true
-					m.quitting = true
-					return m, tea.Quit
-				}
-				return m, nil
-
-			default:
-				// Delegate navigation (up/down/pgup/pgdown/filter/backspace) to selectorModel
-				m.modalSelector.updateNavigation(msg)
-			}
-		}
-		return m, nil
-	}
-
 	switch msg := msg.(type) {
+	case tea.WindowSizeMsg:
+		m.width = msg.Width
+		return m, nil
+
 	case tea.KeyMsg:
 		switch msg.String() {
 		case "ctrl+c", "q", "esc":
@@ -504,162 +190,78 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			if m.cursor > 0 {
 				m.cursor--
 			}
-			// Auto-collapse "Others" when cursor moves back into pinned items
 			if m.showOthers && m.cursor < len(mainMenuItems) {
 				m.showOthers = false
-				m.buildItems()
+				m.items = buildMenuItems(m.state, false)
+				m.cursor = min(m.cursor, len(m.items)-1)
 			}
+			return m, nil

 		case "down", "j":
 			if m.cursor < len(m.items)-1 {
 				m.cursor++
 			}
-			// Auto-expand "Others..." when cursor lands on it
 			if m.cursor < len(m.items) && m.items[m.cursor].isOthers && !m.showOthers {
 				m.showOthers = true
-				m.buildItems()
-				// cursor now points at the first "other" integration
+				m.items = buildMenuItems(m.state, true)
 			}
+			return m, nil

 		case "enter", " ":
-			item := m.items[m.cursor]
-
-			if item.integration != "" && !config.IsIntegrationInstalled(item.integration) && !config.AutoInstallable(item.integration) {
-				return m, nil
+			if m.selectableItem(m.items[m.cursor]) {
+				m.selected = true
+				m.action = actionForMenuItem(m.items[m.cursor], false)
+				m.quitting = true
+				return m, tea.Quit
 			}
-
-			var configuredModel string
-			if item.isRunModel {
-				configuredModel = config.LastModel()
-			} else if item.integration != "" {
-				configuredModel = config.IntegrationModel(item.integration)
-			}
-			if cmd := m.checkCloudSignIn(configuredModel, false); cmd != nil {
-				return m, cmd
-			}
-
-			if configuredModel != "" && isCloudModel(configuredModel) && cloudModelDisabled(configuredModel) {
-				if item.integration != "" && config.IsEditorIntegration(item.integration) {
-					m.openMultiModelModal(item.integration)
-				} else {
-					m.openModelModal(configuredModel)
-				}
-				return m, nil
-			}
-
-			m.selected = true
-			m.quitting = true
-			return m, tea.Quit
+			return m, nil

 		case "right", "l":
 			item := m.items[m.cursor]
-			if item.integration != "" || item.isRunModel {
-				if item.integration != "" && !config.IsIntegrationInstalled(item.integration) {
-					if config.AutoInstallable(item.integration) {
-						// Auto-installable: select to trigger install flow
-						m.selected = true
-						m.quitting = true
-						return m, tea.Quit
-					}
-					return m, nil
-				}
-				if item.integration != "" && config.IsEditorIntegration(item.integration) {
-					m.openMultiModelModal(item.integration)
-				} else {
-					var currentModel string
-					if item.isRunModel {
-						currentModel = config.LastModel()
-					} else if item.integration != "" {
-						currentModel = config.IntegrationModel(item.integration)
-					}
-					m.openModelModal(currentModel)
-				}
+			if item.isRunModel || m.changeableItem(item) {
+				m.selected = true
+				m.action = actionForMenuItem(item, true)
+				m.quitting = true
+				return m, tea.Quit
 			}
+			return m, nil
 		}
 	}

 	return m, nil
 }

+func (m model) selectableItem(item menuItem) bool {
+	if item.isRunModel {
+		return true
+	}
+	if item.integration == "" || item.isOthers {
+		return false
+	}
+	state, ok := m.state.Integrations[item.integration]
+	return ok && state.Selectable
+}
+
+func (m model) changeableItem(item menuItem) bool {
+	if item.integration == "" || item.isOthers {
+		return false
+	}
+	state, ok := m.state.Integrations[item.integration]
+	return ok && state.Changeable
+}
+
 func (m model) View() string {
 	if m.quitting {
 		return ""
 	}

-	if m.showingSignIn {
-		return m.renderSignInDialog()
-	}
-
-	if m.showingMultiModal {
-		return m.multiModalSelector.View()
-	}
-
-	if m.showingModal {
-		return m.renderModal()
-	}
-
 	s := selectorTitleStyle.Render("Ollama "+versionStyle.Render(version.Version)) + "\n\n"

 	for i, item := range m.items {
-		cursor := ""
-		style := menuItemStyle
-		isInstalled := true
-
-		if item.integration != "" {
-			isInstalled = config.IsIntegrationInstalled(item.integration)
-		}
-
-		if m.cursor == i {
-			cursor = "▸ "
-			if isInstalled {
-				style = menuSelectedItemStyle
-			} else {
-				style = greyedSelectedStyle
-			}
-		} else if !isInstalled && item.integration != "" {
-			style = greyedStyle
-		}
-
-		title := item.title
-		var modelSuffix string
-		if item.integration != "" {
-			if !isInstalled {
-				if config.AutoInstallable(item.integration) {
-					title += " " + notInstalledStyle.Render("(install)")
-				} else {
-					title += " " + notInstalledStyle.Render("(not installed)")
-				}
-			} else if m.cursor == i {
-				if mdl := config.IntegrationModel(item.integration); mdl != "" && m.modelExists(mdl) {
-					modelSuffix = " " + modelStyle.Render("("+mdl+")")
-				}
-			}
-		} else if item.isRunModel && m.cursor == i {
-			if mdl := config.LastModel(); mdl != "" && m.modelExists(mdl) {
-				modelSuffix = " " + modelStyle.Render("("+mdl+")")
-			}
-		}
-
-		s += style.Render(cursor+title) + modelSuffix + "\n"
-
-		desc := item.description
-		if !isInstalled && item.integration != "" && m.cursor == i {
-			if config.AutoInstallable(item.integration) {
-				desc = "Press enter to install"
-			} else if hint := config.IntegrationInstallHint(item.integration); hint != "" {
-				desc = hint
-			} else {
-				desc = "not installed"
-			}
-		}
-		s += menuDescStyle.Render(desc) + "\n\n"
+		s += m.renderMenuItem(i, item)
 	}

-	if m.statusMsg != "" {
-		s += "\n" + lipgloss.NewStyle().Foreground(lipgloss.AdaptiveColor{Light: "124", Dark: "210"}).Render(m.statusMsg) + "\n"
-	}
-
-	s += "\n" + selectorHelpStyle.Render("↑/↓ navigate • enter launch • → change model • esc quit")
+	s += "\n" + selectorHelpStyle.Render("↑/↓ navigate • enter launch • → configure • esc quit")

 	if m.width > 0 {
 		return lipgloss.NewStyle().MaxWidth(m.width).Render(s)
@@ -667,80 +269,125 @@ func (m model) View() string {
 	return s
 }

-func (m model) renderModal() string {
-	modalStyle := lipgloss.NewStyle().
-		PaddingBottom(1).
-		PaddingRight(2)
+func (m model) renderMenuItem(index int, item menuItem) string {
+	cursor := ""
+	style := menuItemStyle
+	title := item.title
+	description := item.description
+	modelSuffix := ""

-	s := modalStyle.Render(m.modalSelector.renderContent())
-	if m.width > 0 {
-		return lipgloss.NewStyle().MaxWidth(m.width).Render(s)
-	}
-	return s
-}
-
-func (m model) renderSignInDialog() string {
-	return renderSignIn(m.signInModel, m.signInURL, m.signInSpinner, m.width)
-}
-
-type Selection int
-
-const (
-	SelectionNone Selection = iota
-	SelectionRunModel
-	SelectionChangeRunModel
-	SelectionIntegration       // Generic integration selection
-	SelectionChangeIntegration // Generic change model for integration
-)
-
-type Result struct {
-	Selection   Selection
-	Integration string   // integration name if applicable
-	Model       string   // model name if selected from single-select modal
-	Models      []string // models selected from multi-select modal (Editor integrations)
-}
-
-func Run() (Result, error) {
-	m := initialModel()
-	p := tea.NewProgram(m)
-
-	finalModel, err := p.Run()
-	if err != nil {
-		return Result{Selection: SelectionNone}, fmt.Errorf("error running TUI: %w", err)
-	}
-
-	fm := finalModel.(model)
-	if fm.err != nil {
-		return Result{Selection: SelectionNone}, fm.err
-	}
-
-	if !fm.selected && !fm.changeModel {
-		return Result{Selection: SelectionNone}, nil
-	}
-
-	item := fm.items[fm.cursor]
-
-	if fm.changeModel {
-		if item.isRunModel {
-			return Result{
-				Selection: SelectionChangeRunModel,
-				Model:     fm.modalSelector.selected,
-			}, nil
-		}
-		return Result{
-			Selection:   SelectionChangeIntegration,
-			Integration: item.integration,
-			Model:       fm.modalSelector.selected,
-			Models:      fm.changeModels,
-		}, nil
+	if m.cursor == index {
+		cursor = "▸ "
 	}

 	if item.isRunModel {
-		return Result{Selection: SelectionRunModel}, nil
+		if m.cursor == index && m.state.RunModel != "" {
+			modelSuffix = " " + modelStyle.Render("("+m.state.RunModel+")")
+		}
+		if m.cursor == index {
+			style = menuSelectedItemStyle
+		}
+	} else if item.isOthers {
+		if m.cursor == index {
+			style = menuSelectedItemStyle
+		}
+	} else {
+		integrationState := m.state.Integrations[item.integration]
+		if !integrationState.Selectable {
+			if m.cursor == index {
+				style = greyedSelectedStyle
+			} else {
+				style = greyedStyle
+			}
+		} else if m.cursor == index {
+			style = menuSelectedItemStyle
+		}
+
+		if m.cursor == index && integrationState.CurrentModel != "" {
+			modelSuffix = " " + modelStyle.Render("("+integrationState.CurrentModel+")")
+		}
+
+		if !integrationState.Installed {
+			if integrationState.AutoInstallable {
+				title += " " + notInstalledStyle.Render("(install)")
+			} else {
+				title += " " + notInstalledStyle.Render("(not installed)")
+			}
+			if m.cursor == index {
+				if integrationState.AutoInstallable {
+					description = "Press enter to install"
+				} else if integrationState.InstallHint != "" {
+					description = integrationState.InstallHint
+				} else {
+					description = "not installed"
+				}
+			}
+		}
 	}

-	return Result{
-		Selection:   SelectionIntegration,
-		Integration: item.integration,
-	}, nil
+	return style.Render(cursor+title) + modelSuffix + "\n" + menuDescStyle.Render(description) + "\n\n"
+}
+
+type TUIActionKind int
+
+const (
+	TUIActionNone TUIActionKind = iota
+	TUIActionRunModel
+	TUIActionLaunchIntegration
+)
+
+type TUIAction struct {
+	Kind           TUIActionKind
+	Integration    string
+	ForceConfigure bool
+}
+
+func (a TUIAction) LastSelection() string {
+	switch a.Kind {
+	case TUIActionRunModel:
+		return "run"
+	case TUIActionLaunchIntegration:
+		return a.Integration
+	default:
+		return ""
+	}
+}
+
+func (a TUIAction) RunModelRequest() launch.RunModelRequest {
+	return launch.RunModelRequest{ForcePicker: a.ForceConfigure}
+}
+
+func (a TUIAction) IntegrationLaunchRequest() launch.IntegrationLaunchRequest {
+	return launch.IntegrationLaunchRequest{
+		Name:           a.Integration,
+		ForceConfigure: a.ForceConfigure,
+	}
+}
+
+func actionForMenuItem(item menuItem, forceConfigure bool) TUIAction {
+	switch {
+	case item.isRunModel:
+		return TUIAction{Kind: TUIActionRunModel, ForceConfigure: forceConfigure}
+	case item.integration != "":
+		return TUIAction{Kind: TUIActionLaunchIntegration, Integration: item.integration, ForceConfigure: forceConfigure}
+	default:
+		return TUIAction{Kind: TUIActionNone}
+	}
+}
+
+func RunMenu(state *launch.LauncherState) (TUIAction, error) {
+	menu := newModel(state)
+	program := tea.NewProgram(menu)
+
+	finalModel, err := program.Run()
+	if err != nil {
+		return TUIAction{Kind: TUIActionNone}, fmt.Errorf("error running TUI: %w", err)
+	}
+
+	finalMenu := finalModel.(model)
+	if !finalMenu.selected {
+		return TUIAction{Kind: TUIActionNone}, nil
+	}
+
+	return finalMenu.action, nil
 }
--- a/cmd/tui/tui_test.go
+++ b/cmd/tui/tui_test.go
@@ -0,0 +1,178 @@
+package tui
+
+import (
+	"strings"
+	"testing"
+
+	tea "github.com/charmbracelet/bubbletea"
+	"github.com/ollama/ollama/cmd/launch"
+)
+
+func launcherTestState() *launch.LauncherState {
+	return &launch.LauncherState{
+		LastSelection: "run",
+		RunModel:      "qwen3:8b",
+		Integrations: map[string]launch.LauncherIntegrationState{
+			"claude": {
+				Name:         "claude",
+				DisplayName:  "Claude Code",
+				Description:  "Anthropic's coding tool with subagents",
+				Selectable:   true,
+				Changeable:   true,
+				CurrentModel: "glm-5:cloud",
+			},
+			"codex": {
+				Name:        "codex",
+				DisplayName: "Codex",
+				Description: "OpenAI's open-source coding agent",
+				Selectable:  true,
+				Changeable:  true,
+			},
+			"openclaw": {
+				Name:            "openclaw",
+				DisplayName:     "OpenClaw",
+				Description:     "Personal AI with 100+ skills",
+				Selectable:      true,
+				Changeable:      true,
+				AutoInstallable: true,
+			},
+			"droid": {
+				Name:        "droid",
+				DisplayName: "Droid",
+				Description: "Factory's coding agent across terminal and IDEs",
+				Selectable:  true,
+				Changeable:  true,
+			},
+			"pi": {
+				Name:        "pi",
+				DisplayName: "Pi",
+				Description: "Minimal AI agent toolkit with plugin support",
+				Selectable:  true,
+				Changeable:  true,
+			},
+		},
+	}
+}
+
+func TestMenuRendersPinnedItemsAndMore(t *testing.T) {
+	view := newModel(launcherTestState()).View()
+	for _, want := range []string{"Run a model", "Launch Claude Code", "Launch Codex", "Launch OpenClaw", "More..."} {
+		if !strings.Contains(view, want) {
+			t.Fatalf("expected menu view to contain %q\n%s", want, view)
+		}
+	}
+}
+
+func TestMenuExpandsOthersFromLastSelection(t *testing.T) {
+	state := launcherTestState()
+	state.LastSelection = "pi"
+
+	menu := newModel(state)
+	if !menu.showOthers {
+		t.Fatal("expected others section to expand when last selection is in the overflow list")
+	}
+	view := menu.View()
+	if !strings.Contains(view, "Launch Pi") {
+		t.Fatalf("expected expanded view to contain overflow integration\n%s", view)
+	}
+	if strings.Contains(view, "More...") {
+		t.Fatalf("expected expanded view to replace More... item\n%s", view)
+	}
+}
+
+func TestMenuEnterOnRunSelectsRun(t *testing.T) {
+	menu := newModel(launcherTestState())
+	updated, _ := menu.Update(tea.KeyMsg{Type: tea.KeyEnter})
+	got := updated.(model)
+	want := TUIAction{Kind: TUIActionRunModel}
+	if !got.selected || got.action != want {
+		t.Fatalf("expected enter on run to select run action, got selected=%v action=%v", got.selected, got.action)
+	}
+}
+
+func TestMenuRightOnRunSelectsChangeRun(t *testing.T) {
+	menu := newModel(launcherTestState())
+	updated, _ := menu.Update(tea.KeyMsg{Type: tea.KeyRight})
+	got := updated.(model)
+	want := TUIAction{Kind: TUIActionRunModel, ForceConfigure: true}
+	if !got.selected || got.action != want {
+		t.Fatalf("expected right on run to select change-run action, got selected=%v action=%v", got.selected, got.action)
+	}
+}
+
+func TestMenuEnterOnIntegrationSelectsLaunch(t *testing.T) {
+	menu := newModel(launcherTestState())
+	menu.cursor = 1
+	updated, _ := menu.Update(tea.KeyMsg{Type: tea.KeyEnter})
+	got := updated.(model)
+	want := TUIAction{Kind: TUIActionLaunchIntegration, Integration: "claude"}
+	if !got.selected || got.action != want {
+		t.Fatalf("expected enter on integration to launch, got selected=%v action=%v", got.selected, got.action)
+	}
+}
+
+func TestMenuRightOnIntegrationSelectsConfigure(t *testing.T) {
+	menu := newModel(launcherTestState())
+	menu.cursor = 1
+	updated, _ := menu.Update(tea.KeyMsg{Type: tea.KeyRight})
+	got := updated.(model)
+	want := TUIAction{Kind: TUIActionLaunchIntegration, Integration: "claude", ForceConfigure: true}
+	if !got.selected || got.action != want {
+		t.Fatalf("expected right on integration to configure, got selected=%v action=%v", got.selected, got.action)
+	}
+}
+
+func TestMenuIgnoresDisabledActions(t *testing.T) {
+	state := launcherTestState()
+	claude := state.Integrations["claude"]
+	claude.Selectable = false
+	claude.Changeable = false
+	state.Integrations["claude"] = claude
+
+	menu := newModel(state)
+	menu.cursor = 1
+
+	updatedEnter, _ := menu.Update(tea.KeyMsg{Type: tea.KeyEnter})
+	if updatedEnter.(model).selected {
+		t.Fatal("expected non-selectable integration to ignore enter")
+	}
+
+	updatedRight, _ := menu.Update(tea.KeyMsg{Type: tea.KeyRight})
+	if updatedRight.(model).selected {
+		t.Fatal("expected non-changeable integration to ignore right")
+	}
+}
+
+func TestMenuShowsCurrentModelSuffixes(t *testing.T) {
+	menu := newModel(launcherTestState())
+	runView := menu.View()
+	if !strings.Contains(runView, "(qwen3:8b)") {
+		t.Fatalf("expected run row to show current model suffix\n%s", runView)
+	}
+
+	menu.cursor = 1
+	integrationView := menu.View()
+	if !strings.Contains(integrationView, "(glm-5:cloud)") {
+		t.Fatalf("expected integration row to show current model suffix\n%s", integrationView)
+	}
+}
+
+func TestMenuShowsInstallStatusAndHint(t *testing.T) {
+	state := launcherTestState()
+	codex := state.Integrations["codex"]
+	codex.Installed = false
+	codex.Selectable = false
+	codex.Changeable = false
+	codex.InstallHint = "Install from https://example.com/codex"
+	state.Integrations["codex"] = codex
+
+	menu := newModel(state)
+	menu.cursor = 2
+	view := menu.View()
+	if !strings.Contains(view, "(not installed)") {
+		t.Fatalf("expected not-installed marker\n%s", view)
+	}
+	if !strings.Contains(view, codex.InstallHint) {
+		t.Fatalf("expected install hint in description\n%s", view)
+	}
+}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -320,7 +320,7 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &lfm2Model{}
 	case "Lfm2VlForConditionalGeneration":
 		conv = &lfm2VLTextModel{}
-	case "Qwen3NextForCausalLM":
+	case "Qwen3NextForCausalLM", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration":
 		conv = &qwen3NextModel{}
 	case "NemotronHForCausalLM":
 		conv = &nemotronHModel{}
--- a/convert/convert_qwen3next.go
+++ b/convert/convert_qwen3next.go
@@ -1,6 +1,7 @@
 package convert

 import (
+	"encoding/json"
 	"fmt"
 	"io/fs"
 	"math"
@@ -13,8 +14,21 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-type qwen3NextModel struct {
-	ModelParameters
+type qwen3NextRopeScaling struct {
+	Type         string     `json:"type"`
+	Factor       ropeFactor `json:"factor"`
+	MropeSection []int32    `json:"mrope_section"`
+}
+
+type qwen3NextRopeParams struct {
+	MRopeInterleaved    bool    `json:"mrope_interleaved"`
+	MropeSection        []int32 `json:"mrope_section"`
+	RopeType            string  `json:"rope_type"`
+	RopeTheta           float32 `json:"rope_theta"`
+	PartialRotaryFactor float32 `json:"partial_rotary_factor"`
+}
+
+type qwen3NextTextConfig struct {
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
@@ -28,12 +42,13 @@ type qwen3NextModel struct {
 	// MoE config
 	NumExperts             uint32 `json:"num_experts"`
 	NumExpertsPerToken     uint32 `json:"num_experts_per_tok"`
-	NormTopkProb           bool   `json:"norm_topk_prob"`
+	NormTopkProb           *bool  `json:"norm_topk_prob"`
 	MoEIntermediateSize    uint32 `json:"moe_intermediate_size"`
 	SharedExpertIntermSize uint32 `json:"shared_expert_intermediate_size"`

 	// Hybrid attention config
-	FullAttentionInterval uint32 `json:"full_attention_interval"`
+	FullAttentionInterval uint32   `json:"full_attention_interval"`
+	LayerTypes            []string `json:"layer_types"`

 	// Linear attention (Gated Delta Net) config
 	LinearConvKernelDim uint32 `json:"linear_conv_kernel_dim"`
@@ -43,16 +58,102 @@ type qwen3NextModel struct {
 	LinearValueHeadDim  uint32 `json:"linear_value_head_dim"`

 	// RoPE config
-	PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-	RopeScaling         struct {
-		Type   string     `json:"type"`
-		Factor ropeFactor `json:"factor"`
-	} `json:"rope_scaling"`
+	PartialRotaryFactor float32              `json:"partial_rotary_factor"`
+	RopeScaling         qwen3NextRopeScaling `json:"rope_scaling"`
+	RopeParameters      qwen3NextRopeParams  `json:"rope_parameters"`
+}
+
+type qwen3NextVisionConfig struct {
+	Depth                  uint32  `json:"depth"`
+	HiddenSize             uint32  `json:"hidden_size"`
+	NumHeads               uint32  `json:"num_heads"`
+	InChannels             uint32  `json:"in_channels"`
+	PatchSize              uint32  `json:"patch_size"`
+	SpatialMergeSize       uint32  `json:"spatial_merge_size"`
+	RMSNormEps             float32 `json:"layer_norm_epsilon"`
+	RopeTheta              float32 `json:"rope_theta"`
+	TemporalPatchSize      uint32  `json:"temporal_patch_size"`
+	DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
+
+	Size struct {
+		ShortestEdge uint32 `json:"shortest_edge"`
+		LongestEdge  uint32 `json:"longest_edge"`
+	} `json:"size"`
+
+	ImageMean []float32 `json:"image_mean"`
+	ImageStd  []float32 `json:"image_std"`
+}
+
+type qwen3NextModel struct {
+	ModelParameters
+	qwen3NextTextConfig
+
+	TextConfig  *qwen3NextTextConfig  `json:"text_config"`
+	VisionModel qwen3NextVisionConfig `json:"vision_config"`
+
+	ImageTokenID       uint32 `json:"image_token_id"`
+	VisionStartTokenID uint32 `json:"vision_start_token_id"`
+	VisionEndTokenID   uint32 `json:"vision_end_token_id"`
 }

 var _ ModelConverter = (*qwen3NextModel)(nil)

-func (q *qwen3NextModel) parseMore(_ fs.FS) error {
+func (q *qwen3NextModel) parseMore(fsys fs.FS) error {
+	if q.TextConfig != nil {
+		q.qwen3NextTextConfig = *q.TextConfig
+	}
+
+	if q.RopeTheta == 0 {
+		q.RopeTheta = q.RopeParameters.RopeTheta
+	}
+	if q.PartialRotaryFactor == 0 {
+		q.PartialRotaryFactor = q.RopeParameters.PartialRotaryFactor
+	}
+
+	if q.RopeScaling.Type == "" && q.RopeParameters.RopeType != "" {
+		q.RopeScaling.Type = q.RopeParameters.RopeType
+	}
+
+	// Pull vision preprocessing fields when present.
+	if q.VisionModel.Depth > 0 {
+		if bts, err := fs.ReadFile(fsys, "preprocessor_config.json"); err == nil {
+			var pre struct {
+				Size struct {
+					ShortestEdge uint32 `json:"shortest_edge"`
+					LongestEdge  uint32 `json:"longest_edge"`
+				} `json:"size"`
+				PatchSize         uint32    `json:"patch_size"`
+				TemporalPatchSize uint32    `json:"temporal_patch_size"`
+				MergeSize         uint32    `json:"merge_size"`
+				ImageMean         []float32 `json:"image_mean"`
+				ImageStd          []float32 `json:"image_std"`
+			}
+			if json.Unmarshal(bts, &pre) == nil {
+				if q.VisionModel.PatchSize == 0 {
+					q.VisionModel.PatchSize = pre.PatchSize
+				}
+				if q.VisionModel.TemporalPatchSize == 0 {
+					q.VisionModel.TemporalPatchSize = pre.TemporalPatchSize
+				}
+				if q.VisionModel.SpatialMergeSize == 0 {
+					q.VisionModel.SpatialMergeSize = pre.MergeSize
+				}
+				if q.VisionModel.Size.ShortestEdge == 0 {
+					q.VisionModel.Size.ShortestEdge = pre.Size.ShortestEdge
+				}
+				if q.VisionModel.Size.LongestEdge == 0 {
+					q.VisionModel.Size.LongestEdge = pre.Size.LongestEdge
+				}
+				if len(q.VisionModel.ImageMean) == 0 {
+					q.VisionModel.ImageMean = pre.ImageMean
+				}
+				if len(q.VisionModel.ImageStd) == 0 {
+					q.VisionModel.ImageStd = pre.ImageStd
+				}
+			}
+		}
+	}
+
 	if q.NumHiddenLayers == 0 {
 		return fmt.Errorf("qwen3next: num_hidden_layers must be set")
 	}
@@ -74,36 +175,96 @@ func (q *qwen3NextModel) parseMore(_ fs.FS) error {
 	if q.LinearNumKeyHeads == 0 || q.LinearNumValueHeads == 0 || q.LinearKeyHeadDim == 0 || q.LinearValueHeadDim == 0 {
 		return fmt.Errorf("qwen3next: linear attention config must be set (linear_num_key_heads, linear_num_value_heads, linear_key_head_dim, linear_value_head_dim)")
 	}
-	if q.FullAttentionInterval == 0 {
-		return fmt.Errorf("qwen3next: full_attention_interval must be set")
-	}
-	if q.FullAttentionInterval > q.NumHiddenLayers {
-		return fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds num_hidden_layers (%d)", q.FullAttentionInterval, q.NumHiddenLayers)
-	}
-
-	hasFull := false
-	for i := range q.NumHiddenLayers {
-		if (i+1)%q.FullAttentionInterval == 0 {
-			hasFull = true
-			break
-		}
-	}
-	if !hasFull {
-		return fmt.Errorf("qwen3next: head_count_kv would be all zeros (full_attention_interval=%d, num_hidden_layers=%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	if _, err := q.kvHeadCounts(); err != nil {
+		return err
 	}

 	return nil
 }

+func (q *qwen3NextModel) kvHeadCounts() ([]uint32, error) {
+	if len(q.LayerTypes) > 0 {
+		kv := make([]uint32, q.NumHiddenLayers)
+		hasFull := false
+		hasRecurrent := false
+		for i := range q.NumHiddenLayers {
+			layerType := ""
+			if i < uint32(len(q.LayerTypes)) {
+				layerType = q.LayerTypes[i]
+			}
+			if layerType == "full_attention" {
+				kv[i] = q.NumKeyValueHeads
+				hasFull = true
+			} else {
+				hasRecurrent = true
+			}
+		}
+		if !hasFull || !hasRecurrent {
+			return nil, fmt.Errorf("qwen3next: layer_types must include both full_attention and linear_attention")
+		}
+		return kv, nil
+	}
+
+	if q.FullAttentionInterval == 0 {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval must be set")
+	}
+	if q.FullAttentionInterval > q.NumHiddenLayers {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds num_hidden_layers (%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	}
+
+	kv := make([]uint32, q.NumHiddenLayers)
+	hasFull := false
+	for i := range q.NumHiddenLayers {
+		if (i+1)%q.FullAttentionInterval == 0 {
+			kv[i] = q.NumKeyValueHeads
+			hasFull = true
+		}
+	}
+	if !hasFull {
+		return nil, fmt.Errorf("qwen3next: head_count_kv would be all zeros (full_attention_interval=%d, num_hidden_layers=%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	}
+	return kv, nil
+}
+
+func (q *qwen3NextModel) ropeSections() []int32 {
+	if len(q.RopeParameters.MropeSection) > 0 {
+		return q.RopeParameters.MropeSection
+	}
+	return q.RopeScaling.MropeSection
+}
+
+func (q *qwen3NextModel) shouldReorderVHeads() bool {
+	modelType := strings.ToLower(q.ModelType)
+	if strings.Contains(modelType, "qwen3_next") || strings.Contains(modelType, "qwen3next") {
+		return false
+	}
+
+	for _, arch := range q.Architectures {
+		arch = strings.ToLower(arch)
+		if strings.Contains(arch, "qwen3next") || strings.Contains(arch, "qwen3_next") {
+			return false
+		}
+	}
+
+	// Default to qwen3.5 layout for all other qwen3next-family imports.
+	return true
+}
+
 func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen3next"
-	kv["tokenizer.ggml.pre"] = "qwen2"
+
+	arch := "qwen35"
+	if q.NumExperts > 0 {
+		arch = "qwen35moe"
+	}
+	kv["general.architecture"] = arch
+	kv["tokenizer.ggml.pre"] = "qwen35"
 	kv["block_count"] = q.NumHiddenLayers
 	kv["context_length"] = q.MaxPositionEmbeddings
 	kv["embedding_length"] = q.HiddenSize
 	kv["feed_forward_length"] = q.IntermediateSize
 	kv["attention.head_count"] = q.NumAttentionHeads
+
 	headDim := q.HeadDim
 	if headDim == 0 && q.NumAttentionHeads > 0 {
 		headDim = q.HiddenSize / q.NumAttentionHeads
@@ -113,18 +274,31 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 	kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
 	kv["rope.freq_base"] = q.RopeTheta

-	// RoPE dimension count (partial rotary)
-	// partial_rotary_factor = 0.25 means only 25% of head_dim uses RoPE
 	partialRotary := q.PartialRotaryFactor
 	if partialRotary > 0 && partialRotary <= 1 {
 		kv["rope.dimension_count"] = uint32(float32(headDim) * partialRotary)
 	}

-	// MoE config
+	if sections := q.ropeSections(); len(sections) > 0 {
+		kv["mrope_sections"] = sections
+		kv["rope.mrope_section"] = sections
+		kv["rope.dimension_sections"] = sections
+	}
+	if q.RopeParameters.MRopeInterleaved {
+		kv["rope.mrope_interleaved"] = true
+	}
+
+	if q.RopeScaling.Type != "" && q.RopeScaling.Type != "default" {
+		kv["rope.scaling.type"] = q.RopeScaling.Type
+		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	}
+
 	if q.NumExperts > 0 {
 		kv["expert_count"] = q.NumExperts
 		kv["expert_used_count"] = q.NumExpertsPerToken
-		kv["norm_top_k_prob"] = q.NormTopkProb
+		if q.NormTopkProb != nil {
+			kv["norm_top_k_prob"] = *q.NormTopkProb
+		}
 		if q.MoEIntermediateSize > 0 {
 			kv["expert_feed_forward_length"] = q.MoEIntermediateSize
 		}
@@ -133,33 +307,66 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 		}
 	}

-	// SSM/Linear attention config
-	// d_inner = linear_value_head_dim * linear_num_value_heads
 	dInner := q.LinearValueHeadDim * q.LinearNumValueHeads
 	kv["ssm.inner_size"] = dInner
-	kv["ssm.state_size"] = q.LinearKeyHeadDim        // head_k_dim
-	kv["ssm.group_count"] = q.LinearNumKeyHeads      // num_k_heads
-	kv["ssm.time_step_rank"] = q.LinearNumValueHeads // num_v_heads
+	kv["ssm.state_size"] = q.LinearKeyHeadDim
+	kv["ssm.group_count"] = q.LinearNumKeyHeads
+	kv["ssm.time_step_rank"] = q.LinearNumValueHeads
 	kv["ssm.conv_kernel"] = q.LinearConvKernelDim
-	interval := q.FullAttentionInterval
-	kv["full_attention_interval"] = interval
-
-	// Build per-layer KV head count array to identify layer types
-	// 0 = recurrent (linear attention), non-zero = full attention
-	kvHeadCounts := make([]uint32, q.NumHiddenLayers)
-	for i := range q.NumHiddenLayers {
-		// Full attention every full_attention_interval layers (starting at interval-1)
-		if interval > 0 && (i+1)%interval == 0 {
-			kvHeadCounts[i] = q.NumKeyValueHeads
-		}
-		// else stays 0 (recurrent layer)
+	if q.shouldReorderVHeads() {
+		kv["ssm.v_head_reordered"] = true
+	}
+	if q.FullAttentionInterval > 0 {
+		kv["full_attention_interval"] = q.FullAttentionInterval
 	}
-	kv["attention.head_count_kv"] = kvHeadCounts

-	// RoPE scaling
-	if q.RopeScaling.Type != "" {
-		kv["rope.scaling.type"] = q.RopeScaling.Type
-		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	if headCounts, err := q.kvHeadCounts(); err == nil {
+		kv["attention.head_count_kv"] = headCounts
+	}
+
+	if q.VisionModel.Depth > 0 {
+		kv["vision.block_count"] = q.VisionModel.Depth
+		kv["vision.embedding_length"] = q.VisionModel.HiddenSize
+		kv["vision.attention.head_count"] = q.VisionModel.NumHeads
+		kv["vision.num_channels"] = q.VisionModel.InChannels
+		if q.VisionModel.PatchSize > 0 {
+			kv["vision.patch_size"] = q.VisionModel.PatchSize
+		}
+		if q.VisionModel.SpatialMergeSize > 0 {
+			kv["vision.spatial_merge_size"] = q.VisionModel.SpatialMergeSize
+		}
+		if q.VisionModel.RMSNormEps > 0 {
+			kv["vision.attention.layer_norm_epsilon"] = q.VisionModel.RMSNormEps
+		}
+		if q.VisionModel.RopeTheta > 0 {
+			kv["vision.rope.freq_base"] = q.VisionModel.RopeTheta
+		}
+		if q.VisionModel.TemporalPatchSize > 0 {
+			kv["vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
+		}
+		kv["vision.deepstack_visual_indexes"] = q.VisionModel.DeepstackVisualIndexes
+		if q.VisionModel.Size.ShortestEdge > 0 {
+			kv["vision.shortest_edge"] = q.VisionModel.Size.ShortestEdge
+		}
+		if q.VisionModel.Size.LongestEdge > 0 {
+			kv["vision.longest_edge"] = q.VisionModel.Size.LongestEdge
+		}
+		if len(q.VisionModel.ImageMean) > 0 {
+			kv["vision.image_mean"] = q.VisionModel.ImageMean
+		}
+		if len(q.VisionModel.ImageStd) > 0 {
+			kv["vision.image_std"] = q.VisionModel.ImageStd
+		}
+	}
+
+	if q.ImageTokenID > 0 {
+		kv["image_token_id"] = q.ImageTokenID
+	}
+	if q.VisionStartTokenID > 0 {
+		kv["vision_start_token_id"] = q.VisionStartTokenID
+	}
+	if q.VisionEndTokenID > 0 {
+		kv["vision_end_token_id"] = q.VisionEndTokenID
 	}

 	return kv
@@ -168,7 +375,6 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor

-	// Create merges for expert tensors - stack individual experts into batched tensors
 	merges := make([]merge, q.NumHiddenLayers*3)
 	for i := range q.NumHiddenLayers {
 		merges[i*3+0] = merge{
@@ -185,16 +391,13 @@ func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		}
 	}

-	// Merge expert tensors
 	merged, remaining := mergeTensors(ts, merges...)
 	out = append(out, merged...)

-	// Process remaining tensors
 	for _, t := range remaining {
 		name := t.Name()
 		shape := t.Shape()

-		// Split linear_attn.in_proj_qkvz (ssm_in) into attn_qkv + attn_gate when possible
 		if strings.HasSuffix(name, ".ssm_in.weight") {
 			if qkv, gate, ok := q.splitQKVZTensor(t); ok {
 				out = append(out, qkv, gate)
@@ -204,84 +407,299 @@ func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		}

 		switch {
-		// Add 1 to norm weights (except ssm_norm which is linear_attn.norm)
-		// This matches the Python converter behavior for qwen3next
+		case strings.Contains(name, ".mlp.experts.gate_up_proj"):
+			out = append(out, slices.Collect(splitDim(t, 1,
+				split{Replacer: strings.NewReplacer(".mlp.experts.gate_up_proj", ".ffn_gate_exps.weight")},
+				split{Replacer: strings.NewReplacer(".mlp.experts.gate_up_proj", ".ffn_up_exps.weight")},
+			))...)
+
+		case strings.Contains(name, ".mlp.experts.down_proj"):
+			out = append(out, &ggml.Tensor{
+				Name:     strings.NewReplacer(".mlp.experts.down_proj", ".ffn_down_exps.weight").Replace(name),
+				Kind:     t.Kind(),
+				Shape:    slices.Clone(shape),
+				WriterTo: t,
+			})
+
+		case strings.HasPrefix(name, "v.blk.") && strings.Contains(name, ".attn_qkv"):
+			out = append(out, slices.Collect(splitDim(t, 0,
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
+			))...)
+
+		case strings.Contains(name, "patch_embed") && strings.HasSuffix(name, "weight"):
+			out = append(out, &ggml.Tensor{
+				Name:     name,
+				Kind:     t.Kind(),
+				Shape:    append([]uint64{shape[0] * shape[1]}, shape[2:]...),
+				WriterTo: t,
+			})
+
 		case strings.HasSuffix(name, "_norm.weight") && !strings.HasSuffix(name, ".ssm_norm.weight"):
 			t.SetRepacker(q.addOne)
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})

-		// Handle linear attention A_log -> ssm_a (negate and exp)
-		// Note: name has already been transformed by Replacements at this point
 		case strings.HasSuffix(name, ".ssm_a"):
-			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-				// Compute -exp(A_log)
-				result := make([]float32, len(data))
-				for i, v := range data {
-					// -exp(v)
-					result[i] = -float32(math.Exp(float64(v)))
-				}
-				return result, nil
-			})
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			t.SetRepacker(q.repackSSMA())
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".attn_qkv.weight"):
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackAttnQKV())
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".attn_gate.weight"):
+			if q.shouldReorderVHeads() {
+				// HF tensor layout is [out_features, in_features]; reorder rows.
+				t.SetRepacker(q.repackReorderDim(0, int(q.LinearValueHeadDim)))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_beta.weight"), strings.HasSuffix(name, ".ssm_alpha.weight"):
+			if q.shouldReorderVHeads() {
+				// HF tensor layout is [out_features, in_features]; reorder rows.
+				t.SetRepacker(q.repackReorderDim(0, 1))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_dt"):
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackReorderDim(0, 1))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_out.weight"):
+			if q.shouldReorderVHeads() {
+				// HF out_proj layout is [out_features, in_features]; reorder columns.
+				t.SetRepacker(q.repackReorderDim(1, int(q.LinearValueHeadDim)))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})

-		// Squeeze conv1d weights: [1, D, K] or [D, 1, K] -> [D, K]
 		case strings.HasSuffix(name, ".ssm_conv1d.weight"):
 			newShape := slices.Clone(shape)
 			if len(shape) == 3 {
 				if shape[0] == 1 {
-					// [1, D, K] -> [D, K]
 					newShape = []uint64{shape[1], shape[2]}
 				} else if shape[1] == 1 {
-					// [D, 1, K] -> [D, K]
 					newShape = []uint64{shape[0], shape[2]}
 				}
 			}
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
-		// Squeeze shared expert gate: [D, 1] or [1, D] -> [D]
-		case strings.HasSuffix(name, ".ffn_gate_inp_shexp.weight"):
-			newShape := slices.Clone(shape)
-			if len(shape) == 2 {
-				if shape[0] == 1 && shape[1] > 1 {
-					newShape = []uint64{shape[1]}
-				} else if shape[1] == 1 && shape[0] > 1 {
-					newShape = []uint64{shape[0]}
-				}
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackConv1D())
 			}
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: newShape, WriterTo: t})

 		default:
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
 		}
 	}

 	return out
 }

+func (q *qwen3NextModel) repackReorderDim(dim, headDim int) Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() {
+			return data, nil
+		}
+		numK := int(q.LinearNumKeyHeads)
+		numVPerK := int(q.LinearNumValueHeads / q.LinearNumKeyHeads)
+		return reorderHeadLayout(data, shape, dim, numK, numVPerK, headDim)
+	}
+}
+
+func (q *qwen3NextModel) repackAttnQKV() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() || len(shape) != 2 {
+			return data, nil
+		}
+
+		rows := int(shape[0])
+		cols := int(shape[1])
+		numK := int(q.LinearNumKeyHeads)
+		numV := int(q.LinearNumValueHeads)
+		headK := int(q.LinearKeyHeadDim)
+		headV := int(q.LinearValueHeadDim)
+		qDim := headK * numK
+		kDim := headK * numK
+		vDim := headV * numV
+		qkvDim := qDim + kDim + vDim
+
+		switch {
+		case rows == qkvDim:
+			// HF layout: [out_features, in_features]. Keep Q/K rows unchanged and
+			// reorder only V rows from grouped -> tiled head layout.
+			out := make([]float32, len(data))
+			qkRows := qDim + kDim
+			qkSize := qkRows * cols
+			copy(out[:qkSize], data[:qkSize])
+
+			vStart := qkSize
+			vEnd := vStart + vDim*cols
+			reorderedV, err := reorderHeadLayout(data[vStart:vEnd], []uint64{uint64(vDim), uint64(cols)}, 0, numK, numV/numK, headV)
+			if err != nil {
+				return nil, err
+			}
+			copy(out[vStart:vEnd], reorderedV)
+			copy(out[vEnd:], data[vEnd:])
+			return out, nil
+
+		case cols == qkvDim:
+			// Fallback for already-transposed [in_features, out_features] tensors.
+			out := make([]float32, len(data))
+			copy(out, data)
+			for r := range rows {
+				base := r * cols
+				vStart := base + qDim + kDim
+				vEnd := vStart + vDim
+				reorderedV, err := reorderHeadLayout(out[vStart:vEnd], []uint64{uint64(vDim)}, 0, numK, numV/numK, headV)
+				if err != nil {
+					return nil, err
+				}
+				copy(out[vStart:vEnd], reorderedV)
+			}
+			return out, nil
+
+		default:
+			return data, nil
+		}
+	}
+}
+
+func (q *qwen3NextModel) repackConv1D() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() {
+			return data, nil
+		}
+
+		normShape := slices.Clone(shape)
+		if len(shape) == 3 {
+			if shape[0] == 1 {
+				normShape = []uint64{shape[1], shape[2]}
+			} else if shape[1] == 1 {
+				normShape = []uint64{shape[0], shape[2]}
+			}
+		}
+		if len(normShape) != 2 {
+			return data, nil
+		}
+
+		rows := int(normShape[0])
+		cols := int(normShape[1])
+		numK := int(q.LinearNumKeyHeads)
+		numV := int(q.LinearNumValueHeads)
+		headK := int(q.LinearKeyHeadDim)
+		headV := int(q.LinearValueHeadDim)
+		qkChannels := 2 * headK * numK
+		totalChannels := qkChannels + headV*numV
+		if qkChannels <= 0 {
+			return data, nil
+		}
+
+		switch {
+		case rows == totalChannels:
+			// HF layout after squeeze: [channels, kernel]
+			out := make([]float32, len(data))
+			prefix := qkChannels * cols
+			copy(out[:prefix], data[:prefix])
+			reorderedV, err := reorderHeadLayout(data[prefix:], []uint64{uint64(totalChannels - qkChannels), uint64(cols)}, 0, numK, numV/numK, headV)
+			if err != nil {
+				return nil, err
+			}
+			copy(out[prefix:], reorderedV)
+			return out, nil
+		case cols == totalChannels:
+			// Fallback for transposed [kernel, channels]
+			out := make([]float32, len(data))
+			copy(out, data)
+			vChannels := totalChannels - qkChannels
+			for r := range rows {
+				base := r * cols
+				vStart := base + qkChannels
+				vEnd := vStart + vChannels
+				reorderedV, err := reorderHeadLayout(out[vStart:vEnd], []uint64{uint64(vChannels)}, 0, numK, numV/numK, headV)
+				if err != nil {
+					return nil, err
+				}
+				copy(out[vStart:vEnd], reorderedV)
+			}
+			return out, nil
+		default:
+			return data, nil
+		}
+	}
+}
+
+func (q *qwen3NextModel) repackSSMA() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		result := make([]float32, len(data))
+		for i, v := range data {
+			result[i] = -float32(math.Exp(float64(v)))
+		}
+		if !q.shouldReorderVHeads() {
+			return result, nil
+		}
+		numK := int(q.LinearNumKeyHeads)
+		numVPerK := int(q.LinearNumValueHeads / q.LinearNumKeyHeads)
+		return reorderHeadLayout(result, shape, 0, numK, numVPerK, 1)
+	}
+}
+
+func reorderHeadLayout(data []float32, shape []uint64, dim int, numKHeads, numVPerK, headDim int) ([]float32, error) {
+	if len(shape) == 0 || numKHeads <= 0 || numVPerK <= 0 || headDim <= 0 {
+		return data, nil
+	}
+
+	dims := make([]int, len(shape))
+	for i := range shape {
+		dims[i] = int(shape[i])
+	}
+	if dim < 0 {
+		dim += len(dims)
+	}
+	if dim < 0 || dim >= len(dims) {
+		return data, nil
+	}
+
+	expected := numKHeads * numVPerK * headDim
+	if dims[dim] != expected {
+		return data, nil
+	}
+
+	newShape := make([]int, 0, len(dims)+2)
+	newShape = append(newShape, dims[:dim]...)
+	newShape = append(newShape, numKHeads, numVPerK, headDim)
+	newShape = append(newShape, dims[dim+1:]...)
+
+	var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+	if err := tt.Reshape(newShape...); err != nil {
+		return nil, err
+	}
+
+	perm := make([]int, len(newShape))
+	for i := range perm {
+		perm[i] = i
+	}
+	perm[dim], perm[dim+1] = perm[dim+1], perm[dim]
+
+	tt, err := tensor.Transpose(tt, perm...)
+	if err != nil {
+		return nil, err
+	}
+	tt = tensor.Materialize(tt)
+
+	total := 1
+	for _, d := range dims {
+		total *= d
+	}
+	if err := tt.Reshape(total); err != nil {
+		return nil, err
+	}
+	return native.VectorF32(tt.(*tensor.Dense))
+}
+
 type qkvzSplitSpec struct {
 	hidden    int
 	headKDim  int
@@ -369,7 +787,6 @@ func (q *qwen3NextModel) repackQKVZ(spec qkvzSplitSpec, extractGate bool) Repack
 		var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		var err error

-		// Convert to [hidden, out_features] layout for slicing
 		tt, err = tensor.Transpose(tt, 1, 0)
 		if err != nil {
 			return nil, err
@@ -444,7 +861,6 @@ func (q *qwen3NextModel) repackQKVZ(spec qkvzSplitSpec, extractGate bool) Repack
 	}
 }

-// addOne adds 1.0 to all elements in the tensor (for norm weights)
 func (*qwen3NextModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))
@@ -471,10 +887,21 @@ func (q *qwen3NextModel) Replacements() []string {
 	return []string{
 		// Embeddings and output
 		"lm_head", "output",
+		"model.language_model.embed_tokens", "token_embd",
+		"model.language_model.norm", "output_norm",
+		"model.language_model.layers", "blk",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",

+		// Vision
+		"model.visual", "v",
+		"patch_embed.proj", "patch_embed",
+		"blocks", "blk",
+		"attn.qkv", "attn_qkv",
+		"attn.proj", "attn_out",
+		"deepstack_merger_list", "deepstack_merger",
+
 		// Layer norms
 		"input_layernorm", "attn_norm",
 		"post_attention_layernorm", "post_attention_norm",
@@ -487,9 +914,16 @@ func (q *qwen3NextModel) Replacements() []string {
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",

-		// Linear attention (Gated Delta Net)
+		// Linear attention (legacy qwen3next)
 		"linear_attn.in_proj_qkvz", "ssm_in",
 		"linear_attn.in_proj_ba", "ssm_ba",
+
+		// Linear attention (qwen35)
+		"linear_attn.in_proj_qkv", "attn_qkv",
+		"linear_attn.in_proj_z", "attn_gate",
+		"linear_attn.in_proj_a", "ssm_alpha",
+		"linear_attn.in_proj_b", "ssm_beta",
+
 		"linear_attn.conv1d", "ssm_conv1d",
 		"linear_attn.dt_bias", "ssm_dt",
 		"linear_attn.dt_proj", "ssm_dt",
@@ -497,14 +931,14 @@ func (q *qwen3NextModel) Replacements() []string {
 		"linear_attn.norm", "ssm_norm",
 		"linear_attn.out_proj", "ssm_out",

-		// MoE (experts are stacked via mergeTensors, not replaced here)
+		// MoE
 		"mlp.gate.weight", "ffn_gate_inp.weight",
 		"mlp.shared_expert.down_proj", "ffn_down_shexp",
 		"mlp.shared_expert.gate_proj", "ffn_gate_shexp",
 		"mlp.shared_expert.up_proj", "ffn_up_shexp",
 		"mlp.shared_expert_gate", "ffn_gate_inp_shexp",

-		// Dense FFN (if any layers use it)
+		// Dense FFN
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
--- a/convert/convert_qwen3next_test.go
+++ b/convert/convert_qwen3next_test.go
@@ -0,0 +1,563 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"os"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+func boolPtr(v bool) *bool {
+	return &v
+}
+
+func readTensorData(t *testing.T, tensor *ggml.Tensor) []float32 {
+	t.Helper()
+
+	var b bytes.Buffer
+	if _, err := tensor.WriteTo(&b); err != nil {
+		t.Fatal(err)
+	}
+
+	numel := 1
+	for _, d := range tensor.Shape {
+		numel *= int(d)
+	}
+
+	values := make([]float32, numel)
+	if err := binary.Read(&b, binary.LittleEndian, &values); err != nil {
+		t.Fatal(err)
+	}
+
+	return values
+}
+
+func TestQwen3NextLegacyModelTypeDisablesReorder(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+	}
+
+	if m.shouldReorderVHeads() {
+		t.Fatalf("legacy qwen3_next model_type should not reorder v-head layout")
+	}
+}
+
+func TestQwen3NextLegacyArchitectureDisablesReorder(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			Architectures: []string{"Qwen3NextForCausalLM"},
+		},
+	}
+
+	if m.shouldReorderVHeads() {
+		t.Fatalf("legacy Qwen3Next architecture should not reorder v-head layout")
+	}
+}
+
+func TestQwen3NextKVLegacyConfig(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			MaxPositionEmbeddings: 8192,
+			HiddenSize:            512,
+			NumHiddenLayers:       4,
+			IntermediateSize:      2048,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      2,
+			HeadDim:               64,
+			RopeTheta:             1_000_000,
+			RMSNormEPS:            1e-6,
+
+			NumExperts:             8,
+			NumExpertsPerToken:     2,
+			NormTopkProb:           boolPtr(true),
+			MoEIntermediateSize:    256,
+			SharedExpertIntermSize: 512,
+
+			FullAttentionInterval: 2,
+
+			LinearConvKernelDim: 4,
+			LinearKeyHeadDim:    64,
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  64,
+
+			PartialRotaryFactor: 0.25,
+		},
+	}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "qwen35moe"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "qwen35"; got != want {
+		t.Fatalf("unexpected tokenizer pre: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 2, 0, 2}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	if _, ok := kv["ssm.v_head_reordered"]; ok {
+		t.Fatalf("legacy qwen3next should not enable ssm.v_head_reordered")
+	}
+	if got, want := kv["norm_top_k_prob"], true; got != want {
+		t.Fatalf("unexpected norm_top_k_prob: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35MoeOmitsNormTopKProbWhenUnset(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			MaxPositionEmbeddings: 4096,
+			HiddenSize:            512,
+			NumHiddenLayers:       4,
+			IntermediateSize:      2048,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      2,
+			HeadDim:               64,
+			RopeTheta:             1_000_000,
+			RMSNormEPS:            1e-6,
+			NumExperts:            8,
+			NumExpertsPerToken:    2,
+			FullAttentionInterval: 2,
+			LinearConvKernelDim:   4,
+			LinearKeyHeadDim:      64,
+			LinearNumKeyHeads:     2,
+			LinearNumValueHeads:   4,
+			LinearValueHeadDim:    64,
+			PartialRotaryFactor:   0.25,
+		},
+	}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if _, ok := kv["norm_top_k_prob"]; ok {
+		t.Fatalf("expected norm_top_k_prob to be omitted when not set in config")
+	}
+}
+
+func TestQwen35KVFromTextConfig(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		TextConfig: &qwen3NextTextConfig{
+			MaxPositionEmbeddings: 16384,
+			HiddenSize:            1024,
+			NumHiddenLayers:       4,
+			IntermediateSize:      4096,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      4,
+			HeadDim:               128,
+			RMSNormEPS:            1e-6,
+
+			LayerTypes: []string{
+				"linear_attention",
+				"full_attention",
+				"linear_attention",
+				"full_attention",
+			},
+
+			LinearConvKernelDim: 4,
+			LinearKeyHeadDim:    128,
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  128,
+
+			RopeParameters: qwen3NextRopeParams{
+				MRopeInterleaved:    true,
+				MropeSection:        []int32{11, 11, 10},
+				RopeType:            "default",
+				RopeTheta:           10_000_000,
+				PartialRotaryFactor: 0.25,
+			},
+		},
+		VisionModel: qwen3NextVisionConfig{
+			Depth:                  2,
+			HiddenSize:             128,
+			NumHeads:               4,
+			InChannels:             3,
+			PatchSize:              16,
+			SpatialMergeSize:       2,
+			RMSNormEps:             1e-6,
+			RopeTheta:              10_000,
+			TemporalPatchSize:      2,
+			DeepstackVisualIndexes: []int32{1},
+		},
+		ImageTokenID:       1001,
+		VisionStartTokenID: 1002,
+		VisionEndTokenID:   1003,
+	}
+	m.VisionModel.Size.ShortestEdge = 224
+	m.VisionModel.Size.LongestEdge = 4096
+	m.VisionModel.ImageMean = []float32{0.5, 0.5, 0.5}
+	m.VisionModel.ImageStd = []float32{0.2, 0.2, 0.2}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "qwen35"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 4, 0, 4}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	if got, ok := kv["ssm.v_head_reordered"].(bool); !ok || !got {
+		t.Fatalf("expected ssm.v_head_reordered=true, got %v (%T)", kv["ssm.v_head_reordered"], kv["ssm.v_head_reordered"])
+	}
+
+	mrope, ok := kv["mrope_sections"].([]int32)
+	if !ok {
+		t.Fatalf("mrope_sections has unexpected type: %T", kv["mrope_sections"])
+	}
+	if got, want := mrope, []int32{11, 11, 10}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected mrope_sections: got %v want %v", got, want)
+	}
+	ropeSections, ok := kv["rope.dimension_sections"].([]int32)
+	if !ok {
+		t.Fatalf("rope.dimension_sections has unexpected type: %T", kv["rope.dimension_sections"])
+	}
+	if got, want := ropeSections, []int32{11, 11, 10}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected rope.dimension_sections: got %v want %v", got, want)
+	}
+
+	if got, ok := kv["rope.mrope_interleaved"].(bool); !ok || !got {
+		t.Fatalf("expected rope.mrope_interleaved=true, got %v (%T)", kv["rope.mrope_interleaved"], kv["rope.mrope_interleaved"])
+	}
+
+	if got, want := kv["vision.block_count"], uint32(2); got != want {
+		t.Fatalf("unexpected vision.block_count: got %v want %v", got, want)
+	}
+}
+
+func TestQwen3NextReplacements(t *testing.T) {
+	r := strings.NewReplacer((&qwen3NextModel{}).Replacements()...)
+
+	if got, want := r.Replace("model.language_model.layers.1.linear_attn.in_proj_qkv.weight"), "blk.1.attn_qkv.weight"; got != want {
+		t.Fatalf("unexpected language-model replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("model.visual.blocks.0.attn.qkv.weight"), "v.blk.0.attn_qkv.weight"; got != want {
+		t.Fatalf("unexpected vision replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("model.layers.1.linear_attn.in_proj_qkvz.weight"), "blk.1.ssm_in.weight"; got != want {
+		t.Fatalf("unexpected legacy replacement: got %q want %q", got, want)
+	}
+}
+
+func TestQwen35ReordersVHeads(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_gate.weight",
+			shape: []uint64{4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 4, 5, 2, 3, 6, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersAttnQKVOutputDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearKeyHeadDim:    1,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_qkv.weight",
+			shape: []uint64{8, 2}, // [out_features, in_features] (HF layout)
+			data: []float32{
+				0, 1, // q0
+				2, 3, // q1
+				4, 5, // k0
+				6, 7, // k1
+				10, 11, // v(k0,v0)
+				12, 13, // v(k0,v1)
+				20, 21, // v(k1,v0)
+				22, 23, // v(k1,v1)
+			},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{
+		0, 1, 2, 3, 4, 5, 6, 7,
+		10, 11, 20, 21, 12, 13, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected qkv data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersSsmOutInputDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_out.weight",
+			shape: []uint64{2, 4},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 2, 1, 3, 4, 6, 5, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected ssm_out data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersSsmBetaRows(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_beta.weight",
+			shape: []uint64{4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 4, 5, 2, 3, 6, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected ssm_beta data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersConv1DChannelDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearKeyHeadDim:    1,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_conv1d.weight",
+			shape: []uint64{8, 2}, // [channels, kernel] after squeeze
+			data: []float32{
+				0, 1, // q0
+				2, 3, // q1
+				4, 5, // k0
+				6, 7, // k1
+				10, 11, // v(k0,v0)
+				12, 13, // v(k0,v1)
+				20, 21, // v(k1,v0)
+				22, 23, // v(k1,v1)
+			},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{
+		0, 1, 2, 3, 4, 5, 6, 7,
+		10, 11, 20, 21, 12, 13, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected conv1d data: got %v want %v", got, want)
+	}
+}
+
+func TestLegacyQwen3NextDoesNotReorderVHeads(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_gate.weight",
+			shape: []uint64{4, 1},
+			data:  []float32{0, 1, 2, 3},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected data for legacy qwen3next: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35MoePackedExperts(t *testing.T) {
+	m := &qwen3NextModel{
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			NumHiddenLayers: 1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.mlp.experts.gate_up_proj",
+			shape: []uint64{2, 4, 3},
+			data: []float32{
+				0, 1, 2,
+				3, 4, 5,
+				6, 7, 8,
+				9, 10, 11,
+				12, 13, 14,
+				15, 16, 17,
+				18, 19, 20,
+				21, 22, 23,
+			},
+		},
+		&fakeTensor{
+			name:  "blk.0.mlp.experts.down_proj",
+			shape: []uint64{2, 5, 3},
+			data:  make([]float32, 2*5*3),
+		},
+	})
+
+	get := func(name string) *ggml.Tensor {
+		for _, tensor := range out {
+			if tensor.Name == name {
+				return tensor
+			}
+		}
+		return nil
+	}
+
+	gate := get("blk.0.ffn_gate_exps.weight")
+	if gate == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_gate_exps.weight")
+	}
+	if got, want := gate.Shape, []uint64{2, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected gate shape: got %v want %v", got, want)
+	}
+	if got, want := readTensorData(t, gate), []float32{
+		0, 1, 2, 3, 4, 5,
+		12, 13, 14, 15, 16, 17,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected gate values: got %v want %v", got, want)
+	}
+
+	up := get("blk.0.ffn_up_exps.weight")
+	if up == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_up_exps.weight")
+	}
+	if got, want := up.Shape, []uint64{2, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected up shape: got %v want %v", got, want)
+	}
+	if got, want := readTensorData(t, up), []float32{
+		6, 7, 8, 9, 10, 11,
+		18, 19, 20, 21, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected up values: got %v want %v", got, want)
+	}
+
+	down := get("blk.0.ffn_down_exps.weight")
+	if down == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_down_exps.weight")
+	}
+	if got, want := down.Shape, []uint64{2, 5, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected down shape: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35SharedExpertGateKeepsMatrixShape(t *testing.T) {
+	m := &qwen3NextModel{}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ffn_gate_inp_shexp.weight",
+			shape: []uint64{1, 4},
+			data:  []float32{0, 1, 2, 3},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := out[0].Shape, []uint64{1, 4}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected shared gate shape: got %v want %v", got, want)
+	}
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -101,6 +101,8 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			t.Pre = "deepseek-coder"
 		case "1ff7f41064896984db5d1bb6ff64fa4bc29007d08c1b439e505b7392777a319e":
 			t.Pre = "qwen2"
+		case "00431aed57e696b747435f734d1e3b9b1bfd931a121fb5cac7129e97c181e9ba":
+			t.Pre = "qwen35"
 		case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
 			// noop, empty pretokenizer
 		default:
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -386,6 +386,28 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "qwen35 pretokenizer",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"pre_tokenizer": {
+						"type": "Sequence",
+						"pretokenizers": [
+							{
+								"type": "Split",
+								"pattern": {
+									"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+								}
+							}
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "qwen35",
+			},
+		},
 	}

 	for _, tt := range cases {
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -12,7 +12,6 @@ To use Ollama with tools that expect the Anthropic API (like Claude Code), set t

 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama  # required but ignored
-export ANTHROPIC_API_KEY="" # required but ignored
 export ANTHROPIC_BASE_URL=http://localhost:11434
 ```

@@ -269,7 +268,7 @@ ollama launch claude --config
 Set the environment variables and run Claude Code:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model qwen3-coder
+ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 claude --model qwen3-coder
 ```

 Or set the environment variables in your shell profile:
@@ -277,7 +276,6 @@ Or set the environment variables in your shell profile:
 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
-export ANTHROPIC_API_KEY=""
 ```

 Then run Claude Code with any Ollama model:
--- a/docs/api/openai-compatibility.mdx
+++ b/docs/api/openai-compatibility.mdx
@@ -6,7 +6,7 @@ Ollama provides compatibility with parts of the [OpenAI API](https://platform.op

 ## Usage

-### Simple `v1/chat/completions` example
+### Simple `/v1/chat/completions` example

 <CodeGroup dropdown>

@@ -57,7 +57,7 @@ curl -X POST http://localhost:11434/v1/chat/completions \

 </CodeGroup>

-### Simple `v1/responses` example
+### Simple `/v1/responses` example

 <CodeGroup dropdown>

@@ -103,7 +103,7 @@ curl -X POST http://localhost:11434/v1/responses \

 </CodeGroup>

-### v1/chat/completions with vision example
+### `/v1/chat/completions` with vision example

 <CodeGroup dropdown>

@@ -184,6 +184,7 @@ curl -X POST http://localhost:11434/v1/chat/completions \
 - [x] Reproducible outputs
 - [x] Vision
 - [x] Tools
+- [x] Reasoning/thinking control (for thinking models)
 - [ ] Logprobs

 #### Supported request fields
@@ -207,6 +208,9 @@ curl -X POST http://localhost:11434/v1/chat/completions \
 - [x] `top_p`
 - [x] `max_tokens`
 - [x] `tools`
+- [x] `reasoning_effort` (`"high"`, `"medium"`, `"low"`, `"none"`)
+- [x] `reasoning`
+  - [x] `effort` (`"high"`, `"medium"`, `"low"`, `"none"`)
 - [ ] `tool_choice`
 - [ ] `logit_bias`
 - [ ] `user`
--- a/docs/cli.mdx
+++ b/docs/cli.mdx
@@ -40,7 +40,7 @@ ollama launch claude
 Launch with a specific model:

 ```
-ollama launch claude --model qwen3-coder
+ollama launch claude --model qwen3.5
 ```

 Configure without launching:
--- a/docs/development.md
+++ b/docs/development.md
@@ -51,6 +51,9 @@ Install prerequisites:
    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
 - (Optional) VULKAN GPU support
    - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
+- (Optional) MLX engine support
+    - [CUDA 13+ SDK](https://developer.nvidia.com/cuda-downloads)
+    - [cuDNN 9+](https://developer.nvidia.com/cudnn)

 Then, configure and build the project:

@@ -101,6 +104,10 @@ Install prerequisites:
 - (Optional) VULKAN GPU support
    - [VULKAN SDK](https://vulkan.lunarg.com/sdk/home) - useful for AMD/Intel GPUs
    - Or install via package manager: `sudo apt install vulkan-sdk` (Ubuntu/Debian) or `sudo dnf install vulkan-sdk` (Fedora/CentOS)
+- (Optional) MLX engine support
+    - [CUDA 13+ SDK](https://developer.nvidia.com/cuda-downloads)
+    - [cuDNN 9+](https://developer.nvidia.com/cudnn)
+    - OpenBLAS/LAPACK: `sudo apt install libopenblas-dev liblapack-dev liblapacke-dev` (Ubuntu/Debian)
 > [!IMPORTANT]
 > Ensure prerequisites are in `PATH` before running CMake.

@@ -118,6 +125,67 @@ Lastly, run Ollama:
 go run . serve
 ```

+## MLX Engine (Optional)
+
+The MLX engine enables running safetensor based models. It requires building the [MLX](https://github.com/ml-explore/mlx) and [MLX-C](https://github.com/ml-explore/mlx-c) shared libraries separately via CMake.  On MacOS, MLX leverages the Metal library to run on the GPU, and on Windows and Linux, runs on NVIDIA GPUs via CUDA v13.
+
+### macOS (Apple Silicon)
+
+Requires the Metal toolchain. Install [Xcode](https://developer.apple.com/xcode/) first, then:
+
+```shell
+xcodebuild -downloadComponent MetalToolchain
+```
+
+Verify it's installed correctly (should print "no input files"):
+
+```shell
+xcrun metal
+```
+
+Then build:
+
+```shell
+cmake -B build --preset MLX
+cmake --build build --preset MLX --parallel
+cmake --install build --component MLX
+```
+
+> [!NOTE]
+> Without the Metal toolchain, cmake will silently complete with Metal disabled. Check the cmake output for `Setting MLX_BUILD_METAL=OFF` which indicates the toolchain is missing.
+
+### Windows / Linux (CUDA)
+
+Requires CUDA 13+ and [cuDNN](https://developer.nvidia.com/cudnn) 9+.
+
+```shell
+cmake -B build --preset "MLX CUDA 13"
+cmake --build build --target mlx --target mlxc --config Release --parallel
+cmake --install build --component MLX --strip
+```
+
+### Local MLX source overrides
+
+To build against a local checkout of MLX and/or MLX-C (useful for development), set environment variables before running CMake:
+
+```shell
+export OLLAMA_MLX_SOURCE=/path/to/mlx
+export OLLAMA_MLX_C_SOURCE=/path/to/mlx-c
+```
+
+For example, using the helper scripts with local mlx and mlx-c repos:
+```shell
+OLLAMA_MLX_SOURCE=../mlx OLLAMA_MLX_C_SOURCE=../mlx-c ./scripts/build_linux.sh
+
+OLLAMA_MLX_SOURCE=../mlx OLLAMA_MLX_C_SOURCE=../mlx-c ./scripts/build_darwin.sh
+```
+
+```powershell
+$env:OLLAMA_MLX_SOURCE="../mlx"
+$env:OLLAMA_MLX_C_SOURCE="../mlx-c"
+./scripts/build_darwin.ps1
+```
+
 ## Docker

 ```shell
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -160,6 +160,12 @@
            "group": "More information",
            "pages": [
              "/cli",
+              {
+                "group": "Assistant Sandboxing",
+                "pages": [
+                  "/integrations/nemoclaw"
+                ]
+              },
              "/modelfile",
              "/context-length",
              "/linux",
--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -61,11 +61,17 @@ Ollama supports the following AMD GPUs via the ROCm library:

 ### Linux Support

-| Family         | Cards and accelerators                                                                                                                         |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64`                  |
-| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `SSG`       |
-| AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60`                                                                      |
+Ollama requires the AMD ROCm v7 driver on Linux. You can install or upgrade
+using the `amdgpu-install` utility from
+[AMD's ROCm documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/).
+
+| Family         | Cards and accelerators                                                                                                                                         |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| AMD Radeon RX  | `9070 XT` `9070 GRE` `9070` `9060 XT` `9060 XT LP` `9060` `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7700` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `5700 XT` `5700` `5600 XT` `5500 XT` |
+| AMD Radeon AI PRO | `R9700` `R9600D`                                                                                                                                            |
+| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620`                                                                        |
+| AMD Ryzen AI   | `Ryzen AI Max+ 395` `Ryzen AI Max 390` `Ryzen AI Max 385` `Ryzen AI 9 HX 475` `Ryzen AI 9 HX 470` `Ryzen AI 9 465` `Ryzen AI 9 HX 375` `Ryzen AI 9 HX 370` `Ryzen AI 9 365` |
+| AMD Instinct   | `MI350X` `MI300X` `MI300A` `MI250X` `MI250` `MI210` `MI100`                                                                                                   |

 ### Windows Support

@@ -97,17 +103,20 @@ This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
 |-----------------|---------------------|
 | gfx908 | Radeon Instinct MI100 |
-| gfx90a | Radeon Instinct MI210 |
-| gfx940 | Radeon Instinct MI300 |
-| gfx941 | |
-| gfx942 | |
+| gfx90a | Radeon Instinct MI210/MI250 |
+| gfx942 | Radeon Instinct MI300X/MI300A |
+| gfx950 | Radeon Instinct MI350X |
+| gfx1010 | Radeon RX 5700 XT |
+| gfx1012 | Radeon RX 5500 XT |
 | gfx1030 | Radeon PRO V620 |
 | gfx1100 | Radeon PRO W7900 |
 | gfx1101 | Radeon PRO W7700 |
 | gfx1102 | Radeon RX 7600 |
-
-AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
-future release which should increase support for more GPUs.
+| gfx1103 | Radeon 780M |
+| gfx1150 | Ryzen AI 9 HX 375 |
+| gfx1151 | Ryzen AI Max+ 395 |
+| gfx1200 | Radeon RX 9070 |
+| gfx1201 | Radeon RX 9070 XT |

 Reach out on [Discord](https://discord.gg/ollama) or file an
 [issue](https://github.com/ollama/ollama/issues) for additional help.
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -4,7 +4,7 @@ title: Claude Code

 Claude Code is Anthropic's agentic coding tool that can read, modify, and execute code in your working directory. 

-Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `glm-4.7`, `qwen3-coder`, `gpt-oss`. 
+Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `qwen3.5`, `glm-5:cloud`, `kimi-k2.5:cloud`.

 ![Claude Code with Ollama](https://files.ollama.com/claude-code.png)

@@ -32,13 +32,71 @@ irm https://claude.ai/install.ps1 | iex
 ollama launch claude
 ```

-To configure without launching:
-
+### Run directly with a model
 ```shell
-ollama launch claude --config
+ollama launch claude --model kimi-k2.5:cloud
 ```

-### Manual setup
+## Recommended Models
+
+- `kimi-k2.5:cloud`
+- `glm-5:cloud`
+- `minimax-m2.7:cloud`
+- `qwen3.5:cloud`
+- `glm-4.7-flash`
+- `qwen3.5`
+
+Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
+
+## Non-interactive (headless) mode
+
+Run Claude Code without interaction for use in Docker, CI/CD, or scripts:
+
+```shell
+ollama launch claude --model kimi-k2.5:cloud --yes -- -p "how does this repository work?"
+```
+
+The `--yes` flag auto-pulls the model, skips selectors, and requires `--model` to be specified. Arguments after `--` are passed directly to Claude Code.
+
+## Web search
+
+Claude Code can search the web through Ollama's web search API. See the [web search documentation](/capabilities/web-search) for setup and usage.
+
+## Scheduled Tasks with `/loop`
+
+The `/loop` command runs a prompt or slash command on a recurring schedule inside Claude Code. This is useful for automating repetitive tasks like checking PRs, running research, or setting reminders.
+
+```
+/loop <interval> <prompt or /command>
+```
+
+### Examples
+
+**Check in on your PRs**
+
+```
+/loop 30m Check my open PRs and summarize their status
+```
+
+**Automate research tasks**
+
+```
+/loop 1h Research the latest AI news and summarize key developments
+```
+
+**Automate bug reporting and triaging**
+
+```
+/loop 15m Check for new GitHub issues and triage by priority
+```
+
+**Set reminders**
+
+```
+/loop 1h Remind me to review the deploy status
+```
+
+## Manual setup

 Claude Code connects to Ollama using the Anthropic-compatible API.

@@ -53,23 +111,14 @@ export ANTHROPIC_BASE_URL=http://localhost:11434
 2. Run Claude Code with an Ollama model:

 ```shell
-claude --model gpt-oss:20b
+claude --model qwen3.5
 ```

 Or run with environment variables inline:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model qwen3-coder 
+ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model glm-5:cloud
 ```

 **Note:** Claude Code requires a large context window. We recommend at least 64k tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.

-## Recommended Models
-
- `qwen3-coder` 
- `glm-4.7`
- `gpt-oss:20b`
- `gpt-oss:120b`
-
-Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
-
--- a/docs/integrations/nemoclaw.mdx
+++ b/docs/integrations/nemoclaw.mdx
@@ -0,0 +1,67 @@
+---
+title: NemoClaw
+---
+
+NemoClaw is NVIDIA's open source security stack for [OpenClaw](/integrations/openclaw). It wraps OpenClaw with the NVIDIA OpenShell runtime to provide kernel-level sandboxing, network policy controls, and audit trails for AI agents. 
+
+## Quick start
+
+Pull a model:
+
+```bash
+ollama pull nemotron-3-nano:30b
+```
+
+Run the installer:
+
+```bash
+curl -fsSL https://www.nvidia.com/nemoclaw.sh | \
+  NEMOCLAW_NON_INTERACTIVE=1 \
+  NEMOCLAW_PROVIDER=ollama \
+  NEMOCLAW_MODEL=nemotron-3-nano:30b \
+  bash
+```
+
+Connect to your sandbox:
+
+```bash
+nemoclaw my-assistant connect
+```
+
+Open the TUI:
+
+```bash
+openclaw tui
+```
+
+<Note>Ollama support in NemoClaw is still experimental.</Note>
+
+## Platform support
+
+| Platform | Runtime | Status |
+|----------|---------|--------|
+| Linux (Ubuntu 22.04+) | Docker | Primary |
+| macOS (Apple Silicon) | Colima or Docker Desktop | Supported |
+| Windows | WSL2 with Docker Desktop | Supported |
+
+CMD and PowerShell are not supported on Windows — WSL2 is required.
+
+<Note>Ollama must be installed and running before the installer runs. When running inside WSL2 or a container, ensure Ollama is reachable from the sandbox (e.g. `OLLAMA_HOST=0.0.0.0`).</Note>
+
+## System requirements
+
+- CPU: 4 vCPU minimum
+- RAM: 8 GB minimum (16 GB recommended)
+- Disk: 20 GB free (40 GB recommended for local models)
+- Node.js 20+ and npm 10+
+- Container runtime (Docker preferred)
+
+## Recommended models
+
+- `nemotron-3-super:cloud` — Strong reasoning and coding
+- `qwen3.5:cloud` — 397B; reasoning and code generation
+- `nemotron-3-nano:30b` — Recommended local model; fits in 24 GB VRAM
+- `qwen3.5:27b` — Fast local reasoning (~18 GB VRAM)
+- `glm-4.7-flash` — Reasoning and code generation (~25 GB VRAM)
+
+More models at [ollama.com/search](https://ollama.com/search).
--- a/docs/integrations/openclaw.mdx
+++ b/docs/integrations/openclaw.mdx
@@ -15,13 +15,29 @@ Ollama handles everything automatically:
 1. **Install** — If OpenClaw isn't installed, Ollama prompts to install it via npm
 2. **Security** — On the first launch, a security notice explains the risks of tool access
 3. **Model** — Pick a model from the selector (local or cloud)
-4. **Onboarding** — Ollama configures the provider, installs the gateway daemon, and sets your model as the primary
+4. **Onboarding** — Ollama configures the provider, installs the gateway daemon, sets your model as the primary, and installs the web search and fetch plugin
 5. **Gateway** — Starts in the background and opens the OpenClaw TUI

 <Note>OpenClaw requires a larger context window. It is recommended to use a context window of at least 64k tokens if using local models. See [Context length](/context-length) for more information.</Note>

 <Note>Previously known as Clawdbot. `ollama launch clawdbot` still works as an alias.</Note>

+## Web search and fetch
+
+OpenClaw ships with a web search and fetch plugin that gives local or cloud models the ability to search the web and extract readable page content.
+
+```bash
+ollama launch openclaw
+```
+
+Web search and fetch is enabled automatically when launching OpenClaw through Ollama. To install the plugin directly:
+
+```bash
+openclaw plugins install @ollama/openclaw-web-search
+```
+
+<Note>Web search for local models requires `ollama signin`.</Note>
+
 ## Configure without launching

 To change the model without starting the gateway and TUI:
@@ -43,7 +59,7 @@ If the gateway is already running, it restarts automatically to pick up the new
 **Cloud models**:

 - `kimi-k2.5:cloud` — Multimodal reasoning with subagents
- `minimax-m2.5:cloud` — Fast, efficient coding and real-world productivity
+- `minimax-m2.7:cloud` — Fast, efficient coding and real-world productivity
 - `glm-5:cloud` — Reasoning and code generation

 **Local models:**
@@ -52,6 +68,16 @@ If the gateway is already running, it restarts automatically to pick up the new

 More models at [ollama.com/search](https://ollama.com/search?c=cloud).

+## Non-interactive (headless) mode
+
+Run OpenClaw without interaction for use in Docker, CI/CD, or scripts:
+
+```bash
+ollama launch openclaw --model kimi-k2.5:cloud --yes
+```
+
+The `--yes` flag auto-pulls the model, skips selectors, and requires `--model` to be specified.
+
 ## Connect messaging apps

 ```bash
--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -101,7 +101,7 @@ nvidia-smi

 ### Install AMD ROCm drivers (optional)

-[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.
+[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v7.

 ### Start Ollama

--- a/docs/troubleshooting.mdx
+++ b/docs/troubleshooting.mdx
@@ -114,6 +114,25 @@ If you are experiencing problems getting Ollama to correctly discover or use you
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`

+### AMD Driver Version Mismatch
+
+If your AMD GPU is not detected on Linux and the server logs contain messages like:
+
+```
+msg="failure during GPU discovery" ... error="failed to finish discovery before timeout"
+msg="bootstrap discovery took" duration=30s ...
+```
+
+This typically means the system's AMD GPU driver is too old. Ollama bundles
+ROCm 7 linux libraries which require a compatible ROCm 7 kernel driver. If the
+system is running an older driver (ROCm 6.x or earlier), GPU initialization
+will hang during device discovery and eventually time out, causing Ollama to
+fall back to CPU.
+
+To resolve this, upgrade to the ROCm v7 driver using the `amdgpu-install`
+utility from [AMD's ROCm documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/).
+After upgrading, reboot and restart Ollama.
+
 ## Multiple AMD GPUs

 If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
--- a/docs/windows.mdx
+++ b/docs/windows.mdx
@@ -80,9 +80,13 @@ help you keep up to date.

 If you'd like to install or integrate Ollama as a service, a standalone
 `ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
-and GPU library dependencies for Nvidia. If you have an AMD GPU, also download
-and extract the additional ROCm package `ollama-windows-amd64-rocm.zip` into the
-same directory. This allows for embedding Ollama in existing applications, or
+and GPU library dependencies for Nvidia. Depending on your hardware, you may also
+need to download and extract additional packages into the same directory:
+
+- **AMD GPU**: `ollama-windows-amd64-rocm.zip`
+- **MLX (CUDA)**: `ollama-windows-amd64-mlx.zip`
+
+This allows for embedding Ollama in existing applications, or
 running it as a system service via `ollama serve` with tools such as
 [NSSM](https://nssm.cc/).

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -59,6 +59,29 @@ func Host() *url.URL {
 	}
 }

+// ConnectableHost returns Host() with unspecified bind addresses (0.0.0.0, ::)
+// replaced by the corresponding loopback address (127.0.0.1, ::1).
+// Unspecified addresses are valid for binding a server socket but not for
+// connecting as a client, which fails on Windows.
+func ConnectableHost() *url.URL {
+	u := Host()
+	host, port, err := net.SplitHostPort(u.Host)
+	if err != nil {
+		return u
+	}
+
+	if ip := net.ParseIP(host); ip != nil && ip.IsUnspecified() {
+		if ip.To4() != nil {
+			host = "127.0.0.1"
+		} else {
+			host = "::1"
+		}
+		u.Host = net.JoinHostPort(host, port)
+	}
+
+	return u
+}
+
 // AllowedOrigins returns a list of allowed origins. AllowedOrigins can be configured via the OLLAMA_ORIGINS environment variable.
 func AllowedOrigins() (origins []string) {
 	if s := Var("OLLAMA_ORIGINS"); s != "" {
@@ -191,6 +214,8 @@ func LogLevel() slog.Level {
 var (
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = BoolWithDefault("OLLAMA_FLASH_ATTENTION")
+	// DebugLogRequests logs inference requests to disk for replay/debugging.
+	DebugLogRequests = Bool("OLLAMA_DEBUG_LOG_REQUESTS")
 	// KvCacheType is the quantization type for the K/V cache.
 	KvCacheType = String("OLLAMA_KV_CACHE_TYPE")
 	// NoHistory disables readline history.
@@ -279,28 +304,29 @@ type EnvVar struct {

 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
-		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(false), "Enabled flash attention"},
-		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
-		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
-		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
-		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
-		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
-		"OLLAMA_LOAD_TIMEOUT":      {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
-		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
-		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
-		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
-		"OLLAMA_NO_CLOUD":          {"OLLAMA_NO_CLOUD", NoCloud(), "Disable Ollama cloud features (remote inference and web search)"},
-		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
-		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
-		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
-		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
-		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
-		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"},
-		"OLLAMA_EDITOR":            {"OLLAMA_EDITOR", Editor(), "Path to editor for interactive prompt editing (Ctrl+G)"},
-		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
-		"OLLAMA_REMOTES":           {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},
+		"OLLAMA_DEBUG":              {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_DEBUG_LOG_REQUESTS": {"OLLAMA_DEBUG_LOG_REQUESTS", DebugLogRequests(), "Log inference request bodies and replay curl commands to a temp directory"},
+		"OLLAMA_FLASH_ATTENTION":    {"OLLAMA_FLASH_ATTENTION", FlashAttention(false), "Enabled flash attention"},
+		"OLLAMA_KV_CACHE_TYPE":      {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
+		"OLLAMA_GPU_OVERHEAD":       {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
+		"OLLAMA_HOST":               {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
+		"OLLAMA_KEEP_ALIVE":         {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
+		"OLLAMA_LLM_LIBRARY":        {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
+		"OLLAMA_LOAD_TIMEOUT":       {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
+		"OLLAMA_MAX_LOADED_MODELS":  {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
+		"OLLAMA_MAX_QUEUE":          {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
+		"OLLAMA_MODELS":             {"OLLAMA_MODELS", Models(), "The path to the models directory"},
+		"OLLAMA_NO_CLOUD":           {"OLLAMA_NO_CLOUD", NoCloud(), "Disable Ollama cloud features (remote inference and web search)"},
+		"OLLAMA_NOHISTORY":          {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
+		"OLLAMA_NOPRUNE":            {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
+		"OLLAMA_NUM_PARALLEL":       {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
+		"OLLAMA_ORIGINS":            {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
+		"OLLAMA_SCHED_SPREAD":       {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
+		"OLLAMA_MULTIUSER_CACHE":    {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
+		"OLLAMA_CONTEXT_LENGTH":     {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"},
+		"OLLAMA_EDITOR":             {"OLLAMA_EDITOR", Editor(), "Path to editor for interactive prompt editing (Ctrl+G)"},
+		"OLLAMA_NEW_ENGINE":         {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
+		"OLLAMA_REMOTES":            {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},

 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -52,6 +52,37 @@ func TestHost(t *testing.T) {
 	}
 }

+func TestConnectableHost(t *testing.T) {
+	cases := map[string]struct {
+		value  string
+		expect string
+	}{
+		"empty":                    {"", "http://127.0.0.1:11434"},
+		"localhost":                {"127.0.0.1", "http://127.0.0.1:11434"},
+		"localhost and port":       {"127.0.0.1:1234", "http://127.0.0.1:1234"},
+		"ipv4 unspecified":         {"0.0.0.0", "http://127.0.0.1:11434"},
+		"ipv4 unspecified + port":  {"0.0.0.0:1234", "http://127.0.0.1:1234"},
+		"ipv6 unspecified":         {"[::]", "http://[::1]:11434"},
+		"ipv6 unspecified + port":  {"[::]:1234", "http://[::1]:1234"},
+		"ipv6 localhost":           {"[::1]", "http://[::1]:11434"},
+		"ipv6 localhost + port":    {"[::1]:1234", "http://[::1]:1234"},
+		"specific address":         {"192.168.1.5", "http://192.168.1.5:11434"},
+		"specific address + port":  {"192.168.1.5:8080", "http://192.168.1.5:8080"},
+		"hostname":                 {"example.com", "http://example.com:11434"},
+		"hostname and port":        {"example.com:1234", "http://example.com:1234"},
+		"https unspecified + port": {"https://0.0.0.0:4321", "https://127.0.0.1:4321"},
+	}
+
+	for name, tt := range cases {
+		t.Run(name, func(t *testing.T) {
+			t.Setenv("OLLAMA_HOST", tt.value)
+			if host := ConnectableHost(); host.String() != tt.expect {
+				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
+			}
+		})
+	}
+}
+
 func TestOrigins(t *testing.T) {
 	cases := []struct {
 		value  string
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -290,6 +290,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"olmo3",
 		"qwen25vl",
 		"qwen3", "qwen3moe",
+		"qwen35", "qwen35moe",
 		"qwen3next",
 		"qwen3vl", "qwen3vlmoe",
 		"glm4moelite",
@@ -868,7 +869,12 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

-	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
+	arch := f.KV().Architecture()
+	if slices.Contains([]string{"qwen35", "qwen35moe", "qwen3next"}, arch) {
+		return true
+	}
+
+	if slices.Contains([]string{"gemma2"}, arch) {
 		return false
 	}

@@ -892,6 +898,7 @@ func (f GGML) FlashAttention() bool {
 		"nemotron_h", "nemotron_h_moe",
 		"olmo3",
 		"qwen3", "qwen3moe",
+		"qwen35", "qwen35moe",
 		"qwen3next",
 		"qwen3vl", "qwen3vlmoe",
 	}, f.KV().String("general.architecture"))
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -245,7 +245,22 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	padding := ggufPadding(offset, int64(alignment))
 	llm.tensorOffset = uint64(offset + padding)

+	// get file size to validate tensor bounds
+	fileSize, err := rs.Seek(0, io.SeekEnd)
+	if err != nil {
+		return fmt.Errorf("failed to determine file size: %w", err)
+	}
+
+	if _, err := rs.Seek(offset, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek back after size check: %w", err)
+	}
+
 	for _, tensor := range llm.tensors {
+		tensorEnd := llm.tensorOffset + tensor.Offset + tensor.Size()
+		if tensorEnd > uint64(fileSize) {
+			return fmt.Errorf("tensor %q offset+size (%d) exceeds file size (%d)", tensor.Name, tensorEnd, fileSize)
+		}
+
 		offset, err := rs.Seek(0, io.SeekCurrent)
 		if err != nil {
 			return fmt.Errorf("failed to get current offset: %w", err)
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -11,21 +11,21 @@ import (
 )

 func TestWriteGGUF(t *testing.T) {
-	b := bytes.NewBuffer(make([]byte, 2*3))
+	tensorData := make([]byte, 2*3*4) // 6 F32 elements = 24 bytes
 	for range 8 {
 		t.Run("shuffle", func(t *testing.T) {
 			t.Parallel()

 			ts := []*Tensor{
-				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: b},
-				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: b},
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewReader(tensorData)},
 			}

 			rand.Shuffle(len(ts), func(i, j int) {
@@ -98,4 +98,32 @@ func TestWriteGGUF(t *testing.T) {
 			}
 		})
 	}
+
+	t.Run("truncated_tensor_data", func(t *testing.T) {
+		t.Parallel()
+
+		ts := []*Tensor{
+			{Name: "blk.0.attn.weight", Kind: 0, Shape: []uint64{512, 2}, WriterTo: bytes.NewBuffer(make([]byte, 32))},
+		}
+
+		w, err := os.CreateTemp(t.TempDir(), "truncated_*.bin")
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer w.Close()
+
+		if err := WriteGGUF(w, KV{"general.architecture": "test"}, ts); err != nil {
+			t.Fatal(err)
+		}
+
+		r, err := os.Open(w.Name())
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer r.Close()
+
+		if _, err := Decode(r, -1); err == nil {
+			t.Error("Decode should reject GGUF files where tensor data extends beyond file size")
+		}
+	})
 }
--- a/internal/modelref/modelref.go
+++ b/internal/modelref/modelref.go
@@ -0,0 +1,115 @@
+package modelref
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+)
+
+type ModelSource uint8
+
+const (
+	ModelSourceUnspecified ModelSource = iota
+	ModelSourceLocal
+	ModelSourceCloud
+)
+
+var (
+	ErrConflictingSourceSuffix = errors.New("use either :local or :cloud, not both")
+	ErrModelRequired           = errors.New("model is required")
+)
+
+type ParsedRef struct {
+	Original string
+	Base     string
+	Source   ModelSource
+}
+
+func ParseRef(raw string) (ParsedRef, error) {
+	var zero ParsedRef
+
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return zero, ErrModelRequired
+	}
+
+	base, source, explicit := parseSourceSuffix(raw)
+	if explicit {
+		if _, _, nested := parseSourceSuffix(base); nested {
+			return zero, fmt.Errorf("%w: %q", ErrConflictingSourceSuffix, raw)
+		}
+	}
+
+	return ParsedRef{
+		Original: raw,
+		Base:     base,
+		Source:   source,
+	}, nil
+}
+
+func HasExplicitCloudSource(raw string) bool {
+	parsedRef, err := ParseRef(raw)
+	return err == nil && parsedRef.Source == ModelSourceCloud
+}
+
+func HasExplicitLocalSource(raw string) bool {
+	parsedRef, err := ParseRef(raw)
+	return err == nil && parsedRef.Source == ModelSourceLocal
+}
+
+func StripCloudSourceTag(raw string) (string, bool) {
+	parsedRef, err := ParseRef(raw)
+	if err != nil || parsedRef.Source != ModelSourceCloud {
+		return strings.TrimSpace(raw), false
+	}
+
+	return parsedRef.Base, true
+}
+
+func NormalizePullName(raw string) (string, bool, error) {
+	parsedRef, err := ParseRef(raw)
+	if err != nil {
+		return "", false, err
+	}
+
+	if parsedRef.Source != ModelSourceCloud {
+		return parsedRef.Base, false, nil
+	}
+
+	return toLegacyCloudPullName(parsedRef.Base), true, nil
+}
+
+func toLegacyCloudPullName(base string) string {
+	if hasExplicitTag(base) {
+		return base + "-cloud"
+	}
+
+	return base + ":cloud"
+}
+
+func hasExplicitTag(name string) bool {
+	lastSlash := strings.LastIndex(name, "/")
+	lastColon := strings.LastIndex(name, ":")
+	return lastColon > lastSlash
+}
+
+func parseSourceSuffix(raw string) (string, ModelSource, bool) {
+	idx := strings.LastIndex(raw, ":")
+	if idx >= 0 {
+		suffixRaw := strings.TrimSpace(raw[idx+1:])
+		suffix := strings.ToLower(suffixRaw)
+
+		switch suffix {
+		case "cloud":
+			return raw[:idx], ModelSourceCloud, true
+		case "local":
+			return raw[:idx], ModelSourceLocal, true
+		}
+
+		if !strings.Contains(suffixRaw, "/") && strings.HasSuffix(suffix, "-cloud") {
+			return raw[:idx+1] + suffixRaw[:len(suffixRaw)-len("-cloud")], ModelSourceCloud, true
+		}
+	}
+
+	return raw, ModelSourceUnspecified, false
+}
--- a/internal/modelref/modelref_test.go
+++ b/internal/modelref/modelref_test.go
@@ -0,0 +1,268 @@
+package modelref
+
+import (
+	"errors"
+	"testing"
+)
+
+func TestParseRef(t *testing.T) {
+	tests := []struct {
+		name         string
+		input        string
+		wantBase     string
+		wantSource   ModelSource
+		wantErr      error
+		wantCloud    bool
+		wantLocal    bool
+		wantStripped string
+		wantStripOK  bool
+	}{
+		{
+			name:         "cloud suffix",
+			input:        "gpt-oss:20b:cloud",
+			wantBase:     "gpt-oss:20b",
+			wantSource:   ModelSourceCloud,
+			wantCloud:    true,
+			wantStripped: "gpt-oss:20b",
+			wantStripOK:  true,
+		},
+		{
+			name:         "legacy cloud suffix",
+			input:        "gpt-oss:20b-cloud",
+			wantBase:     "gpt-oss:20b",
+			wantSource:   ModelSourceCloud,
+			wantCloud:    true,
+			wantStripped: "gpt-oss:20b",
+			wantStripOK:  true,
+		},
+		{
+			name:         "local suffix",
+			input:        "qwen3:8b:local",
+			wantBase:     "qwen3:8b",
+			wantSource:   ModelSourceLocal,
+			wantLocal:    true,
+			wantStripped: "qwen3:8b:local",
+		},
+		{
+			name:         "no source suffix",
+			input:        "llama3.2",
+			wantBase:     "llama3.2",
+			wantSource:   ModelSourceUnspecified,
+			wantStripped: "llama3.2",
+		},
+		{
+			name:         "bare cloud name is not explicit cloud",
+			input:        "my-cloud-model",
+			wantBase:     "my-cloud-model",
+			wantSource:   ModelSourceUnspecified,
+			wantStripped: "my-cloud-model",
+		},
+		{
+			name:         "slash in suffix blocks legacy cloud parsing",
+			input:        "foo:bar-cloud/baz",
+			wantBase:     "foo:bar-cloud/baz",
+			wantSource:   ModelSourceUnspecified,
+			wantStripped: "foo:bar-cloud/baz",
+		},
+		{
+			name:       "conflicting source suffixes",
+			input:      "foo:cloud:local",
+			wantErr:    ErrConflictingSourceSuffix,
+			wantSource: ModelSourceUnspecified,
+		},
+		{
+			name:       "empty input",
+			input:      "   ",
+			wantErr:    ErrModelRequired,
+			wantSource: ModelSourceUnspecified,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := ParseRef(tt.input)
+			if tt.wantErr != nil {
+				if !errors.Is(err, tt.wantErr) {
+					t.Fatalf("ParseRef(%q) error = %v, want %v", tt.input, err, tt.wantErr)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("ParseRef(%q) returned error: %v", tt.input, err)
+			}
+
+			if got.Base != tt.wantBase {
+				t.Fatalf("base = %q, want %q", got.Base, tt.wantBase)
+			}
+
+			if got.Source != tt.wantSource {
+				t.Fatalf("source = %v, want %v", got.Source, tt.wantSource)
+			}
+
+			if HasExplicitCloudSource(tt.input) != tt.wantCloud {
+				t.Fatalf("HasExplicitCloudSource(%q) = %v, want %v", tt.input, HasExplicitCloudSource(tt.input), tt.wantCloud)
+			}
+
+			if HasExplicitLocalSource(tt.input) != tt.wantLocal {
+				t.Fatalf("HasExplicitLocalSource(%q) = %v, want %v", tt.input, HasExplicitLocalSource(tt.input), tt.wantLocal)
+			}
+
+			stripped, ok := StripCloudSourceTag(tt.input)
+			if ok != tt.wantStripOK {
+				t.Fatalf("StripCloudSourceTag(%q) ok = %v, want %v", tt.input, ok, tt.wantStripOK)
+			}
+			if stripped != tt.wantStripped {
+				t.Fatalf("StripCloudSourceTag(%q) base = %q, want %q", tt.input, stripped, tt.wantStripped)
+			}
+		})
+	}
+}
+
+func TestNormalizePullName(t *testing.T) {
+	tests := []struct {
+		name      string
+		input     string
+		wantName  string
+		wantCloud bool
+		wantErr   error
+	}{
+		{
+			name:     "explicit local strips source",
+			input:    "gpt-oss:20b:local",
+			wantName: "gpt-oss:20b",
+		},
+		{
+			name:      "explicit cloud with size maps to legacy dash cloud tag",
+			input:     "gpt-oss:20b:cloud",
+			wantName:  "gpt-oss:20b-cloud",
+			wantCloud: true,
+		},
+		{
+			name:      "legacy cloud with size remains stable",
+			input:     "gpt-oss:20b-cloud",
+			wantName:  "gpt-oss:20b-cloud",
+			wantCloud: true,
+		},
+		{
+			name:      "explicit cloud without tag maps to cloud tag",
+			input:     "qwen3:cloud",
+			wantName:  "qwen3:cloud",
+			wantCloud: true,
+		},
+		{
+			name:      "host port without tag keeps host port and appends cloud tag",
+			input:     "localhost:11434/library/foo:cloud",
+			wantName:  "localhost:11434/library/foo:cloud",
+			wantCloud: true,
+		},
+		{
+			name:    "conflicting source suffixes fail",
+			input:   "foo:cloud:local",
+			wantErr: ErrConflictingSourceSuffix,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotName, gotCloud, err := NormalizePullName(tt.input)
+			if tt.wantErr != nil {
+				if !errors.Is(err, tt.wantErr) {
+					t.Fatalf("NormalizePullName(%q) error = %v, want %v", tt.input, err, tt.wantErr)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("NormalizePullName(%q) returned error: %v", tt.input, err)
+			}
+
+			if gotName != tt.wantName {
+				t.Fatalf("normalized name = %q, want %q", gotName, tt.wantName)
+			}
+			if gotCloud != tt.wantCloud {
+				t.Fatalf("cloud = %v, want %v", gotCloud, tt.wantCloud)
+			}
+		})
+	}
+}
+
+func TestParseSourceSuffix(t *testing.T) {
+	tests := []struct {
+		name         string
+		input        string
+		wantBase     string
+		wantSource   ModelSource
+		wantExplicit bool
+	}{
+		{
+			name:         "explicit cloud suffix",
+			input:        "gpt-oss:20b:cloud",
+			wantBase:     "gpt-oss:20b",
+			wantSource:   ModelSourceCloud,
+			wantExplicit: true,
+		},
+		{
+			name:         "explicit local suffix",
+			input:        "qwen3:8b:local",
+			wantBase:     "qwen3:8b",
+			wantSource:   ModelSourceLocal,
+			wantExplicit: true,
+		},
+		{
+			name:         "legacy cloud suffix on tag",
+			input:        "gpt-oss:20b-cloud",
+			wantBase:     "gpt-oss:20b",
+			wantSource:   ModelSourceCloud,
+			wantExplicit: true,
+		},
+		{
+			name:         "legacy cloud suffix does not match model segment",
+			input:        "my-cloud-model",
+			wantBase:     "my-cloud-model",
+			wantSource:   ModelSourceUnspecified,
+			wantExplicit: false,
+		},
+		{
+			name:         "legacy cloud suffix blocked when suffix includes slash",
+			input:        "foo:bar-cloud/baz",
+			wantBase:     "foo:bar-cloud/baz",
+			wantSource:   ModelSourceUnspecified,
+			wantExplicit: false,
+		},
+		{
+			name:         "unknown suffix is not explicit source",
+			input:        "gpt-oss:clod",
+			wantBase:     "gpt-oss:clod",
+			wantSource:   ModelSourceUnspecified,
+			wantExplicit: false,
+		},
+		{
+			name:         "uppercase suffix is accepted",
+			input:        "gpt-oss:20b:CLOUD",
+			wantBase:     "gpt-oss:20b",
+			wantSource:   ModelSourceCloud,
+			wantExplicit: true,
+		},
+		{
+			name:         "no suffix",
+			input:        "llama3.2",
+			wantBase:     "llama3.2",
+			wantSource:   ModelSourceUnspecified,
+			wantExplicit: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotBase, gotSource, gotExplicit := parseSourceSuffix(tt.input)
+			if gotBase != tt.wantBase {
+				t.Fatalf("base = %q, want %q", gotBase, tt.wantBase)
+			}
+			if gotSource != tt.wantSource {
+				t.Fatalf("source = %v, want %v", gotSource, tt.wantSource)
+			}
+			if gotExplicit != tt.wantExplicit {
+				t.Fatalf("explicit = %v, want %v", gotExplicit, tt.wantExplicit)
+			}
+		})
+	}
+}
--- a/kvcache/recurrent.go
+++ b/kvcache/recurrent.go
@@ -11,9 +11,9 @@ import (
 )

 const (
-	DefaultCheckpointCount    = 32
+	DefaultCheckpointCount    = 24
 	DefaultCheckpointMinPos   = int32(16)
-	DefaultCheckpointInterval = int32(1280)
+	DefaultCheckpointInterval = int32(1664)
 )

 var ErrInvalidRecurrentShape = errors.New("kvcache: invalid recurrent state shape")
--- a/llm/server.go
+++ b/llm/server.go
@@ -74,8 +74,7 @@ type LlamaServer interface {
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
-	VRAMSize() uint64 // Total VRAM across all GPUs
-	TotalSize() uint64
+	MemorySize() (total, vram uint64)
 	VRAMByGPU(id ml.DeviceID) uint64
 	Pid() int
 	GetPort() int
@@ -88,7 +87,8 @@ type LlamaServer interface {
 type llmServer struct {
 	port      int
 	cmd       *exec.Cmd
-	done      chan error // Channel to signal when the process exits
+	done      chan struct{} // closed when the process exits
+	doneErr   error         // valid after done is closed
 	status    *StatusWriter
 	options   api.Options
 	modelPath string
@@ -281,7 +281,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 		sem:            semaphore.NewWeighted(int64(numParallel)),
 		totalLayers:    f.KV().BlockCount() + 1,
 		loadStart:      time.Now(),
-		done:           make(chan error, 1),
+		done:           make(chan struct{}),
 	}

 	if err != nil {
@@ -305,10 +305,11 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 			if strings.Contains(s.status.LastErrMsg, "unknown model") {
 				s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
 			}
-			s.done <- errors.New(s.status.LastErrMsg)
+			s.doneErr = errors.New(s.status.LastErrMsg)
 		} else {
-			s.done <- err
+			s.doneErr = err
 		}
+		close(s.done)
 	}()

 	if tok != nil {
@@ -685,8 +686,9 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
 	// Windows CUDA should not use mmap for best performance
 	// Linux  with a model larger than free space, mmap leads to thrashing
 	// For CPU loads we want the memory to be allocated, not FS cache
+	totalSize, _ := s.MemorySize()
 	if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
-		(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.TotalSize() && s.options.UseMMap == nil) ||
+		(runtime.GOOS == "linux" && systemInfo.FreeMemory < totalSize && s.options.UseMMap == nil) ||
 		(len(gpus) == 0 && s.options.UseMMap == nil) ||
 		(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
 		(s.options.UseMMap != nil && !*s.options.UseMMap) {
@@ -1356,8 +1358,8 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 		case <-ctx.Done():
 			slog.Warn("client connection closed before server finished loading, aborting load")
 			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
-		case err := <-s.done:
-			return fmt.Errorf("llama runner process has terminated: %w", err)
+		case <-s.done:
+			return fmt.Errorf("llama runner process has terminated: %w", s.doneErr)
 		default:
 		}
 		if time.Now().After(stallTimer) {
@@ -1848,17 +1850,17 @@ func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	return nil
 }

-func (s *llmServer) VRAMSize() uint64 {
+func (s *llmServer) MemorySize() (total, vram uint64) {
 	if s.mem == nil {
-		return 0
+		return 0, 0
 	}

-	var mem uint64
-
 	for _, g := range s.mem.GPUs {
-		mem += g.Size()
+		vram += g.Size()
 	}

+	total = s.mem.InputWeights + s.mem.CPU.Size() + vram
+
 	// Some elements are always on CPU. However, if we have allocated all layers
 	// on the GPU then include the CPU components as well, to represent complete offloading.
 	noCPULayers := true
@@ -1869,25 +1871,11 @@ func (s *llmServer) VRAMSize() uint64 {
 		}
 	}
 	if noCPULayers {
-		mem += s.mem.InputWeights
-		mem += s.mem.CPU.Graph
+		vram += s.mem.InputWeights
+		vram += s.mem.CPU.Graph
 	}

-	return mem
-}
-
-func (s *llmServer) TotalSize() uint64 {
-	if s.mem == nil {
-		return 0
-	}
-
-	mem := s.mem.InputWeights
-	mem += s.mem.CPU.Size()
-	for _, g := range s.mem.GPUs {
-		mem += g.Size()
-	}
-
-	return mem
+	return total, vram
 }

 func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 {
--- a/middleware/anthropic.go
+++ b/middleware/anthropic.go
@@ -17,6 +17,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	internalcloud "github.com/ollama/ollama/internal/cloud"
+	"github.com/ollama/ollama/internal/modelref"
 	"github.com/ollama/ollama/logutil"
 )

@@ -33,12 +34,13 @@ func (w *AnthropicWriter) writeError(data []byte) (int, error) {
 		Error string `json:"error"`
 	}
 	if err := json.Unmarshal(data, &errData); err != nil {
-		return 0, err
+		// If the error response isn't valid JSON, use the raw bytes as the
+		// error message rather than surfacing a confusing JSON parse error.
+		errData.Error = string(data)
 	}

 	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err := json.NewEncoder(w.ResponseWriter).Encode(anthropic.NewError(w.Status(), errData.Error))
-	if err != nil {
+	if err := json.NewEncoder(w.ResponseWriter).Encode(anthropic.NewError(w.Status(), errData.Error)); err != nil {
 		return 0, err
 	}

@@ -239,15 +241,6 @@ func (w *WebSearchAnthropicWriter) runWebSearchLoop(ctx context.Context, initial

 	var serverContent []anthropic.ContentBlock

-	if !isCloudModelName(w.req.Model) {
-		logutil.TraceContext(ctx, "anthropic middleware: web_search execution blocked", "reason", "non_cloud_model")
-		return anthropic.MessagesResponse{}, &webSearchLoopError{
-			code:  "web_search_not_supported_for_local_models",
-			query: extractQueryFromToolCall(&initialToolCall),
-			usage: usage,
-		}
-	}
-
 	for loop := 1; loop <= maxWebSearchLoops; loop++ {
 		query := extractQueryFromToolCall(&currentToolCall)
 		logutil.TraceContext(ctx, "anthropic middleware: web_search loop iteration",
@@ -919,7 +912,7 @@ func hasWebSearchTool(tools []anthropic.Tool) bool {
 }

 func isCloudModelName(name string) bool {
-	return strings.HasSuffix(name, ":cloud") || strings.HasSuffix(name, "-cloud")
+	return modelref.HasExplicitCloudSource(name)
 }

 // extractQueryFromToolCall extracts the search query from a web_search tool call
--- a/middleware/anthropic_test.go
+++ b/middleware/anthropic_test.go
@@ -1644,7 +1644,35 @@ func TestWebSearchCloudModelGating(t *testing.T) {
 		}
 	})

-	t.Run("local model emits web_search and gets structured error", func(t *testing.T) {
+	t.Run("local model emits web_search and gets results", func(t *testing.T) {
+		// Mock followup server for the model's response after receiving search results
+		followupServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			resp := api.ChatResponse{
+				Model:      "llama3.2",
+				Message:    api.Message{Role: "assistant", Content: "Based on search results, here is the answer."},
+				Done:       true,
+				DoneReason: "stop",
+				Metrics:    api.Metrics{PromptEvalCount: 20, EvalCount: 10},
+			}
+			_ = json.NewEncoder(w).Encode(resp)
+		}))
+		defer followupServer.Close()
+		t.Setenv("OLLAMA_HOST", followupServer.URL)
+
+		// Mock search server
+		searchServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			resp := anthropic.OllamaWebSearchResponse{
+				Results: []anthropic.OllamaWebSearchResult{
+					{Title: "Result", URL: "https://example.com", Content: "content"},
+				},
+			}
+			_ = json.NewEncoder(w).Encode(resp)
+		}))
+		defer searchServer.Close()
+		originalEndpoint := anthropic.WebSearchEndpoint
+		anthropic.WebSearchEndpoint = searchServer.URL
+		defer func() { anthropic.WebSearchEndpoint = originalEndpoint }()
+
 		router := gin.New()
 		router.Use(AnthropicMessagesMiddleware())
 		router.POST("/v1/messages", func(c *gin.Context) {
@@ -1685,16 +1713,23 @@ func TestWebSearchCloudModelGating(t *testing.T) {
 		if err := json.Unmarshal(resp.Body.Bytes(), &result); err != nil {
 			t.Fatalf("unmarshal error: %v", err)
 		}
-		if len(result.Content) != 2 {
-			t.Fatalf("expected 2 content blocks for local model web_search error, got %d", len(result.Content))
+
+		// Should have search result content blocks and the final text response
+		hasText := false
+		hasSearchResult := false
+		for _, block := range result.Content {
+			if block.Type == "text" {
+				hasText = true
+			}
+			if block.Type == "web_search_tool_result" {
+				hasSearchResult = true
+			}
 		}
-		contentJSON, _ := json.Marshal(result.Content[1].Content)
-		var errContent anthropic.WebSearchToolResultError
-		if err := json.Unmarshal(contentJSON, &errContent); err != nil {
-			t.Fatalf("failed to parse web_search error content: %v", err)
+		if !hasText {
+			t.Fatal("expected text content block in response")
 		}
-		if errContent.ErrorCode != "web_search_not_supported_for_local_models" {
-			t.Fatalf("expected web_search_not_supported_for_local_models, got %q", errContent.ErrorCode)
+		if !hasSearchResult {
+			t.Fatal("expected web_search_tool_result content block in response")
 		}
 	})

--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"log/slog"
 	"math/rand"
 	"net/http"
 	"strings"
@@ -17,6 +18,9 @@ import (
 	"github.com/ollama/ollama/openai"
 )

+// maxDecompressedBodySize limits the size of a decompressed request body
+const maxDecompressedBodySize = 20 << 20
+
 type BaseWriter struct {
 	gin.ResponseWriter
 }
@@ -53,14 +57,14 @@ type EmbedWriter struct {

 func (w *BaseWriter) writeError(data []byte) (int, error) {
 	var serr api.StatusError
-	err := json.Unmarshal(data, &serr)
-	if err != nil {
-		return 0, err
+	if err := json.Unmarshal(data, &serr); err != nil {
+		// If the error response isn't valid JSON, use the raw bytes as the
+		// error message rather than surfacing a confusing JSON parse error.
+		serr.ErrorMessage = string(data)
 	}

 	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err = json.NewEncoder(w.ResponseWriter).Encode(openai.NewError(http.StatusInternalServerError, serr.Error()))
-	if err != nil {
+	if err := json.NewEncoder(w.ResponseWriter).Encode(openai.NewError(w.ResponseWriter.Status(), serr.Error())); err != nil {
 		return 0, err
 	}

@@ -76,22 +80,29 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {

 	// chat chunk
 	if w.stream {
-		c := openai.ToChunk(w.id, chatResponse, w.toolCallSent)
-		d, err := json.Marshal(c)
-		if err != nil {
-			return 0, err
-		}
-		if !w.toolCallSent && len(c.Choices) > 0 && len(c.Choices[0].Delta.ToolCalls) > 0 {
-			w.toolCallSent = true
-		}
-
+		chunks := openai.ToChunks(w.id, chatResponse, w.toolCallSent)
 		w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
-		_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
-		if err != nil {
-			return 0, err
+		for _, c := range chunks {
+			d, err := json.Marshal(c)
+			if err != nil {
+				return 0, err
+			}
+			if !w.toolCallSent && len(c.Choices) > 0 && len(c.Choices[0].Delta.ToolCalls) > 0 {
+				w.toolCallSent = true
+			}
+			_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
+			if err != nil {
+				return 0, err
+			}
 		}

 		if chatResponse.Done {
+			c := openai.ToChunk(w.id, chatResponse, w.toolCallSent)
+			if len(chunks) > 0 {
+				c = chunks[len(chunks)-1]
+			} else {
+				slog.Warn("ToChunks returned no chunks; falling back to ToChunk for usage chunk", "id", w.id, "model", chatResponse.Model)
+			}
 			if w.streamOptions != nil && w.streamOptions.IncludeUsage {
 				u := openai.ToUsage(chatResponse)
 				c.Usage = &u
@@ -504,7 +515,7 @@ func ResponsesMiddleware() gin.HandlerFunc {
 				return
 			}
 			defer reader.Close()
-			c.Request.Body = io.NopCloser(reader)
+			c.Request.Body = http.MaxBytesReader(c.Writer, io.NopCloser(reader), maxDecompressedBodySize)
 			c.Request.Header.Del("Content-Encoding")
 		}

--- a/middleware/openai_test.go
+++ b/middleware/openai_test.go
@@ -76,6 +76,299 @@ func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
 	}
 }

+func sseDataFrames(body string) []string {
+	frames := strings.Split(body, "\n\n")
+	data := make([]string, 0, len(frames))
+	for _, frame := range frames {
+		frame = strings.TrimSpace(frame)
+		if !strings.HasPrefix(frame, "data: ") {
+			continue
+		}
+		data = append(data, strings.TrimPrefix(frame, "data: "))
+	}
+	return data
+}
+
+func TestChatWriter_StreamMixedThinkingAndContentEmitsSplitChunks(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	recorder := httptest.NewRecorder()
+	context, _ := gin.CreateTestContext(recorder)
+
+	writer := &ChatWriter{
+		stream:        true,
+		streamOptions: &openai.StreamOptions{IncludeUsage: true},
+		id:            "chatcmpl-test",
+		BaseWriter:    BaseWriter{ResponseWriter: context.Writer},
+	}
+
+	response := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Thinking: "reasoning",
+			Content:  "final answer",
+		},
+		Done:       true,
+		DoneReason: "stop",
+		Metrics: api.Metrics{
+			PromptEvalCount: 3,
+			EvalCount:       2,
+		},
+	}
+
+	data, err := json.Marshal(response)
+	if err != nil {
+		t.Fatalf("marshal response: %v", err)
+	}
+
+	if _, err = writer.Write(data); err != nil {
+		t.Fatalf("write response: %v", err)
+	}
+
+	if got := recorder.Header().Get("Content-Type"); got != "text/event-stream" {
+		t.Fatalf("expected Content-Type text/event-stream, got %q", got)
+	}
+
+	frames := sseDataFrames(recorder.Body.String())
+	if len(frames) != 4 {
+		t.Fatalf("expected 4 SSE data frames (2 chunks + usage + [DONE]), got %d:\n%s", len(frames), recorder.Body.String())
+	}
+	if frames[3] != "[DONE]" {
+		t.Fatalf("expected final frame [DONE], got %q", frames[3])
+	}
+
+	var reasoningChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[0]), &reasoningChunk); err != nil {
+		t.Fatalf("unmarshal reasoning chunk: %v", err)
+	}
+
+	var contentChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[1]), &contentChunk); err != nil {
+		t.Fatalf("unmarshal content chunk: %v", err)
+	}
+
+	var usageChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[2]), &usageChunk); err != nil {
+		t.Fatalf("unmarshal usage chunk: %v", err)
+	}
+
+	if len(reasoningChunk.Choices) != 1 {
+		t.Fatalf("expected 1 reasoning choice, got %d", len(reasoningChunk.Choices))
+	}
+	if reasoningChunk.Choices[0].Delta.Reasoning != "reasoning" {
+		t.Fatalf("expected reasoning chunk reasoning %q, got %q", "reasoning", reasoningChunk.Choices[0].Delta.Reasoning)
+	}
+	if reasoningChunk.Choices[0].Delta.Content != "" {
+		t.Fatalf("expected reasoning chunk content to be empty, got %v", reasoningChunk.Choices[0].Delta.Content)
+	}
+	if reasoningChunk.Choices[0].FinishReason != nil {
+		t.Fatalf("expected reasoning chunk finish reason nil, got %v", reasoningChunk.Choices[0].FinishReason)
+	}
+
+	if len(contentChunk.Choices) != 1 {
+		t.Fatalf("expected 1 content choice, got %d", len(contentChunk.Choices))
+	}
+	if contentChunk.Choices[0].Delta.Reasoning != "" {
+		t.Fatalf("expected content chunk reasoning to be empty, got %q", contentChunk.Choices[0].Delta.Reasoning)
+	}
+	if contentChunk.Choices[0].Delta.Content != "final answer" {
+		t.Fatalf("expected content chunk content %q, got %v", "final answer", contentChunk.Choices[0].Delta.Content)
+	}
+	if contentChunk.Choices[0].FinishReason == nil || *contentChunk.Choices[0].FinishReason != "stop" {
+		t.Fatalf("expected content chunk finish reason %q, got %v", "stop", contentChunk.Choices[0].FinishReason)
+	}
+
+	if usageChunk.Usage == nil {
+		t.Fatal("expected usage chunk to include usage")
+	}
+	if usageChunk.Usage.TotalTokens != 5 {
+		t.Fatalf("expected usage total tokens 5, got %d", usageChunk.Usage.TotalTokens)
+	}
+	if len(usageChunk.Choices) != 0 {
+		t.Fatalf("expected usage chunk choices to be empty, got %d", len(usageChunk.Choices))
+	}
+}
+
+func TestChatWriter_StreamSingleChunkPathStillEmitsOneChunk(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	recorder := httptest.NewRecorder()
+	context, _ := gin.CreateTestContext(recorder)
+
+	writer := &ChatWriter{
+		stream:     true,
+		id:         "chatcmpl-test",
+		BaseWriter: BaseWriter{ResponseWriter: context.Writer},
+	}
+
+	response := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Content: "single chunk",
+		},
+		Done:       true,
+		DoneReason: "stop",
+	}
+
+	data, err := json.Marshal(response)
+	if err != nil {
+		t.Fatalf("marshal response: %v", err)
+	}
+
+	if _, err = writer.Write(data); err != nil {
+		t.Fatalf("write response: %v", err)
+	}
+
+	frames := sseDataFrames(recorder.Body.String())
+	if len(frames) != 2 {
+		t.Fatalf("expected 2 SSE data frames (1 chunk + [DONE]), got %d:\n%s", len(frames), recorder.Body.String())
+	}
+	if frames[1] != "[DONE]" {
+		t.Fatalf("expected final frame [DONE], got %q", frames[1])
+	}
+
+	var chunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[0]), &chunk); err != nil {
+		t.Fatalf("unmarshal chunk: %v", err)
+	}
+	if len(chunk.Choices) != 1 {
+		t.Fatalf("expected 1 chunk choice, got %d", len(chunk.Choices))
+	}
+	if chunk.Choices[0].Delta.Content != "single chunk" {
+		t.Fatalf("expected chunk content %q, got %v", "single chunk", chunk.Choices[0].Delta.Content)
+	}
+}
+
+func TestChatWriter_StreamMixedThinkingAndToolCallsWithoutDoneEmitsChunksOnly(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	recorder := httptest.NewRecorder()
+	context, _ := gin.CreateTestContext(recorder)
+
+	writer := &ChatWriter{
+		stream:        true,
+		streamOptions: &openai.StreamOptions{IncludeUsage: true},
+		id:            "chatcmpl-test",
+		BaseWriter:    BaseWriter{ResponseWriter: context.Writer},
+	}
+
+	response := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Thinking: "reasoning",
+			ToolCalls: []api.ToolCall{
+				{
+					ID: "call_234",
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_weather",
+						Arguments: testArgs(map[string]any{
+							"location": "Portland",
+						}),
+					},
+				},
+			},
+		},
+		Done: false,
+	}
+
+	data, err := json.Marshal(response)
+	if err != nil {
+		t.Fatalf("marshal response: %v", err)
+	}
+
+	if _, err = writer.Write(data); err != nil {
+		t.Fatalf("write response: %v", err)
+	}
+
+	frames := sseDataFrames(recorder.Body.String())
+	if len(frames) != 2 {
+		t.Fatalf("expected 2 SSE data frames (reasoning + tool-calls), got %d:\n%s", len(frames), recorder.Body.String())
+	}
+	if frames[len(frames)-1] == "[DONE]" {
+		t.Fatalf("did not expect [DONE] frame for non-final chunk: %s", recorder.Body.String())
+	}
+
+	var reasoningChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[0]), &reasoningChunk); err != nil {
+		t.Fatalf("unmarshal reasoning chunk: %v", err)
+	}
+
+	var toolCallChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[1]), &toolCallChunk); err != nil {
+		t.Fatalf("unmarshal tool-call chunk: %v", err)
+	}
+
+	if len(reasoningChunk.Choices) != 1 || reasoningChunk.Choices[0].Delta.Reasoning != "reasoning" {
+		t.Fatalf("expected first chunk to be reasoning-only, got %+v", reasoningChunk.Choices)
+	}
+	if len(toolCallChunk.Choices) != 1 || len(toolCallChunk.Choices[0].Delta.ToolCalls) != 1 {
+		t.Fatalf("expected second chunk to contain tool calls, got %+v", toolCallChunk.Choices)
+	}
+	if toolCallChunk.Choices[0].FinishReason != nil {
+		t.Fatalf("expected nil finish reason for non-final tool-call chunk, got %v", toolCallChunk.Choices[0].FinishReason)
+	}
+	if !writer.toolCallSent {
+		t.Fatal("expected toolCallSent to be tracked after tool-call chunk emission")
+	}
+}
+
+func TestChatWriter_StreamMixedThinkingAndContentWithoutDoneEmitsChunksOnly(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	recorder := httptest.NewRecorder()
+	context, _ := gin.CreateTestContext(recorder)
+
+	writer := &ChatWriter{
+		stream:        true,
+		streamOptions: &openai.StreamOptions{IncludeUsage: true},
+		id:            "chatcmpl-test",
+		BaseWriter:    BaseWriter{ResponseWriter: context.Writer},
+	}
+
+	response := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Thinking: "reasoning",
+			Content:  "partial content",
+		},
+		Done: false,
+	}
+
+	data, err := json.Marshal(response)
+	if err != nil {
+		t.Fatalf("marshal response: %v", err)
+	}
+
+	if _, err = writer.Write(data); err != nil {
+		t.Fatalf("write response: %v", err)
+	}
+
+	frames := sseDataFrames(recorder.Body.String())
+	if len(frames) != 2 {
+		t.Fatalf("expected 2 SSE data frames (reasoning + content), got %d:\n%s", len(frames), recorder.Body.String())
+	}
+	if frames[len(frames)-1] == "[DONE]" {
+		t.Fatalf("did not expect [DONE] frame for non-final chunk: %s", recorder.Body.String())
+	}
+
+	var reasoningChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[0]), &reasoningChunk); err != nil {
+		t.Fatalf("unmarshal reasoning chunk: %v", err)
+	}
+
+	var contentChunk openai.ChatCompletionChunk
+	if err := json.Unmarshal([]byte(frames[1]), &contentChunk); err != nil {
+		t.Fatalf("unmarshal content chunk: %v", err)
+	}
+
+	if len(reasoningChunk.Choices) != 1 || reasoningChunk.Choices[0].Delta.Reasoning != "reasoning" {
+		t.Fatalf("expected first chunk to be reasoning-only, got %+v", reasoningChunk.Choices)
+	}
+	if len(contentChunk.Choices) != 1 || contentChunk.Choices[0].Delta.Content != "partial content" {
+		t.Fatalf("expected second chunk to contain content, got %+v", contentChunk.Choices)
+	}
+	if contentChunk.Choices[0].FinishReason != nil {
+		t.Fatalf("expected nil finish reason for non-final content chunk, got %v", contentChunk.Choices[0].FinishReason)
+	}
+}
+
 func TestChatMiddleware(t *testing.T) {
 	type testCase struct {
 		name string
@@ -929,7 +1222,7 @@ func TestRetrieveMiddleware(t *testing.T) {
 				  "code": null,
 				  "message": "model not found",
 				  "param": null,
-				  "type": "api_error"
+				  "type": "invalid_request_error"
 				}
 			}`,
 		},
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -195,6 +195,7 @@ type Tensor interface {
 	Concat(ctx Context, t2 Tensor, dim int) Tensor
 	Rows(ctx Context, t2 Tensor) Tensor
 	SetRows(ctx Context, src Tensor, idxs Tensor) Tensor
+	SetInplace(ctx Context, src Tensor, nb1, nb2, nb3, offset int) Tensor
 	Copy(ctx Context, t2 Tensor) Tensor
 	Duplicate(ctx Context) Tensor

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1345,6 +1345,21 @@ func (t *Tensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tenso
 	}
 }

+func (t *Tensor) SetInplace(ctx ml.Context, src ml.Tensor, nb1, nb2, nb3, offset int) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_set_inplace(
+			ctx.(*Context).ctx,
+			t.t,
+			src.(*Tensor).t,
+			C.size_t(nb1),
+			C.size_t(nb2),
+			C.size_t(nb3),
+			C.size_t(offset),
+		),
+	}
+}
+
 func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/model/models/qwen3next/cache.go
+++ b/model/models/qwen3next/cache.go
@@ -2,595 +2,58 @@ package qwen3next

 import (
 	"math"
-	"slices"

 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
 )

-var _ kvcache.Cache = (*HybridCache)(nil)
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)

-// HybridCache stores:
-// - a standard causal KV cache for full attention layers
-// - per-sequence conv state for linear attention layers
-// - per-sequence delta state for linear attention layers
-//
-// Conv state shape (per layer, per sequence): [convKernelSize-1, convChannels]
-// Delta state shape (per layer, per sequence): [headVDim, headVDim * numVHeads]
+// HybridCache adapts the shared recurrent cache base for Qwen3-Next naming.
 type HybridCache struct {
-	kv *kvcache.Causal
-
-	backend      ml.Backend
-	dtype        ml.DType
-	maxSequences int
-
-	// Conv state dimensions
-	convDim      int // convKernelSize - 1
-	convChannels int // d_inner + 2 * num_k_heads * head_k_dim
-
-	// Delta state dimensions
-	deltaStateSize int // headVDim * headVDim * numVHeads
-
-	// slot mapping for recurrent state (copy-on-write)
-	slotForSeq map[int]int
-	refCount   []int
-	freeSlots  []int
-
-	// per-layer conv state buffers (allocated lazily)
-	convCtxs   map[int]ml.Context
-	convStates map[int]ml.Tensor // [convDim*convChannels, maxSlots]
-
-	// per-layer delta state buffers (allocated lazily)
-	deltaCtxs   map[int]ml.Context
-	deltaStates map[int]ml.Tensor // [deltaStateSize, maxSlots]
-
-	// recurrent checkpoints (per slot)
-	checkpointCount     int
-	checkpointMinPos    int32
-	checkpointInterval  int32
-	checkpointCtxSize   int
-	checkpoints         map[int]*slotCheckpointStore
-	pendingRestore      map[int]checkpointRestore
-	curCheckpointPos    []int32
-	curCheckpointSlots  map[int]int
-	reserveCheckpoints  bool
-	checkpointConvCtxs  map[int]ml.Context
-	checkpointDeltaCtxs map[int]ml.Context
-	checkpointReserved  map[int]struct{}
-
-	// current forward batch (derived in StartForward)
-	curSeqs       []int
-	curSlots      []int
-	curSlotsInput ml.Tensor
-	curSeqTokens  int
-
-	// track if EnsureWritable has been called for this forward pass
-	writableEnsured bool
-	writableError   error
+	*kvcache.Recurrent
 }

 func NewHybridCache(
 	shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error),
 	convDim, convChannels, deltaStateSize int,
 ) *HybridCache {
-	return &HybridCache{
-		kv:                  kvcache.NewCausalCache(shift),
-		convDim:             convDim,
-		convChannels:        convChannels,
-		deltaStateSize:      deltaStateSize,
-		slotForSeq:          make(map[int]int),
-		convCtxs:            make(map[int]ml.Context),
-		convStates:          make(map[int]ml.Tensor),
-		deltaCtxs:           make(map[int]ml.Context),
-		deltaStates:         make(map[int]ml.Tensor),
-		checkpointCount:     checkpointCountDefault,
-		checkpointMinPos:    checkpointMinPosDefault,
-		checkpointInterval:  checkpointIntervalDefault,
-		checkpoints:         make(map[int]*slotCheckpointStore),
-		pendingRestore:      make(map[int]checkpointRestore),
-		curCheckpointSlots:  make(map[int]int),
-		checkpointConvCtxs:  make(map[int]ml.Context),
-		checkpointDeltaCtxs: make(map[int]ml.Context),
-		checkpointReserved:  make(map[int]struct{}),
-	}
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               shift,
+		ConvDim:             convDim,
+		ConvChannels:        convChannels,
+		RecurrentStateSize:  deltaStateSize,
+		CheckpointLogPrefix: "qwen3next",
+	})
+	return &HybridCache{Recurrent: base}
 }

-func (c *HybridCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.backend = backend
-	c.dtype = dtype
-	c.maxSequences = maxSequences
-	c.checkpoints = make(map[int]*slotCheckpointStore)
-	c.pendingRestore = make(map[int]checkpointRestore)
-	c.curCheckpointPos = c.curCheckpointPos[:0]
-	c.curCheckpointSlots = make(map[int]int)
-	c.checkpointReserved = make(map[int]struct{})
-	c.checkpointCtxSize = c.checkpointCount * c.maxSequences
-	if c.checkpointCtxSize < 8 {
-		c.checkpointCtxSize = 8
-	}
-
-	// initialize slot allocator
-	c.refCount = make([]int, maxSequences)
-	c.freeSlots = c.freeSlots[:0]
-	for i := maxSequences - 1; i >= 0; i-- {
-		c.freeSlots = append(c.freeSlots, i)
-	}
-
-	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
-}
-
-func (c *HybridCache) Close() {
-	for _, ctx := range c.convCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.deltaCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.checkpointConvCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.checkpointDeltaCtxs {
-		ctx.Close()
-	}
-	c.kv.Close()
-}
-
-func (c *HybridCache) SetConfig(config ml.CacheConfig) {
-	c.kv.SetConfig(config)
-}
-
-func (c *HybridCache) SetLayer(layer int) {
-	c.kv.SetLayer(layer)
-}
-
-func (c *HybridCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.kv.Get(ctx)
-}
-
-func (c *HybridCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.kv.Put(ctx, key, value)
-}
-
-func (c *HybridCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
-		return err
-	}
-
-	// Derive equal-length sequence layout for recurrent layers
-	seqCounts := make(map[int]int)
-	c.curSeqs = c.curSeqs[:0]
-	for _, s := range batch.Sequences {
-		if _, ok := seqCounts[s]; !ok {
-			c.curSeqs = append(c.curSeqs, s)
-		}
-		seqCounts[s]++
-	}
-
-	if len(c.curSeqs) == 0 {
-		return nil
-	}
-
-	nTokens := len(batch.Sequences)
-	nSeqs := len(c.curSeqs)
-	want := nTokens / nSeqs
-	for _, s := range c.curSeqs {
-		if seqCounts[s] != want {
-			return kvcache.ErrNotSupported
-		}
-	}
-
-	c.curSeqTokens = want
-
-	// When reserving memory for estimation, use fake slot assignments
-	if reserve {
-		c.curSlots = c.curSlots[:0]
-		slots := make([]int32, nSeqs)
-		for i := range nSeqs {
-			c.curSlots = append(c.curSlots, i)
-			slots[i] = int32(i)
-		}
-		c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-		c.reserveCheckpoints = true
-		c.planCheckpoints(batch)
-		return nil
-	}
-
-	// Ensure slots exist for sequences in this batch
-	c.curSlots = c.curSlots[:0]
-	var newSlots []int
-	for _, s := range c.curSeqs {
-		slot, ok := c.slotForSeq[s]
-		if !ok {
-			var err error
-			slot, err = c.allocSlot()
-			if err != nil {
-				return err
-			}
-			c.slotForSeq[s] = slot
-			c.refCount[slot] = 1
-			newSlots = append(newSlots, slot)
-		}
-		c.curSlots = append(c.curSlots, slot)
-	}
-
-	// Zero state for newly allocated slots
-	if len(newSlots) > 0 {
-		c.zeroSlots(ctx, newSlots)
-	}
-
-	// Create a tensor for the current slots
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	// Reset writable state for new forward pass
-	c.writableEnsured = false
-	c.writableError = nil
-	c.reserveCheckpoints = false
-	c.planCheckpoints(batch)
-
-	return nil
-}
-
-func (c *HybridCache) allocSlot() (int, error) {
-	if len(c.freeSlots) == 0 {
-		return 0, kvcache.ErrKvCacheFull
-	}
-	slot := c.freeSlots[len(c.freeSlots)-1]
-	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
-	return slot, nil
-}
-
-func (c *HybridCache) freeSlot(slot int) {
-	if slot >= 0 && slot < c.maxSequences {
-		c.freeSlots = append(c.freeSlots, slot)
-	}
-}
-
-// zeroSlots zeros the recurrent state for the given slots across all layers.
-func (c *HybridCache) zeroSlots(ctx ml.Context, slots []int) {
-	if len(slots) == 0 {
-		return
-	}
-
-	inputCtx := ctx.Input()
-
-	slotIndices := make([]int32, len(slots))
-	for i, s := range slots {
-		slotIndices[i] = int32(s)
-	}
-	slotsTensor := inputCtx.FromInts(slotIndices, len(slotIndices))
-
-	// Zero conv states
-	if len(c.convStates) > 0 {
-		zeros := inputCtx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, len(slots))
-		for _, buf := range c.convStates {
-			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-		}
-	}
-
-	// Zero delta states
-	if len(c.deltaStates) > 0 {
-		zeros := inputCtx.Zeros(ml.DTypeF32, c.deltaStateSize, len(slots))
-		for _, buf := range c.deltaStates {
-			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-		}
-	}
-}
-
-// EnsureWritable ensures sequences have private slots (copy-on-write).
-func (c *HybridCache) EnsureWritable(ctx ml.Context) error {
-	for i, seq := range c.curSeqs {
-		slot, ok := c.slotForSeq[seq]
-		if !ok {
-			continue
-		}
-
-		if slot < 0 || slot >= len(c.refCount) {
-			continue
-		}
-
-		if c.refCount[slot] <= 1 {
-			continue
-		}
-
-		newSlot, err := c.allocSlot()
-		if err != nil {
-			return err
-		}
-		c.refCount[slot]--
-		c.refCount[newSlot] = 1
-		c.slotForSeq[seq] = newSlot
-		c.curSlots[i] = newSlot
-
-		c.copyRecurrentState(ctx, slot, newSlot)
-		c.copyCheckpoints(ctx, slot, newSlot)
-	}
-
-	// Rebuild current slots tensor
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	return nil
-}
-
-func (c *HybridCache) copyRecurrentState(ctx ml.Context, srcSlot, dstSlot int) {
-	src := ctx.Input().FromInts([]int32{int32(srcSlot)}, 1)
-	dst := ctx.Input().FromInts([]int32{int32(dstSlot)}, 1)
-
-	for _, buf := range c.convStates {
-		rows := buf.Rows(ctx, src)
-		rowsF32 := rows.Cast(ctx, ml.DTypeF32)
-		ctx.Forward(buf.SetRows(ctx, rowsF32, dst))
-	}
-
-	for _, buf := range c.deltaStates {
-		rows := buf.Rows(ctx, src)
-		rowsF32 := rows.Cast(ctx, ml.DTypeF32)
-		ctx.Forward(buf.SetRows(ctx, rowsF32, dst))
-	}
-}
-
-func (c *HybridCache) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
-	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
-
-	// Copy-on-write for recurrent state
-	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
-		if c.validSlot(dstSlot) {
-			c.refCount[dstSlot]--
-			if c.refCount[dstSlot] <= 0 {
-				c.refCount[dstSlot] = 0
-				c.freeSlot(dstSlot)
-			}
-		}
-		delete(c.slotForSeq, dstSeq)
-	}
-
-	srcSlot, ok := c.slotForSeq[srcSeq]
-	if !ok {
-		return
-	}
-
-	if c.validSlot(srcSlot) {
-		c.slotForSeq[dstSeq] = srcSlot
-		c.refCount[srcSlot]++
-	}
-}
-
-func (c *HybridCache) CanResume(seq int, pos int32) bool {
-	if !c.kv.CanResume(seq, pos) {
-		return false
-	}
-	if pos == 0 {
-		return true
-	}
-	return c.hasCheckpoint(seq, pos)
-}
-
-func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if beginIndex > 0 && endIndex != math.MaxInt32 {
-		return kvcache.ErrNotSupported
-	}
-
-	if beginIndex > 0 {
-		restore, ok := c.pendingRestore[seq]
-		if !ok || restore.pos+1 != beginIndex {
-			return kvcache.ErrNotSupported
-		}
-		if !c.restoreComplete(restore) {
-			return kvcache.ErrNotSupported
-		}
-		// If the recurrent slot is shared, detach it before applying a restore.
-		if slot, ok := c.slotForSeq[seq]; ok && c.validSlot(slot) && c.refCount[slot] > 1 {
-			newSlot, err := c.allocSlot()
-			if err != nil {
-				return err
-			}
-			ctx := c.backend.NewContext()
-			c.copyRecurrentState(ctx, slot, newSlot)
-			c.copyCheckpoints(ctx, slot, newSlot)
-			if len(c.convStates) > 0 || len(c.deltaStates) > 0 {
-				ctx.Compute()
-			}
-			ctx.Close()
-
-			c.refCount[slot]--
-			c.refCount[newSlot] = 1
-			c.slotForSeq[seq] = newSlot
-
-			restore.slot = newSlot
-			c.pendingRestore[seq] = restore
-		}
-	}
-
-	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
-		return err
-	}
-
-	if beginIndex > 0 {
-		restore := c.pendingRestore[seq]
-		delete(c.pendingRestore, seq)
-		return c.applyCheckpointRestore(restore)
-	}
-
-	// Removal invalidates recurrent state
-	slot, ok := c.slotForSeq[seq]
-	delete(c.pendingRestore, seq)
-	if !ok {
-		return nil
-	}
-
-	if !c.validSlot(slot) {
-		delete(c.slotForSeq, seq)
-		return nil
-	}
-
-	c.refCount[slot]--
-	if c.refCount[slot] <= 0 {
-		c.refCount[slot] = 0
-		c.clearCheckpoints(slot)
-		c.freeSlot(slot)
-	}
-	delete(c.slotForSeq, seq)
-
-	return nil
-}
-
-func (c *HybridCache) validSlot(slot int) bool {
-	return slot >= 0 && slot < len(c.refCount)
-}
-
-func (c *HybridCache) slotsTensor() ml.Tensor {
-	return c.curSlotsInput
-}
-
-// contiguousSlots returns the starting slot if current slots are contiguous and ordered.
-func (c *HybridCache) contiguousSlots() (int, bool) {
-	if len(c.curSlots) == 0 {
-		return 0, false
-	}
-	start := c.curSlots[0]
-	for i, s := range c.curSlots {
-		if s != start+i {
-			return 0, false
-		}
-	}
-	return start, true
-}
-
-func (c *HybridCache) seqTokens() int {
-	return c.curSeqTokens
-}
-
-func (c *HybridCache) numSeqs() int {
-	return len(c.curSeqs)
-}
-
-func (c *HybridCache) convBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.convStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.convCtxs[layer]; !ok {
-		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	// Recurrent state must stay in F32 (ssm_conv kernels are F32-only).
-	buf := c.convCtxs[layer].Zeros(ml.DTypeF32, c.convDim*c.convChannels, c.maxSequences)
-	c.convStates[layer] = buf
-	return buf
-}
-
-func (c *HybridCache) deltaBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.deltaStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.deltaCtxs[layer]; !ok {
-		c.deltaCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	// Recurrent delta state must stay in F32.
-	buf := c.deltaCtxs[layer].Zeros(ml.DTypeF32, c.deltaStateSize, c.maxSequences)
-	c.deltaStates[layer] = buf
-	return buf
-}
-
-func (c *HybridCache) ensureWritableOnce(ctx ml.Context) {
-	if !c.writableEnsured {
-		needsWritable := false
-		for _, seq := range c.curSeqs {
-			slot, ok := c.slotForSeq[seq]
-			if !ok {
-				continue
-			}
-			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
-				needsWritable = true
-				break
-			}
-		}
-
-		if needsWritable {
-			if err := c.EnsureWritable(ctx); err != nil {
-				c.writableError = err
-			}
-		}
-		c.writableEnsured = true
-	}
-}
-
-// ConvState returns the conv state for current batch sequences as [convDim, convChannels, nSeqs].
-func (c *HybridCache) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
-	c.ensureWritableOnce(ctx)
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.convBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, c.convDim, c.convChannels, c.numSeqs()), nil
-}
-
-// UpdateConvState writes a new conv state for current batch sequences.
-func (c *HybridCache) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.convBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.convDim*c.convChannels, c.numSeqs())
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	if start, ok := c.contiguousSlots(); ok {
-		// Fast path: contiguous slots allow a single view + copy
-		offset := start * buf.Stride(1)
-		view := buf.View(ctx, offset, c.convDim*c.convChannels, buf.Stride(1), c.numSeqs())
-		ctx.Forward(srcF32.Copy(ctx, view))
-	} else {
-		ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
-	}
-
-	c.captureConvCheckpoint(ctx, layer, srcF32)
-}
-
-// DeltaState returns the delta state for current batch sequences as [headVDim, headVDim*numVHeads, nSeqs].
+// DeltaState returns the delta state for current batch sequences as
+// [headVDim, headVDim*numVHeads, nSeqs].
 func (c *HybridCache) DeltaState(ctx ml.Context, layer int, headVDim, numVHeads int) (ml.Tensor, error) {
-	c.ensureWritableOnce(ctx)
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.deltaBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, headVDim, headVDim*numVHeads, c.numSeqs()), nil
+	return c.RecurrentState(ctx, layer, headVDim, headVDim*numVHeads)
 }

 // UpdateDeltaState writes a new delta state for current batch sequences.
 func (c *HybridCache) UpdateDeltaState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.deltaBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.deltaStateSize, c.numSeqs())
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	if start, ok := c.contiguousSlots(); ok {
-		// Fast path: contiguous slots allow a single view + copy
-		offset := start * buf.Stride(1)
-		view := buf.View(ctx, offset, c.deltaStateSize, buf.Stride(1), c.numSeqs())
-		ctx.Forward(srcF32.Copy(ctx, view))
-	} else {
-		ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
+	c.UpdateRecurrentState(ctx, layer, newState)
+}
+
+func (c *HybridCache) seqTokens() int {
+	return c.SeqTokens()
+}
+
+func (c *HybridCache) numSeqs() int {
+	return c.NumSeqs()
+}
+
+// Keep qwen3next behavior for partial mid-sequence removals.
+func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
+	if beginIndex > 0 && endIndex != math.MaxInt32 {
+		return kvcache.ErrNotSupported
 	}
-
-	c.captureDeltaCheckpoint(ctx, layer, srcF32)
-}
-
-// IsSupportedForBatch returns true if the current batch layout supports recurrent layers.
-func (c *HybridCache) IsSupportedForBatch() bool {
-	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
-}
-
-// Seqs returns the ordered unique sequences for the current forward pass.
-func (c *HybridCache) Seqs() []int {
-	return slices.Clone(c.curSeqs)
+	return c.Recurrent.Remove(seq, beginIndex, endIndex)
 }
--- a/model/models/qwen3next/checkpoints.go
+++ b/model/models/qwen3next/checkpoints.go
@@ -1,498 +0,0 @@
-package qwen3next
-
-import (
-	"log/slog"
-	"math"
-
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
-)
-
-const (
-	checkpointCountDefault    = 32
-	checkpointMinPosDefault   = int32(16)
-	checkpointIntervalDefault = int32(1280)
-)
-
-// TODO(jmorganca): Add byte-serialized host-RAM checkpoints to reduce GPU
-// memory usage while preserving prefix reuse for recurrent state.
-
-type checkpointEntry struct {
-	pos   int32
-	conv  map[int]ml.Tensor
-	delta map[int]ml.Tensor
-}
-
-type slotCheckpointStore struct {
-	entries []checkpointEntry
-	size    int
-	next    int
-	lastPos int32
-}
-
-type checkpointRestore struct {
-	slot int
-	idx  int
-	pos  int32
-}
-
-func newSlotCheckpointStore(n int) *slotCheckpointStore {
-	entries := make([]checkpointEntry, n)
-	for i := range entries {
-		entries[i].pos = -1
-	}
-	return &slotCheckpointStore{
-		entries: entries,
-		lastPos: -1,
-	}
-}
-
-func (s *slotCheckpointStore) reset() {
-	s.size = 0
-	s.next = 0
-	s.lastPos = -1
-	for i := range s.entries {
-		s.entries[i].pos = -1
-	}
-}
-
-func (s *slotCheckpointStore) record(pos int32) int {
-	if len(s.entries) == 0 {
-		return -1
-	}
-	idx := s.next
-	s.next = (s.next + 1) % len(s.entries)
-	if s.size < len(s.entries) {
-		s.size++
-	}
-	s.entries[idx].pos = pos
-	s.lastPos = pos
-	return idx
-}
-
-func (s *slotCheckpointStore) bestIndex(targetPos int32) (int, int32, bool) {
-	bestIdx := -1
-	bestPos := int32(-1)
-	for i := range s.entries {
-		pos := s.entries[i].pos
-		if pos < 0 || pos >= targetPos {
-			continue
-		}
-		if pos > bestPos {
-			bestPos = pos
-			bestIdx = i
-		}
-	}
-	if bestIdx < 0 {
-		return -1, -1, false
-	}
-	return bestIdx, bestPos, true
-}
-
-func (s *slotCheckpointStore) pruneAfter(pos int32) {
-	if len(s.entries) == 0 {
-		s.size = 0
-		s.next = 0
-		s.lastPos = -1
-		return
-	}
-
-	size := 0
-	next := -1
-	minPos := int32(math.MaxInt32)
-	minIdx := 0
-	for i := range s.entries {
-		if s.entries[i].pos > pos {
-			s.entries[i].pos = -1
-		}
-		if s.entries[i].pos >= 0 {
-			size++
-			if s.entries[i].pos < minPos {
-				minPos = s.entries[i].pos
-				minIdx = i
-			}
-		} else if next == -1 {
-			next = i
-		}
-	}
-
-	s.size = size
-	if size == 0 {
-		s.next = 0
-		s.lastPos = -1
-		return
-	}
-	if next != -1 {
-		s.next = next
-	} else {
-		// Full ring: overwrite the oldest checkpoint next.
-		s.next = minIdx
-	}
-	s.lastPos = pos
-}
-
-func (s *slotCheckpointStore) window() (size int, minPos, maxPos, lastPos int32) {
-	minPos = int32(math.MaxInt32)
-	maxPos = int32(-1)
-	for i := range s.entries {
-		pos := s.entries[i].pos
-		if pos < 0 {
-			continue
-		}
-		size++
-		if pos < minPos {
-			minPos = pos
-		}
-		if pos > maxPos {
-			maxPos = pos
-		}
-	}
-	if size == 0 {
-		minPos = -1
-		maxPos = -1
-	}
-	return size, minPos, maxPos, s.lastPos
-}
-
-func (c *HybridCache) planCheckpoints(batch input.Batch) {
-	if c.checkpointCount == 0 || len(c.curSeqs) == 0 {
-		c.curCheckpointPos = c.curCheckpointPos[:0]
-		for k := range c.curCheckpointSlots {
-			delete(c.curCheckpointSlots, k)
-		}
-		return
-	}
-
-	if cap(c.curCheckpointPos) < len(c.curSeqs) {
-		c.curCheckpointPos = make([]int32, len(c.curSeqs))
-	} else {
-		c.curCheckpointPos = c.curCheckpointPos[:len(c.curSeqs)]
-	}
-	for i := range c.curCheckpointPos {
-		c.curCheckpointPos[i] = -1
-	}
-	for k := range c.curCheckpointSlots {
-		delete(c.curCheckpointSlots, k)
-	}
-
-	posMax := make(map[int]int32, len(c.curSeqs))
-	for i, seq := range batch.Sequences {
-		pos := batch.Positions[i]
-		if cur, ok := posMax[seq]; !ok || pos > cur {
-			posMax[seq] = pos
-		}
-	}
-
-	for i, seq := range c.curSeqs {
-		pos, ok := posMax[seq]
-		if !ok {
-			continue
-		}
-		if pos < c.checkpointMinPos {
-			continue
-		}
-		slot := c.curSlots[i]
-		store := c.checkpointStore(slot)
-		lastPos := store.lastPos
-		if lastPos < 0 || pos-lastPos >= c.checkpointInterval {
-			c.curCheckpointPos[i] = pos
-		}
-	}
-}
-
-func (c *HybridCache) checkpointStore(slot int) *slotCheckpointStore {
-	store, ok := c.checkpoints[slot]
-	if ok {
-		return store
-	}
-	store = newSlotCheckpointStore(c.checkpointCount)
-	c.checkpoints[slot] = store
-	return store
-}
-
-func (c *HybridCache) checkpointIndexForSlot(slot int, pos int32) int {
-	if c.checkpointCount == 0 {
-		return -1
-	}
-	if idx, ok := c.curCheckpointSlots[slot]; ok {
-		return idx
-	}
-	store := c.checkpointStore(slot)
-	idx := store.record(pos)
-	if idx >= 0 {
-		c.curCheckpointSlots[slot] = idx
-	}
-	return idx
-}
-
-func (c *HybridCache) hasCheckpoint(seq int, pos int32) bool {
-	if pos <= 0 {
-		return false
-	}
-	slot, ok := c.slotForSeq[seq]
-	if !ok {
-		return false
-	}
-	store, ok := c.checkpoints[slot]
-	if !ok {
-		return false
-	}
-	_, _, ok = store.bestIndex(pos)
-	return ok
-}
-
-func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
-	if targetPos <= 0 {
-		return 0, false
-	}
-	slot, ok := c.slotForSeq[seq]
-	if !ok {
-		return 0, false
-	}
-	store, ok := c.checkpoints[slot]
-	if !ok {
-		slog.Debug("qwen3next: checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", 0)
-		return 0, false
-	}
-	idx, pos, ok := store.bestIndex(targetPos)
-	if !ok {
-		size, minPos, maxPos, lastPos := store.window()
-		slog.Debug("qwen3next: checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", size,
-			"min", minPos, "max", maxPos, "last", lastPos)
-		return 0, false
-	}
-	c.pendingRestore[seq] = checkpointRestore{
-		slot: slot,
-		idx:  idx,
-		pos:  pos,
-	}
-	return pos + 1, true
-}
-
-func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {
-	entry, ok := c.restoreEntry(restore)
-	if !ok {
-		return kvcache.ErrNotSupported
-	}
-
-	ctx := c.backend.NewContext()
-	defer ctx.Close()
-
-	slotIdx := ctx.Input().FromInts([]int32{int32(restore.slot)}, 1)
-	for layer, src := range entry.conv {
-		buf := c.convBuffer(ctx, layer)
-		ctx.Forward(buf.SetRows(ctx, src, slotIdx))
-	}
-	for layer, src := range entry.delta {
-		buf := c.deltaBuffer(ctx, layer)
-		ctx.Forward(buf.SetRows(ctx, src, slotIdx))
-	}
-
-	if len(entry.conv) > 0 || len(entry.delta) > 0 {
-		ctx.Compute()
-	}
-	store := c.checkpoints[restore.slot]
-	store.pruneAfter(restore.pos)
-	return nil
-}
-
-func (c *HybridCache) restoreComplete(restore checkpointRestore) bool {
-	_, ok := c.restoreEntry(restore)
-	return ok
-}
-
-func (c *HybridCache) restoreEntry(restore checkpointRestore) (*checkpointEntry, bool) {
-	store, ok := c.checkpoints[restore.slot]
-	if !ok || restore.idx < 0 || restore.idx >= len(store.entries) {
-		return nil, false
-	}
-	entry := &store.entries[restore.idx]
-	if entry.pos < 0 {
-		return nil, false
-	}
-	if !c.entryComplete(entry) {
-		return nil, false
-	}
-	return entry, true
-}
-
-func (c *HybridCache) entryComplete(entry *checkpointEntry) bool {
-	for layer := range c.convStates {
-		if entry.conv == nil || entry.conv[layer] == nil {
-			return false
-		}
-	}
-	for layer := range c.deltaStates {
-		if entry.delta == nil || entry.delta[layer] == nil {
-			return false
-		}
-	}
-	return true
-}
-
-func (c *HybridCache) clearCheckpoints(slot int) {
-	if store, ok := c.checkpoints[slot]; ok {
-		store.reset()
-	}
-}
-
-func (c *HybridCache) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
-	if c.checkpointCount == 0 {
-		return
-	}
-	srcStore, ok := c.checkpoints[srcSlot]
-	if !ok || srcStore.size == 0 {
-		return
-	}
-	dstStore := c.checkpointStore(dstSlot)
-	dstStore.size = srcStore.size
-	dstStore.next = srcStore.next
-	dstStore.lastPos = srcStore.lastPos
-
-	for i := range srcStore.entries {
-		srcEntry := &srcStore.entries[i]
-		dstEntry := &dstStore.entries[i]
-		dstEntry.pos = srcEntry.pos
-		if srcEntry.conv != nil {
-			if dstEntry.conv == nil {
-				dstEntry.conv = make(map[int]ml.Tensor)
-			}
-			for layer, src := range srcEntry.conv {
-				dst := c.ensureCheckpointConv(layer, dstEntry)
-				ctx.Forward(src.Copy(ctx, dst))
-			}
-		}
-		if srcEntry.delta != nil {
-			if dstEntry.delta == nil {
-				dstEntry.delta = make(map[int]ml.Tensor)
-			}
-			for layer, src := range srcEntry.delta {
-				dst := c.ensureCheckpointDelta(layer, dstEntry)
-				ctx.Forward(src.Copy(ctx, dst))
-			}
-		}
-	}
-}
-
-func (c *HybridCache) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
-	if c.checkpointCount == 0 {
-		return
-	}
-	if c.reserveCheckpoints {
-		c.reserveCheckpointConv(layer)
-		return
-	}
-	if len(c.curCheckpointPos) == 0 {
-		return
-	}
-	for i, pos := range c.curCheckpointPos {
-		if pos < 0 {
-			continue
-		}
-		slot := c.curSlots[i]
-		idx := c.checkpointIndexForSlot(slot, pos)
-		if idx < 0 {
-			continue
-		}
-		entry := &c.checkpoints[slot].entries[idx]
-		dst := c.ensureCheckpointConv(layer, entry)
-		seqSlice := src.Slice(ctx, 1, i, i+1, 1)
-		ctx.Forward(seqSlice.Copy(ctx, dst))
-	}
-}
-
-func (c *HybridCache) captureDeltaCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
-	if c.checkpointCount == 0 {
-		return
-	}
-	if c.reserveCheckpoints {
-		c.reserveCheckpointDelta(layer)
-		return
-	}
-	if len(c.curCheckpointPos) == 0 {
-		return
-	}
-	for i, pos := range c.curCheckpointPos {
-		if pos < 0 {
-			continue
-		}
-		slot := c.curSlots[i]
-		idx := c.checkpointIndexForSlot(slot, pos)
-		if idx < 0 {
-			continue
-		}
-		entry := &c.checkpoints[slot].entries[idx]
-		dst := c.ensureCheckpointDelta(layer, entry)
-		seqSlice := src.Slice(ctx, 1, i, i+1, 1)
-		ctx.Forward(seqSlice.Copy(ctx, dst))
-	}
-}
-
-func (c *HybridCache) ensureCheckpointConv(layer int, entry *checkpointEntry) ml.Tensor {
-	if entry.conv == nil {
-		entry.conv = make(map[int]ml.Tensor)
-	}
-	if t, ok := entry.conv[layer]; ok {
-		return t
-	}
-	ctx, ok := c.checkpointConvCtxs[layer]
-	if !ok {
-		ctx = c.backend.NewContextSize(c.checkpointCtxSize).Layer(layer)
-		c.checkpointConvCtxs[layer] = ctx
-	}
-	t := ctx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, 1)
-	entry.conv[layer] = t
-	return t
-}
-
-func (c *HybridCache) ensureCheckpointDelta(layer int, entry *checkpointEntry) ml.Tensor {
-	if entry.delta == nil {
-		entry.delta = make(map[int]ml.Tensor)
-	}
-	if t, ok := entry.delta[layer]; ok {
-		return t
-	}
-	ctx, ok := c.checkpointDeltaCtxs[layer]
-	if !ok {
-		ctx = c.backend.NewContextSize(c.checkpointCtxSize).Layer(layer)
-		c.checkpointDeltaCtxs[layer] = ctx
-	}
-	t := ctx.Zeros(ml.DTypeF32, c.deltaStateSize, 1)
-	entry.delta[layer] = t
-	return t
-}
-
-func (c *HybridCache) reserveCheckpointConv(layer int) {
-	key := checkpointReserveKey(layer, 0)
-	if _, ok := c.checkpointReserved[key]; ok {
-		return
-	}
-	for slot := range c.maxSequences {
-		store := c.checkpointStore(slot)
-		for i := range store.entries {
-			entry := &store.entries[i]
-			_ = c.ensureCheckpointConv(layer, entry)
-		}
-	}
-	c.checkpointReserved[key] = struct{}{}
-}
-
-func (c *HybridCache) reserveCheckpointDelta(layer int) {
-	key := checkpointReserveKey(layer, 1)
-	if _, ok := c.checkpointReserved[key]; ok {
-		return
-	}
-	for slot := range c.maxSequences {
-		store := c.checkpointStore(slot)
-		for i := range store.entries {
-			entry := &store.entries[i]
-			_ = c.ensureCheckpointDelta(layer, entry)
-		}
-	}
-	c.checkpointReserved[key] = struct{}{}
-}
-
-func checkpointReserveKey(layer int, kind int) int {
-	return layer*2 + kind
-}
--- a/model/models/qwen3next/checkpoints_test.go
+++ b/model/models/qwen3next/checkpoints_test.go
@@ -1,300 +0,0 @@
-package qwen3next
-
-import (
-	"errors"
-	"math"
-	"os"
-	"testing"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-)
-
-func newTestBackend(tb testing.TB) ml.Backend {
-	tb.Helper()
-
-	f, err := os.CreateTemp(tb.TempDir(), "*.gguf")
-	if err != nil {
-		tb.Fatal(err)
-	}
-	if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
-		_ = f.Close()
-		tb.Fatal(err)
-	}
-	if err := f.Close(); err != nil {
-		tb.Fatal(err)
-	}
-
-	b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
-	if err != nil {
-		tb.Fatal(err)
-	}
-	tb.Cleanup(func() {
-		b.Close()
-	})
-
-	return b
-}
-
-func TestSlotCheckpointStoreBestIndex(t *testing.T) {
-	store := newSlotCheckpointStore(2)
-	store.record(10)
-	store.record(20)
-
-	_, pos, ok := store.bestIndex(15)
-	if !ok || pos != 10 {
-		t.Fatalf("expected best pos 10, got pos=%d ok=%v", pos, ok)
-	}
-
-	store.record(30) // overwrite oldest (10)
-
-	if _, _, ok := store.bestIndex(15); ok {
-		t.Fatalf("expected no checkpoint for targetPos=15 after overwrite")
-	}
-
-	_, pos, ok = store.bestIndex(40)
-	if !ok || pos != 30 {
-		t.Fatalf("expected best pos 30, got pos=%d ok=%v", pos, ok)
-	}
-}
-
-func TestHybridCachePrepareRestore(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 1, 1)
-	cache.checkpointCount = 3
-	cache.checkpoints = make(map[int]*slotCheckpointStore)
-	cache.pendingRestore = make(map[int]checkpointRestore)
-
-	cache.slotForSeq[1] = 0
-	store := cache.checkpointStore(0)
-	store.record(5)
-	store.record(9)
-	store.record(15)
-
-	restorePos, ok := cache.PrepareRestore(1, 12)
-	if !ok {
-		t.Fatalf("expected restore ok")
-	}
-	if restorePos != 10 {
-		t.Fatalf("expected restorePos 10, got %d", restorePos)
-	}
-	rest, ok := cache.pendingRestore[1]
-	if !ok {
-		t.Fatalf("expected pending restore entry")
-	}
-	if rest.pos != 9 {
-		t.Fatalf("expected pending restore pos 9, got %d", rest.pos)
-	}
-}
-
-func TestSlotCheckpointStorePruneAfter(t *testing.T) {
-	store := newSlotCheckpointStore(3)
-	store.record(10)
-	store.record(20)
-	store.record(30)
-
-	store.pruneAfter(20)
-
-	if store.lastPos != 20 {
-		t.Fatalf("expected lastPos 20, got %d", store.lastPos)
-	}
-
-	_, pos, ok := store.bestIndex(25)
-	if !ok || pos != 20 {
-		t.Fatalf("expected best pos 20 after prune, got pos=%d ok=%v", pos, ok)
-	}
-
-	_, pos, ok = store.bestIndex(35)
-	if !ok || pos != 20 {
-		t.Fatalf("expected pruned best pos 20 for targetPos=35, got pos=%d ok=%v", pos, ok)
-	}
-}
-
-func TestHybridCacheRestoreDetachesSharedSlot(t *testing.T) {
-	backend := newTestBackend(t)
-
-	cache := NewHybridCache(nil, 1, 2, 2)
-	cache.Init(backend, ml.DTypeF16, 2, 8, 2)
-
-	cache.slotForSeq[1] = 0
-	cache.slotForSeq[2] = 0
-	cache.refCount[0] = 2
-	cache.refCount[1] = 0
-	cache.freeSlots = []int{1}
-
-	store := cache.checkpointStore(0)
-	idx := store.record(9)
-	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
-
-	if err := cache.Remove(1, 10, math.MaxInt32); err != nil {
-		t.Fatalf("Remove failed: %v", err)
-	}
-
-	if cache.slotForSeq[1] == cache.slotForSeq[2] {
-		t.Fatalf("expected restore to detach shared slot, got same slot %d", cache.slotForSeq[1])
-	}
-	if cache.slotForSeq[1] != 1 {
-		t.Fatalf("expected seq 1 to move to slot 1, got %d", cache.slotForSeq[1])
-	}
-	if cache.slotForSeq[2] != 0 {
-		t.Fatalf("expected seq 2 to remain on slot 0, got %d", cache.slotForSeq[2])
-	}
-	if cache.refCount[0] != 1 || cache.refCount[1] != 1 {
-		t.Fatalf("unexpected refCounts: slot0=%d slot1=%d", cache.refCount[0], cache.refCount[1])
-	}
-	if _, ok := cache.pendingRestore[1]; ok {
-		t.Fatalf("expected pending restore to be cleared")
-	}
-}
-
-func TestHybridCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 2, 2)
-	cache.checkpointCount = 3
-	cache.checkpoints = make(map[int]*slotCheckpointStore)
-	cache.pendingRestore = make(map[int]checkpointRestore)
-
-	cache.slotForSeq[1] = 0
-	cache.refCount = []int{1}
-	cache.freeSlots = nil
-
-	// Simulate that layer 0 has both conv and delta state (so entryComplete expects both)
-	cache.convStates[0] = nil  // placeholder to indicate layer 0 exists
-	cache.deltaStates[0] = nil // placeholder to indicate layer 0 exists
-
-	store := cache.checkpointStore(0)
-	idx := store.record(9)
-	entry := &store.entries[idx]
-	// Only set conv checkpoint, not delta - making it incomplete
-	entry.conv = map[int]ml.Tensor{0: nil}
-	// entry.delta is not set, so checkpoint is incomplete
-
-	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
-
-	err := cache.Remove(1, 10, math.MaxInt32)
-	if !errors.Is(err, kvcache.ErrNotSupported) {
-		t.Fatalf("expected ErrNotSupported for incomplete checkpoint, got %v", err)
-	}
-}
-
-func TestHybridCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 2, 2)
-	cache.checkpointCount = 3
-	cache.checkpoints = make(map[int]*slotCheckpointStore)
-	cache.pendingRestore = make(map[int]checkpointRestore)
-
-	cache.slotForSeq[1] = 0
-	cache.refCount = []int{1}
-	cache.freeSlots = nil
-
-	// Don't set convStates/deltaStates - with no layers to check,
-	// entryComplete will return true as long as entry.pos >= 0
-
-	store := cache.checkpointStore(0)
-	idx := store.record(9)
-
-	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
-
-	// Test that restoreComplete returns true when no layers need checkpoints
-	restore := cache.pendingRestore[1]
-	if !cache.restoreComplete(restore) {
-		t.Fatalf("expected restoreComplete to return true for complete checkpoint")
-	}
-}
-
-func TestSlotCheckpointStoreRingBufferWrapAround(t *testing.T) {
-	// Test that ring buffer wrap-around reuses entries without clearing maps.
-	store := newSlotCheckpointStore(3)
-
-	// Fill the buffer
-	store.record(10)
-	store.record(20)
-	store.record(30)
-
-	// Create fake tensor data in the first entry's maps
-	store.entries[0].conv = make(map[int]ml.Tensor)
-	store.entries[0].conv[0] = nil // Simulated tensor reference
-	store.entries[0].delta = make(map[int]ml.Tensor)
-	store.entries[0].delta[0] = nil // Simulated tensor reference
-
-	// Record another entry, which should wrap around and overwrite entry 0
-	store.record(40)
-
-	// Verify the maps are still present (we reuse tensors)
-	if store.entries[0].conv == nil {
-		t.Fatalf("expected conv map to be preserved on reuse")
-	}
-	if store.entries[0].delta == nil {
-		t.Fatalf("expected delta map to be preserved on reuse")
-	}
-
-	// Verify the new position was recorded
-	if store.entries[0].pos != 40 {
-		t.Fatalf("expected entry 0 pos to be 40, got %d", store.entries[0].pos)
-	}
-}
-
-func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
-	// Test behavior when buffer is exactly at capacity
-	store := newSlotCheckpointStore(2)
-
-	idx1 := store.record(10)
-	idx2 := store.record(20)
-
-	if idx1 != 0 || idx2 != 1 {
-		t.Fatalf("expected indices 0, 1, got %d, %d", idx1, idx2)
-	}
-
-	if store.size != 2 {
-		t.Fatalf("expected size 2, got %d", store.size)
-	}
-
-	// Verify both checkpoints are accessible
-	_, pos1, ok1 := store.bestIndex(15)
-	_, pos2, ok2 := store.bestIndex(25)
-
-	if !ok1 || pos1 != 10 {
-		t.Fatalf("expected best pos 10 for target 15, got pos=%d ok=%v", pos1, ok1)
-	}
-	if !ok2 || pos2 != 20 {
-		t.Fatalf("expected best pos 20 for target 25, got pos=%d ok=%v", pos2, ok2)
-	}
-}
-
-func TestSlotCheckpointStoreEmptyBuffer(t *testing.T) {
-	// Test behavior with zero-size buffer
-	store := newSlotCheckpointStore(0)
-
-	idx := store.record(10)
-	if idx != -1 {
-		t.Fatalf("expected record to return -1 for empty buffer, got %d", idx)
-	}
-
-	_, _, ok := store.bestIndex(15)
-	if ok {
-		t.Fatalf("expected no checkpoint for empty buffer")
-	}
-}
-
-func TestSlotCheckpointStorePruneAfterAll(t *testing.T) {
-	// Test pruning that removes all checkpoints
-	store := newSlotCheckpointStore(3)
-	store.record(10)
-	store.record(20)
-	store.record(30)
-
-	// Prune everything by setting threshold below all positions
-	store.pruneAfter(5)
-
-	if store.size != 0 {
-		t.Fatalf("expected size 0 after pruning all, got %d", store.size)
-	}
-	// When all checkpoints are pruned, lastPos is reset to -1
-	if store.lastPos != -1 {
-		t.Fatalf("expected lastPos -1 after pruning all, got %d", store.lastPos)
-	}
-
-	_, _, ok := store.bestIndex(100)
-	if ok {
-		t.Fatalf("expected no checkpoint after pruning all")
-	}
-}
--- a/model/models/qwen3next/deltanet.go
+++ b/model/models/qwen3next/deltanet.go
@@ -37,10 +37,12 @@ type GatedDeltaNet struct {
 	// Optimized path: pre-split QKV and gate
 	SSMQKV       *nn.Linear  `gguf:"attn_qkv"`  // -> Q, K, V (concatenated)
 	SSMQKVGate   *nn.Linear  `gguf:"attn_gate"` // -> Z gate
-	SSMBetaAlpha *nn.Linear  `gguf:"ssm_ba"`    // -> beta, alpha
+	SSMBetaAlpha *nn.Linear  `gguf:"ssm_ba"`    // -> beta, alpha (legacy qwen3next)
+	SSMBeta      *nn.Linear  `gguf:"ssm_beta"`  // -> beta (qwen35)
+	SSMAlpha     *nn.Linear  `gguf:"ssm_alpha"` // -> alpha (qwen35)
 	SSMConv1D    *convKernel `gguf:"ssm_conv1d"`
-	SSMDT        ml.Tensor   `gguf:"ssm_dt"` // alpha bias
-	SSMA         ml.Tensor   `gguf:"ssm_a"`  // -A_log.exp()
+	SSMDT        ml.Tensor   `gguf:"ssm_dt,alt:ssm_dt.bias"` // alpha bias
+	SSMA         ml.Tensor   `gguf:"ssm_a"`                  // -A_log.exp()
 	SSMNorm      *nn.RMSNorm `gguf:"ssm_norm"`
 	SSMOut       *nn.Linear  `gguf:"ssm_out"`

@@ -96,7 +98,6 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	headVDim := opts.ssmDInner / numVHeads
 	convKernelSize := opts.convKernelSize

-	mixedBA := gdn.SSMBetaAlpha.Forward(ctx, hiddenStates)
 	qkvDim := headKDim*numKHeads*2 + headVDim*numVHeads

 	if gdn.SSMQKV == nil || gdn.SSMQKVGate == nil {
@@ -106,24 +107,52 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	qkvMixed := gdn.SSMQKV.Forward(ctx, hiddenStates).Reshape(ctx, qkvDim, nSeqTokens, nSeqs)
 	z := gdn.SSMQKVGate.Forward(ctx, hiddenStates)

-	baNewDim := 2 * numVHeads / numKHeads
-	mixedBAReshaped := mixedBA.Reshape(ctx, baNewDim, numKHeads, nSeqTokens, nSeqs)
+	var beta ml.Tensor
+	var alpha ml.Tensor
+	switch {
+	case gdn.SSMBetaAlpha != nil:
+		// Legacy qwen3next path: in_proj_ba packs beta/alpha grouped by K-head.
+		mixedBA := gdn.SSMBetaAlpha.Forward(ctx, hiddenStates)
+		baNewDim := 2 * numVHeads / numKHeads
+		mixedBAReshaped := mixedBA.Reshape(ctx, baNewDim, numKHeads, nSeqTokens, nSeqs)

-	// Split beta and alpha
-	betaSize := numVHeads / numKHeads
-	alphaSize := numVHeads / numKHeads
+		betaSize := numVHeads / numKHeads
+		alphaSize := numVHeads / numKHeads

-	b := mixedBAReshaped.Slice(ctx, 0, 0, betaSize, 1)
-	a := mixedBAReshaped.Slice(ctx, 0, betaSize, betaSize+alphaSize, 1)
+		b := mixedBAReshaped.Slice(ctx, 0, 0, betaSize, 1)
+		a := mixedBAReshaped.Slice(ctx, 0, betaSize, betaSize+alphaSize, 1)

-	// Reshape to merge head dimensions
-	beta := b.Contiguous(ctx, numVHeads, 1, nSeqTokens, nSeqs)
-	alpha := a.Contiguous(ctx, numVHeads, nSeqTokens, nSeqs)
+		// Keep beta layout consistent with qwen35.
+		// [1, numVHeads, nSeqTokens, nSeqs]
+		beta = b.Contiguous(ctx, 1, numVHeads, nSeqTokens, nSeqs)
+		alpha = a.Contiguous(ctx, numVHeads, nSeqTokens, nSeqs)
+
+	case gdn.SSMBeta != nil && gdn.SSMAlpha != nil:
+		// qwen35 path: beta/alpha are separate projections.
+		beta = gdn.SSMBeta.Forward(ctx, hiddenStates).Reshape(ctx, 1, numVHeads, nSeqTokens, nSeqs)
+		alpha = gdn.SSMAlpha.Forward(ctx, hiddenStates).Reshape(ctx, numVHeads, nSeqTokens, nSeqs)
+
+	default:
+		return nil, errors.New("qwen3next: missing linear attention beta/alpha projections")
+	}
+	if gdn.SSMDT == nil {
+		return nil, errors.New("qwen3next: missing linear attention ssm_dt tensor")
+	}
+	if gdn.SSMA == nil {
+		return nil, errors.New("qwen3next: missing linear attention ssm_a tensor")
+	}
+	if gdn.SSMConv1D == nil || gdn.SSMConv1D.Weight == nil {
+		return nil, errors.New("qwen3next: missing linear attention ssm_conv1d tensor")
+	}
+	if gdn.SSMNorm == nil || gdn.SSMOut == nil {
+		return nil, errors.New("qwen3next: missing linear attention ssm_norm/ssm_out projections")
+	}

 	// Compute gate: softplus(alpha + dt_bias) * -A
 	alphaBiased := alpha.Add(ctx, gdn.SSMDT)
 	alphaSoftplus := alphaBiased.Softplus(ctx)
 	gate := alphaSoftplus.Mul(ctx, gdn.SSMA)
+	gate = gate.Reshape(ctx, 1, numVHeads, nSeqTokens, nSeqs)
 	qkvMixed = qkvMixed.Permute(ctx, 1, 0, 2, 3)

 	// Get conv state from cache
@@ -172,16 +201,20 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac

 	// Repeat interleave Q and K if numKHeads != numVHeads
 	if numKHeads != numVHeads {
-		repeatFactor := numVHeads / numKHeads
+		if opts.vHeadReordered {
+			qConv = qConv.Repeat4D(ctx, headKDim, numVHeads, nSeqTokens, nSeqs)
+			kConv = kConv.Repeat4D(ctx, headKDim, numVHeads, nSeqTokens, nSeqs)
+		} else {
+			repeatFactor := numVHeads / numKHeads
+			qReshaped := qConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
+			kReshaped := kConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)

-		qReshaped := qConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
-		kReshaped := kConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
+			qRepeated := qReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
+			kRepeated := kReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)

-		qRepeated := qReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
-		kRepeated := kReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
-
-		qConv = qRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
-		kConv = kRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+			qConv = qRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+			kConv = kRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+		}
 	}

 	// Choose computation mode based on sequence length
@@ -189,7 +222,9 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	if nSeqTokens == 1 {
 		attnOut = gdn.deltaNetAutoregressive(ctx, qConv, kConv, vConv, gate, beta, state, opts, layer, cache)
 	} else {
-		// Use pre-computed masks from opts (created once in Model.Forward)
+		if opts.masks == nil {
+			opts.masks = createMasks(ctx)
+		}
 		attnOut = gdn.deltaNetChunked(ctx, qConv, kConv, vConv, gate, beta, state, opts.masks, opts, layer, cache)
 	}

@@ -310,9 +345,9 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headKDim, nTokens, numVHeads, nSeqs)
 	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headKDim, nTokens, numVHeads, nSeqs)
 	v = v.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headVDim, nTokens, numVHeads, nSeqs)
-	gate = gate.Permute(ctx, 2, 0, 3, 1).Contiguous(ctx, nTokens, 1, numVHeads, nSeqs)
-
-	beta = beta.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	// gate/beta: [1, numVHeads, nTokens, nSeqs] -> [1, nTokens, numVHeads, nSeqs]
+	gate = gate.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, 1, nTokens, numVHeads, nSeqs)
+	beta = beta.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, 1, nTokens, numVHeads, nSeqs)
 	state = state.Reshape(ctx, headVDim, headVDim, numVHeads, nSeqs)

 	// Compute padding
@@ -324,7 +359,7 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 		q = q.Pad(ctx, 0, pad, 0, 0)
 		k = k.Pad(ctx, 0, pad, 0, 0)
 		v = v.Pad(ctx, 0, pad, 0, 0)
-		gate = gate.Pad(ctx, pad, 0, 0, 0)
+		gate = gate.Pad(ctx, 0, pad, 0, 0)
 		beta = beta.Pad(ctx, 0, pad, 0, 0)
 	}

@@ -344,10 +379,12 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	kBeta = kBeta.Reshape(ctx, headKDim, chunkSize, nChunks, numVHeads*nSeqs)
 	vBeta = vBeta.Reshape(ctx, headVDim, chunkSize, nChunks, numVHeads*nSeqs)

-	gate = gate.Reshape(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs)
+	// Reshape gate and cumsum over chunk axis.
+	// [1, chunkSize, nChunks, H*nSeqs] -> transpose -> [chunkSize, 1, nChunks, H*nSeqs]
+	gate = gate.Reshape(ctx, 1, chunkSize, nChunks, numVHeads*nSeqs)

 	// g_cumsum = cumsum(gate)
-	gCumsum := gate.CumSum(ctx)
+	gCumsum := gate.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs).CumSum(ctx)

 	// Compute decay mask
 	gcsI := gCumsum.Reshape(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs)
@@ -411,60 +448,75 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	keyGDiff := k.Mul(ctx, gDiffExpReshaped)
 	keyGDiffT := keyGDiff.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

-	// Process chunks and update state
-	var coreAttnOut ml.Tensor
-	newState := state
+	// Process chunks and update state.
+	// Keep a transposed view of v and recurrent state across chunks so the
+	// chunk loop does not need extra transpose+contiguous nodes.
+	vT := v.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, chunkSize, headVDim, nChunks, numVHeads*nSeqs)
+	stateT := state.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, headVDim, headVDim, 1, numVHeads*nSeqs)
+
+	// Collect chunk outputs and concatenate at the end.
+	// Avoids SET on buffer-less intermediates under partial offload.
+	chunks := make([]ml.Tensor, nChunks)

 	for chunk := range nChunks {
 		qChunk := q.Slice(ctx, 2, chunk, chunk+1, 1)
-		vChunk := v.Slice(ctx, 2, chunk, chunk+1, 1)
+		vTChunk := vT.Slice(ctx, 2, chunk, chunk+1, 1)
 		gExpChunk := gExp.Slice(ctx, 2, chunk, chunk+1, 1)
 		kCumdecayChunk := kCumdecay.Slice(ctx, 2, chunk, chunk+1, 1)
 		attnChunk := attnKQ.Slice(ctx, 2, chunk, chunk+1, 1) // Pre-computed!

-		// state^T - permute is needed but Contiguous creates a copy
-		stateT := newState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, headVDim, headVDim, 1, numVHeads*nSeqs)
+		// v'_t = k_cumdecay @ state_t
+		vTPrime := kCumdecayChunk.Mulmat(ctx, stateT)

-		// v_prime = k_cumdecay @ state
-		vPrime := stateT.Mulmat(ctx, kCumdecayChunk)
-
-		// v_new = v - v_prime
-		vNew := vChunk.Sub(ctx, vPrime)
-		vNewT := vNew.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+		// v_t_new = v_t - v'_t
+		vTNewChunk := vTChunk.Sub(ctx, vTPrime)

 		// attn_inter = (q * g_exp) @ state
 		qGExp := qChunk.Mul(ctx, gExpChunk)
 		attnInter := stateT.Mulmat(ctx, qGExp)

 		// core_attn_out = attn_inter + attn @ v_new
-		vAttn := vNewT.Mulmat(ctx, attnChunk)
+		vAttn := vTNewChunk.Mulmat(ctx, attnChunk)
 		coreAttnOutChunk := attnInter.Add(ctx, vAttn)

-		if coreAttnOut == nil {
-			coreAttnOut = coreAttnOutChunk
-		} else {
-			coreAttnOut = coreAttnOut.Concat(ctx, coreAttnOutChunk, 1)
-		}
+		chunks[chunk] = coreAttnOutChunk

 		// Update state for next chunk
 		gExpLastChunk := gLastExp.Slice(ctx, 2, chunk, chunk+1, 1)
 		kGDiffChunkT := keyGDiffT.Slice(ctx, 2, chunk, chunk+1, 1)
-		kgdMulVNew := vNewT.Mulmat(ctx, kGDiffChunkT)
+		// kgdmulvnew = key_gdiff_t @ v_new_t
+		kgdMulVNew := kGDiffChunkT.Mulmat(ctx, vTNewChunk)

-		// state = state * g_last + kgdmulvnew
-		gExpLastReshaped := gExpLastChunk.Contiguous(ctx).Reshape(ctx, 1, 1, numVHeads, nSeqs)
-		newState = newState.Mul(ctx, gExpLastReshaped)
-		newState = newState.Add(ctx, kgdMulVNew.Reshape(ctx, headVDim, headVDim, numVHeads, nSeqs))
+		// stateT = stateT * g_last + kgdmulvnew
+		stateT = stateT.Mul(ctx, gExpLastChunk)
+		stateT = stateT.Add(ctx, kgdMulVNew)
 	}

+	// Use a balanced concat tree so concat work does not balloon on long prompts.
+	for len(chunks) > 1 {
+		merged := make([]ml.Tensor, 0, (len(chunks)+1)/2)
+		for i := 0; i < len(chunks); i += 2 {
+			if i+1 < len(chunks) {
+				merged = append(merged, chunks[i].Concat(ctx, chunks[i+1], 2))
+			} else {
+				merged = append(merged, chunks[i])
+			}
+		}
+		chunks = merged
+	}
+	v = chunks[0]
+
 	// Final reshape
-	coreAttnOut = coreAttnOut.Contiguous(ctx, headVDim, chunkSize*nChunks, numVHeads, nSeqs)
+	coreAttnOut := v.Contiguous(ctx, headVDim, chunkSize*nChunks, numVHeads, nSeqs)

 	// Slice to remove padding
 	if pad > 0 {
 		coreAttnOut = coreAttnOut.Slice(ctx, 1, 0, nTokens, 1)
 	}

+	// Convert stateT back to cache layout [S_v, S_v, H_v, nSeqs]
+	newState := stateT.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, headVDim, headVDim, numVHeads, nSeqs)
+
 	// Update delta state in cache
 	cache.UpdateDeltaState(ctx, layer, newState.Reshape(ctx, headVDim, headVDim*numVHeads, nSeqs))

--- a/model/models/qwen3next/model.go
+++ b/model/models/qwen3next/model.go
@@ -1,9 +1,12 @@
 package qwen3next

 import (
+	"bytes"
 	"cmp"
 	"fmt"
+	"image"
 	"math"
+	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/ml"
@@ -11,6 +14,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen3vl"
 	"github.com/ollama/ollama/tokenizer"
 )

@@ -41,10 +45,15 @@ type Options struct {
 	ssmNGroup      int // num_k_heads
 	ssmDtRank      int // num_v_heads
 	convKernelSize int // SSM conv kernel size
+	vHeadReordered bool

 	// Per-layer type from GGUF metadata
 	isRecurrent []bool

+	// RoPE mode config (used by qwen35/qwen35moe)
+	mropeSections    []int
+	mropeInterleaved bool
+
 	// Pre-computed masks for chunked attention (created once per forward pass)
 	masks *Masks
 }
@@ -54,7 +63,17 @@ func (o Options) headDim() int {
 }

 func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
-	opts := []func(*rope.Options){rope.WithTypeNeoX()}
+	var opts []func(*rope.Options)
+	if len(o.mropeSections) > 0 {
+		if o.mropeInterleaved {
+			opts = append(opts, rope.WithInterleaveMRoPE(o.mropeSections))
+		} else {
+			opts = append(opts, rope.WithMRoPE(o.mropeSections))
+		}
+	} else {
+		opts = append(opts, rope.WithTypeNeoX())
+	}
+
 	if o.ropeType == "yarn" {
 		attnFactor := float32(1.0 / (1.0 + 0.1*math.Log(float64(o.ropeScale))))
 		opts = append(opts,
@@ -214,20 +233,190 @@ type Model struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

-	Layers []Layer `gguf:"blk"`
+	Layers []Layer              `gguf:"blk"`
+	Vision *qwen3vl.VisionModel `gguf:"v"`
+
+	ImageProcessor *qwen3vl.ImageProcessor

 	*Options
+
+	positionCache    []int32
+	imageToken       int32
+	visionStart      int32
+	visionEnd        int32
+	spatialMergeSize uint32
+}
+
+func (m *Model) mapPosition(id int32) int32 {
+	if id < int32(len(m.positionCache)) {
+		return m.positionCache[id]
+	}
+	if len(m.positionCache) > 0 {
+		return id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
+	}
+	return id
+}
+
+func (m *Model) buildPositions(ctx ml.Context, batch input.Batch) ml.Tensor {
+	if len(m.mropeSections) == 0 {
+		return ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	}
+
+	// ggml MRoPE expects [time, height, width, extra] for each token.
+	positionSlice := [][]int32{
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+	}
+
+	for i, id := range batch.Positions {
+		p := m.mapPosition(id)
+		positionSlice[0][i] = p
+		positionSlice[1][i] = p
+		positionSlice[2][i] = p
+	}
+
+	if m.Vision != nil {
+		for _, mi := range batch.Multimodal {
+			grid, ok := mi.Multimodal[0].Data.(*qwen3vl.Grid)
+			if !ok {
+				continue
+			}
+			w := max(1, grid.Width/int(m.spatialMergeSize))
+			for i := range mi.Multimodal[0].Tensor.Dim(1) {
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+	}
+
+	return ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if m.Vision == nil || m.ImageProcessor == nil || len(m.Vision.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, grid, err := m.ImageProcessor.ProcessImage(ctx, img)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs, deepstackVisualEmbeds := m.Vision.Forward(ctx, pixelValues, grid)
+	mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
+	for i := range deepstackVisualEmbeds {
+		mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
+	}
+
+	return mm, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	m.positionCache = m.positionCache[:0]
+	var result []*input.Input
+	appendInput := func(inp *input.Input, position int32) {
+		result = append(result, inp)
+		m.positionCache = append(m.positionCache, position)
+	}
+
+	var p int32
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			appendInput(inp, p)
+			p++
+			continue
+		}
+
+		grid := inp.Multimodal[0].Data.(*qwen3vl.Grid)
+		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
+
+		appendInput(&input.Input{
+			Token:     m.visionStart,
+			SameBatch: tokensPerGrid + 1,
+		}, p)
+		p++
+
+		appendInput(&input.Input{
+			Token:          m.imageToken,
+			Multimodal:     inp.Multimodal,
+			MultimodalHash: inp.MultimodalHash,
+		}, p)
+
+		for range tokensPerGrid - 1 {
+			appendInput(&input.Input{
+				Token: m.imageToken,
+			}, p)
+		}
+
+		gridSpan := max(grid.Width/int(m.spatialMergeSize), grid.Height/int(m.spatialMergeSize))
+		p = p + int32(gridSpan)
+		appendInput(&input.Input{
+			Token: m.visionEnd,
+		}, p)
+		p++
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	positions := m.buildPositions(ctx, batch)

 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	if len(batch.Multimodal) > 0 {
+		hiddenStates = hiddenStates.Duplicate(ctx)
+
+		var deepstackVisualEmbeds []ml.Tensor
+		for _, mi := range batch.Multimodal {
+			visionOutputs := mi.Multimodal[0].Tensor
+			ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+			if len(mi.Multimodal[1:]) > len(deepstackVisualEmbeds) {
+				deepstackVisualEmbeds = append(deepstackVisualEmbeds, make([]ml.Tensor, len(mi.Multimodal[1:])-len(deepstackVisualEmbeds))...)
+			}
+			for i, mm := range mi.Multimodal[1:] {
+				if deepstackVisualEmbeds[i] == nil {
+					deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+				}
+				ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
+			}
+		}
+
+		cache := m.Cache.(*HybridCache)
+		m.Options.masks = nil
+		for i, layer := range m.Layers {
+			cache.SetLayer(i)
+
+			var outputs ml.Tensor
+			if i == len(m.Layers)-1 {
+				outputs = batch.Outputs
+			}
+
+			var err error
+			hiddenStates, err = layer.Forward(ctx, i, hiddenStates, positions, outputs, cache, m.Options)
+			if err != nil {
+				return nil, err
+			}
+			if i < len(deepstackVisualEmbeds) {
+				hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
+			}
+		}
+
+		hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+		return m.Output.Forward(ctx, hiddenStates), nil
+	}

 	cache := m.Cache.(*HybridCache)

-	// Create masks once per forward pass
-	m.Options.masks = createMasks(ctx)
+	// Masks are allocated lazily only for chunked recurrent prefill.
+	m.Options.masks = nil

 	for i, layer := range m.Layers {
 		cache.SetLayer(i)
@@ -248,11 +437,116 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	return m.Output.Forward(ctx, hiddenStates), nil
 }

+func (m *Model) Validate() error {
+	if m.Options == nil {
+		return fmt.Errorf("qwen3next: missing model options")
+	}
+	if len(m.Layers) != len(m.Options.isRecurrent) {
+		return fmt.Errorf("qwen3next: layer config mismatch: have %d layers, %d recurrent flags", len(m.Layers), len(m.Options.isRecurrent))
+	}
+
+	for i, layer := range m.Layers {
+		if !m.Options.isRecurrent[i] {
+			continue
+		}
+
+		gdn, ok := layer.Operator.(*GatedDeltaNet)
+		if !ok || gdn == nil {
+			return fmt.Errorf("qwen3next: layer %d expected recurrent operator", i)
+		}
+		if gdn.SSMQKV == nil || gdn.SSMQKVGate == nil {
+			return fmt.Errorf("qwen3next: layer %d missing attn_qkv/attn_gate projections", i)
+		}
+		if gdn.SSMBetaAlpha == nil && (gdn.SSMBeta == nil || gdn.SSMAlpha == nil) {
+			return fmt.Errorf("qwen3next: layer %d missing linear attention beta/alpha projections", i)
+		}
+		if gdn.SSMDT == nil {
+			return fmt.Errorf("qwen3next: layer %d missing ssm_dt tensor", i)
+		}
+		if gdn.SSMA == nil {
+			return fmt.Errorf("qwen3next: layer %d missing ssm_a tensor", i)
+		}
+		if gdn.SSMConv1D == nil || gdn.SSMConv1D.Weight == nil {
+			return fmt.Errorf("qwen3next: layer %d missing ssm_conv1d tensor", i)
+		}
+		if gdn.SSMNorm == nil || gdn.SSMOut == nil {
+			return fmt.Errorf("qwen3next: layer %d missing ssm_norm/ssm_out projections", i)
+		}
+	}
+
+	return nil
+}
+
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	m.positionCache = nil
+	if len(m.mropeSections) > 0 {
+		shift = shift.Repeat(ctx, 1, 4).Reshape(ctx, -1)
+	}
 	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
 }

-var _ model.Model = (*Model)(nil)
+var (
+	_ model.Model               = (*Model)(nil)
+	_ model.MultimodalProcessor = (*Model)(nil)
+)
+
+func defaultVHeadReordered(arch string) bool {
+	return arch == "qwen35" || arch == "qwen35moe"
+}
+
+func inferRecurrentLayers(headCountKV []uint64, numLayers int, fullAttentionInterval uint32) ([]bool, error) {
+	isRecurrent := make([]bool, numLayers)
+
+	hasZero := false
+	hasFull := false
+	for i := range numLayers {
+		if i >= len(headCountKV) {
+			continue
+		}
+
+		if headCountKV[i] == 0 {
+			isRecurrent[i] = true
+			hasZero = true
+		} else {
+			hasFull = true
+		}
+	}
+	if hasZero && hasFull {
+		return isRecurrent, nil
+	}
+	if !hasFull {
+		return nil, fmt.Errorf("qwen3next: attention.head_count_kv must include at least one non-zero value")
+	}
+
+	// Compatibility path: older imports store a scalar KV head count and omit
+	// per-layer recurrent flags. Derive the hybrid layout from the interval.
+	interval := int(fullAttentionInterval)
+	if interval == 0 {
+		interval = min(4, numLayers)
+	}
+	if interval <= 0 {
+		return nil, fmt.Errorf("qwen3next: invalid block_count (%d)", numLayers)
+	}
+	if interval > numLayers {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds block_count (%d)", interval, numLayers)
+	}
+
+	hasZero = false
+	hasFull = false
+	for i := range numLayers {
+		isRecurrent[i] = (i+1)%interval != 0
+		if isRecurrent[i] {
+			hasZero = true
+		} else {
+			hasFull = true
+		}
+	}
+	if !hasZero || !hasFull {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval (%d) does not produce a mixed recurrent/full layout", interval)
+	}
+
+	return isRecurrent, nil
+}

 func New(c fs.Config) (model.Model, error) {
 	numLayers := int(c.Uint("block_count"))
@@ -264,26 +558,14 @@ func New(c fs.Config) (model.Model, error) {
 		HeadCountKV() []uint64
 	}

-	var isRecurrent []bool
 	var headCountKV []uint64
 	if hc, ok := c.(headCounts); ok {
 		headCountKV = hc.HeadCountKV()
 	}

-	isRecurrent = make([]bool, numLayers)
-	hasZero := false
-	hasFull := false
-	for i := range numLayers {
-		// If KV head count is 0, it's a recurrent layer
-		if i < len(headCountKV) && headCountKV[i] == 0 {
-			isRecurrent[i] = true
-			hasZero = true
-		} else if i < len(headCountKV) && headCountKV[i] > 0 {
-			hasFull = true
-		}
-	}
-	if !hasZero || !hasFull {
-		return nil, fmt.Errorf("qwen3next: invalid attention.head_count_kv array; expected mix of zero and non-zero values")
+	isRecurrent, err := inferRecurrentLayers(headCountKV, numLayers, c.Uint("full_attention_interval"))
+	if err != nil {
+		return nil, err
 	}

 	// Determine if MoE
@@ -303,6 +585,22 @@ func New(c fs.Config) (model.Model, error) {
 		}
 	}

+	mropeSections := c.Ints("mrope_sections", nil)
+	if len(mropeSections) == 0 {
+		mropeSections = c.Ints("rope.mrope_section", nil)
+	}
+	if len(mropeSections) == 0 {
+		mropeSections = c.Ints("rope.dimension_sections", nil)
+	}
+	if len(mropeSections) > 4 {
+		mropeSections = mropeSections[:4]
+	}
+
+	ropeType := c.String("rope.scaling.type")
+	if ropeType == "" {
+		ropeType = c.String("rope.type")
+	}
+
 	opts := &Options{
 		hiddenSize: int(c.Uint("embedding_length")),
 		numHeads:   int(c.Uint("attention.head_count")),
@@ -318,7 +616,7 @@ func New(c fs.Config) (model.Model, error) {
 		valueLength:           int(c.Uint("attention.value_length")),
 		ropeDim:               int(c.Uint("rope.dimension_count")),
 		eps:                   c.Float("attention.layer_norm_rms_epsilon"),
-		ropeType:              c.String("rope.scaling.type"),
+		ropeType:              ropeType,
 		ropeBase:              c.Float("rope.freq_base"),
 		ropeScale:             c.Float("rope.scaling.factor", 1),
 		originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
@@ -331,10 +629,19 @@ func New(c fs.Config) (model.Model, error) {
 		ssmNGroup:             int(c.Uint("ssm.group_count")),
 		ssmDtRank:             int(c.Uint("ssm.time_step_rank")),
 		convKernelSize:        int(c.Uint("ssm.conv_kernel")),
+		vHeadReordered:        c.Bool("ssm.v_head_reordered", defaultVHeadReordered(c.Architecture())),
 		isRecurrent:           isRecurrent,
+		mropeSections: slices.Collect(func(yield func(int) bool) {
+			for _, section := range mropeSections {
+				if !yield(int(section)) {
+					return
+				}
+			}
+		}),
+		mropeInterleaved: c.Bool("rope.mrope_interleaved", c.Bool("mrope_interleaved", false)),
 	}
 	if opts.numKVHeads == 0 {
-		return nil, fmt.Errorf("qwen3next: attention.head_count_kv array must include at least one non-zero value")
+		return nil, fmt.Errorf("qwen3next: attention.head_count_kv must include at least one non-zero value")
 	}

 	// Calculate cache dimensions
@@ -353,6 +660,19 @@ func New(c fs.Config) (model.Model, error) {
 		return nil, fmt.Errorf("qwen3next: headKDim (%d) != headVDim (%d) not supported; state computations require equal dimensions", headKDim, headVDim)
 	}

+	var vision *qwen3vl.VisionModel
+	var imageProcessor *qwen3vl.ImageProcessor
+	if c.Uint("vision.block_count", 0) > 0 {
+		vision = qwen3vl.NewVisionModel(c)
+		processor := qwen3vl.NewImageProcessor(c)
+		imageProcessor = &processor
+	}
+
+	spatialMergeSize := c.Uint("vision.spatial_merge_size", 2)
+	if spatialMergeSize == 0 {
+		spatialMergeSize = 2
+	}
+
 	m := Model{
 		Tokenizer: tokenizer.NewBytePairEncoding(
 			&tokenizer.Vocabulary{
@@ -371,8 +691,14 @@ func New(c fs.Config) (model.Model, error) {
 			},
 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		),
-		Layers:  layers,
-		Options: opts,
+		Layers:           layers,
+		Vision:           vision,
+		ImageProcessor:   imageProcessor,
+		Options:          opts,
+		imageToken:       int32(c.Uint("image_token_id", 151655)),
+		visionStart:      int32(c.Uint("vision_start_token_id", 151652)),
+		visionEnd:        int32(c.Uint("vision_end_token_id", 151653)),
+		spatialMergeSize: spatialMergeSize,
 	}

 	m.Cache = NewHybridCache(m.Shift, convDim, convChannels, deltaStateSize)
@@ -380,5 +706,7 @@ func New(c fs.Config) (model.Model, error) {
 }

 func init() {
+	model.Register("qwen35", New)
+	model.Register("qwen35moe", New)
 	model.Register("qwen3next", New)
 }
--- a/Show More
+++ b/Show More