ci: include mlx jit headers on linux (#15083)

* ci: include mlx jit headers on linux * handle CUDA JIT headers
2026-04-17 21:54:08 +02:00 · 2026-03-26 23:10:07 -07:00
parent f567abc63f
commit 516ebd8548
3 changed files with 83 additions and 4 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -424,6 +424,7 @@ jobs:
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/include*)       echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,13 +246,21 @@ if(MLX_ENGINE)
            COMPONENT MLX)
    endif()

-    # Install CCCL headers for NVRTC JIT compilation at runtime.
+    # Install headers for NVRTC JIT compilation at runtime.
    # MLX's own install rules use the default component so they get skipped by
    # --component MLX. Headers are installed alongside libmlx in OLLAMA_INSTALL_DIR.
+    #
+    # Layout:
+    #   ${OLLAMA_INSTALL_DIR}/include/cccl/{cuda,nv}/  — CCCL headers
+    #   ${OLLAMA_INSTALL_DIR}/include/*.h               — CUDA toolkit headers
+    #
+    # MLX's jit_module.cpp resolves CCCL via
+    #   current_binary_dir()[.parent_path()] / "include" / "cccl"
    # On Linux, MLX's jit_module.cpp resolves CCCL via
-    # current_binary_dir().parent_path() / "include" / "cccl", so we create a
-    # symlink from lib/ollama/include -> ${OLLAMA_RUNNER_DIR}/include
-    # This will need refinement if we add multiple CUDA versions for MLX in the future.
+    #   current_binary_dir().parent_path() / "include" / "cccl", so we create a
+    #   symlink from lib/ollama/include -> ${OLLAMA_RUNNER_DIR}/include
+    #   This will need refinement if we add multiple CUDA versions for MLX in the future.
+    # CUDA runtime headers are found via CUDA_PATH env var (set by mlxrunner).
    if(EXISTS ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/cuda)
        install(DIRECTORY ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/cuda
            DESTINATION ${OLLAMA_INSTALL_DIR}/include/cccl
@@ -271,6 +279,48 @@ if(MLX_ENGINE)
        endif()
    endif()

+    # Install minimal CUDA toolkit headers needed by MLX JIT kernels.
+    # These are the transitive closure of includes from mlx/backend/cuda/device/*.cuh.
+    # The Go mlxrunner sets CUDA_PATH to OLLAMA_INSTALL_DIR so MLX finds them at
+    # $CUDA_PATH/include/*.h via NVRTC --include-path.
+    if(CUDAToolkit_FOUND)
+        set(_cuda_inc "${CUDAToolkit_INCLUDE_DIRS}")
+        set(_dst "${OLLAMA_INSTALL_DIR}/include")
+        set(_MLX_JIT_CUDA_HEADERS
+            builtin_types.h
+            cooperative_groups.h
+            cuda_bf16.h
+            cuda_bf16.hpp
+            cuda_device_runtime_api.h
+            cuda_fp16.h
+            cuda_fp16.hpp
+            cuda_fp8.h
+            cuda_fp8.hpp
+            cuda_runtime_api.h
+            device_types.h
+            driver_types.h
+            math_constants.h
+            surface_types.h
+            texture_types.h
+            vector_functions.h
+            vector_functions.hpp
+            vector_types.h
+        )
+        foreach(_hdr ${_MLX_JIT_CUDA_HEADERS})
+            install(FILES "${_cuda_inc}/${_hdr}"
+                DESTINATION ${_dst}
+                COMPONENT MLX)
+        endforeach()
+        # Subdirectory headers
+        install(DIRECTORY "${_cuda_inc}/cooperative_groups"
+            DESTINATION ${_dst}
+            COMPONENT MLX
+            FILES_MATCHING PATTERN "*.h")
+        install(FILES "${_cuda_inc}/crt/host_defines.h"
+            DESTINATION "${_dst}/crt"
+            COMPONENT MLX)
+    endif()
+
    # On Windows, explicitly install dl.dll (dlfcn-win32 POSIX dlopen emulation)
    # RUNTIME_DEPENDENCIES auto-excludes it via POST_EXCLUDE_FILES_STRICT because
    # dlfcn-win32 is a known CMake target with its own install rules (which install
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -400,6 +400,21 @@ func (c *Client) Load(ctx context.Context, _ ml.SystemInfo, gpus []ml.DeviceInfo
 		slog.Debug("mlx subprocess library path", libPathEnvVar, pathEnvVal)
 	}

+	// Point MLX's JIT compiler at our bundled CUDA runtime headers.
+	// MLX resolves headers via $CUDA_PATH/include/*.h (and checks CUDA_HOME first).
+	// Always use bundled headers to avoid version mismatches with any
+	// system-installed CUDA toolkit.
+	if mlxDirs, err := filepath.Glob(filepath.Join(ml.LibOllamaPath, "mlx_cuda_*")); err == nil {
+		for _, d := range mlxDirs {
+			if _, err := os.Stat(filepath.Join(d, "include")); err == nil {
+				setEnv(cmd, "CUDA_PATH", d)
+				setEnv(cmd, "CUDA_HOME", d)
+				slog.Debug("mlx subprocess CUDA headers", "CUDA_PATH", d)
+				break
+			}
+		}
+	}
+
 	c.cmd = cmd

 	// Forward subprocess stdout/stderr to server logs
@@ -519,3 +534,16 @@ func (c *Client) VRAMByGPU(id ml.DeviceID) uint64 {
 }

 var _ llm.LlamaServer = (*Client)(nil)
+
+// setEnv sets or replaces an environment variable in cmd.Env.
+func setEnv(cmd *exec.Cmd, key, value string) {
+	entry := key + "=" + value
+	prefix := strings.ToUpper(key + "=")
+	for i, e := range cmd.Env {
+		if strings.HasPrefix(strings.ToUpper(e), prefix) {
+			cmd.Env[i] = entry
+			return
+		}
+	}
+	cmd.Env = append(cmd.Env, entry)
+}