mirror of
https://github.com/ollama/ollama.git
synced 2026-04-17 21:54:08 +02:00
ci: include mlx jit headers on linux (#15083)
* ci: include mlx jit headers on linux * handle CUDA JIT headers
This commit is contained in:
1
.github/workflows/release.yaml
vendored
1
.github/workflows/release.yaml
vendored
@@ -424,6 +424,7 @@ jobs:
|
||||
lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/vulkan*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/mlx*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/include*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
|
||||
|
||||
@@ -246,13 +246,21 @@ if(MLX_ENGINE)
|
||||
COMPONENT MLX)
|
||||
endif()
|
||||
|
||||
# Install CCCL headers for NVRTC JIT compilation at runtime.
|
||||
# Install headers for NVRTC JIT compilation at runtime.
|
||||
# MLX's own install rules use the default component so they get skipped by
|
||||
# --component MLX. Headers are installed alongside libmlx in OLLAMA_INSTALL_DIR.
|
||||
#
|
||||
# Layout:
|
||||
# ${OLLAMA_INSTALL_DIR}/include/cccl/{cuda,nv}/ — CCCL headers
|
||||
# ${OLLAMA_INSTALL_DIR}/include/*.h — CUDA toolkit headers
|
||||
#
|
||||
# MLX's jit_module.cpp resolves CCCL via
|
||||
# current_binary_dir()[.parent_path()] / "include" / "cccl"
|
||||
# On Linux, MLX's jit_module.cpp resolves CCCL via
|
||||
# current_binary_dir().parent_path() / "include" / "cccl", so we create a
|
||||
# symlink from lib/ollama/include -> ${OLLAMA_RUNNER_DIR}/include
|
||||
# This will need refinement if we add multiple CUDA versions for MLX in the future.
|
||||
# current_binary_dir().parent_path() / "include" / "cccl", so we create a
|
||||
# symlink from lib/ollama/include -> ${OLLAMA_RUNNER_DIR}/include
|
||||
# This will need refinement if we add multiple CUDA versions for MLX in the future.
|
||||
# CUDA runtime headers are found via CUDA_PATH env var (set by mlxrunner).
|
||||
if(EXISTS ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/cuda)
|
||||
install(DIRECTORY ${CMAKE_BINARY_DIR}/_deps/cccl-src/include/cuda
|
||||
DESTINATION ${OLLAMA_INSTALL_DIR}/include/cccl
|
||||
@@ -271,6 +279,48 @@ if(MLX_ENGINE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Install minimal CUDA toolkit headers needed by MLX JIT kernels.
|
||||
# These are the transitive closure of includes from mlx/backend/cuda/device/*.cuh.
|
||||
# The Go mlxrunner sets CUDA_PATH to OLLAMA_INSTALL_DIR so MLX finds them at
|
||||
# $CUDA_PATH/include/*.h via NVRTC --include-path.
|
||||
if(CUDAToolkit_FOUND)
|
||||
set(_cuda_inc "${CUDAToolkit_INCLUDE_DIRS}")
|
||||
set(_dst "${OLLAMA_INSTALL_DIR}/include")
|
||||
set(_MLX_JIT_CUDA_HEADERS
|
||||
builtin_types.h
|
||||
cooperative_groups.h
|
||||
cuda_bf16.h
|
||||
cuda_bf16.hpp
|
||||
cuda_device_runtime_api.h
|
||||
cuda_fp16.h
|
||||
cuda_fp16.hpp
|
||||
cuda_fp8.h
|
||||
cuda_fp8.hpp
|
||||
cuda_runtime_api.h
|
||||
device_types.h
|
||||
driver_types.h
|
||||
math_constants.h
|
||||
surface_types.h
|
||||
texture_types.h
|
||||
vector_functions.h
|
||||
vector_functions.hpp
|
||||
vector_types.h
|
||||
)
|
||||
foreach(_hdr ${_MLX_JIT_CUDA_HEADERS})
|
||||
install(FILES "${_cuda_inc}/${_hdr}"
|
||||
DESTINATION ${_dst}
|
||||
COMPONENT MLX)
|
||||
endforeach()
|
||||
# Subdirectory headers
|
||||
install(DIRECTORY "${_cuda_inc}/cooperative_groups"
|
||||
DESTINATION ${_dst}
|
||||
COMPONENT MLX
|
||||
FILES_MATCHING PATTERN "*.h")
|
||||
install(FILES "${_cuda_inc}/crt/host_defines.h"
|
||||
DESTINATION "${_dst}/crt"
|
||||
COMPONENT MLX)
|
||||
endif()
|
||||
|
||||
# On Windows, explicitly install dl.dll (dlfcn-win32 POSIX dlopen emulation)
|
||||
# RUNTIME_DEPENDENCIES auto-excludes it via POST_EXCLUDE_FILES_STRICT because
|
||||
# dlfcn-win32 is a known CMake target with its own install rules (which install
|
||||
|
||||
@@ -400,6 +400,21 @@ func (c *Client) Load(ctx context.Context, _ ml.SystemInfo, gpus []ml.DeviceInfo
|
||||
slog.Debug("mlx subprocess library path", libPathEnvVar, pathEnvVal)
|
||||
}
|
||||
|
||||
// Point MLX's JIT compiler at our bundled CUDA runtime headers.
|
||||
// MLX resolves headers via $CUDA_PATH/include/*.h (and checks CUDA_HOME first).
|
||||
// Always use bundled headers to avoid version mismatches with any
|
||||
// system-installed CUDA toolkit.
|
||||
if mlxDirs, err := filepath.Glob(filepath.Join(ml.LibOllamaPath, "mlx_cuda_*")); err == nil {
|
||||
for _, d := range mlxDirs {
|
||||
if _, err := os.Stat(filepath.Join(d, "include")); err == nil {
|
||||
setEnv(cmd, "CUDA_PATH", d)
|
||||
setEnv(cmd, "CUDA_HOME", d)
|
||||
slog.Debug("mlx subprocess CUDA headers", "CUDA_PATH", d)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.cmd = cmd
|
||||
|
||||
// Forward subprocess stdout/stderr to server logs
|
||||
@@ -519,3 +534,16 @@ func (c *Client) VRAMByGPU(id ml.DeviceID) uint64 {
|
||||
}
|
||||
|
||||
var _ llm.LlamaServer = (*Client)(nil)
|
||||
|
||||
// setEnv sets or replaces an environment variable in cmd.Env.
|
||||
func setEnv(cmd *exec.Cmd, key, value string) {
|
||||
entry := key + "=" + value
|
||||
prefix := strings.ToUpper(key + "=")
|
||||
for i, e := range cmd.Env {
|
||||
if strings.HasPrefix(strings.ToUpper(e), prefix) {
|
||||
cmd.Env[i] = entry
|
||||
return
|
||||
}
|
||||
}
|
||||
cmd.Env = append(cmd.Env, entry)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user