Files
ollama/discover/runner.go
Daniel Hiltgen 56c735d871 runner: Remove CGO engines, use llama-server exclusively for GGML models
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model
implementations, and sample.  llama-server (built from upstream llama.cpp via
FetchContent) is now the sole inference engine for GGUF-based models.
(Safetensor based models continue to run on the new MLX engine.)  This allows
us to more rapidly pick up new capabilities and fixes from llama.cpp as they
come out.

On windows this now requires recent AMD driver versions to support ROCm v7 as
llama.cpp currently does not support building against v6.
2026-04-20 08:44:02 -07:00

286 lines
7.7 KiB
Go

package discover
// GPU discovery via llama-server --list-devices
import (
"context"
"log/slog"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"time"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
)
var (
deviceMu sync.Mutex
devices []ml.DeviceInfo
libDirs map[string]struct{}
bootstrapped bool
)
func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
deviceMu.Lock()
defer deviceMu.Unlock()
startDiscovery := time.Now()
msg := "overall device VRAM discovery took"
defer func() {
slog.Debug(msg, "duration", time.Since(startDiscovery))
}()
if !bootstrapped {
msg = "GPU bootstrap discovery took"
libDirs = make(map[string]struct{})
files, err := filepath.Glob(filepath.Join(ml.LibOllamaPath, "*", "*ggml-*"))
if err != nil {
slog.Debug("unable to lookup GPU backend directories", "error", err)
}
for _, file := range files {
libDirs[filepath.Dir(file)] = struct{}{}
}
if len(libDirs) == 0 {
libDirs[""] = struct{}{}
}
slog.Info("discovering available GPUs...")
detectIncompatibleLibraries()
detectOldAMDDriverWindows()
overrideWarnings()
requested := envconfig.LLMLibrary()
jetpack := cudaJetpack()
// Discover GPUs through each detected GPU backend library.
// llama-server --list-devices enumerates devices for the loaded backend.
for dir := range libDirs {
bootstrapTimeout := 30 * time.Second
if runtime.GOOS == "windows" {
// Windows Defender AV scanning of DLLs can be slow on first load
bootstrapTimeout = 90 * time.Second
}
var dirs []string
if dir != "" {
if requested != "" && !strings.HasPrefix(requested, "mlx_") && filepath.Base(dir) != requested {
slog.Debug("skipping available library at user's request", "requested", requested, "libDir", dir)
continue
} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
continue
} else if jetpack == "" && strings.Contains(filepath.Base(dir), "cuda_jetpack") {
slog.Debug("jetpack not detected (set JETSON_JETPACK or OLLAMA_LLM_LIBRARY to override), skipping", "libDir", dir)
continue
} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
slog.Info("experimental Vulkan support disabled. To enable, set OLLAMA_VULKAN=1")
continue
}
dirs = []string{ml.LibOllamaPath, dir}
} else {
dirs = []string{ml.LibOllamaPath}
}
ctx1stPass, cancel := context.WithTimeout(ctx, bootstrapTimeout)
defer cancel()
devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
}
// Filter duplicate devices:
// - Same backend (e.g., cuda_v12 and cuda_v13 both see same GPU): keep newer version
// - Different backends (e.g., CUDA and Vulkan see same GPU): prefer CUDA/HIP
for i := 0; i < len(devices); i++ {
for j := i + 1; j < len(devices); j++ {
switch devices[i].Compare(devices[j]) {
case ml.SameBackendDevice:
// Same library, different version — keep the better one
if devices[i].IsBetter(devices[j]) {
devices[i] = devices[j]
}
devices = append(devices[:j], devices[j+1:]...)
j--
continue
case ml.DuplicateDevice:
var droppedDevice ml.DeviceInfo
if devices[i].PreferredLibrary(devices[j]) {
droppedDevice = devices[j]
} else {
droppedDevice = devices[i]
devices[i] = devices[j]
}
devices = append(devices[:j], devices[j+1:]...)
j--
typeStr := "discrete"
if droppedDevice.Integrated {
typeStr = "iGPU"
}
slog.Debug("dropping duplicate device",
"id", droppedDevice.ID,
"library", droppedDevice.Library,
"name", droppedDevice.Name,
"pci_id", droppedDevice.PCIID,
"type", typeStr,
"total", format.HumanBytes2(droppedDevice.TotalMemory),
)
continue
}
}
}
// Renumber device IDs after filtering
postFilteredID := map[string]int{}
for i := range devices {
if _, ok := postFilteredID[devices[i].Library]; !ok {
postFilteredID[devices[i].Library] = 0
}
if _, err := strconv.Atoi(devices[i].ID); err == nil {
devices[i].FilterID = devices[i].ID
devices[i].ID = strconv.Itoa(postFilteredID[devices[i].Library])
}
postFilteredID[devices[i].Library]++
}
// Record which lib dirs are actually in use for VRAM refresh
libDirs = make(map[string]struct{})
for _, dev := range devices {
dir := dev.LibraryPath[len(dev.LibraryPath)-1]
if dir != ml.LibOllamaPath {
libDirs[dir] = struct{}{}
}
}
if len(libDirs) == 0 {
libDirs[""] = struct{}{}
}
bootstrapped = true
} else {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
// Metal never updates free VRAM
return append([]ml.DeviceInfo{}, devices...)
}
// Refresh free memory from running llama-server instances if possible,
// otherwise re-run llama-server --list-devices.
slog.Debug("refreshing free memory")
updated := make([]bool, len(devices))
for _, runner := range runners {
if runner == nil {
continue
}
deviceIDs := runner.GetActiveDeviceIDs()
if len(deviceIDs) == 0 {
continue
}
skip := true
for _, dev := range deviceIDs {
for i := range devices {
if dev == devices[i].DeviceID && !updated[i] {
skip = false
break
}
}
if !skip {
break
}
}
if skip {
continue
}
rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
for _, u := range runner.GetDeviceInfos(rctx) {
for i := range devices {
if u.DeviceID == devices[i].DeviceID {
updated[i] = true
devices[i].FreeMemory = u.FreeMemory
break
}
}
}
}
// Fall back to bootstrap discovery for any devices not refreshed
allDone := true
for _, done := range updated {
if !done {
allDone = false
break
}
}
if !allDone {
slog.Debug("refreshing remaining GPUs via llama-server --list-devices")
rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
devFilter := ml.GetVisibleDevicesEnv(devices, false)
for dir := range libDirs {
for _, u := range bootstrapDevices(rctx, []string{ml.LibOllamaPath, dir}, devFilter) {
for i := range devices {
if u.DeviceID == devices[i].DeviceID && u.PCIID == devices[i].PCIID {
updated[i] = true
devices[i].FreeMemory = u.FreeMemory
break
}
}
}
}
}
}
return append([]ml.DeviceInfo{}, devices...)
}
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
return llamaServerBootstrapDevices(ctx, ollamaLibDirs, extraEnvs)
}
func overrideWarnings() {
anyFound := false
m := envconfig.AsMap()
for _, k := range []string{
"CUDA_VISIBLE_DEVICES",
"HIP_VISIBLE_DEVICES",
"ROCR_VISIBLE_DEVICES",
"GGML_VK_VISIBLE_DEVICES",
"GPU_DEVICE_ORDINAL",
"HSA_OVERRIDE_GFX_VERSION",
} {
if e, found := m[k]; found && e.Value != "" {
anyFound = true
slog.Warn("user overrode visible devices", k, e.Value)
}
}
if anyFound {
slog.Warn("if GPUs are not correctly discovered, unset and try again")
}
}
func detectIncompatibleLibraries() {
if runtime.GOOS != "windows" {
return
}
basePath, err := exec.LookPath("ggml-base.dll")
if err != nil || basePath == "" {
return
}
if !strings.HasPrefix(basePath, ml.LibOllamaPath) {
slog.Warn("potentially incompatible library detected in PATH", "location", basePath)
}
}
func detectOldAMDDriverWindows() {
if runtime.GOOS != "windows" {
return
}
_, errV6 := exec.LookPath("amdhip64_6.dll")
_, errV7 := exec.LookPath("amdhip64_7.dll")
if errV6 == nil && errV7 != nil {
slog.Warn("AMD driver is too old. Update your AMD driver to enable GPU inference.")
}
}