mirror of
https://github.com/ollama/ollama.git
synced 2026-04-24 17:55:43 +02:00
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model implementations, and sample. llama-server (built from upstream llama.cpp via FetchContent) is now the sole inference engine for GGUF-based models. (Safetensor based models continue to run on the new MLX engine.) This allows us to more rapidly pick up new capabilities and fixes from llama.cpp as they come out. On windows this now requires recent AMD driver versions to support ROCm v7 as llama.cpp currently does not support building against v6.
413 lines
13 KiB
Go
413 lines
13 KiB
Go
package discover
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/ollama/ollama/llm"
|
|
"github.com/ollama/ollama/logutil"
|
|
"github.com/ollama/ollama/ml"
|
|
)
|
|
|
|
// llamaServerDiscoverDevices spawns llama-server briefly (without a model) to
|
|
// discover GPU devices and their capabilities. The server prints device info
|
|
// and system_info (including compiled CUDA architectures) on startup before
|
|
// any model load, then we kill it.
|
|
//
|
|
// Captured from combined stderr output:
|
|
//
|
|
// Device 0: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
|
|
// Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
|
|
//
|
|
// Captured from stdout device list:
|
|
//
|
|
// CUDA0: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
|
|
// Metal: Apple M3 Max (98304 MiB, 98303 MiB free)
|
|
//
|
|
// Captured from system_info line:
|
|
//
|
|
// system_info: ... | CUDA : ARCHS = 750,800,860,890,... |
|
|
func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
|
|
llamaServer, err := llm.FindLlamaServer()
|
|
if err != nil {
|
|
slog.Debug("llama-server not available for device discovery", "error", err)
|
|
return nil
|
|
}
|
|
|
|
start := time.Now()
|
|
defer func() {
|
|
slog.Debug("llama-server device discovery took", "duration", time.Since(start), "libDirs", libDirs)
|
|
}()
|
|
|
|
// Use a random port to avoid conflicts. The server will start listening
|
|
// but we kill it as soon as we have the system_info output.
|
|
port := 49152 + time.Now().UnixNano()%16383
|
|
cmd := exec.CommandContext(ctx, llamaServer,
|
|
"--port", strconv.FormatInt(port, 10),
|
|
"--host", "127.0.0.1",
|
|
"--no-webui",
|
|
"--offline",
|
|
)
|
|
cmd.Env = os.Environ()
|
|
|
|
setupLibraryEnv(cmd, libDirs, extraEnvs)
|
|
|
|
logutil.Trace("running llama-server for discovery", "cmd", cmd.Path, "libDirs", libDirs)
|
|
|
|
// Capture stderr (device info + system_info) via pipe so we can
|
|
// read it line-by-line and kill the server as soon as we have what we need.
|
|
stderrPipe, err := cmd.StderrPipe()
|
|
if err != nil {
|
|
slog.Debug("llama-server discovery: failed to create stderr pipe", "error", err)
|
|
return nil
|
|
}
|
|
// Capture stdout (device list) into a buffer
|
|
var stdoutBuf strings.Builder
|
|
cmd.Stdout = &stdoutBuf
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
slog.Debug("llama-server discovery: failed to start", "error", err)
|
|
return nil
|
|
}
|
|
|
|
// Read stderr until we see system_info or timeout
|
|
var stderrLines []string
|
|
gotSystemInfo := false
|
|
done := make(chan struct{})
|
|
go func() {
|
|
scanner := bufio.NewScanner(stderrPipe)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
stderrLines = append(stderrLines, line)
|
|
if strings.Contains(line, "system_info:") {
|
|
gotSystemInfo = true
|
|
break
|
|
}
|
|
}
|
|
close(done)
|
|
}()
|
|
|
|
select {
|
|
case <-done:
|
|
case <-ctx.Done():
|
|
}
|
|
|
|
// Kill the server — we have what we need (or timed out)
|
|
cmd.Process.Kill()
|
|
cmd.Wait()
|
|
|
|
if !gotSystemInfo {
|
|
slog.Warn("llama-server discovery: system_info line not found in output — "+
|
|
"CUDA architecture filtering will be disabled. If GPU inference fails, "+
|
|
"this may indicate an incompatible llama-server version.",
|
|
"libDirs", libDirs, "lines_captured", len(stderrLines))
|
|
}
|
|
|
|
// Also run --list-devices to get the stdout device list with free memory
|
|
// (the brief server startup doesn't print that)
|
|
cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline")
|
|
cmd2.Env = cmd.Env // reuse same environment
|
|
listOutput, err := cmd2.CombinedOutput()
|
|
if err != nil {
|
|
slog.Debug("llama-server --list-devices failed", "error", err)
|
|
return nil
|
|
}
|
|
|
|
combined := string(listOutput) + "\n" + strings.Join(stderrLines, "\n")
|
|
return parseLlamaServerDevices(combined, libDirs)
|
|
}
|
|
|
|
// setupLibraryEnv configures the command environment for GPU backend discovery:
|
|
// sets library paths and GGML_BACKEND_PATH for the GPU backend .so/.dll.
|
|
func setupLibraryEnv(cmd *exec.Cmd, libDirs []string, extraEnvs map[string]string) {
|
|
var pathEnv string
|
|
switch runtime.GOOS {
|
|
case "windows":
|
|
pathEnv = "PATH"
|
|
case "darwin":
|
|
pathEnv = "DYLD_LIBRARY_PATH"
|
|
default:
|
|
pathEnv = "LD_LIBRARY_PATH"
|
|
}
|
|
|
|
libraryPaths := append([]string{}, libDirs...)
|
|
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
|
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
|
}
|
|
pathVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
|
|
|
pathSet := false
|
|
for i := range cmd.Env {
|
|
if key, _, ok := strings.Cut(cmd.Env[i], "="); ok && strings.EqualFold(key, pathEnv) {
|
|
cmd.Env[i] = pathEnv + "=" + pathVal
|
|
pathSet = true
|
|
}
|
|
}
|
|
if !pathSet {
|
|
cmd.Env = append(cmd.Env, pathEnv+"="+pathVal)
|
|
}
|
|
|
|
for k, v := range extraEnvs {
|
|
found := false
|
|
for i := range cmd.Env {
|
|
if key, _, ok := strings.Cut(cmd.Env[i], "="); ok && strings.EqualFold(key, k) {
|
|
cmd.Env[i] = k + "=" + v
|
|
found = true
|
|
}
|
|
}
|
|
if !found {
|
|
cmd.Env = append(cmd.Env, k+"="+v)
|
|
}
|
|
}
|
|
|
|
// Find GPU backend .so/.dll and set GGML_BACKEND_PATH
|
|
backendPatterns := []string{
|
|
"libggml-cuda*", "ggml-cuda*.dll",
|
|
"libggml-hip*", "ggml-hip*.dll",
|
|
"libggml-vulkan*", "ggml-vulkan*.dll",
|
|
}
|
|
for _, dir := range libDirs {
|
|
for _, pattern := range backendPatterns {
|
|
matches, _ := filepath.Glob(filepath.Join(dir, pattern))
|
|
if len(matches) > 0 {
|
|
cmd.Env = append(cmd.Env, "GGML_BACKEND_PATH="+matches[0])
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// deviceLineRegex matches stdout lines like:
|
|
//
|
|
// CUDA0: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
|
|
// Metal: Apple M3 Max (98304 MiB, 98303 MiB free)
|
|
var deviceLineRegex = regexp.MustCompile(
|
|
`^\s+(.+?):\s+(.+?)\s+\((\d+)\s+MiB,\s+(\d+)\s+MiB\s+free\)`,
|
|
)
|
|
|
|
// gfxTargetRegex matches ROCm stderr lines like:
|
|
//
|
|
// Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
|
|
// Device 1: AMD Radeon Pro VII, gfx906:sramecc+:xnack- (0x906), VMM: no, Wave Size: 64, VRAM: 16368 MiB
|
|
var gfxTargetRegex = regexp.MustCompile(
|
|
`Device\s+(\d+):.*,\s+(gfx[0-9a-f]+)[\s:(]`,
|
|
)
|
|
|
|
// cudaCCRegex matches CUDA stderr lines like:
|
|
//
|
|
// Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
|
|
var cudaCCRegex = regexp.MustCompile(
|
|
`Device\s+(\d+):.*compute capability\s+(\d+)\.(\d+)`,
|
|
)
|
|
|
|
// cudaArchsRegex matches the CUDA architecture list from system_info like:
|
|
//
|
|
// CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210
|
|
var cudaArchsRegex = regexp.MustCompile(
|
|
`CUDA\s*:\s*ARCHS\s*=\s*([\d,]+)`,
|
|
)
|
|
|
|
// parseLlamaServerDevices parses the combined output of llama-server discovery.
|
|
// It extracts device info, ROCm gfx targets, CUDA compute capabilities, and
|
|
// CUDA compiled architecture lists.
|
|
func parseLlamaServerDevices(output string, libDirs []string) []ml.DeviceInfo {
|
|
// Extract per-device metadata from stderr
|
|
gfxByIndex := make(map[int]string)
|
|
ccByIndex := make(map[int]string) // "610", "890", etc.
|
|
var cudaArchs []string // compiled architectures for this variant
|
|
|
|
scanner := bufio.NewScanner(strings.NewReader(output))
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if matches := gfxTargetRegex.FindStringSubmatch(line); matches != nil {
|
|
idx, _ := strconv.Atoi(matches[1])
|
|
gfxByIndex[idx] = matches[2]
|
|
}
|
|
if matches := cudaCCRegex.FindStringSubmatch(line); matches != nil {
|
|
idx, _ := strconv.Atoi(matches[1])
|
|
major, _ := strconv.Atoi(matches[2])
|
|
minor, _ := strconv.Atoi(matches[3])
|
|
ccByIndex[idx] = fmt.Sprintf("%d%d0", major, minor)
|
|
}
|
|
if matches := cudaArchsRegex.FindStringSubmatch(line); matches != nil {
|
|
cudaArchs = strings.Split(matches[1], ",")
|
|
}
|
|
}
|
|
|
|
// Validate CUDA devices against compiled architectures
|
|
cudaArchSet := make(map[string]bool, len(cudaArchs))
|
|
for _, arch := range cudaArchs {
|
|
cudaArchSet[strings.TrimSpace(arch)] = true
|
|
}
|
|
|
|
// Parse stdout device lines
|
|
var devices []ml.DeviceInfo
|
|
deviceIndex := 0
|
|
scanner = bufio.NewScanner(strings.NewReader(output))
|
|
for scanner.Scan() {
|
|
matches := deviceLineRegex.FindStringSubmatch(scanner.Text())
|
|
if matches == nil {
|
|
continue
|
|
}
|
|
|
|
name := matches[1]
|
|
description := matches[2]
|
|
totalMiB, _ := strconv.ParseUint(matches[3], 10, 64)
|
|
freeMiB, _ := strconv.ParseUint(matches[4], 10, 64)
|
|
library := inferLibrary(name, description)
|
|
|
|
// Skip pseudo-devices like BLAS/Accelerate that report zero memory.
|
|
// These are CPU math libraries, not real GPUs — they shouldn't appear
|
|
// as inference compute devices or inflate the scheduler's GPU count.
|
|
if totalMiB == 0 {
|
|
slog.Debug("skipping pseudo-device with zero memory", "name", name, "description", description)
|
|
deviceIndex++
|
|
continue
|
|
}
|
|
|
|
// For CUDA devices, check if this variant supports the device's CC
|
|
if library == "CUDA" {
|
|
cc := ccByIndex[deviceIndex]
|
|
if cc != "" && len(cudaArchSet) > 0 {
|
|
if !cudaArchSet[cc] {
|
|
slog.Info("skipping CUDA device — compute capability not in compiled architectures",
|
|
"device", description, "cc", cc, "archs", cudaArchs,
|
|
"libDirs", libDirs)
|
|
deviceIndex++
|
|
continue
|
|
}
|
|
} else if cc == "" {
|
|
slog.Warn("llama-server discovery: could not determine compute capability for CUDA device — "+
|
|
"architecture filtering disabled for this device. If inference crashes, "+
|
|
"check that the CUDA backend supports this GPU.",
|
|
"device", description, "libDirs", libDirs)
|
|
} else if len(cudaArchSet) == 0 {
|
|
slog.Warn("llama-server discovery: could not determine compiled CUDA architectures — "+
|
|
"architecture filtering disabled. If inference crashes on older GPUs, "+
|
|
"check llama-server system_info output for ARCHS.",
|
|
"device", description, "libDirs", libDirs)
|
|
}
|
|
}
|
|
|
|
dev := ml.DeviceInfo{
|
|
DeviceID: ml.DeviceID{
|
|
ID: strconv.Itoa(deviceIndex),
|
|
Library: library,
|
|
},
|
|
Name: name,
|
|
Description: description,
|
|
TotalMemory: totalMiB * 1024 * 1024,
|
|
FreeMemory: freeMiB * 1024 * 1024,
|
|
LibraryPath: libDirs,
|
|
GFXTarget: gfxByIndex[deviceIndex],
|
|
}
|
|
|
|
devices = append(devices, dev)
|
|
deviceIndex++
|
|
}
|
|
|
|
return devices
|
|
}
|
|
|
|
// inferLibrary determines the GPU library type from the llama-server device name and description.
|
|
func inferLibrary(name, description string) string {
|
|
combined := strings.ToLower(name + " " + description)
|
|
switch {
|
|
case strings.Contains(combined, "cuda"):
|
|
return "CUDA"
|
|
case strings.Contains(combined, "rocm") || strings.Contains(combined, "hip"):
|
|
return "ROCm"
|
|
case strings.Contains(combined, "metal") || strings.Contains(combined, "apple"):
|
|
return "Metal"
|
|
case strings.Contains(combined, "vulkan"):
|
|
return "Vulkan"
|
|
default:
|
|
return description
|
|
}
|
|
}
|
|
|
|
// llamaServerBootstrapDevices discovers GPU devices using llama-server.
|
|
// CUDA devices are validated against compiled architectures (from system_info).
|
|
// ROCm devices are validated against bundled rocblas kernels.
|
|
func llamaServerBootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
|
|
devices := llamaServerDiscoverDevices(ctx, ollamaLibDirs, extraEnvs)
|
|
if runtime.GOOS != "linux" {
|
|
return devices
|
|
}
|
|
|
|
hasROCm := false
|
|
for _, d := range devices {
|
|
if d.Library == "ROCm" {
|
|
hasROCm = true
|
|
break
|
|
}
|
|
}
|
|
if !hasROCm {
|
|
return devices
|
|
}
|
|
|
|
return filterUnsupportedROCmDevices(devices, ollamaLibDirs)
|
|
}
|
|
|
|
// rocblasGFXTargets scans the rocblas library directory for supported gfx targets
|
|
// by looking for TensileLibrary_lazy_gfxNNNN.dat files.
|
|
func rocblasGFXTargets(libDirs []string) map[string]bool {
|
|
targets := make(map[string]bool)
|
|
for _, dir := range libDirs {
|
|
files, _ := filepath.Glob(filepath.Join(dir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
|
|
for _, f := range files {
|
|
base := filepath.Base(f)
|
|
if t, ok := strings.CutPrefix(base, "TensileLibrary_lazy_"); ok {
|
|
if t, ok = strings.CutSuffix(t, ".dat"); ok {
|
|
targets[t] = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return targets
|
|
}
|
|
|
|
// filterUnsupportedROCmDevices removes ROCm devices whose gfx target doesn't have
|
|
// matching rocblas kernels bundled.
|
|
func filterUnsupportedROCmDevices(devices []ml.DeviceInfo, libDirs []string) []ml.DeviceInfo {
|
|
supported := rocblasGFXTargets(libDirs)
|
|
if len(supported) == 0 {
|
|
return devices
|
|
}
|
|
|
|
var filtered []ml.DeviceInfo
|
|
for _, dev := range devices {
|
|
if dev.Library != "ROCm" {
|
|
filtered = append(filtered, dev)
|
|
continue
|
|
}
|
|
gfx := dev.GFXTarget
|
|
if gfx == "" {
|
|
filtered = append(filtered, dev)
|
|
continue
|
|
}
|
|
if supported[gfx] {
|
|
filtered = append(filtered, dev)
|
|
} else {
|
|
slog.Warn("dropping ROCm device — no rocblas support for gfx target",
|
|
"device", dev.Name, "gfx_target", gfx, "supported", supported,
|
|
"hint", "set HSA_OVERRIDE_GFX_VERSION to map to a supported target")
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
// Ensure stderrPipe is fully consumed to avoid blocking
|
|
var _ io.Reader
|