Files
ollama/discover/llama_server_test.go
Daniel Hiltgen 56c735d871 runner: Remove CGO engines, use llama-server exclusively for GGML models
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model
implementations, and sample.  llama-server (built from upstream llama.cpp via
FetchContent) is now the sole inference engine for GGUF-based models.
(Safetensor based models continue to run on the new MLX engine.)  This allows
us to more rapidly pick up new capabilities and fixes from llama.cpp as they
come out.

On windows this now requires recent AMD driver versions to support ROCm v7 as
llama.cpp currently does not support building against v6.
2026-04-20 08:44:02 -07:00

299 lines
9.4 KiB
Go

package discover
import (
"fmt"
"strconv"
"strings"
"testing"
)
func TestParseLlamaServerDevices(t *testing.T) {
tests := []struct {
name string
output string
libDirs []string
wantLen int
wantName string
wantLib string
wantMiB uint64
}{
{
name: "NVIDIA CUDA",
output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
Available devices:
NVIDIA GeForce RTX 4090: NVIDIA CUDA (24564 MiB, 23592 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
wantLen: 1,
wantName: "NVIDIA GeForce RTX 4090",
wantLib: "CUDA",
wantMiB: 24564,
},
{
name: "Metal",
output: `Available devices:
Metal: Apple M3 Max (98304 MiB, 98303 MiB free)
`,
libDirs: []string{"/lib/ollama"},
wantLen: 1,
wantName: "Metal",
wantLib: "Metal",
wantMiB: 98304,
},
{
name: "ROCm with gfx target",
output: ` Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
Available devices:
ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/rocm"},
wantLen: 1,
wantName: "ROCm0",
wantLib: "ROCm",
wantMiB: 12272,
},
{
name: "multi GPU",
output: `Available devices:
CUDA0: NVIDIA GeForce RTX 4090 (24564 MiB, 23592 MiB free)
CUDA1: NVIDIA GeForce RTX 3060 (12288 MiB, 11500 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
wantLen: 2,
},
{
name: "no devices",
output: "Available devices:\n",
libDirs: []string{"/lib/ollama"},
wantLen: 0,
},
{
name: "empty output",
output: "",
libDirs: []string{"/lib/ollama"},
wantLen: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
devices := parseLlamaServerDevices(tt.output, tt.libDirs)
if len(devices) != tt.wantLen {
t.Fatalf("got %d devices, want %d", len(devices), tt.wantLen)
}
if tt.wantLen > 0 {
if tt.wantName != "" && devices[0].Name != tt.wantName {
t.Errorf("name = %q, want %q", devices[0].Name, tt.wantName)
}
if tt.wantLib != "" && devices[0].Library != tt.wantLib {
t.Errorf("library = %q, want %q", devices[0].Library, tt.wantLib)
}
if tt.wantMiB > 0 {
expectedBytes := tt.wantMiB * 1024 * 1024
if devices[0].TotalMemory != expectedBytes {
t.Errorf("total memory = %d, want %d", devices[0].TotalMemory, expectedBytes)
}
}
}
})
}
}
func TestCUDADeviceFilteredByArchs(t *testing.T) {
// GTX 1060 (CC 6.1 = 610) with v13 ARCHS that don't include 610
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
load_backend: loaded CUDA backend from /lib/ollama/cuda_v13/libggml-cuda.so
system_info: n_threads = 4 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
Available devices:
CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/cuda_v13"})
if len(devices) != 0 {
t.Fatalf("expected 0 devices (GTX 1060 CC 610 not in ARCHS), got %d", len(devices))
}
}
func TestCUDADeviceKeptByArchs(t *testing.T) {
// RTX 4060 Ti (CC 8.9 = 890) with v13 ARCHS that include 890
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
Device 0: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
Available devices:
CUDA0: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (CC 890 in ARCHS), got %d", len(devices))
}
}
func TestCUDANoArchsFailOpen(t *testing.T) {
// No system_info line — should keep all devices (fail open)
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
Available devices:
CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (no ARCHS = fail open), got %d", len(devices))
}
}
func TestCUDANoCCFailOpen(t *testing.T) {
// Device line without compute capability — should keep (fail open)
output := `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
Available devices:
CUDA0: Some Future GPU (8192 MiB, 8000 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (no CC = fail open), got %d", len(devices))
}
}
func TestCUDAMultiDeviceMixedFilter(t *testing.T) {
// Two devices: one supported (CC 890), one not (CC 610)
output := `ggml_cuda_init: found 2 CUDA devices:
Device 0: NVIDIA GeForce GTX 1060, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
system_info: n_threads = 8 | CUDA : ARCHS = 750,800,860,890 |
Available devices:
CUDA0: NVIDIA GeForce GTX 1060 (6063 MiB, 5900 MiB free)
CUDA1: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (only RTX 4060 Ti), got %d", len(devices))
}
if devices[0].Name != "CUDA1" {
t.Errorf("expected CUDA1, got %s", devices[0].Name)
}
}
func TestROCmDeviceGFXTarget(t *testing.T) {
output := `ggml_cuda_init: found 1 ROCm devices (Total VRAM: 12272 MiB):
Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
Available devices:
ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device, got %d", len(devices))
}
if devices[0].GFXTarget != "gfx1031" {
t.Errorf("expected gfx1031, got %s", devices[0].GFXTarget)
}
}
func TestROCmDeviceGFXTargetWithXnack(t *testing.T) {
// gfx906 with :sramecc+:xnack- suffix (e.g., Radeon Pro VII)
output := `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
Device 0: AMD Radeon RX 6800, gfx1030 (0x1030), VMM: no, Wave Size: 32, VRAM: 16368 MiB
Device 1: AMD Radeon Pro VII, gfx906:sramecc+:xnack- (0x906), VMM: no, Wave Size: 64, VRAM: 16368 MiB
Available devices:
ROCm0: AMD Radeon RX 6800 (16368 MiB, 16342 MiB free)
ROCm1: AMD Radeon Pro VII (16368 MiB, 16348 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 2 {
t.Fatalf("expected 2 devices, got %d", len(devices))
}
if devices[0].GFXTarget != "gfx1030" {
t.Errorf("device 0: expected gfx1030, got %s", devices[0].GFXTarget)
}
if devices[1].GFXTarget != "gfx906" {
t.Errorf("device 1: expected gfx906, got %s", devices[1].GFXTarget)
}
}
func TestInferLibrary(t *testing.T) {
tests := []struct {
name string
desc string
want string
}{
{"NVIDIA CUDA", "NVIDIA GeForce RTX 4090", "CUDA"},
{"CUDA0", "NVIDIA GeForce RTX 4090", "CUDA"},
{"AMD ROCm", "AMD Radeon RX 6700 XT", "ROCm"},
{"ROCm0", "AMD Radeon RX 6700 XT", "ROCm"},
{"Metal", "Apple M3 Max", "Metal"},
{"Vulkan0", "NVIDIA GeForce RTX 4090 (Vulkan)", "Vulkan"},
{"Unknown", "Unknown Backend", "Unknown Backend"},
}
for _, tt := range tests {
got := inferLibrary(tt.name, tt.desc)
if got != tt.want {
t.Errorf("inferLibrary(%q, %q) = %q, want %q", tt.name, tt.desc, got, tt.want)
}
}
}
func TestCudaCCRegex(t *testing.T) {
tests := []struct {
line string
wantIdx int
wantCC string
}{
{" Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB", 0, "610"},
{" Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB", 1, "890"},
{" Device 0: NVIDIA RTX PRO 6000, compute capability 12.0, VMM: yes, VRAM: 97250 MiB", 0, "1200"},
{" Device 0: Tesla V100-PCIE-16GB, compute capability 7.0, VMM: yes, VRAM: 16160 MiB", 0, "700"},
}
for _, tt := range tests {
matches := cudaCCRegex.FindStringSubmatch(tt.line)
if matches == nil {
t.Errorf("expected match for %q", tt.line)
continue
}
idx, _ := strconv.Atoi(matches[1])
major, _ := strconv.Atoi(matches[2])
minor, _ := strconv.Atoi(matches[3])
cc := fmt.Sprintf("%d%d0", major, minor)
if idx != tt.wantIdx {
t.Errorf("for %q: got idx %d, want %d", tt.line, idx, tt.wantIdx)
}
if cc != tt.wantCC {
t.Errorf("for %q: got CC %s, want %s", tt.line, cc, tt.wantCC)
}
}
}
func TestCudaArchsRegex(t *testing.T) {
tests := []struct {
line string
want []string
}{
{
"system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890 | USE_GRAPHS = 1 |",
[]string{"750", "800", "860", "890"},
},
{
"system_info: | CUDA : ARCHS = 500,520,600,610,700,750,800,860,890,900,1200 |",
[]string{"500", "520", "600", "610", "700", "750", "800", "860", "890", "900", "1200"},
},
{
"no archs here",
nil,
},
}
for _, tt := range tests {
matches := cudaArchsRegex.FindStringSubmatch(tt.line)
if tt.want == nil {
if matches != nil {
t.Errorf("expected no match for %q, got %v", tt.line, matches)
}
continue
}
if matches == nil {
t.Errorf("expected match for %q, got nil", tt.line)
continue
}
got := strings.Split(matches[1], ",")
if len(got) != len(tt.want) {
t.Errorf("for %q: got %v, want %v", tt.line, got, tt.want)
}
}
}