benchmark: compare backend graph computation times

Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.
Merge pull request #9203 from ollama/mxyng/sapphirerapids
2026-04-23 01:05:47 +02:00 · 2025-02-19 15:22:53 -08:00 · 2025-02-19 21:42:00 +00:00 · 2025-02-19 13:24:27 -08:00 · 2025-02-19 13:22:48 -08:00 · 2025-02-19 13:20:09 -08:00
9 changed files with 342 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -382,6 +382,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
 - [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
 - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
 - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
 ### Cloud
--- a/benchmark/ggml_backend_benchmark_test.go
+++ b/benchmark/ggml_backend_benchmark_test.go
@@ -0,0 +1,86 @@
 package backend
 import (
 	"flag"
 	"fmt"
 	"io"
 	"log"
 	"os"
 	"testing"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/server"
 	_ "github.com/ollama/ollama/model/models/llama"
 )
 var modelName = flag.String("m", "", "Name of the model to benchmark")
 func suppressOutput() (cleanup func()) {
 	oldStdout, oldStderr := os.Stdout, os.Stderr
 	os.Stdout, os.Stderr = nil, nil
 	log.SetOutput(io.Discard)
 	return func() {
 		os.Stdout, os.Stderr = oldStdout, oldStderr
 		log.SetOutput(os.Stderr)
 	}
 }
 func setupModel(b *testing.B) model.Model {
 	if *modelName == "" {
 		b.Fatal("Error: -m flag is required for benchmark tests")
 	}
 	sm, err := server.GetModel(*modelName)
 	if err != nil {
 		b.Fatal(err)
 	}
 	m, err := model.New(sm.ModelPath)
 	if err != nil {
 		b.Fatal(err)
 	}
 	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
 	return m
 }
 func BenchmarkGGMLOperations(b *testing.B) {
 	// loading the GGML back-end logs to standard out and makes the bench output messy
 	cleanup := suppressOutput()
 	defer cleanup()
 	b.Setenv("OLLAMA_BENCHMARK", "1")
 	b.Setenv("OLLAMA_BACKEND", "ggml")
 	m := setupModel(b)
 	// Sample input data
 	inputIDs := []int32{1, 2, 3, 4, 5}
 	options := model.Options{
 		Inputs:    inputIDs,
 		Positions: []int32{1, 2, 3, 4, 5},
 		Sequences: []int{1, 1, 1, 1, 1},
 		Outputs:   []int32{int32(len(inputIDs) - 1)},
 	}
 	b.ResetTimer()
 	for range b.N {
 		ctx := m.Backend().NewContext()
 		defer ctx.Close()
 		modelOutput, err := model.Forward(ctx, m, options)
 		if err != nil {
 			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
 		}
 		ctx.Compute(modelOutput)
 		for _, op := range ctx.Timing() {
 			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
 		}
 	}
 }
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"strings"
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
 	"github.com/spf13/cobra"
@@ -490,6 +491,96 @@ func TestPushHandler(t *testing.T) {
 	}
 }
 func TestListHandler(t *testing.T) {
 	tests := []struct {
 		name           string
 		args           []string
 		serverResponse []api.ListModelResponse
 		expectedError  string
 		expectedOutput string
 	}{
 		{
 			name: "list all models",
 			args: []string{},
 			serverResponse: []api.ListModelResponse{
 				{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
 				{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-48 * time.Hour)},
 			},
 			expectedOutput: "NAME      ID              SIZE      MODIFIED     \n" +
 				"model1    sha256:abc12    1.0 KB    24 hours ago    \n" +
 				"model2    sha256:def45    2.0 KB    2 days ago      \n",
 		},
 		{
 			name: "filter models by prefix",
 			args: []string{"model1"},
 			serverResponse: []api.ListModelResponse{
 				{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
 				{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-24 * time.Hour)},
 			},
 			expectedOutput: "NAME      ID              SIZE      MODIFIED     \n" +
 				"model1    sha256:abc12    1.0 KB    24 hours ago    \n",
 		},
 		{
 			name:          "server error",
 			args:          []string{},
 			expectedError: "server error",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if r.URL.Path != "/api/tags" || r.Method != http.MethodGet {
 					t.Errorf("unexpected request to %s %s", r.Method, r.URL.Path)
 					http.Error(w, "not found", http.StatusNotFound)
 					return
 				}
 				if tt.expectedError != "" {
 					http.Error(w, tt.expectedError, http.StatusInternalServerError)
 					return
 				}
 				response := api.ListResponse{Models: tt.serverResponse}
 				if err := json.NewEncoder(w).Encode(response); err != nil {
 					t.Fatal(err)
 				}
 			}))
 			defer mockServer.Close()
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			cmd := &cobra.Command{}
 			cmd.SetContext(context.TODO())
 			// Capture stdout
 			oldStdout := os.Stdout
 			r, w, _ := os.Pipe()
 			os.Stdout = w
 			err := ListHandler(cmd, tt.args)
 			// Restore stdout and get output
 			w.Close()
 			os.Stdout = oldStdout
 			output, _ := io.ReadAll(r)
 			if tt.expectedError == "" {
 				if err != nil {
 					t.Errorf("expected no error, got %v", err)
 				}
 				if got := string(output); got != tt.expectedOutput {
 					t.Errorf("expected output:\n%s\ngot:\n%s", tt.expectedOutput, got)
 				}
 			} else {
 				if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
 					t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
 				}
 			}
 		})
 	}
 }
 func TestCreateHandler(t *testing.T) {
 	tests := []struct {
 		name           string
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -167,6 +167,8 @@ var (
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// Ollama is running in a benchmark context, additional timing data will be collected.
 	Benchmark = Bool("OLLAMA_BENCHMARK")
 )
 func String(s string) func() string {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 	return 10
 }
 func (c *testContext) Timing() []ml.OpTiming {
 	return []ml.OpTiming{}
 }
 func (c *testContext) Close() {}
 type testTensor struct {
--- a/llama/patches/0018-remove-amx.patch
+++ b/llama/patches/0018-remove-amx.patch
@@ -0,0 +1,24 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Tue, 18 Feb 2025 14:47:21 -0800
 Subject: [PATCH] remove amx
 ---
 ggml/src/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
 index 72b488dd..50828717 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
 -    if (NOT MSVC)
 -        # MSVC doesn't support AMX
 -        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
 -    endif()
 else ()
     ggml_add_cpu_backend_variant_impl("")
 endif()
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,6 +2,7 @@ package ml
 import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"fmt"
 	"os"
@@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 }
 func NewBackend(f *os.File) (Backend, error) {
-	if backend, ok := backends["ggml"]; ok {
+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
 		return backend(f)
 	}
@@ -53,6 +54,30 @@ type Context interface {
 	Compute(...Tensor)
 	MaxTensors() int
 	Close()
 	Timing() []OpTiming
 }
 // OpType is the type of operation performed during a forward pass.
 type OpType string
 const (
 	View       OpType = "View"
 	Copy       OpType = "Copy"
 	Reshape    OpType = "Reshape"
 	Permute    OpType = "Permute"
 	Contiguous OpType = "Contiguous"
 	Input      OpType = "Input"
 	ComputeOp  OpType = "Compute"
 	Transpose  OpType = "Transpose"
 )
 // OpTiming stores the timing information for a single operation.
 type OpTiming struct {
 	Type      OpType
 	Operation string
 	Duration  float64
 	Order     int
 }
 type Tensor interface {
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -4,6 +4,8 @@ package ggml
 #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 #include <stdlib.h>
 #include <stdint.h>
 #include <time.h>
 #include <string.h>
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
@@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
 #endif
 }
 // Define a fixed-size struct to store timing data
 #define MAX_TENSOR_NAME 256
 #define MAX_TIMINGS 1000
 typedef struct {
    char tensor_name[MAX_TENSOR_NAME];
    double duration_ms;
 } timing_entry;
 typedef struct {
    timing_entry entries[MAX_TIMINGS];
    int count;
 } timing_data;
 // Global timing data structure
 timing_data g_timings = {0};
 double get_time_ms() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
 }
 bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
    static double start_time;
    static char current_tensor[MAX_TENSOR_NAME];
    if (ask) {
        start_time = get_time_ms();
        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
    } else {
        double end_time = get_time_ms();
        double duration = end_time - start_time;
        if (g_timings.count < MAX_TIMINGS) {
            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
            g_timings.entries[g_timings.count].duration_ms = duration;
            g_timings.count++;
        }
    }
    return true;
 }
 void clear_timings() {
    g_timings.count = 0;
 }
 */
 import "C"
@@ -29,9 +79,11 @@ import (
 	"io"
 	"log/slog"
 	"os"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
@@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }
 // Timing retrieves the collected timing data
 func (c *Context) Timing() []ml.OpTiming {
 	sequence := make([]ml.OpTiming, C.g_timings.count)
 	for i := range int(C.g_timings.count) {
 		entry := C.g_timings.entries[i]
 		tensorName := C.GoString(&entry.tensor_name[0])
 		// Determine operation type and description based on tensor name
 		var opType ml.OpType
 		var opDesc string
 		switch {
 		case strings.Contains(tensorName, "(view)"):
 			opType, opDesc = ml.View, "Memory view"
 		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
 			opType, opDesc = ml.Copy, "Memory copy"
 		case strings.Contains(tensorName, "(reshaped)"):
 			opType, opDesc = ml.Reshape, "Reshape"
 		case strings.Contains(tensorName, "(permuted)"):
 			opType, opDesc = ml.Permute, "Permute dimensions"
 		case strings.Contains(tensorName, "(cont)"):
 			opType, opDesc = ml.Contiguous, "Make contiguous"
 		case strings.Contains(tensorName, "(transposed)"):
 			opType, opDesc = ml.Transpose, "Transpose"
 		case strings.HasPrefix(tensorName, "leaf_"):
 			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
 		case strings.HasPrefix(tensorName, "node_"):
 			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
 		default:
 			opType, opDesc = "Unknown", tensorName
 		}
 		sequence[i] = ml.OpTiming{
 			Type:      opType,
 			Operation: opDesc,
 			Duration:  float64(entry.duration_ms),
 			Order:     i,
 		}
 	}
 	return sequence
 }
 func (c *Context) Compute(tensors ...ml.Tensor) {
 	if envconfig.Benchmark() {
 		// Clear previous timings before new computation
 		C.clear_timings()
 		C.ggml_backend_sched_set_eval_callback(
 			c.sched,
 			C.ggml_backend_eval_callback(C.debug_callback),
 			nil,
 		)
 	}
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
 	needSync := true
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
    if (NOT MSVC)
        # MSVC doesn't support AMX
        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
    endif()
 else ()
    ggml_add_cpu_backend_variant_impl("")
 endif()
Author	SHA1	Message	Date
Bruce MacDonald	057cc54b66	benchmark: compare backend graph computation times Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.	2025-02-19 15:22:53 -08:00
Michael Yang	1e438b237c	Merge pull request #9203 from ollama/mxyng/sapphirerapids build: remove backend build for sapphirerapids	2025-02-19 21:42:00 +00:00
yuiseki	d721a02e7d	test: add test cases for ListHandler (#9146 )	2025-02-19 13:24:27 -08:00
zyxucp	778603a818	docs: Add AntSK to Community Integrations (#9214 )	2025-02-19 13:22:48 -08:00
maninhill	3c874df46e	docs: Add MaxKB to Community Integrations (#9212 )	2025-02-19 13:20:09 -08:00
Michael Yang	5f8c03189e	build: remove backend build for sapphirerapids sapphire rapids has amx support but it ends up having a negative performance impact. emerald rapids also has amx support with a positive performance impact however there's no reasonable way in ggml to differentiate between the two. the impact is small (~6%) so disable amx entirely for simplicity	2025-02-18 14:47:58 -08:00