From dde09129d143c99e3f78859d3829d215501aa584 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 7 Apr 2026 14:54:25 -0700 Subject: [PATCH] gemma4: Disable FA on older GPUs where it doesn't work (#15403) CUDA older than 7.5 lack the support to enable flash attention for the model. --- llm/server.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llm/server.go b/llm/server.go index e8fa4cce0..a8104f79f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -210,6 +210,18 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st fa = false } + // Gemma 4's 512-dim attention heads require MMA FA kernels (Turing+, compute >= 7.5). + // Older CUDA GPUs only have tile/vec FA kernels which abort on dk512 non-GQA attention. + if fa && f.KV().Architecture() == "gemma4" { + for _, gpu := range gpus { + if gpu.Library == "CUDA" && (gpu.ComputeMajor < 7 || (gpu.ComputeMajor == 7 && gpu.ComputeMinor < 5)) { + slog.Debug("disabling flash attention for gemma4 on pre-Turing GPU", "compute", fmt.Sprintf("%d.%d", gpu.ComputeMajor, gpu.ComputeMinor)) + fa = false + break + } + } + } + kvct := strings.ToLower(envconfig.KvCacheType()) if tok == nil {