ollama/llama/patches/0035-CUDA-get_rows-q6_k-support.patch

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 20 Mar 2026 18:50:38 -0700
Subject: [PATCH] CUDA get_rows q6_k support

---
 ggml/src/ggml-cuda/getrows.cu   | 80 ++++++++++++++++++++++++++++++++-
 ggml/src/ggml-cuda/ggml-cuda.cu |  1 +
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index 2fab33243..dc5c4f57a 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -155,6 +155,81 @@ static void get_rows_cuda_float(
         s10, s11, s12/*, s13*/);
 }

+// Specialized GET_ROWS kernel for Q6_K — the k_get_rows template doesn't work for K-quants
+// because they lack the simple dequantize_kernel_t (float2) interface.
+// Based on dequantize_block_q6_K from convert.cu with row-selection logic added.
+template<typename dst_t>
+static __global__ void k_get_rows_q6_K(
+        const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne11, const int64_t ne12,
+        const size_t s1, const size_t s2, const size_t s3,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t s10, const size_t s11, const size_t s12) {
+
+    const int64_t i10 = blockIdx.x;  // row index into src1
+    const int64_t z   = blockIdx.z;
+    const int64_t i11 = z / ne12;
+    const int64_t i12 = z % ne12;
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const char * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int64_t nb = ne00 / QK_K;  // number of Q6_K blocks per row
+
+    // blockIdx.y iterates over Q6_K blocks within the row
+    for (int64_t iblk = blockIdx.y; iblk < nb; iblk += gridDim.y) {
+        const block_q6_K * x = (const block_q6_K *)src0_row + iblk;
+
+        // Same dequantization as dequantize_block_q6_K (assumes 64 threads)
+        const int64_t tid = threadIdx.x;
+        const int64_t ip  = tid / 32;   // 0 or 1
+        const int64_t il  = tid - 32*ip; // 0..31
+        const int64_t is  = 8*ip + il/16;
+
+        const int64_t y_offset = iblk * QK_K + 128*ip + il;
+
+        const float d = x->d;
+        const uint8_t * ql = x->ql + 64*ip + il;
+        const uint8_t   qh = x->qh[32*ip + il];
+        const int8_t  * sc = x->scales + is;
+
+        if (y_offset + 0  < ne00) dst_row[y_offset +  0] = ggml_cuda_cast<dst_t>(d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32));
+        if (y_offset + 32 < ne00) dst_row[y_offset + 32] = ggml_cuda_cast<dst_t>(d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32));
+        if (y_offset + 64 < ne00) dst_row[y_offset + 64] = ggml_cuda_cast<dst_t>(d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32));
+        if (y_offset + 96 < ne00) dst_row[y_offset + 96] = ggml_cuda_cast<dst_t>(d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32));
+    }
+}
+
+template<typename dst_t>
+static void get_rows_cuda_q6_K(
+        const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
+        const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        cudaStream_t stream) {
+    const int64_t nb_blocks = ne00 / QK_K;
+    const dim3 block_dims(64, 1, 1);
+    const dim3 block_nums(ne10, MIN(nb_blocks, (int64_t)UINT16_MAX), MIN(ne11*ne12, (int64_t)UINT16_MAX));
+
+    const size_t s1 = nb1 / sizeof(dst_t);
+    const size_t s2 = nb2 / sizeof(dst_t);
+    const size_t s3 = nb3 / sizeof(dst_t);
+
+    const size_t s10 = nb10 / sizeof(int32_t);
+    const size_t s11 = nb11 / sizeof(int32_t);
+    const size_t s12 = nb12 / sizeof(int32_t);
+
+    k_get_rows_q6_K<<<block_nums, block_dims, 0, stream>>>(
+        src0_d, src1_d, dst_d,
+        ne00, ne11, ne12,
+        s1, s2, s3,
+        nb01, nb02, nb03,
+        s10, s11, s12);
+}
+
 template <typename dst_t>
 static void ggml_cuda_get_rows_switch_src0_type(
         const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
@@ -199,8 +274,11 @@ static void ggml_cuda_get_rows_switch_src0_type(
             get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
                 ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
             break;
+        case GGML_TYPE_Q6_K:
+            get_rows_cuda_q6_K(src0_d, src1_d, dst_d,
+                ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
+            break;
         default:
-            // TODO: k-quants
             GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
             break;
     }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 5c9dfd032..b8ed3709b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4693,6 +4693,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     case GGML_TYPE_Q5_0:
                     case GGML_TYPE_Q5_1:
                     case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q6_K:
                         return true;
                     default:
                         return false;