Files
ollama/server/quantization.go
Daniel Hiltgen 56c735d871 runner: Remove CGO engines, use llama-server exclusively for GGML models
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model
implementations, and sample.  llama-server (built from upstream llama.cpp via
FetchContent) is now the sole inference engine for GGUF-based models.
(Safetensor based models continue to run on the new MLX engine.)  This allows
us to more rapidly pick up new capabilities and fixes from llama.cpp as they
come out.

On windows this now requires recent AMD driver versions to support ROCm v7 as
llama.cpp currently does not support building against v6.
2026-04-20 08:44:02 -07:00

170 lines
4.9 KiB
Go

package server
import (
"bufio"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
fsggml "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
)
// findLlamaQuantize locates the llama-quantize binary (installed alongside llama-server).
func findLlamaQuantize() (string, error) {
suffix := "llama-quantize"
if runtime.GOOS == "windows" {
suffix += ".exe"
}
seen := map[string]bool{}
var candidates []string
add := func(dir string) {
path := filepath.Join(dir, suffix)
if !seen[path] {
seen[path] = true
candidates = append(candidates, path)
}
}
add(ml.LibOllamaPath)
exe, err := os.Executable()
if err == nil {
if eval, err := filepath.EvalSymlinks(exe); err == nil {
exe = eval
}
add(filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"))
}
if cwd, err := os.Getwd(); err == nil {
add(filepath.Join(cwd, "build", "lib", "ollama"))
}
// Dev build paths (cmake build output, before install)
addGlob := func(base string) {
matches, _ := filepath.Glob(filepath.Join(base, "build", "llama-server-*", "bin"))
for _, m := range matches {
add(m)
}
}
if exe, err := os.Executable(); err == nil {
if eval, err := filepath.EvalSymlinks(exe); err == nil {
exe = eval
}
addGlob(filepath.Dir(exe))
}
if cwd, err := os.Getwd(); err == nil {
addGlob(cwd)
}
for _, path := range candidates {
if _, err := os.Stat(path); err == nil {
return path, nil
}
}
return "", fmt.Errorf("llama-quantize binary not found (checked: %s)", strings.Join(candidates, ", "))
}
// progressRegex matches llama-quantize output lines like "[ 42/ 200]"
var progressRegex = regexp.MustCompile(`\[\s*(\d+)/\s*(\d+)\]`)
// quantize re-quantizes a GGUF model by shelling out to llama-quantize.
// The upstream llama-quantize handles all quantization types and per-tensor
// type selection (mixed quantization for quality).
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
quantizeExe, err := findLlamaQuantize()
if err != nil {
return fmt.Errorf("quantization unavailable: %w", err)
}
// Map our FileType to the llama-quantize type name
typeName := newFileType.String()
if typeName == "" {
return fmt.Errorf("unsupported quantization type: %v", newFileType)
}
slog.Info("quantizing model", "type", typeName, "input", in.Name(), "output", out.Name())
args := []string{"--allow-requantize"}
arch := orig.KV().Architecture()
// gemma3n's per_layer_token_embd is read on every layer for every token
// (not just once at input like token_embd), so it's far more quality-sensitive
// than a normal token embedding. Keep it at F16 on K-quants via an anchored
// regex so we don't also bump token_embd (which --token-embedding-type would).
if arch == "gemma3n" {
switch newFileType {
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
args = append(args, "--tensor-type", `^per_layer_token_embd\.weight$=f16`)
}
}
// deepseek2 MLA tensors (attn_k_b / attn_q_a / attn_q_b / attn_v_b /
// attn_kv_a_mqa) are small, critical matrices in DeepSeek-V2-style multi-head
// latent attention. Upstream llama-quant.cpp has no special case for these
// names at b8680 — they fall through to the default Q4_K / Q5_0 for Q4_K_M.
// Published library/glm-4.7-flash quantizes them at Q8_0 for quality. Force
// the same on K-quants via --tensor-type regex so we match.
if arch == "deepseek2" {
switch newFileType {
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
args = append(args,
"--tensor-type", `attn_k_b\.weight$=q8_0`,
"--tensor-type", `attn_q_a\.weight$=q8_0`,
"--tensor-type", `attn_q_b\.weight$=q8_0`,
"--tensor-type", `attn_v_b\.weight$=q8_0`,
"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
)
}
}
args = append(args, in.Name(), out.Name(), typeName)
cmd := exec.Command(quantizeExe, args...)
cmd.Env = os.Environ()
// Parse progress from stdout
stdout, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("failed to create stdout pipe: %w", err)
}
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start llama-quantize: %w", err)
}
// Track total tensor size for progress reporting
totalSize := uint64(0)
for _, t := range orig.Tensors().Items() {
totalSize += t.Size()
}
var lastReported uint64
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
line := scanner.Text()
if matches := progressRegex.FindStringSubmatch(line); len(matches) == 3 {
current, _ := strconv.ParseUint(matches[1], 10, 64)
total, _ := strconv.ParseUint(matches[2], 10, 64)
if total > 0 && progressFn != nil {
// progressFn expects incremental byte deltas
done := totalSize * current / total
if done > lastReported {
progressFn(done - lastReported)
lastReported = done
}
}
}
}
if err := cmd.Wait(); err != nil {
return fmt.Errorf("llama-quantize failed: %w", err)
}
return nil
}