ollama/server/quantization.go

package server

import (
	"bufio"
	"fmt"
	"log/slog"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"runtime"
	"strconv"
	"strings"

	fsggml "github.com/ollama/ollama/fs/ggml"
	"github.com/ollama/ollama/ml"
)

// findLlamaQuantize locates the llama-quantize binary (installed alongside llama-server).
func findLlamaQuantize() (string, error) {
	suffix := "llama-quantize"
	if runtime.GOOS == "windows" {
		suffix += ".exe"
	}

	seen := map[string]bool{}
	var candidates []string
	add := func(dir string) {
		path := filepath.Join(dir, suffix)
		if !seen[path] {
			seen[path] = true
			candidates = append(candidates, path)
		}
	}

	add(ml.LibOllamaPath)

	exe, err := os.Executable()
	if err == nil {
		if eval, err := filepath.EvalSymlinks(exe); err == nil {
			exe = eval
		}
		add(filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"))
	}
	if cwd, err := os.Getwd(); err == nil {
		add(filepath.Join(cwd, "build", "lib", "ollama"))
	}

	// Dev build paths (cmake build output, before install)
	addGlob := func(base string) {
		matches, _ := filepath.Glob(filepath.Join(base, "build", "llama-server-*", "bin"))
		for _, m := range matches {
			add(m)
		}
	}
	if exe, err := os.Executable(); err == nil {
		if eval, err := filepath.EvalSymlinks(exe); err == nil {
			exe = eval
		}
		addGlob(filepath.Dir(exe))
	}
	if cwd, err := os.Getwd(); err == nil {
		addGlob(cwd)
	}

	for _, path := range candidates {
		if _, err := os.Stat(path); err == nil {
			return path, nil
		}
	}

	return "", fmt.Errorf("llama-quantize binary not found (checked: %s)", strings.Join(candidates, ", "))
}

// progressRegex matches llama-quantize output lines like "[ 42/ 200]"
var progressRegex = regexp.MustCompile(`\[\s*(\d+)/\s*(\d+)\]`)

// quantize re-quantizes a GGUF model by shelling out to llama-quantize.
// The upstream llama-quantize handles all quantization types and per-tensor
// type selection (mixed quantization for quality).
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
	quantizeExe, err := findLlamaQuantize()
	if err != nil {
		return fmt.Errorf("quantization unavailable: %w", err)
	}

	// Map our FileType to the llama-quantize type name
	typeName := newFileType.String()
	if typeName == "" {
		return fmt.Errorf("unsupported quantization type: %v", newFileType)
	}

	slog.Info("quantizing model", "type", typeName, "input", in.Name(), "output", out.Name())

	args := []string{"--allow-requantize"}
	arch := orig.KV().Architecture()
	// gemma3n's per_layer_token_embd is read on every layer for every token
	// (not just once at input like token_embd), so it's far more quality-sensitive
	// than a normal token embedding. Keep it at F16 on K-quants via an anchored
	// regex so we don't also bump token_embd (which --token-embedding-type would).
	if arch == "gemma3n" {
		switch newFileType {
		case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
			args = append(args, "--tensor-type", `^per_layer_token_embd\.weight$=f16`)
		}
	}
	// deepseek2 MLA tensors (attn_k_b / attn_q_a / attn_q_b / attn_v_b /
	// attn_kv_a_mqa) are small, critical matrices in DeepSeek-V2-style multi-head
	// latent attention. Upstream llama-quant.cpp has no special case for these
	// names at b8680 — they fall through to the default Q4_K / Q5_0 for Q4_K_M.
	// Published library/glm-4.7-flash quantizes them at Q8_0 for quality. Force
	// the same on K-quants via --tensor-type regex so we match.
	if arch == "deepseek2" {
		switch newFileType {
		case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
			args = append(args,
				"--tensor-type", `attn_k_b\.weight$=q8_0`,
				"--tensor-type", `attn_q_a\.weight$=q8_0`,
				"--tensor-type", `attn_q_b\.weight$=q8_0`,
				"--tensor-type", `attn_v_b\.weight$=q8_0`,
				"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
			)
		}
	}
	args = append(args, in.Name(), out.Name(), typeName)
	cmd := exec.Command(quantizeExe, args...)
	cmd.Env = os.Environ()

	// Parse progress from stdout
	stdout, err := cmd.StdoutPipe()
	if err != nil {
		return fmt.Errorf("failed to create stdout pipe: %w", err)
	}
	cmd.Stderr = os.Stderr

	if err := cmd.Start(); err != nil {
		return fmt.Errorf("failed to start llama-quantize: %w", err)
	}

	// Track total tensor size for progress reporting
	totalSize := uint64(0)
	for _, t := range orig.Tensors().Items() {
		totalSize += t.Size()
	}

	var lastReported uint64
	scanner := bufio.NewScanner(stdout)
	for scanner.Scan() {
		line := scanner.Text()
		if matches := progressRegex.FindStringSubmatch(line); len(matches) == 3 {
			current, _ := strconv.ParseUint(matches[1], 10, 64)
			total, _ := strconv.ParseUint(matches[2], 10, 64)
			if total > 0 && progressFn != nil {
				// progressFn expects incremental byte deltas
				done := totalSize * current / total
				if done > lastReported {
					progressFn(done - lastReported)
					lastReported = done
				}
			}
		}
	}

	if err := cmd.Wait(); err != nil {
		return fmt.Errorf("llama-quantize failed: %w", err)
	}

	return nil
}