mirror of
https://github.com/ollama/ollama.git
synced 2026-04-23 01:05:47 +02:00
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model implementations, and sample. llama-server (built from upstream llama.cpp via FetchContent) is now the sole inference engine for GGUF-based models. (Safetensor based models continue to run on the new MLX engine.) This allows us to more rapidly pick up new capabilities and fixes from llama.cpp as they come out. On windows this now requires recent AMD driver versions to support ROCm v7 as llama.cpp currently does not support building against v6.
170 lines
4.9 KiB
Go
170 lines
4.9 KiB
Go
package server
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
|
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
|
"github.com/ollama/ollama/ml"
|
|
)
|
|
|
|
// findLlamaQuantize locates the llama-quantize binary (installed alongside llama-server).
|
|
func findLlamaQuantize() (string, error) {
|
|
suffix := "llama-quantize"
|
|
if runtime.GOOS == "windows" {
|
|
suffix += ".exe"
|
|
}
|
|
|
|
seen := map[string]bool{}
|
|
var candidates []string
|
|
add := func(dir string) {
|
|
path := filepath.Join(dir, suffix)
|
|
if !seen[path] {
|
|
seen[path] = true
|
|
candidates = append(candidates, path)
|
|
}
|
|
}
|
|
|
|
add(ml.LibOllamaPath)
|
|
|
|
exe, err := os.Executable()
|
|
if err == nil {
|
|
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
|
exe = eval
|
|
}
|
|
add(filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"))
|
|
}
|
|
if cwd, err := os.Getwd(); err == nil {
|
|
add(filepath.Join(cwd, "build", "lib", "ollama"))
|
|
}
|
|
|
|
// Dev build paths (cmake build output, before install)
|
|
addGlob := func(base string) {
|
|
matches, _ := filepath.Glob(filepath.Join(base, "build", "llama-server-*", "bin"))
|
|
for _, m := range matches {
|
|
add(m)
|
|
}
|
|
}
|
|
if exe, err := os.Executable(); err == nil {
|
|
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
|
exe = eval
|
|
}
|
|
addGlob(filepath.Dir(exe))
|
|
}
|
|
if cwd, err := os.Getwd(); err == nil {
|
|
addGlob(cwd)
|
|
}
|
|
|
|
for _, path := range candidates {
|
|
if _, err := os.Stat(path); err == nil {
|
|
return path, nil
|
|
}
|
|
}
|
|
|
|
return "", fmt.Errorf("llama-quantize binary not found (checked: %s)", strings.Join(candidates, ", "))
|
|
}
|
|
|
|
// progressRegex matches llama-quantize output lines like "[ 42/ 200]"
|
|
var progressRegex = regexp.MustCompile(`\[\s*(\d+)/\s*(\d+)\]`)
|
|
|
|
// quantize re-quantizes a GGUF model by shelling out to llama-quantize.
|
|
// The upstream llama-quantize handles all quantization types and per-tensor
|
|
// type selection (mixed quantization for quality).
|
|
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
|
|
quantizeExe, err := findLlamaQuantize()
|
|
if err != nil {
|
|
return fmt.Errorf("quantization unavailable: %w", err)
|
|
}
|
|
|
|
// Map our FileType to the llama-quantize type name
|
|
typeName := newFileType.String()
|
|
if typeName == "" {
|
|
return fmt.Errorf("unsupported quantization type: %v", newFileType)
|
|
}
|
|
|
|
slog.Info("quantizing model", "type", typeName, "input", in.Name(), "output", out.Name())
|
|
|
|
args := []string{"--allow-requantize"}
|
|
arch := orig.KV().Architecture()
|
|
// gemma3n's per_layer_token_embd is read on every layer for every token
|
|
// (not just once at input like token_embd), so it's far more quality-sensitive
|
|
// than a normal token embedding. Keep it at F16 on K-quants via an anchored
|
|
// regex so we don't also bump token_embd (which --token-embedding-type would).
|
|
if arch == "gemma3n" {
|
|
switch newFileType {
|
|
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
|
|
args = append(args, "--tensor-type", `^per_layer_token_embd\.weight$=f16`)
|
|
}
|
|
}
|
|
// deepseek2 MLA tensors (attn_k_b / attn_q_a / attn_q_b / attn_v_b /
|
|
// attn_kv_a_mqa) are small, critical matrices in DeepSeek-V2-style multi-head
|
|
// latent attention. Upstream llama-quant.cpp has no special case for these
|
|
// names at b8680 — they fall through to the default Q4_K / Q5_0 for Q4_K_M.
|
|
// Published library/glm-4.7-flash quantizes them at Q8_0 for quality. Force
|
|
// the same on K-quants via --tensor-type regex so we match.
|
|
if arch == "deepseek2" {
|
|
switch newFileType {
|
|
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
|
|
args = append(args,
|
|
"--tensor-type", `attn_k_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_q_a\.weight$=q8_0`,
|
|
"--tensor-type", `attn_q_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_v_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
|
|
)
|
|
}
|
|
}
|
|
args = append(args, in.Name(), out.Name(), typeName)
|
|
cmd := exec.Command(quantizeExe, args...)
|
|
cmd.Env = os.Environ()
|
|
|
|
// Parse progress from stdout
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create stdout pipe: %w", err)
|
|
}
|
|
cmd.Stderr = os.Stderr
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
return fmt.Errorf("failed to start llama-quantize: %w", err)
|
|
}
|
|
|
|
// Track total tensor size for progress reporting
|
|
totalSize := uint64(0)
|
|
for _, t := range orig.Tensors().Items() {
|
|
totalSize += t.Size()
|
|
}
|
|
|
|
var lastReported uint64
|
|
scanner := bufio.NewScanner(stdout)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if matches := progressRegex.FindStringSubmatch(line); len(matches) == 3 {
|
|
current, _ := strconv.ParseUint(matches[1], 10, 64)
|
|
total, _ := strconv.ParseUint(matches[2], 10, 64)
|
|
if total > 0 && progressFn != nil {
|
|
// progressFn expects incremental byte deltas
|
|
done := totalSize * current / total
|
|
if done > lastReported {
|
|
progressFn(done - lastReported)
|
|
lastReported = done
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if err := cmd.Wait(); err != nil {
|
|
return fmt.Errorf("llama-quantize failed: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|