diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 902fa9ccc..fc3cde9c9 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -65,6 +65,11 @@ jobs: arch: amd64 preset: 'CUDA 12' install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' cuda-version: '12.8' flags: '' runner_dir: 'cuda_v12' @@ -72,6 +77,14 @@ jobs: arch: amd64 preset: 'CUDA 13' install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' + - '"crt"' + - '"nvvm"' + - '"nvptxcompiler"' cuda-version: '13.0' flags: '' runner_dir: 'cuda_v13' @@ -105,7 +118,7 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} + $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait } diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a10ad37a9..e470540a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -80,6 +80,15 @@ jobs: - preset: CUDA install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe flags: '-DCMAKE_CUDA_ARCHITECTURES=80' + cuda-components: + - '"cudart"' + - '"nvcc"' + - '"cublas"' + - '"cublas_dev"' + - '"crt"' + - '"nvvm"' + - '"nvptxcompiler"' + cuda-version: '13.0' - preset: ROCm install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' @@ -102,7 +111,8 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_13.0", "nvcc_13.0", "cublas_13.0", "cublas_dev_13.0")) -NoNewWindow -Wait + $subpackages = @(${{ join(matrix.cuda-components, ', ') }}) | Foreach-Object {"${_}_${{ matrix.cuda-version }}"} + Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait } $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path diff --git a/CMakeLists.txt b/CMakeLists.txt index 8503aa80e..29fbd00cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ set(GGML_LLAMAFILE ON) set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128) set(GGML_CUDA_GRAPHS ON) set(GGML_CUDA_FA ON) -set(GGML_CUDA_COMPRESSION_MODE size) +set(GGML_CUDA_COMPRESSION_MODE default) if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+")) diff --git a/api/types.go b/api/types.go index d3f6fc5a4..a7ddbc373 100644 --- a/api/types.go +++ b/api/types.go @@ -388,8 +388,12 @@ type EmbedRequest struct { // this request. KeepAlive *Duration `json:"keep_alive,omitempty"` + // Truncate truncates the input to fit the model's max sequence length. Truncate *bool `json:"truncate,omitempty"` + // Dimensions truncates the output embedding to the specified dimension. + Dimensions int `json:"dimensions,omitempty"` + // Options lists model-specific options. Options map[string]any `json:"options"` } diff --git a/cmd/cmd.go b/cmd/cmd.go index 8fe068655..19f1e192f 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -56,10 +56,8 @@ func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) if err != nil { return } - for _, cap := range resp.Capabilities { - if cap == model.CapabilityThinking { - return - } + if slices.Contains(resp.Capabilities, model.CapabilityThinking) { + return } fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name) } diff --git a/docs/api.md b/docs/api.md index f11d59ed1..f47af63c6 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1708,6 +1708,7 @@ Advanced parameters: - `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) +- `dimensions`: number of dimensions for the embedding ### Examples diff --git a/envconfig/config.go b/envconfig/config.go index c1342c917..aa5fc4d04 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -185,8 +185,6 @@ var ( ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) // Auth enables authentication between the Ollama client and server UseAuth = Bool("OLLAMA_AUTH") - // Enable the new memory estimation logic - NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES") ) func String(s string) func() string { @@ -273,7 +271,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, - "OLLAMA_NEW_ESTIMATES": {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"}, // Informational "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, diff --git a/llm/server.go b/llm/server.go index 09987f6f6..2af82fa04 100644 --- a/llm/server.go +++ b/llm/server.go @@ -149,7 +149,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a var textProcessor model.TextProcessor var err error if envconfig.NewEngine() || f.KV().OllamaEngineRequired() { - textProcessor, err = model.NewTextProcessor(modelPath) + if len(projectors) == 0 { + textProcessor, err = model.NewTextProcessor(modelPath) + } else { + err = errors.New("split vision models aren't supported") + } if err != nil { // To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err) @@ -162,11 +166,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } } - newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates() - if newEstimates { - slog.Info("enabling new memory estimates") - } - // Verify the requested context size is <= the model training size trainCtx := f.KV().ContextLength() if opts.NumCtx > int(trainCtx) && trainCtx > 0 { @@ -434,7 +433,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a } }() - if newEstimates { + if textProcessor != nil { return &ollamaServer{llmServer: s}, nil } else { return &llamaServer{llmServer: s, ggml: f}, nil diff --git a/openai/openai.go b/openai/openai.go index 9c7c41cb4..b6a8a95e2 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -76,8 +76,9 @@ type JsonSchema struct { } type EmbedRequest struct { - Input any `json:"input"` - Model string `json:"model"` + Input any `json:"input"` + Model string `json:"model"` + Dimensions int `json:"dimensions,omitempty"` } type StreamOptions struct { @@ -1005,7 +1006,7 @@ func EmbeddingsMiddleware() gin.HandlerFunc { } var b bytes.Buffer - if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil { + if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input, Dimensions: req.Dimensions}); err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) return } diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 201d55a16..676e5186f 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -18,7 +18,6 @@ import ( "reflect" "regexp" "runtime" - "runtime/debug" "strconv" "strings" "sync" @@ -1101,9 +1100,13 @@ func (s *Server) allocModel( // Convert memory allocation panics to errors defer func() { if r := recover(); r != nil { - debug.PrintStack() if err, ok := r.(error); ok { - panicErr = err + var noMem ml.ErrNoMem + if errors.As(err, &noMem) { + panicErr = noMem + } else { + panic(r) + } } else { panic(r) } diff --git a/server/routes.go b/server/routes.go index ac4df4a46..8dd1b217a 100644 --- a/server/routes.go +++ b/server/routes.go @@ -558,7 +558,12 @@ func (s *Server) EmbedHandler(c *gin.Context) { if err != nil { return err } - embeddings[i] = normalize(embedding) + // TODO: this first normalization should be done by the model + embedding = normalize(embedding) + if req.Dimensions > 0 && req.Dimensions < len(embedding) { + embedding = normalize(embedding[:req.Dimensions]) + } + embeddings[i] = embedding return nil }) } @@ -584,11 +589,7 @@ func normalize(vec []float32) []float32 { sum += v * v } - norm := float32(0.0) - if sum > 0 { - norm = float32(1.0 / math.Sqrt(float64(sum))) - } - + norm := float32(1.0 / max(math.Sqrt(float64(sum)), 1e-12)) for i := range vec { vec[i] *= norm }