Merge pull request #3926 from dhiltgen/ci_fixes

Fix release CI
2026-04-27 11:15:40 +02:00 · 2024-04-25 17:42:31 -07:00 · 2024-04-25 17:27:11 -07:00 · 2024-04-25 16:34:17 -07:00 · 2024-04-25 16:28:31 -07:00 · 2024-04-25 19:02:40 -04:00
16 changed files with 329 additions and 227 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -311,29 +311,18 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cpu
-          path: |
-            llm/build
-            dist/windows-amd64
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cuda
-          path: |
-            llm/build
-            dist/windows-amd64
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps
-          path: dist/deps
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
-          path: dist/deps
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-          path: |
-            llm/build
-            dist/windows-amd64
      - run: dir llm/build
      - run: |
          $gopath=(get-command go).source | split-path -parent
--- a/api/types.go
+++ b/api/types.go
@@ -396,8 +396,10 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 func DefaultOptions() Options {
 	return Options{
 		// options set on request to runner
-		NumPredict:       -1,
-		NumKeep:          0,
+		NumPredict: -1,
+
+		// set a minimal num_keep to avoid issues on context shifts
+		NumKeep:          4,
 		Temperature:      0.8,
 		TopK:             40,
 		TopP:             0.9,
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,6 +17,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"strings"
 	"syscall"
@@ -53,8 +54,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

-	bars := make(map[string]*progress.Bar)
-
 	modelfile, err := os.ReadFile(filename)
 	if err != nil {
 		return err
@@ -95,95 +94,16 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}

-			// TODO make this work w/ adapters
 			if fi.IsDir() {
-				tf, err := os.CreateTemp("", "ollama-tf")
+				// this is likely a safetensors or pytorch directory
+				// TODO make this work w/ adapters
+				tempfile, err := tempZipFiles(path)
 				if err != nil {
 					return err
 				}
-				defer os.RemoveAll(tf.Name())
+				defer os.RemoveAll(tempfile)

-				zf := zip.NewWriter(tf)
-
-				files := []string{}
-
-				tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
-				if err != nil {
-					return err
-				} else if len(tfiles) == 0 {
-					tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
-					if err != nil {
-						return err
-					}
-				}
-
-				files = append(files, tfiles...)
-
-				if len(files) == 0 {
-					return fmt.Errorf("no models were found in '%s'", path)
-				}
-
-				// add the safetensor/torch config file + tokenizer
-				files = append(files, filepath.Join(path, "config.json"))
-				files = append(files, filepath.Join(path, "params.json"))
-				files = append(files, filepath.Join(path, "added_tokens.json"))
-				files = append(files, filepath.Join(path, "tokenizer.model"))
-
-				for _, fn := range files {
-					f, err := os.Open(fn)
-
-					// just skip whatever files aren't there
-					if os.IsNotExist(err) {
-						if strings.HasSuffix(fn, "tokenizer.model") {
-							// try the parent dir before giving up
-							parentDir := filepath.Dir(path)
-							newFn := filepath.Join(parentDir, "tokenizer.model")
-							f, err = os.Open(newFn)
-							if os.IsNotExist(err) {
-								continue
-							} else if err != nil {
-								return err
-							}
-						} else {
-							continue
-						}
-					} else if err != nil {
-						return err
-					}
-
-					fi, err := f.Stat()
-					if err != nil {
-						return err
-					}
-
-					h, err := zip.FileInfoHeader(fi)
-					if err != nil {
-						return err
-					}
-
-					h.Name = filepath.Base(fn)
-					h.Method = zip.Store
-
-					w, err := zf.CreateHeader(h)
-					if err != nil {
-						return err
-					}
-
-					_, err = io.Copy(w, f)
-					if err != nil {
-						return err
-					}
-
-				}
-
-				if err := zf.Close(); err != nil {
-					return err
-				}
-
-				if err := tf.Close(); err != nil {
-					return err
-				}
-				path = tf.Name()
+				path = tempfile
 			}

 			digest, err := createBlob(cmd, client, path)
@@ -191,10 +111,17 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}

-			modelfile = bytes.ReplaceAll(modelfile, []byte(c.Args), []byte("@"+digest))
+			name := c.Name
+			if c.Name == "model" {
+				name = "from"
+			}
+
+			re := regexp.MustCompile(fmt.Sprintf(`(?im)^(%s)\s+%s\s*$`, name, c.Args))
+			modelfile = re.ReplaceAll(modelfile, []byte("$1 @"+digest))
 		}
 	}

+	bars := make(map[string]*progress.Bar)
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != "" {
 			spinner.Stop()
@@ -228,6 +155,88 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

+func tempZipFiles(path string) (string, error) {
+	tempfile, err := os.CreateTemp("", "ollama-tf")
+	if err != nil {
+		return "", err
+	}
+	defer tempfile.Close()
+
+	zipfile := zip.NewWriter(tempfile)
+	defer zipfile.Close()
+
+	tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
+	if err != nil {
+		return "", err
+	} else if len(tfiles) == 0 {
+		tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
+		if err != nil {
+			return "", err
+		}
+	}
+
+	files := []string{}
+	files = append(files, tfiles...)
+
+	if len(files) == 0 {
+		return "", fmt.Errorf("no models were found in '%s'", path)
+	}
+
+	// add the safetensor/torch config file + tokenizer
+	files = append(files, filepath.Join(path, "config.json"))
+	files = append(files, filepath.Join(path, "params.json"))
+	files = append(files, filepath.Join(path, "added_tokens.json"))
+	files = append(files, filepath.Join(path, "tokenizer.model"))
+
+	for _, fn := range files {
+		f, err := os.Open(fn)
+
+		// just skip whatever files aren't there
+		if os.IsNotExist(err) {
+			if strings.HasSuffix(fn, "tokenizer.model") {
+				// try the parent dir before giving up
+				parentDir := filepath.Dir(path)
+				newFn := filepath.Join(parentDir, "tokenizer.model")
+				f, err = os.Open(newFn)
+				if os.IsNotExist(err) {
+					continue
+				} else if err != nil {
+					return "", err
+				}
+			} else {
+				continue
+			}
+		} else if err != nil {
+			return "", err
+		}
+
+		fi, err := f.Stat()
+		if err != nil {
+			return "", err
+		}
+
+		h, err := zip.FileInfoHeader(fi)
+		if err != nil {
+			return "", err
+		}
+
+		h.Name = filepath.Base(fn)
+		h.Method = zip.Store
+
+		w, err := zipfile.CreateHeader(h)
+		if err != nil {
+			return "", err
+		}
+
+		_, err = io.Copy(w, f)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	return tempfile.Name(), nil
+}
+
 func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
 	bin, err := os.Open(path)
 	if err != nil {
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -140,7 +140,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		}

 		if int(major) < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%d", major, minor, patch), "gpu", gpuID)
+			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}

@@ -266,7 +266,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
-			gfx := fmt.Sprintf("gfx%d%d%d", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
+			gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
 			if !slices.Contains[[]string, string](supported, gfx) {
 				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -149,13 +149,16 @@ func AMDGetGPUInfo() []GpuInfo {
 			}
 		}
 		if patch != "" {
-			gpuInfo.Patch, err = strconv.Atoi(patch)
+			// Patch rev is hex; e.g. gfx90a
+			p, err := strconv.ParseInt(patch, 16, 0)
 			if err != nil {
 				slog.Info("failed to parse version", "version", gfx, "error", err)
+			} else {
+				gpuInfo.Patch = int(p)
 			}
 		}
 		if gpuInfo.Major < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%d", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
+			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
 			continue
 		}

--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -21,7 +21,7 @@ init_vars() {
        # TODO - add additional optimization flags...
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
-    case $(uname -s) in 
+    case $(uname -s) in
    "Darwin")
        LIB_EXT="dylib"
        WHOLE_ARCHIVE="-Wl,-force_load"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -165,11 +165,11 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    fi
    if [ "${ARCH}" == "arm64" ]; then
        echo "ARM CPU detected - disabling unsupported AVX instructions"
-        
+
        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
        #
-        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
-        # Disabling has minimal performance effect while maintaining compatibility. 
+        # CUDA compute < 6.0 lacks proper FP16 support on ARM.
+        # Disabling has minimal performance effect while maintaining compatibility.
        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
    fi
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -5,7 +5,6 @@ import (
 	"log/slog"
 	"os"
 	"strconv"
-	"strings"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
@@ -100,8 +99,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		return 0, 0
 	}

-	var layerCount int
 	layers := ggml.Tensors().Layers()
+
+	var memoryLayerOutput uint64
+	for k, v := range layers {
+		if k == "output" || k == "output_norm" {
+			memoryLayerOutput += v.size()
+		}
+	}
+
+	if gpus[0].Library == "metal" && opts.UseMMap {
+		// memory is preallocated for output tensors
+		memoryRequiredTotal += memoryLayerOutput
+		memoryRequiredPartial += memoryLayerOutput
+	}
+
+	var layerCount int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
 		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()

@@ -115,15 +128,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		}
 	}

-	var memoryLayerOutput uint64
-	for k, v := range layers {
-		if !strings.HasPrefix(k, "blk.") {
-			memoryLayerOutput += v.size()
-		}
+	if gpus[0].Library != "metal" || !opts.UseMMap {
+		// memory was not preallocated for output tensors
+		memoryRequiredTotal += memoryLayerOutput
 	}

-	memoryRequiredTotal += memoryLayerOutput
-
 	if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
--- a/llm/patches/04-metal.diff
+++ b/llm/patches/04-metal.diff
@@ -0,0 +1,45 @@
+diff --git a/ggml-metal.m b/ggml-metal.m
+index 0207b787..b5e9884b 100644
+--- a/ggml-metal.m
+++ b/ggml-metal.m
+@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
+                         // to the matrix-vector kernel
+                         int ne11_mm_min = 1;
+ 
+-#if 0
+                         // the numbers below are measured on M2 Ultra for 7B and 13B models
+                         // these numbers do not translate to other devices or model sizes
+                         // TODO: need to find a better approach
+-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+-                            switch (src0t) {
+-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+-                                case GGML_TYPE_Q4_0:
+-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+-                                case GGML_TYPE_Q5_0:                          // not tested yet
+-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+-                                default:             ne11_mm_min = 1;  break;
+-                            }
+                        switch (src0t) {
+                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+                            case GGML_TYPE_Q4_0:
+                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+                            case GGML_TYPE_Q5_0:                          // not tested yet
+                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+                            default:             ne11_mm_min = 1;  break;
+                         }
+-#endif
+ 
+                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--- a/llm/server.go
+++ b/llm/server.go
@@ -560,6 +560,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		return err
 	}
 	defer s.sem.Release(1)
+
+	// only allow maximum 10 "context shifts" to avoid infinite generation
+	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
+		req.Options.NumPredict = 10 * s.options.NumCtx
+		slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
+	}
+
 	request := map[string]any{
 		"prompt":            req.Prompt,
 		"stream":            true,
--- a/server/images.go
+++ b/server/images.go
@@ -29,6 +29,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

@@ -701,36 +702,32 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
 	return path, nil
 }

-func CopyModel(src, dest string) error {
-	srcModelPath := ParseModelPath(src)
-	srcPath, err := srcModelPath.GetManifestPath()
+func CopyModel(src, dst model.Name) error {
+	manifests, err := GetManifestPath()
 	if err != nil {
 		return err
 	}

-	destModelPath := ParseModelPath(dest)
-	destPath, err := destModelPath.GetManifestPath()
-	if err != nil {
-		return err
-	}
-	if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil {
+	dstpath := filepath.Join(manifests, dst.FilepathNoBuild())
+	if err := os.MkdirAll(filepath.Dir(dstpath), 0o755); err != nil {
 		return err
 	}

-	// copy the file
-	input, err := os.ReadFile(srcPath)
+	srcpath := filepath.Join(manifests, src.FilepathNoBuild())
+	srcfile, err := os.Open(srcpath)
 	if err != nil {
-		fmt.Println("Error reading file:", err)
 		return err
 	}
+	defer srcfile.Close()

-	err = os.WriteFile(destPath, input, 0o644)
+	dstfile, err := os.Create(dstpath)
 	if err != nil {
-		fmt.Println("Error reading file:", err)
 		return err
 	}
+	defer dstfile.Close()

-	return nil
+	_, err = io.Copy(dstfile, srcfile)
+	return err
 }

 func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
--- a/server/routes.go
+++ b/server/routes.go
@@ -29,6 +29,7 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

@@ -788,34 +789,34 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
 }

 func (s *Server) CopyModelHandler(c *gin.Context) {
-	var req api.CopyRequest
-	err := c.ShouldBindJSON(&req)
-	switch {
-	case errors.Is(err, io.EOF):
+	var r api.CopyRequest
+	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
 		return
-	case err != nil:
+	} else if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

-	if req.Source == "" || req.Destination == "" {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "source add destination are required"})
+	src := model.ParseName(r.Source)
+	if !src.IsValid() {
+		_ = c.Error(fmt.Errorf("source %q is invalid", r.Source))
+	}
+
+	dst := model.ParseName(r.Destination)
+	if !dst.IsValid() {
+		_ = c.Error(fmt.Errorf("destination %q is invalid", r.Destination))
+	}
+
+	if len(c.Errors) > 0 {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": c.Errors.Errors()})
 		return
 	}

-	if err := ParseModelPath(req.Destination).Validate(); err != nil {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
-	if err := CopyModel(req.Source, req.Destination); err != nil {
-		if os.IsNotExist(err) {
-			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Source)})
-		} else {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		}
-		return
+	if err := CopyModel(src, dst); errors.Is(err, os.ErrNotExist) {
+		c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found", r.Source)})
+	} else if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 	}
 }

--- a/server/sched.go
+++ b/server/sched.go
@@ -23,7 +23,6 @@ import (
 type LlmRequest struct {
 	ctx             context.Context //nolint:containedctx
 	model           *Model
-	ggml            *llm.GGML // TODO - how large is this, and do we need to free it after we've finished loading?
 	opts            api.Options
 	sessionDuration time.Duration
 	successCh       chan *runnerRef
@@ -39,7 +38,7 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex

-	loadFn      func(req *LlmRequest, gpus gpu.GpuInfoList)
+	loadFn      func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 	newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
 	getGpuFn    func() gpu.GpuInfoList
 }
@@ -47,6 +46,7 @@ type Scheduler struct {
 // TODO set this to zero after a release or two, to enable multiple models by default
 var loadedMax = 1          // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
 var maxQueuedRequests = 10 // TODO configurable
+var numParallel = 1

 func InitScheduler(ctx context.Context) *Scheduler {
 	maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
@@ -58,6 +58,14 @@ func InitScheduler(ctx context.Context) *Scheduler {
 			loadedMax = m
 		}
 	}
+	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
+		p, err := strconv.Atoi(onp)
+		if err != nil || p <= 0 {
+			slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
+		} else {
+			numParallel = p
+		}
+	}

 	sched := &Scheduler{
 		pendingReqCh:  make(chan *LlmRequest, maxQueuedRequests),
@@ -74,20 +82,16 @@ func InitScheduler(ctx context.Context) *Scheduler {

 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
-	ggml, err := llm.LoadModel(model.ModelPath)
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
-		ggml:            ggml,
 		opts:            opts,
 		sessionDuration: sessionDuration,
 		successCh:       make(chan *runnerRef),
 		errCh:           make(chan error, 1),
 	}
-	if err != nil {
-		req.errCh <- err
-		return req.successCh, req.errCh
-	}
+	// context split across parallel threads
+	opts.NumCtx = opts.NumCtx * numParallel
 	select {
 	case s.pendingReqCh <- req:
 	default:
@@ -130,28 +134,39 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if loadedCount == 0 {
-					slog.Debug("loading first model", "model", pending.model.ModelPath)
-					gpus := s.getGpuFn()
-					g := pickBestFitGPUs(pending, gpus)
-					if g != nil {
-						gpus = g
-					}
-					s.loadFn(pending, gpus)
-					break
 				} else if loadedMax > 0 && loadedCount >= loadedMax {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload(pending)
 				} else {
-					// More than one loaded model, so we have to see if the new one fits
+					// Either no models are loaded or below loadedMax
 					// Get a refreshed GPU list
 					gpus := s.getGpuFn()
+
+					// Load model for fitting
+					ggml, err := llm.LoadModel(pending.model.ModelPath)
+					if err != nil {
+						pending.errCh <- err
+						break
+					}
+
+					// No models loaded. Load the model but prefer the best fit.
+					if loadedCount == 0 {
+						slog.Debug("loading first model", "model", pending.model.ModelPath)
+						g := pickBestFitGPUs(pending, ggml, gpus)
+						if g != nil {
+							gpus = g
+						}
+						s.loadFn(pending, ggml, gpus)
+						break
+					}
+
+					// More than one loaded model, so we have to see if the new one fits
 					// Update free memory from currently loaded models
 					s.updateFreeSpace(gpus)
-					gpus = pickBestFitGPUs(pending, gpus)
+					gpus = pickBestFitGPUs(pending, ggml, gpus)
 					if gpus != nil {
 						slog.Debug("new model fits with existing models, loading")
-						s.loadFn(pending, gpus)
+						s.loadFn(pending, ggml, gpus)
 						break
 					}
 					runnerToExpire = s.findRunnerToUnload(pending)
@@ -282,8 +297,8 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 }

-func (s *Scheduler) load(req *LlmRequest, gpus gpu.GpuInfoList) {
-	llama, err := s.newServerFn(gpus, req.model.ModelPath, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
+	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
 	if err != nil {
 		// some older models are not compatible with newer versions of llama.cpp
 		// show a generalized compatibility error until there is a better way to
@@ -417,16 +432,21 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 	slog.Debug("evaluating already loaded", "model", req.model.ModelPath)
 	runner.refMu.Lock()
 	defer runner.refMu.Unlock()
-	// Ignore the NumGPU settings for comparison
-	optsExisting := runner.Options.Runner
-	optsExisting.NumGPU = -1
-	optsNew := req.opts.Runner
-	optsNew.NumGPU = -1
+
 	timeout := 10 * time.Second
 	if runner.loading {
 		timeout = 2 * time.Minute // Initial load can take a long time for big models on slow systems...
 	}
-	ctx, cancel := context.WithTimeout(ctx, timeout) // BUG -
+
+	// Don't reload runner if num_gpu=-1 was provided
+	optsExisting := runner.Options.Runner
+	optsNew := req.opts.Runner
+	if optsNew.NumGPU < 0 {
+		optsExisting.NumGPU = -1
+		optsNew.NumGPU = -1
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
 	if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
 		!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
@@ -434,6 +454,7 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 		runner.llama.Ping(ctx) != nil {
 		return true
 	}
+
 	return false
 }

@@ -454,7 +475,7 @@ func (a ByDuration) Less(i, j int) bool {

 // pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
 // If the model can not be fit fully within the available GPU(s) nil is returned
-func pickBestFitGPUs(req *LlmRequest, gpus gpu.GpuInfoList) gpu.GpuInfoList {
+func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
 	var estimatedVRAM uint64
 	for _, gl := range gpus.ByLibrary() {
 		var ok bool
@@ -466,7 +487,7 @@ func pickBestFitGPUs(req *LlmRequest, gpus gpu.GpuInfoList) gpu.GpuInfoList {

 		// First attempt to fit the model into a single GPU
 		for _, g := range sgl {
-			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 				slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 				return []gpu.GpuInfo{g}
 			}
@@ -477,7 +498,7 @@ func pickBestFitGPUs(req *LlmRequest, gpus gpu.GpuInfoList) gpu.GpuInfoList {
 		// - try subsets of GPUs instead of just falling back to 1 or all in a family

 		// Now try all the GPUs
-		if ok, estimatedVRAM = llm.PredictServerFit(gl, req.ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+		if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
 			return gl
 		}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -47,6 +47,7 @@ func TestLoad(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
+	var ggml *llm.GGML // value not used in tests
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
@@ -59,7 +60,7 @@ func TestLoad(t *testing.T) {
 		return nil, fmt.Errorf("something failed to load model blah")
 	}
 	gpus := gpu.GpuInfoList{}
-	s.load(req, gpus)
+	s.load(req, ggml, gpus)
 	require.Len(t, req.successCh, 0)
 	require.Len(t, req.errCh, 1)
 	require.Len(t, s.loaded, 0)
@@ -70,7 +71,7 @@ func TestLoad(t *testing.T) {
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
 		return server, nil
 	}
-	s.load(req, gpus)
+	s.load(req, ggml, gpus)
 	select {
 	case err := <-req.errCh:
 		require.NoError(t, err)
@@ -82,7 +83,7 @@ func TestLoad(t *testing.T) {

 	req.model.ModelPath = "dummy_model_path"
 	server.waitResp = fmt.Errorf("wait failure")
-	s.load(req, gpus)
+	s.load(req, ggml, gpus)
 	select {
 	case err := <-req.errCh:
 		require.Contains(t, err.Error(), "wait failure")
@@ -101,6 +102,7 @@ type bundle struct {
 	ctxDone func()
 	srv     *mockLlm
 	req     *LlmRequest
+	ggml    *llm.GGML
 }

 func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
@@ -132,14 +134,15 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	})
 	assert.Nil(t, err)
+
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	ggml, err := llm.LoadModel(model.ModelPath)
+	scenario.ggml, err = llm.LoadModel(model.ModelPath)
 	require.NoError(t, err)
+
 	scenario.req = &LlmRequest{
 		ctx:             scenario.ctx,
 		model:           model,
-		ggml:            ggml,
 		sessionDuration: 5 * time.Millisecond,
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
@@ -157,13 +160,13 @@ func TestRequests(t *testing.T) {
 	scenario1a.req.sessionDuration = 0
 	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
 	scenario1b.req.model = scenario1a.req.model
-	scenario1b.req.ggml = scenario1a.req.ggml
+	scenario1b.ggml = scenario1a.ggml
 	scenario1b.req.sessionDuration = 0

 	// simple reload of same model
 	scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
 	scenario2a.req.model = scenario1a.req.model
-	scenario2a.req.ggml = scenario1a.req.ggml
+	scenario2a.ggml = scenario1a.ggml

 	// Multiple loaded models
 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -322,13 +325,14 @@ func TestGetRunner(t *testing.T) {
 	successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 0)
 	require.Len(t, successCh1c, 0)
+	require.Len(t, errCh1c, 0)
+
+	time.Sleep(5 * time.Millisecond)
+	require.Len(t, s.loaded, 0)
 	require.Len(t, errCh1c, 1)
 	err = <-errCh1c
 	require.Contains(t, err.Error(), "bad path")
 	scenario1b.ctxDone()
-
-	time.Sleep(5 * time.Millisecond)
-	require.Len(t, s.loaded, 0)
 }

 // TODO - add one scenario that triggers the bogus finished event with positive ref count
@@ -366,7 +370,9 @@ func TestPrematureExpired(t *testing.T) {
 	require.LessOrEqual(t, len(s.finishedReqCh), 1)
 	time.Sleep(10 * time.Millisecond)
 	require.Len(t, s.finishedReqCh, 0)
+	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 0)
+	s.loadedMu.Unlock()

 	// also shouldn't happen in real life
 	s.finishedReqCh <- scenario1a.req
@@ -426,7 +432,6 @@ func TestUpdateFreeSpace(t *testing.T) {
 	s.updateFreeSpace(gpus)
 	require.Equal(t, uint64(850), gpus[0].FreeMemory)
 	require.Equal(t, uint64(1850), gpus[1].FreeMemory)
-
 }

 func TestFindRunnerToUnload(t *testing.T) {
@@ -485,6 +490,9 @@ func TestNeedsReload(t *testing.T) {
 	require.False(t, resp)
 	req.opts.NumGPU = 99
 	resp = runner.needsReload(ctx, req)
+	require.True(t, resp)
+	req.opts.NumGPU = -1
+	resp = runner.needsReload(ctx, req)
 	require.False(t, resp)
 }

--- a/types/model/name.go
+++ b/types/model/name.go
@@ -150,7 +150,7 @@ type Name struct {
 // For any valid s, the fill string is used to fill in missing parts of the
 // Name. The fill string must be a valid Name with the exception that any part
 // may be the string ("?"), which will not be considered for filling.
-func ParseName(s, fill string) Name {
+func ParseNameFill(s, fill string) Name {
 	var r Name
 	parts(s)(func(kind PartKind, part string) bool {
 		if kind == PartDigest && !ParseDigest(part).IsValid() {
@@ -170,6 +170,13 @@ func ParseName(s, fill string) Name {
 	return Name{}
 }

+// ParseName parses s into a Name, and returns the result of filling it
+// with FillDefault. The input string must be a valid string representation
+// of a model
+func ParseName(s string) Name {
+	return ParseNameFill(s, "")
+}
+
 func parseMask(s string) Name {
 	var r Name
 	parts(s)(func(kind PartKind, part string) bool {
@@ -187,7 +194,7 @@ func parseMask(s string) Name {
 }

 func MustParseName(s, fill string) Name {
-	r := ParseName(s, fill)
+	r := ParseNameFill(s, fill)
 	if !r.IsValid() {
 		panic("invalid Name: " + s)
 	}
@@ -579,7 +586,11 @@ func (r Name) IsValid() bool {
 // it trims any leading "/" and then calls [ParseName] with fill.
 func ParseNameFromURLPath(s, fill string) Name {
 	s = strings.TrimPrefix(s, "/")
-	return ParseName(s, fill)
+	return ParseNameFill(s, fill)
+}
+
+func ParseNameFromURLPathFill(s, fill string) Name {
+	return ParseNameFill(s, fill)
 }

 // URLPath returns a complete, canonicalized, relative URL path using the parts of a
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -119,11 +119,11 @@ func TestNameConsecutiveDots(t *testing.T) {
 	for i := 1; i < 10; i++ {
 		s := strings.Repeat(".", i)
 		if i > 1 {
-			if g := ParseName(s, FillNothing).DisplayLong(); g != "" {
+			if g := ParseNameFill(s, FillNothing).DisplayLong(); g != "" {
 				t.Errorf("ParseName(%q) = %q; want empty string", s, g)
 			}
 		} else {
-			if g := ParseName(s, FillNothing).DisplayLong(); g != s {
+			if g := ParseNameFill(s, FillNothing).DisplayLong(); g != s {
 				t.Errorf("ParseName(%q) = %q; want %q", s, g, s)
 			}
 		}
@@ -156,14 +156,14 @@ func TestParseName(t *testing.T) {
 			s := prefix + baseName

 			t.Run(s, func(t *testing.T) {
-				name := ParseName(s, FillNothing)
+				name := ParseNameFill(s, FillNothing)
 				got := fieldsFromName(name)
 				if got != want {
 					t.Errorf("ParseName(%q) = %q; want %q", s, got, want)
 				}

 				// test round-trip
-				if !ParseName(name.DisplayLong(), FillNothing).EqualFold(name) {
+				if !ParseNameFill(name.DisplayLong(), FillNothing).EqualFold(name) {
 					t.Errorf("ParseName(%q).String() = %s; want %s", s, name.DisplayLong(), baseName)
 				}
 			})
@@ -188,7 +188,7 @@ func TestParseNameFill(t *testing.T) {

 	for _, tt := range cases {
 		t.Run(tt.in, func(t *testing.T) {
-			name := ParseName(tt.in, tt.fill)
+			name := ParseNameFill(tt.in, tt.fill)
 			if g := name.DisplayLong(); g != tt.want {
 				t.Errorf("ParseName(%q, %q) = %q; want %q", tt.in, tt.fill, g, tt.want)
 			}
@@ -201,7 +201,7 @@ func TestParseNameFill(t *testing.T) {
 				t.Fatal("expected panic")
 			}
 		}()
-		ParseName("x", "^")
+		ParseNameFill("x", "^")
 	})
 }

@@ -212,7 +212,7 @@ func TestParseNameHTTPDoublePrefixStrip(t *testing.T) {
 	}
 	for _, s := range cases {
 		t.Run(s, func(t *testing.T) {
-			name := ParseName(s, FillNothing)
+			name := ParseNameFill(s, FillNothing)
 			if name.IsValid() {
 				t.Errorf("expected invalid path; got %#v", name)
 			}
@@ -237,7 +237,7 @@ func TestCompleteWithAndWithoutBuild(t *testing.T) {

 	for _, tt := range cases {
 		t.Run(tt.in, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
+			p := ParseNameFill(tt.in, FillNothing)
 			t.Logf("ParseName(%q) = %#v", tt.in, p)
 			if g := p.IsComplete(); g != tt.complete {
 				t.Errorf("Complete(%q) = %v; want %v", tt.in, g, tt.complete)
@@ -252,7 +252,7 @@ func TestCompleteWithAndWithoutBuild(t *testing.T) {
 	// inlined when used in Complete, preventing any allocations or
 	// escaping to the heap.
 	allocs := testing.AllocsPerRun(1000, func() {
-		keep(ParseName("complete.com/x/mistral:latest+Q4_0", FillNothing).IsComplete())
+		keep(ParseNameFill("complete.com/x/mistral:latest+Q4_0", FillNothing).IsComplete())
 	})
 	if allocs > 0 {
 		t.Errorf("Complete allocs = %v; want 0", allocs)
@@ -269,7 +269,7 @@ func TestNameLogValue(t *testing.T) {
 		t.Run(s, func(t *testing.T) {
 			var b bytes.Buffer
 			log := slog.New(slog.NewTextHandler(&b, nil))
-			name := ParseName(s, FillNothing)
+			name := ParseNameFill(s, FillNothing)
 			log.Info("", "name", name)
 			want := fmt.Sprintf("name=%s", name.GoString())
 			got := b.String()
@@ -316,7 +316,7 @@ func TestNameGoString(t *testing.T) {

 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
+			p := ParseNameFill(tt.in, FillNothing)
 			tt.wantGoString = cmp.Or(tt.wantGoString, tt.in)
 			if g := fmt.Sprintf("%#v", p); g != tt.wantGoString {
 				t.Errorf("GoString() = %q; want %q", g, tt.wantGoString)
@@ -326,7 +326,7 @@ func TestNameGoString(t *testing.T) {
 }

 func TestDisplayLongest(t *testing.T) {
-	g := ParseName("example.com/library/mistral:latest+Q4_0", FillNothing).DisplayLongest()
+	g := ParseNameFill("example.com/library/mistral:latest+Q4_0", FillNothing).DisplayLongest()
 	if g != "example.com/library/mistral:latest" {
 		t.Errorf("got = %q; want %q", g, "example.com/library/mistral:latest")
 	}
@@ -377,7 +377,7 @@ func TestDisplayShortest(t *testing.T) {
 				}
 			}()

-			p := ParseName(tt.in, FillNothing)
+			p := ParseNameFill(tt.in, FillNothing)
 			t.Logf("ParseName(%q) = %#v", tt.in, p)
 			if g := p.DisplayShortest(tt.mask); g != tt.want {
 				t.Errorf("got = %q; want %q", g, tt.want)
@@ -388,7 +388,7 @@ func TestDisplayShortest(t *testing.T) {

 func TestParseNameAllocs(t *testing.T) {
 	allocs := testing.AllocsPerRun(1000, func() {
-		keep(ParseName("example.com/mistral:7b+Q4_0", FillNothing))
+		keep(ParseNameFill("example.com/mistral:7b+Q4_0", FillNothing))
 	})
 	if allocs > 0 {
 		t.Errorf("ParseName allocs = %v; want 0", allocs)
@@ -399,7 +399,7 @@ func BenchmarkParseName(b *testing.B) {
 	b.ReportAllocs()

 	for range b.N {
-		keep(ParseName("example.com/mistral:7b+Q4_0", FillNothing))
+		keep(ParseNameFill("example.com/mistral:7b+Q4_0", FillNothing))
 	}
 }

@@ -430,7 +430,7 @@ func FuzzParseName(f *testing.F) {
 	f.Add(":@!@")
 	f.Add("...")
 	f.Fuzz(func(t *testing.T, s string) {
-		r0 := ParseName(s, FillNothing)
+		r0 := ParseNameFill(s, FillNothing)

 		if strings.Contains(s, "..") && !r0.IsZero() {
 			t.Fatalf("non-zero value for path with '..': %q", s)
@@ -453,7 +453,7 @@ func FuzzParseName(f *testing.F) {
 			t.Errorf("String() did not round-trip with case insensitivity: %q\ngot  = %q\nwant = %q", s, r0.DisplayLong(), s)
 		}

-		r1 := ParseName(r0.DisplayLong(), FillNothing)
+		r1 := ParseNameFill(r0.DisplayLong(), FillNothing)
 		if !r0.EqualFold(r1) {
 			t.Errorf("round-trip mismatch: %+v != %+v", r0, r1)
 		}
@@ -461,7 +461,7 @@ func FuzzParseName(f *testing.F) {
 }

 func TestNameStringAllocs(t *testing.T) {
-	name := ParseName("example.com/ns/mistral:latest+Q4_0", FillNothing)
+	name := ParseNameFill("example.com/ns/mistral:latest+Q4_0", FillNothing)
 	allocs := testing.AllocsPerRun(1000, func() {
 		keep(name.DisplayLong())
 	})
@@ -483,7 +483,7 @@ func TestNamePath(t *testing.T) {
 	}
 	for _, tt := range cases {
 		t.Run(tt.in, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
+			p := ParseNameFill(tt.in, FillNothing)
 			t.Logf("ParseName(%q) = %#v", tt.in, p)
 			if g := p.DisplayURLPath(); g != tt.want {
 				t.Errorf("got = %q; want %q", g, tt.want)
@@ -526,7 +526,7 @@ func TestNameFilepath(t *testing.T) {
 	}
 	for _, tt := range cases {
 		t.Run(tt.in, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
+			p := ParseNameFill(tt.in, FillNothing)
 			t.Logf("ParseName(%q) = %#v", tt.in, p)
 			g := p.Filepath()
 			g = filepath.ToSlash(g)
@@ -587,7 +587,7 @@ func TestParseNameFilepath(t *testing.T) {
 		t.Run(tt.in, func(t *testing.T) {
 			in := strings.ReplaceAll(tt.in, "/", string(filepath.Separator))
 			fill := cmp.Or(tt.fill, FillNothing)
-			want := ParseName(tt.want, fill)
+			want := ParseNameFill(tt.want, fill)
 			if g := ParseNameFromFilepath(in, fill); !g.EqualFold(want) {
 				t.Errorf("got = %q; want %q", g.DisplayLong(), tt.want)
 			}
@@ -645,12 +645,12 @@ func ExampleName_MapHash() {
 	m := map[uint64]bool{}

 	// key 1
-	m[ParseName("mistral:latest+q4", FillNothing).MapHash()] = true
-	m[ParseName("miSTRal:latest+Q4", FillNothing).MapHash()] = true
-	m[ParseName("mistral:LATest+Q4", FillNothing).MapHash()] = true
+	m[ParseNameFill("mistral:latest+q4", FillNothing).MapHash()] = true
+	m[ParseNameFill("miSTRal:latest+Q4", FillNothing).MapHash()] = true
+	m[ParseNameFill("mistral:LATest+Q4", FillNothing).MapHash()] = true

 	// key 2
-	m[ParseName("mistral:LATest", FillNothing).MapHash()] = true
+	m[ParseNameFill("mistral:LATest", FillNothing).MapHash()] = true

 	fmt.Println(len(m))
 	// Output:
@@ -659,9 +659,9 @@ func ExampleName_MapHash() {

 func ExampleName_CompareFold_sort() {
 	names := []Name{
-		ParseName("mistral:latest", FillNothing),
-		ParseName("mistRal:7b+q4", FillNothing),
-		ParseName("MIstral:7b", FillNothing),
+		ParseNameFill("mistral:latest", FillNothing),
+		ParseNameFill("mistRal:7b+q4", FillNothing),
+		ParseNameFill("MIstral:7b", FillNothing),
 	}

 	slices.SortFunc(names, Name.CompareFold)
@@ -682,7 +682,7 @@ func ExampleName_completeAndResolved() {
 		"x/y/z:latest+q4_0",
 		"@sha123-abc",
 	} {
-		name := ParseName(s, FillNothing)
+		name := ParseNameFill(s, FillNothing)
 		fmt.Printf("complete:%v resolved:%v  digest:%s\n", name.IsComplete(), name.IsResolved(), name.Digest())
 	}

@@ -693,7 +693,7 @@ func ExampleName_completeAndResolved() {
 }

 func ExampleName_DisplayShortest() {
-	name := ParseName("example.com/jmorganca/mistral:latest+Q4_0", FillNothing)
+	name := ParseNameFill("example.com/jmorganca/mistral:latest+Q4_0", FillNothing)

 	fmt.Println(name.DisplayShortest("example.com/jmorganca/_:latest"))
 	fmt.Println(name.DisplayShortest("example.com/_/_:latest"))
@@ -701,7 +701,7 @@ func ExampleName_DisplayShortest() {
 	fmt.Println(name.DisplayShortest("_/_/_:_"))

 	// Default
-	name = ParseName("registry.ollama.ai/library/mistral:latest+Q4_0", FillNothing)
+	name = ParseNameFill("registry.ollama.ai/library/mistral:latest+Q4_0", FillNothing)
 	fmt.Println(name.DisplayShortest(""))

 	// Output:
Author	SHA1	Message	Date
Daniel Hiltgen	4e1ff6dcbb	Merge pull request #3926 from dhiltgen/ci_fixes Fix release CI	2024-04-25 17:42:31 -07:00
Daniel Hiltgen	8589d752ac	Fix release CI download-artifact path was being used incorrectly. It is where to extract the zip not the files in the zip to extract. Default is workspace dir which is what we want, so omit it	2024-04-25 17:27:11 -07:00
Michael Yang	de4ded68b0	Merge pull request #3923 from ollama/mxyng/mem only count output tensors	2024-04-25 16:34:17 -07:00
Daniel Hiltgen	9b5a3c5991	Merge pull request #3914 from dhiltgen/mac_perf Improve mac parallel performance	2024-04-25 16:28:31 -07:00
Jeffrey Morgan	00b0699c75	Reload model if `num_gpu` changes (#3920 ) * reload model if `num_gpu` changes * dont reload on -1 * fix tests	2024-04-25 19:02:40 -04:00
Jeffrey Morgan	993cf8bf55	llm: limit generation to 10x context size to avoid run on generations (#3918 ) * llm: limit generation to 10x context size to avoid run on generations * add comment * simplify condition statement	2024-04-25 19:02:30 -04:00
Michael Yang	7bb7cb8a60	only count output tensors	2024-04-25 15:24:08 -07:00
Daniel Hiltgen	b123be5b71	Adjust context size for parallelism	2024-04-25 13:58:54 -07:00
jmorganca	ddf5c09a9b	use matrix multiplcation kernels in more cases	2024-04-25 13:58:54 -07:00
Roy Yang	5f73c08729	Remove trailing spaces (#3889 )	2024-04-25 14:32:26 -04:00
Daniel Hiltgen	f503a848c2	Merge pull request #3895 from brycereitano/shiftloading Move ggml loading to when attempting to fit	2024-04-25 09:24:08 -07:00
Bryce Reitano	36a6daccab	Restructure loading conditional chain	2024-04-24 17:37:03 -06:00
Bryce Reitano	ceb0e26e5e	Provide variable ggml for TestLoad	2024-04-24 17:19:55 -06:00
Bryce Reitano	284e02bed0	Move ggml loading to when we attempt fitting	2024-04-24 17:17:24 -06:00
Michael Yang	3450a57d4a	Merge pull request #3713 from ollama/mxyng/modelname update copy handler to use model.Name	2024-04-24 16:00:32 -07:00
Michael Yang	592dae31c8	update copy to use model.Name	2024-04-24 15:54:54 -07:00
Michael Yang	2010cbc5fa	Merge pull request #3833 from ollama/mxyng/fix-from fix: from blob	2024-04-24 15:13:47 -07:00
Michael Yang	ac0801eced	only replace if it matches command	2024-04-24 14:49:26 -07:00
Michael Yang	ad66e5b060	split temp zip files	2024-04-24 14:18:01 -07:00
Blake Mizerany	ade4b55520	types/model: make ParseName use default without question (#3886 )	2024-04-24 11:52:55 -07:00
Daniel Hiltgen	a6d62e0617	Merge pull request #3882 from dhiltgen/amd_gfx AMD gfx patch rev is hex	2024-04-24 11:07:49 -07:00
Daniel Hiltgen	0d6687f84c	AMD gfx patch rev is hex Correctly handle gfx90a discovery	2024-04-24 09:43:52 -07:00