handle intermediate blobs

split model layer into metadata and data layers
s/DisplayLongest/String/
2026-04-26 18:55:53 +02:00 · 2024-05-02 17:05:49 -07:00 · 2024-05-02 17:05:49 -07:00 · 2024-05-02 17:05:26 -07:00 · 2024-05-02 17:05:26 -07:00 · 2024-05-02 17:05:26 -07:00
57 changed files with 3048 additions and 2679 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -331,8 +331,6 @@ jobs:
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
          $env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
          $env:HIP_PATH=$(resolve-path ".\dist\deps")
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,5 +1,15 @@
 name: test
 concurrency:
  # For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
  # cancels running CI jobs and starts all new ones.
  #
  # For non-PR pushes, concurrency.group needs to be unique for every distinct
  # CI run we want to have happen. Use run_id, which in practice means all
  # non-PR CI runs will be allowed to run without preempting each other.
  group: ${{ github.workflow }}-$${{ github.pull_request.number || github.run_id }}
  cancel-in-progress: true
 on:
  pull_request:
    paths:
@@ -21,7 +31,9 @@ jobs:
      - id: changes
        run: |
          changed() {
-            git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
+            git diff-tree -r --no-commit-id --name-only \
              $(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
              ${{ github.event.pull_request.head.sha }} \
              | xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
          }
@@ -283,7 +295,6 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
      - run: go get
      - run: |
          case ${{ matrix.arch }} in
            amd64) echo ARCH=x86_64 ;;
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ ggml-metal.metal
 .idea
 test_data
 *.crt
-llm/build
+llm/build
 __debug_bin*
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+ <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>
 # Ollama
@@ -51,7 +51,7 @@ Here are some example models that can be downloaded:
 | ------------------ | ---------- | ----- | ------------------------------ |
 | Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
-| Phi-3              | 3,8B       | 2.3GB | `ollama run phi3`              |
+| Phi-3              | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
 | Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```
-### Pass in prompt as arguments
+### Pass the prompt as an argument
 ```
 $ ollama run llama3 "Summarize this file: $(cat README.md)"
@@ -294,7 +294,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
 - [chat: chat web app for teams](https://github.com/swuecho/chat)
 - [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
+- [Ollama RAG Chatbot: Local Chat with multiple PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
 ### Terminal
@@ -384,4 +384,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 ### Supported backends 
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
--- a/api/client.go
+++ b/api/client.go
@@ -18,6 +18,7 @@ import (
 	"net/url"
 	"os"
 	"runtime"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/format"
@@ -57,12 +58,36 @@ func checkError(resp *http.Response, body []byte) error {
 // If the variable is not specified, a default ollama host and port will be
 // used.
 func ClientFromEnvironment() (*Client, error) {
 	ollamaHost, err := GetOllamaHost()
 	if err != nil {
 		return nil, err
 	}
 	return &Client{
 		base: &url.URL{
 			Scheme: ollamaHost.Scheme,
 			Host:   net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
 		},
 		http: http.DefaultClient,
 	}, nil
 }
 type OllamaHost struct {
 	Scheme string
 	Host   string
 	Port   string
 }
 func GetOllamaHost() (OllamaHost, error) {
 	defaultPort := "11434"
-	scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
+	hostVar := os.Getenv("OLLAMA_HOST")
 	hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
 	scheme, hostport, ok := strings.Cut(hostVar, "://")
 	switch {
 	case !ok:
-		scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
+		scheme, hostport = "http", hostVar
 	case scheme == "http":
 		defaultPort = "80"
 	case scheme == "https":
@@ -82,12 +107,14 @@ func ClientFromEnvironment() (*Client, error) {
 		}
 	}
-	return &Client{
+	if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
-		base: &url.URL{
+		return OllamaHost{}, ErrInvalidHostPort
-			Scheme: scheme,
+	}
-			Host:   net.JoinHostPort(host, port),
+
-		},
+	return OllamaHost{
-		http: http.DefaultClient,
+		Scheme: scheme,
 		Host:   host,
 		Port:   port,
 	}, nil
 }
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -1,6 +1,12 @@
 package api
-import "testing"
+import (
 	"fmt"
 	"net"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func TestClientFromEnvironment(t *testing.T) {
 	type testCase struct {
@@ -40,4 +46,40 @@ func TestClientFromEnvironment(t *testing.T) {
 			}
 		})
 	}
 	hostTestCases := map[string]*testCase{
 		"empty":               {value: "", expect: "127.0.0.1:11434"},
 		"only address":        {value: "1.2.3.4", expect: "1.2.3.4:11434"},
 		"only port":           {value: ":1234", expect: ":1234"},
 		"address and port":    {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
 		"hostname":            {value: "example.com", expect: "example.com:11434"},
 		"hostname and port":   {value: "example.com:1234", expect: "example.com:1234"},
 		"zero port":           {value: ":0", expect: ":0"},
 		"too large port":      {value: ":66000", err: ErrInvalidHostPort},
 		"too small port":      {value: ":-1", err: ErrInvalidHostPort},
 		"ipv6 localhost":      {value: "[::1]", expect: "[::1]:11434"},
 		"ipv6 world open":     {value: "[::]", expect: "[::]:11434"},
 		"ipv6 no brackets":    {value: "::1", expect: "[::1]:11434"},
 		"ipv6 + port":         {value: "[::1]:1337", expect: "[::1]:1337"},
 		"extra space":         {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
 		"extra quotes":        {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
 		"extra space+quotes":  {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
 		"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
 	}
 	for k, v := range hostTestCases {
 		t.Run(k, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", v.value)
 			oh, err := GetOllamaHost()
 			if err != v.err {
 				t.Fatalf("expected %s, got %s", v.err, err)
 			}
 			if err == nil {
 				host := net.JoinHostPort(oh.Host, oh.Port)
 				assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
 			}
 		})
 	}
 }
--- a/api/types.go
+++ b/api/types.go
@@ -309,6 +309,7 @@ func (m *Metrics) Summary() {
 }
 var ErrInvalidOpts = errors.New("invalid options")
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 func (opts *Options) FromMap(m map[string]interface{}) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -43,37 +43,36 @@ func getCLIFullPath(command string) string {
 	return command
 }
-func SpawnServer(ctx context.Context, command string) (chan int, error) {
+func start(ctx context.Context, command string) (*exec.Cmd, error) {
 	done := make(chan int)
 	logDir := filepath.Dir(ServerLogFile)
 	_, err := os.Stat(logDir)
 	if errors.Is(err, os.ErrNotExist) {
 		if err := os.MkdirAll(logDir, 0o755); err != nil {
 			return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
 		}
 	}
 	cmd := getCmd(ctx, getCLIFullPath(command))
 	// send stdout and stderr to a file
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
-		return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
+		return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
 	}
 	stderr, err := cmd.StderrPipe()
 	if err != nil {
-		return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
+		return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
 	}
 	stdin, err := cmd.StdinPipe()
 	if err != nil {
 		return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
 	}
 	// TODO - rotation
 	logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 	if err != nil {
-		return done, fmt.Errorf("failed to create server log %w", err)
+		return nil, fmt.Errorf("failed to create server log: %w", err)
 	}
 	logDir := filepath.Dir(ServerLogFile)
 	_, err = os.Stat(logDir)
 	if err != nil {
 		if !errors.Is(err, os.ErrNotExist) {
 			return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
 		}
 		if err := os.MkdirAll(logDir, 0o755); err != nil {
 			return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
 		}
 	}
 	go func() {
 		defer logFile.Close()
 		io.Copy(logFile, stdout) //nolint:errcheck
@@ -117,19 +116,33 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 	// run the command and wait for it to finish
 	if err := cmd.Start(); err != nil {
-		return done, fmt.Errorf("failed to start server %w", err)
+		return nil, fmt.Errorf("failed to start server %w", err)
 	}
 	if cmd.Process != nil {
 		slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
 	}
 	slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
 	return cmd, nil
 }
 func SpawnServer(ctx context.Context, command string) (chan int, error) {
 	done := make(chan int)
 	go func() {
 		// Keep the server running unless we're shuttind down the app
 		crashCount := 0
 		for {
 			slog.Info("starting server...")
 			cmd, err := start(ctx, command)
 			if err != nil {
 				crashCount++
 				slog.Error(fmt.Sprintf("failed to start server %s", err))
 				time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
 				continue
 			}
 			cmd.Wait() //nolint:errcheck
 			stdin.Close()
 			var code int
 			if cmd.ProcessState != nil {
 				code = cmd.ProcessState.ExitCode()
@@ -143,15 +156,12 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 			default:
 				crashCount++
 				slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
-				time.Sleep(500 * time.Millisecond)
+				time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
-				if err := cmd.Start(); err != nil {
+				break
 					slog.Error(fmt.Sprintf("failed to restart server %s", err))
 					// Keep trying, but back off if we keep failing
 					time.Sleep(time.Duration(crashCount) * time.Second)
 				}
 			}
 		}
 	}()
 	return done, nil
 }
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,16 +88,12 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-; Assumes v5.7, may need adjustments for v6
+#if DirExists("..\dist\windows-amd64\rocm")
-#if GetEnv("HIP_PATH") != ""
+  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
  Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
  ; amdhip64.dll dependency comes from the driver and must be installed already
  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
 #endif
@@ -133,7 +129,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 ;FinishedHeadingLabel=Run your first model
-;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama2
+;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3
 ;ClickFinish=%n
 [Registry]
--- a/auth/auth.go
+++ b/auth/auth.go
@@ -10,12 +10,44 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
 	"golang.org/x/crypto/ssh"
 )
 const defaultPrivateKey = "id_ed25519"
 func keyPath() (string, error) {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}
 	return filepath.Join(home, ".ollama", defaultPrivateKey), nil
 }
 func GetPublicKey() (string, error) {
 	keyPath, err := keyPath()
 	if err != nil {
 		return "", err
 	}
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
 		return "", err
 	}
 	privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
 	if err != nil {
 		return "", err
 	}
 	publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
 	return strings.TrimSpace(string(publicKey)), nil
 }
 func NewNonce(r io.Reader, length int) (string, error) {
 	nonce := make([]byte, length)
 	if _, err := io.ReadFull(r, nonce); err != nil {
@@ -26,13 +58,11 @@ func NewNonce(r io.Reader, length int) (string, error) {
 }
 func Sign(ctx context.Context, bts []byte) (string, error) {
-	home, err := os.UserHomeDir()
+	keyPath, err := keyPath()
 	if err != nil {
 		return "", err
 	}
 	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -32,10 +32,13 @@ import (
 	"golang.org/x/term"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -54,12 +57,13 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()
-	modelfile, err := os.ReadFile(filename)
+	modelfile, err := os.Open(filename)
 	if err != nil {
 		return err
 	}
 	defer modelfile.Close()
-	commands, err := parser.Parse(bytes.NewReader(modelfile))
+	commands, err := parser.Parse(modelfile)
 	if err != nil {
 		return err
 	}
@@ -73,10 +77,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)
-	for _, c := range commands {
+	for i := range commands {
-		switch c.Name {
+		switch commands[i].Name {
 		case "model", "adapter":
-			path := c.Args
+			path := commands[i].Args
 			if path == "~" {
 				path = home
 			} else if strings.HasPrefix(path, "~/") {
@@ -88,7 +92,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			}
 			fi, err := os.Stat(path)
-			if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
+			if errors.Is(err, os.ErrNotExist) && commands[i].Name == "model" {
 				continue
 			} else if err != nil {
 				return err
@@ -111,13 +115,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}
-			name := c.Name
+			commands[i].Args = "@"+digest
 			if c.Name == "model" {
 				name = "from"
 			}
 			re := regexp.MustCompile(fmt.Sprintf(`(?im)^(%s)\s+%s\s*$`, name, c.Args))
 			modelfile = re.ReplaceAll(modelfile, []byte("$1 @"+digest))
 		}
 	}
@@ -147,7 +145,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	quantization, _ := cmd.Flags().GetString("quantization")
-	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
+	request := api.CreateRequest{Name: args[0], Modelfile: parser.Format(commands), Quantization: quantization}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -165,71 +163,97 @@ func tempZipFiles(path string) (string, error) {
 	zipfile := zip.NewWriter(tempfile)
 	defer zipfile.Close()
-	tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
+	detectContentType := func(path string) (string, error) {
-	if err != nil {
+		f, err := os.Open(path)
 		return "", err
 	} else if len(tfiles) == 0 {
 		tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
 		if err != nil {
 			return "", err
 		}
-	}
+		defer f.Close()
-	files := []string{}
+		var b bytes.Buffer
-	files = append(files, tfiles...)
+		b.Grow(512)
-	if len(files) == 0 {
+		if _, err := io.CopyN(&b, f, 512); err != nil && !errors.Is(err, io.EOF) {
 		return "", fmt.Errorf("no models were found in '%s'", path)
 	}
 	// add the safetensor/torch config file + tokenizer
 	files = append(files, filepath.Join(path, "config.json"))
 	files = append(files, filepath.Join(path, "params.json"))
 	files = append(files, filepath.Join(path, "added_tokens.json"))
 	files = append(files, filepath.Join(path, "tokenizer.model"))
 	for _, fn := range files {
 		f, err := os.Open(fn)
 		// just skip whatever files aren't there
 		if os.IsNotExist(err) {
 			if strings.HasSuffix(fn, "tokenizer.model") {
 				// try the parent dir before giving up
 				parentDir := filepath.Dir(path)
 				newFn := filepath.Join(parentDir, "tokenizer.model")
 				f, err = os.Open(newFn)
 				if os.IsNotExist(err) {
 					continue
 				} else if err != nil {
 					return "", err
 				}
 			} else {
 				continue
 			}
 		} else if err != nil {
 			return "", err
 		}
 		contentType, _, _ := strings.Cut(http.DetectContentType(b.Bytes()), ";")
 		return contentType, nil
 	}
 	glob := func(pattern, contentType string) ([]string, error) {
 		matches, err := filepath.Glob(pattern)
 		if err != nil {
 			return nil, err
 		}
 		for _, safetensor := range matches {
 			if ct, err := detectContentType(safetensor); err != nil {
 				return nil, err
 			} else if ct != contentType {
 				return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor)
 			}
 		}
 		return matches, nil
 	}
 	var files []string
 	if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
 		files = append(files, pt...)
 	} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers consolidated.x.pth, consolidated.pth
 		files = append(files, pt...)
 	} else {
 		return "", errors.New("no safetensors or torch files found")
 	}
 	// add configuration files, json files are detected as text/plain
 	js, err := glob(filepath.Join(path, "*.json"), "text/plain")
 	if err != nil {
 		return "", err
 	}
 	files = append(files, js...)
 	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
 		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
 		// tokenizer.model might be a unresolved git lfs reference; error if it is
 		files = append(files, tks...)
 	} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
 		// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
 		files = append(files, tks...)
 	}
 	for _, file := range files {
 		f, err := os.Open(file)
 		if err != nil {
 			return "", err
 		}
 		defer f.Close()
 		fi, err := f.Stat()
 		if err != nil {
 			return "", err
 		}
-		h, err := zip.FileInfoHeader(fi)
+		zfi, err := zip.FileInfoHeader(fi)
 		if err != nil {
 			return "", err
 		}
-		h.Name = filepath.Base(fn)
+		zf, err := zipfile.CreateHeader(zfi)
 		h.Method = zip.Store
 		w, err := zipfile.CreateHeader(h)
 		if err != nil {
 			return "", err
 		}
-		_, err = io.Copy(w, f)
+		if _, err := io.Copy(zf, f); err != nil {
 		if err != nil {
 			return "", err
 		}
 	}
@@ -331,6 +355,47 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generateInteractive(cmd, opts)
 }
 func errFromUnknownKey(unknownKeyErr error) error {
 	// find SSH public key in the error message
 	sshKeyPattern := `ssh-\w+ [^\s"]+`
 	re := regexp.MustCompile(sshKeyPattern)
 	matches := re.FindStringSubmatch(unknownKeyErr.Error())
 	if len(matches) > 0 {
 		serverPubKey := matches[0]
 		localPubKey, err := auth.GetPublicKey()
 		if err != nil {
 			return unknownKeyErr
 		}
 		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
 			// try the ollama service public key
 			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
 			if err != nil {
 				return unknownKeyErr
 			}
 			localPubKey = strings.TrimSpace(string(svcPubKey))
 		}
 		// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
 		if serverPubKey != localPubKey {
 			return unknownKeyErr
 		}
 		var msg strings.Builder
 		msg.WriteString(unknownKeyErr.Error())
 		msg.WriteString("\n\nYour ollama key is:\n")
 		msg.WriteString(localPubKey)
 		msg.WriteString("\nAdd your key at:\n")
 		msg.WriteString("https://ollama.com/settings/keys")
 		return errors.New(msg.String())
 	}
 	return unknownKeyErr
 }
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -378,6 +443,20 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 	request := api.PushRequest{Name: args[0], Insecure: insecure}
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
 		if spinner != nil {
 			spinner.Stop()
 		}
 		if strings.Contains(err.Error(), "access denied") {
 			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
 		}
 		host := model.ParseName(args[0]).Host
 		isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
 		if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
 			// the user has not added their ollama key to ollama.com
 			// re-throw an error with a more user-friendly message
 			return errFromUnknownKey(err)
 		}
 		return err
 	}
@@ -805,19 +884,17 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 }
 func RunServer(cmd *cobra.Command, _ []string) error {
-	host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
+	// retrieve the OLLAMA_HOST environment variable
 	ollamaHost, err := api.GetOllamaHost()
 	if err != nil {
-		host, port = "127.0.0.1", "11434"
+		return err
 		if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
 			host = ip.String()
 		}
 	}
 	if err := initializeKeypair(); err != nil {
 		return err
 	}
-	ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
+	ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
 	if err != nil {
 		return err
 	}
@@ -1043,7 +1120,7 @@ Environment Variables:
 		RunE:    ListHandler,
 	}
 	copyCmd := &cobra.Command{
-		Use:     "cp SOURCE TARGET",
+		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
 		Args:    cobra.ExactArgs(2),
 		PreRunE: checkServerHeartbeat,
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -94,6 +94,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show           Show model information")
 		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
 		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
 		fmt.Fprintln(os.Stderr, "  /clear          Clear session context")
 		fmt.Fprintln(os.Stderr, "  /bye            Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
@@ -280,6 +281,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			}
 			fmt.Printf("Created new model '%s'\n", args[1])
 			continue
 		case strings.HasPrefix(line, "/clear"):
 			opts.Messages = []api.Message{}
 			fmt.Println("Cleared session context")
 			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -5,6 +5,7 @@ import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -47,7 +48,7 @@ type ByteOrder interface {
 type ModelArch interface {
 	GetTensors() error
 	LoadVocab() error
-	WriteGGUF() (string, error)
+	WriteGGUF(io.WriteSeeker) error
 }
 type ModelFormat interface {
--- a/convert/gemma.go
+++ b/convert/gemma.go
@@ -94,7 +94,7 @@ func (m *GemmaModel) LoadVocab() error {
 	return nil
 }
-func (m *GemmaModel) WriteGGUF() (string, error) {
+func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "gemma",
 		"general.name":                           m.Name,
@@ -122,16 +122,5 @@ func (m *GemmaModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}
-	f, err := os.CreateTemp("", "ollama-gguf")
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	mod := llm.NewGGUFV3(m.Params.ByteOrder)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	return f.Name(), nil
 }
--- a/convert/llama.go
+++ b/convert/llama.go
@@ -132,7 +132,7 @@ func (m *LlamaModel) LoadVocab() error {
 	return nil
 }
-func (m *LlamaModel) WriteGGUF() (string, error) {
+func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -161,16 +161,9 @@ func (m *LlamaModel) WriteGGUF() (string, error) {
 	f, err := os.CreateTemp("", "ollama-gguf")
 	if err != nil {
-		return "", err
+		return err
 	}
 	defer f.Close()
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(f, kv, m.Tensors)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
 	return f.Name(), nil
 }
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error {
 	return nil
 }
-func (m *MistralModel) WriteGGUF() (string, error) {
+func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -158,16 +158,5 @@ func (m *MistralModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}
-	f, err := os.CreateTemp("", "ollama-gguf")
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	mod := llm.NewGGUFV3(m.Params.ByteOrder)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	return f.Name(), nil
 }
--- a/convert/mixtral.go
+++ b/convert/mixtral.go
@@ -1,7 +1,7 @@
 package convert
 import (
-	"os"
+	"io"
 	"regexp"
 	"github.com/ollama/ollama/llm"
@@ -47,7 +47,7 @@ func (m *MixtralModel) LoadVocab() error {
 	return nil
 }
-func (m *MixtralModel) WriteGGUF() (string, error) {
+func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":          "llama",
 		"general.name":                  m.Name,
@@ -81,16 +81,5 @@ func (m *MixtralModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}
-	f, err := os.CreateTemp("", "ollama-gguf")
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 	if err != nil {
 		return "", err
 	}
 	defer f.Close()
 	mod := llm.NewGGUFV3(m.Params.ByteOrder)
 	if err := mod.Encode(f, kv, m.Tensors); err != nil {
 		return "", err
 	}
 	return f.Name(), nil
 }
--- a/docs/development.md
+++ b/docs/development.md
@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
+a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
 Then generate dependencies:
@@ -142,4 +142,4 @@ In addition to the common Windows development tools described above, install AMD
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 - [Strawberry Perl](https://strawberryperl.com/)
-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
+Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -17,10 +17,12 @@ Let's start by asking a simple question that we can get an answer to from the **
 Then we can create a model and ask the question:
 ```python
-from langchain.llms import Ollama
+from langchain_community.llms import Ollama
-ollama = Ollama(base_url='http://localhost:11434',
+ollama = Ollama(
-model="llama2")
+    base_url='http://localhost:11434',
-print(ollama("why is the sky blue"))
+    model="llama3"
 )
 print(ollama.invoke("why is the sky blue"))
 ```
 Notice that we are defining the model and the base URL for Ollama.
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -1,47 +1,47 @@
-# Ollama Windows Preview
+# Ollama Windows Preview
-
+
-Welcome to the Ollama Windows preview.
+Welcome to the Ollama Windows preview.
-
+
-No more WSL required!
+No more WSL required!
-
+
-Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
+Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama Windows Preview, Ollama will run in the background and
+After installing Ollama Windows Preview, Ollama will run in the background and
-the `ollama` command line is available in `cmd`, `powershell` or your favorite
+the `ollama` command line is available in `cmd`, `powershell` or your favorite
-terminal application. As usual the Ollama [api](./api.md) will be served on
+terminal application. As usual the Ollama [api](./api.md) will be served on
-`http://localhost:11434`.
+`http://localhost:11434`.
-
+
-As this is a preview release, you should expect a few bugs here and there.  If
+As this is a preview release, you should expect a few bugs here and there.  If
-you run into a problem you can reach out on
+you run into a problem you can reach out on
-[Discord](https://discord.gg/ollama), or file an 
+[Discord](https://discord.gg/ollama), or file an
-[issue](https://github.com/ollama/ollama/issues).
+[issue](https://github.com/ollama/ollama/issues).
-Logs will often be helpful in dianosing the problem (see
+Logs will often be helpful in diagnosing the problem (see
-[Troubleshooting](#troubleshooting) below)
+[Troubleshooting](#troubleshooting) below)
-
+
-## System Requirements
+## System Requirements
-
+
-* Windows 10 or newer, Home or Pro
+* Windows 10 or newer, Home or Pro
-* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
+* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
-* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
+* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
-
+
-## API Access
+## API Access
-
+
-Here's a quick example showing API access from `powershell`
+Here's a quick example showing API access from `powershell`
-```powershell
+```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
-```
+```
-
+
-## Troubleshooting
+## Troubleshooting
-
+
-While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
+While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
-a "view logs" menu item to the app, and increses logging for the GUI app and
+a "view logs" menu item to the app, and increses logging for the GUI app and
-server.
+server.
-
+
-Ollama on Windows stores files in a few different locations.  You can view them in
+Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<cmd>+R` and type in:
+the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
+- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains logs from the GUI application
+    - *app.log* contains logs from the GUI application
-    - *server.log* contains the server logs
+    - *server.log* contains the server logs
-    - *upgrade.log* contains log output for upgrades
+    - *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
+- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
+- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
+- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -32,9 +32,25 @@ func PayloadsDir() (string, error) {
 				slog.Error("failed to lookup executable path", "error", err)
 				return "", err
 			}
 			cwd, err := os.Getwd()
 			if err != nil {
 				slog.Error("failed to lookup working directory", "error", err)
 				return "", err
 			}
 			var paths []string
 			for _, root := range []string{filepath.Dir(appExe), cwd} {
 				paths = append(paths,
 					filepath.Join(root),
 					filepath.Join(root, "windows-"+runtime.GOARCH),
 					filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
 				)
 			}
 			// Try a few variations to improve developer experience when building from source in the local tree
-			for _, d := range []string{".", "windows-" + runtime.GOARCH, "dist\\windows-" + runtime.GOARCH} {
+			for _, p := range paths {
-				candidate := filepath.Join(filepath.Dir(appExe), d, "ollama_runners")
+				candidate := filepath.Join(p, "ollama_runners")
 				_, err := os.Stat(candidate)
 				if err == nil {
 					runnersDir = candidate
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -10,6 +10,12 @@ package gpu
 import "C"
 import (
 	"runtime"
 	"github.com/ollama/ollama/format"
 )
 const (
 	metalMinimumMemory = 512 * format.MebiByte
 )
 func GetGPUInfo() GpuInfoList {
@@ -32,7 +38,7 @@ func GetGPUInfo() GpuInfoList {
 	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
 	info.FreeMemory = info.TotalMemory
-	info.MinimumMemory = 0
+	info.MinimumMemory = metalMinimumMemory
 	return []GpuInfo{info}
 }
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -107,7 +107,7 @@ func startServer(ctx context.Context, ollamaHost string) error {
 	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
 		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
-		os.Setenv("OLLAMA_HOST", ollamaHost)
+		t.Setenv("OLLAMA_HOST", ollamaHost)
 	}
 	slog.Info("starting server", "url", ollamaHost)
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1032,7 +1032,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }
-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1144,12 +1144,15 @@ struct llama_server_context
        res.result_json = json
        {
            {"content",    tkn.text_to_send},
            {"stop",       false},
            {"slot_id",    slot.id},
            {"multimodal", multimodal}
        };
        if (!llama_token_is_eog(model, tkn.tok)) {
            res.result_json["content"] = tkn.text_to_send;
        }
        if (slot.sparams.n_probs > 0)
        {
            std::vector<completion_token_output> probs_output = {};
@@ -2644,18 +2647,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            if (strncmp(sep, "int:", 4) == 0) {
                sep += 4;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
+                kvo.val_i64 = std::atol(sep);
            } else if (strncmp(sep, "float:", 6) == 0) {
                sep += 6;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
+                kvo.val_f64 = std::atof(sep);
            } else if (strncmp(sep, "bool:", 5) == 0) {
                sep += 5;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
+                    kvo.val_bool = true;
                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
+                    kvo.val_bool = false;
                } else {
                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
                    invalid_param = true;
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -0,0 +1,140 @@
 package llm
 import "fmt"
 type fileType uint32
 const (
 	fileTypeF32 fileType = iota
 	fileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
 	fileTypeQ4_1_F16
 	fileTypeQ4_2 // unused
 	fileTypeQ4_3 // unused
 	fileTypeQ8_0
 	fileTypeQ5_0
 	fileTypeQ5_1
 	fileTypeQ2_K
 	fileTypeQ3_K_S
 	fileTypeQ3_K_M
 	fileTypeQ3_K_L
 	fileTypeQ4_K_S
 	fileTypeQ4_K_M
 	fileTypeQ5_K_S
 	fileTypeQ5_K_M
 	fileTypeQ6_K
 	fileTypeIQ2_XXS
 	fileTypeIQ2_XS
 	fileTypeQ2_K_S
 	fileTypeQ3_K_XS
 	fileTypeIQ3_XXS
 	fileTypeUnknown
 )
 func ParseFileType(s string) (fileType, error) {
 	switch s {
 	case "F32":
 		return fileTypeF32, nil
 	case "F16":
 		return fileTypeF16, nil
 	case "Q4_0":
 		return fileTypeQ4_0, nil
 	case "Q4_1":
 		return fileTypeQ4_1, nil
 	case "Q4_1_F16":
 		return fileTypeQ4_1_F16, nil
 	case "Q8_0":
 		return fileTypeQ8_0, nil
 	case "Q5_0":
 		return fileTypeQ5_0, nil
 	case "Q5_1":
 		return fileTypeQ5_1, nil
 	case "Q2_K":
 		return fileTypeQ2_K, nil
 	case "Q3_K_S":
 		return fileTypeQ3_K_S, nil
 	case "Q3_K_M":
 		return fileTypeQ3_K_M, nil
 	case "Q3_K_L":
 		return fileTypeQ3_K_L, nil
 	case "Q4_K_S":
 		return fileTypeQ4_K_S, nil
 	case "Q4_K_M":
 		return fileTypeQ4_K_M, nil
 	case "Q5_K_S":
 		return fileTypeQ5_K_S, nil
 	case "Q5_K_M":
 		return fileTypeQ5_K_M, nil
 	case "Q6_K":
 		return fileTypeQ6_K, nil
 	case "IQ2_XXS":
 		return fileTypeIQ2_XXS, nil
 	case "IQ2_XS":
 		return fileTypeIQ2_XS, nil
 	case "Q2_K_S":
 		return fileTypeQ2_K_S, nil
 	case "Q3_K_XS":
 		return fileTypeQ3_K_XS, nil
 	case "IQ3_XXS":
 		return fileTypeIQ3_XXS, nil
 	default:
 		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
 	}
 }
 func (t fileType) String() string {
 	switch t {
 	case fileTypeF32:
 		return "F32"
 	case fileTypeF16:
 		return "F16"
 	case fileTypeQ4_0:
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
 	case fileTypeQ4_1_F16:
 		return "Q4_1_F16"
 	case fileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
 		return "Q5_0"
 	case fileTypeQ5_1:
 		return "Q5_1"
 	case fileTypeQ2_K:
 		return "Q2_K"
 	case fileTypeQ3_K_S:
 		return "Q3_K_S"
 	case fileTypeQ3_K_M:
 		return "Q3_K_M"
 	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case fileTypeQ4_K_S:
 		return "Q4_K_S"
 	case fileTypeQ4_K_M:
 		return "Q4_K_M"
 	case fileTypeQ5_K_S:
 		return "Q5_K_S"
 	case fileTypeQ5_K_M:
 		return "Q5_K_M"
 	case fileTypeQ6_K:
 		return "Q6_K"
 	case fileTypeIQ2_XXS:
 		return "IQ2_XXS"
 	case fileTypeIQ2_XS:
 		return "IQ2_XS"
 	case fileTypeQ2_K_S:
 		return "Q2_K_S"
 	case fileTypeQ3_K_XS:
 		return "Q3_K_XS"
 	case fileTypeIQ3_XXS:
 		return "IQ3_XXS"
 	default:
 		return "unknown"
 	}
 }
 func (t fileType) Value() uint32 {
 	return uint32(t)
 }
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -26,16 +26,25 @@ function amdGPUs {
    $GPU_LIST -join ';'
 }
 function init_vars {
-    $script:SRC_DIR = $(resolve-path "..\..\")
+    if (!$script:SRC_DIR) {
-    $script:llamacppDir = "../llama.cpp"
+        $script:SRC_DIR = $(resolve-path "..\..\")
    }
    if (!$script:llamacppDir) {
        $script:llamacppDir = "../llama.cpp"
    }
    if (!$script:cmakeTargets) {
        $script:cmakeTargets = @("ollama_llama_server")
    }
    $script:cmakeDefs = @(
        "-DBUILD_SHARED_LIBS=on",
        "-DLLAMA_NATIVE=off"
        )
-    $script:cmakeTargets = @("ollama_llama_server")
+    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-    $script:ARCH = "amd64" # arm not yet supported.
+    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
        $script:config = "RelWithDebInfo"
@@ -166,137 +175,195 @@ function cleanup {
    }
 }
 init_vars
 git_module_setup
 apply_patches
 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
+function build_static() {
    if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
        # GCC build for direct linking into the Go binary
        init_vars
        # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
        # as we need this to be compiled by gcc for golang to be able to link with itx
        write-host "Checking for MinGW..."
        # error action ensures we exit on failure
        get-command gcc
        get-command mingw32-make
        $oldTargets = $script:cmakeTargets
        $script:cmakeTargets = @("llama", "ggml")
        $script:cmakeDefs = @(
            "-G", "MinGW Makefiles"
            "-DCMAKE_C_COMPILER=gcc.exe",
            "-DCMAKE_CXX_COMPILER=g++.exe",
            "-DBUILD_SHARED_LIBS=off",
            "-DLLAMA_NATIVE=off",
            "-DLLAMA_AVX=off",
            "-DLLAMA_AVX2=off",
            "-DLLAMA_AVX512=off",
            "-DLLAMA_F16C=off",
            "-DLLAMA_FMA=off")
        $script:buildDir="../build/windows/${script:ARCH}_static"
        write-host "Building static library"
        build
        $script:cmakeTargets = $oldTargets
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu($gen_arch) {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu_avx() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
        $script:distDir="$script:DIST_BASE\cpu_avx"
        write-host "Building AVX CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU AVX generation step as requested"
    }
 }
 function build_cpu_avx2() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
        $script:distDir="$script:DIST_BASE\cpu_avx2"
        write-host "Building AVX2 CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU AVX2 generation step as requested"
    }
 }
 function build_cuda() {
    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
        # Then build cuda as a dynamically loaded library
        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
        if ($null -ne $script:CUDA_VERSION) {
            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
        }
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
        $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
            write-host "building custom CUDA GPU"
        }
        build
        sign
        install
        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
    } else {
        write-host "Skipping CUDA generation step"
    }
 }
 function build_rocm() {
    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
        if ($null -ne $script:ROCM_VERSION) {
            $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
        }
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
        $script:cmakeDefs += @(
            "-G", "Ninja", 
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DLLAMA_HIPBLAS=on",
            "-DHIP_PLATFORM=amd",
            "-DLLAMA_AVX=on",
            "-DLLAMA_AVX2=off",
            "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
            "-DAMDGPU_TARGETS=$(amdGPUs)",
            "-DGPU_TARGETS=$(amdGPUs)"
            )
        # Make sure the ROCm binary dir is first in the path
        $env:PATH="$env:HIP_PATH\bin;$env:PATH"
        # We have to clobber the LIB var from the developer shell for clang to work properly
        $env:LIB=""
        if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
            write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
            $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
            write-host "building custom ROCM GPU"
        }
        write-host "Building ROCm"
        build
        # Ninja doesn't prefix with config name
        ${script:config}=""
        if ($null -ne $script:DUMPBIN) {
            & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
        }
        sign
        install
        # Assumes v5.7, may need adjustments for v6
        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
    } else {
        write-host "Skipping ROCm generation step"
    }
 }
 # GCC build for direct linking into the Go binary
 init_vars
-# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
+if ($($args.count) -eq 0) {
-# as we need this to be compiled by gcc for golang to be able to link with itx
+    git_module_setup
-write-host "Checking for MinGW..."
+    apply_patches
-# error action ensures we exit on failure
+    build_static
-get-command gcc
+    if ($script:ARCH -eq "arm64") {
-get-command mingw32-make
+        build_cpu("ARM64")
-$script:cmakeTargets = @("llama", "ggml")
+    } else { # amd64
-$script:cmakeDefs = @(
+        build_cpu("x64")
-    "-G", "MinGW Makefiles"
+        build_cpu_avx
-    "-DCMAKE_C_COMPILER=gcc.exe",
+        build_cpu_avx2
-    "-DCMAKE_CXX_COMPILER=g++.exe",
+        build_cuda
-    "-DBUILD_SHARED_LIBS=off",
+        build_rocm
-    "-DLLAMA_NATIVE=off",
+    }
    "-DLLAMA_AVX=off",
    "-DLLAMA_AVX2=off",
    "-DLLAMA_AVX512=off",
    "-DLLAMA_F16C=off",
    "-DLLAMA_FMA=off")
 $script:buildDir="../build/windows/${script:ARCH}_static"
 write-host "Building static library"
 build
-# remaining llama.cpp builds use MSVC 
+    cleanup
-    init_vars
+    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
    $script:buildDir="../build/windows/${script:ARCH}/cpu"
    $script:distDir="$script:DIST_BASE\cpu"
    write-host "Building LCD CPU"
    build
    sign
    install
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
    $script:distDir="$script:DIST_BASE\cpu_avx"
    write-host "Building AVX CPU"
    build
    sign
    install
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
    $script:distDir="$script:DIST_BASE\cpu_avx2"
    write-host "Building AVX2 CPU"
    build
    sign
    install
 } else {
-    write-host "Skipping CPU generation step as requested"
+    for ( $i = 0; $i -lt $args.count; $i++ ) {
-}
+        write-host "performing $($args[$i])"
-
+        & $($args[$i])
-if ($null -ne $script:CUDA_LIB_DIR) {
+    } 
-    # Then build cuda as a dynamically loaded library
+}
    $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
    $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
    if ($null -ne $script:CUDA_VERSION) {
        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
    }
    init_vars
    $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
    $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
    if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
        write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
        $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
        write-host "building custom CUDA GPU"
    }
    build
    sign
    install
 }
 if ($null -ne $env:HIP_PATH) {
    $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
    if ($null -ne $script:ROCM_VERSION) {
        $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
    }
    init_vars
    $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
    $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
    $script:cmakeDefs += @(
        "-G", "Ninja", 
        "-DCMAKE_C_COMPILER=clang.exe",
        "-DCMAKE_CXX_COMPILER=clang++.exe",
        "-DLLAMA_HIPBLAS=on",
        "-DHIP_PLATFORM=amd",
        "-DLLAMA_AVX=on",
        "-DLLAMA_AVX2=off",
        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
        "-DAMDGPU_TARGETS=$(amdGPUs)",
        "-DGPU_TARGETS=$(amdGPUs)"
        )
    # Make sure the ROCm binary dir is first in the path
    $env:PATH="$env:HIP_PATH\bin;$env:PATH"
    # We have to clobber the LIB var from the developer shell for clang to work properly
    $env:LIB=""
    if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
        write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
        $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
        write-host "building custom ROCM GPU"
    }
    write-host "Building ROCm"
    build
    # Ninja doesn't prefix with config name
    ${script:config}=""
    if ($null -ne $script:DUMPBIN) {
        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
    }
    sign
    install
 }
 cleanup
 write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -33,6 +33,7 @@ func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
 type ggla struct {
 	*containerGGLA
 	offset int64
 	kv      KV
 	tensors []*Tensor
@@ -53,6 +54,10 @@ func (llm *ggla) Tensors() Tensors {
 	return llm.tensors
 }
 func (llm *ggla) Offset() int64 {
 	return llm.offset
 }
 func (llm *ggla) decode(rs io.ReadSeeker) error {
 	var r uint32
 	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
@@ -66,6 +71,13 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 	}
 	llm.kv["alpha"] = alpha
 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
 		return err
 	}
 	llm.offset = offset
 	for {
 		var dims uint32
 		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -13,85 +13,10 @@ type GGML struct {
 	model
 }
 const (
 	fileTypeF32 uint32 = iota
 	fileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
 	fileTypeQ4_1_F16
 	fileTypeQ8_0 uint32 = iota + 2
 	fileTypeQ5_0
 	fileTypeQ5_1
 	fileTypeQ2_K
 	fileTypeQ3_K_S
 	fileTypeQ3_K_M
 	fileTypeQ3_K_L
 	fileTypeQ4_K_S
 	fileTypeQ4_K_M
 	fileTypeQ5_K_S
 	fileTypeQ5_K_M
 	fileTypeQ6_K
 	fileTypeIQ2_XXS
 	fileTypeIQ2_XS
 	fileTypeQ2_K_S
 	fileTypeQ3_K_XS
 	fileTypeIQ3_XXS
 )
 func fileType(fileType uint32) string {
 	switch fileType {
 	case fileTypeF32:
 		return "F32"
 	case fileTypeF16:
 		return "F16"
 	case fileTypeQ4_0:
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
 	case fileTypeQ4_1_F16:
 		return "Q4_1_F16"
 	case fileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
 		return "Q5_0"
 	case fileTypeQ5_1:
 		return "Q5_1"
 	case fileTypeQ2_K:
 		return "Q2_K"
 	case fileTypeQ3_K_S:
 		return "Q3_K_S"
 	case fileTypeQ3_K_M:
 		return "Q3_K_M"
 	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case fileTypeQ4_K_S:
 		return "Q4_K_S"
 	case fileTypeQ4_K_M:
 		return "Q4_K_M"
 	case fileTypeQ5_K_S:
 		return "Q5_K_S"
 	case fileTypeQ5_K_M:
 		return "Q5_K_M"
 	case fileTypeQ6_K:
 		return "Q6_K"
 	case fileTypeIQ2_XXS:
 		return "IQ2_XXS"
 	case fileTypeIQ2_XS:
 		return "IQ2_XS"
 	case fileTypeQ2_K_S:
 		return "Q2_K_S"
 	case fileTypeQ3_K_XS:
 		return "Q3_K_XS"
 	case fileTypeIQ3_XXS:
 		return "IQ3_XXS"
 	default:
 		return "unknown"
 	}
 }
 type model interface {
 	KV() KV
 	Tensors() Tensors
 	Offset() int64
 }
 type KV map[string]any
@@ -123,7 +48,7 @@ func (kv KV) ParameterCount() uint64 {
 func (kv KV) FileType() string {
 	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
+		return fileType(uint32(u64)).String()
 	}
 	return "unknown"
@@ -286,6 +211,23 @@ const (
 var ErrUnsupportedFormat = errors.New("unsupported model format")
 func DetectGGMLType(b []byte) string {
 	switch binary.LittleEndian.Uint32(b[:4]) {
 	case FILE_MAGIC_GGML:
 		return "ggml"
 	case FILE_MAGIC_GGMF:
 		return "ggmf"
 	case FILE_MAGIC_GGJT:
 		return "ggjt"
 	case FILE_MAGIC_GGLA:
 		return "ggla"
 	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
 		return "gguf"
 	default:
 		return ""
 	}
 }
 func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -55,7 +55,7 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
 	model := newGGUF(c)
 	slog.Debug(fmt.Sprintf("model = %#v", model))
-	if err := model.Decode(rs); err != nil {
+	if err := model.decode(rs); err != nil {
 		return nil, err
 	}
@@ -90,6 +90,7 @@ const (
 type gguf struct {
 	*containerGGUF
 	offset int64
 	kv      KV
 	tensors []*Tensor
@@ -116,6 +117,10 @@ func (llm *gguf) Tensors() Tensors {
 	return llm.tensors
 }
 func (llm *gguf) Offset() int64 {
 	return llm.offset
 }
 func (llm *gguf) numTensor() uint64 {
 	switch llm.Version {
 	case 1:
@@ -138,7 +143,7 @@ func (llm *gguf) numKV() uint64 {
 	}
 }
-func (llm *gguf) Decode(rs io.ReadSeeker) error {
+func (llm *gguf) decode(rs io.ReadSeeker) error {
 	// decode key-values
 	for i := 0; uint64(i) < llm.numKV(); i++ {
 		k, err := readGGUFString(llm, rs)
@@ -250,6 +255,8 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		return err
 	}
 	llm.offset = offset + padding
 	for _, tensor := range llm.tensors {
 		if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil {
 			return err
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -4,6 +4,7 @@ package llm
 // #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
 // #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
 // #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
 // #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
 // #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
 // #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
 // #include <stdlib.h>
@@ -19,7 +20,7 @@ func SystemInfo() string {
 	return C.GoString(C.llama_print_system_info())
 }
-func Quantize(infile, outfile, filetype string) error {
+func Quantize(infile, outfile string, ftype fileType) error {
 	cinfile := C.CString(infile)
 	defer C.free(unsafe.Pointer(cinfile))
@@ -28,58 +29,10 @@ func Quantize(infile, outfile, filetype string) error {
 	params := C.llama_model_quantize_default_params()
 	params.nthread = -1
 	params.ftype = ftype.Value()
-	switch filetype {
+	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
-	case "F32":
+		return fmt.Errorf("llama_model_quantize: %d", rc)
 		params.ftype = fileTypeF32
 	case "F16":
 		params.ftype = fileTypeF16
 	case "Q4_0":
 		params.ftype = fileTypeQ4_0
 	case "Q4_1":
 		params.ftype = fileTypeQ4_1
 	case "Q4_1_F16":
 		params.ftype = fileTypeQ4_1_F16
 	case "Q8_0":
 		params.ftype = fileTypeQ8_0
 	case "Q5_0":
 		params.ftype = fileTypeQ5_0
 	case "Q5_1":
 		params.ftype = fileTypeQ5_1
 	case "Q2_K":
 		params.ftype = fileTypeQ2_K
 	case "Q3_K_S":
 		params.ftype = fileTypeQ3_K_S
 	case "Q3_K_M":
 		params.ftype = fileTypeQ3_K_M
 	case "Q3_K_L":
 		params.ftype = fileTypeQ3_K_L
 	case "Q4_K_S":
 		params.ftype = fileTypeQ4_K_S
 	case "Q4_K_M":
 		params.ftype = fileTypeQ4_K_M
 	case "Q5_K_S":
 		params.ftype = fileTypeQ5_K_S
 	case "Q5_K_M":
 		params.ftype = fileTypeQ5_K_M
 	case "Q6_K":
 		params.ftype = fileTypeQ6_K
 	case "IQ2_XXS":
 		params.ftype = fileTypeIQ2_XXS
 	case "IQ2_XS":
 		params.ftype = fileTypeIQ2_XS
 	case "Q2_K_S":
 		params.ftype = fileTypeQ2_K_S
 	case "Q3_K_XS":
 		params.ftype = fileTypeQ3_K_XS
 	case "IQ3_XXS":
 		params.ftype = fileTypeIQ3_XXS
 	default:
 		return fmt.Errorf("unknown filetype: %s", filetype)
 	}
 	if retval := C.llama_model_quantize(cinfile, coutfile, &params); retval != 0 {
 		return fmt.Errorf("llama_model_quantize: %d", retval)
 	}
 	return nil
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -88,6 +88,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	graphFullOffload *= uint64(len(gpus))
 	graphPartialOffload *= uint64(len(gpus))
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
 	}
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload
@@ -102,10 +107,14 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	layers := ggml.Tensors().Layers()
 	var memoryLayerOutput uint64
-	for k, v := range layers {
+	if layer, ok := layers["output_norm"]; ok {
-		if k == "output" || k == "output_norm" {
+		memoryLayerOutput += layer.size()
-			memoryLayerOutput += v.size()
+	}
-		}
+
 	if layer, ok := layers["output"]; ok {
 		memoryLayerOutput += layer.size()
 	} else if layer, ok := layers["token_embd"]; ok {
 		memoryLayerOutput += layer.size()
 	}
 	if gpus[0].Library == "metal" && opts.UseMMap {
--- a/llm/patches/02-clip-log.diff
+++ b/llm/patches/02-clip-log.diff
@@ -0,0 +1,12 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index e431c7f7..f077e688 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 +#include "common.h"
 #include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
--- a/llm/server.go
+++ b/llm/server.go
@@ -73,8 +73,7 @@ func LoadModel(model string) (*GGML, error) {
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
 	if opts.NumCtx > int(ggml.KV().ContextLength()) {
-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
+		slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
 		opts.NumCtx = int(ggml.KV().ContextLength())
 	}
 	if opts.NumCtx < 4 {
@@ -301,12 +300,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			continue
 		}
 		// reap subprocess when it exits
 		go func() {
 			// Exit status managed via getServerStatus
 			_ = s.cmd.Wait()
 		}()
 		// TODO - make sure this is all wired up correctly
 		// if err = s.WaitUntilRunning(); err != nil {
 		// 	slog.Error("error starting llama server", "server", servers[i], "error", err)
@@ -442,7 +435,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 		select {
 		case <-ctx.Done():
 			slog.Info("context expired before server started")
-			return fmt.Errorf("timed out waiting for llama runner to start")
+			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
 		case err := <-s.done:
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
@@ -900,7 +893,13 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
 func (s *llmServer) Close() error {
 	if s.cmd != nil {
 		slog.Debug("stopping llama server")
-		return s.cmd.Process.Kill()
+		if err := s.cmd.Process.Kill(); err != nil {
 			return err
 		}
 		_ = s.cmd.Wait()
 		slog.Debug("llama server stopped")
 	}
 	return nil
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
  const [step, setStep] = useState<Step>(Step.WELCOME)
  const [commandCopied, setCommandCopied] = useState<boolean>(false)
-  const command = 'ollama run llama2'
+  const command = 'ollama run llama3'
  return (
    <div className='drag'>
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -6,8 +6,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"log/slog"
+	"strconv"
-	"slices"
+	"strings"
 )
 type Command struct {
@@ -15,118 +15,283 @@ type Command struct {
 	Args string
 }
-func (c *Command) Reset() {
+type state int
 	c.Name = ""
 	c.Args = ""
 }
-func Parse(reader io.Reader) ([]Command, error) {
+const (
-	var commands []Command
+	stateNil state = iota
-	var command, modelCommand Command
+	stateName
 	stateValue
 	stateParameter
 	stateMessage
 	stateComment
 )
-	scanner := bufio.NewScanner(reader)
+var (
-	scanner.Buffer(make([]byte, 0, bufio.MaxScanTokenSize), bufio.MaxScanTokenSize)
+	errMissingFrom        = errors.New("no FROM line")
-	scanner.Split(scanModelfile)
+	errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
-	for scanner.Scan() {
+	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
-		line := scanner.Bytes()
+)
-		fields := bytes.SplitN(line, []byte(" "), 2)
+func Format(cmds []Command) string {
-		if len(fields) == 0 || len(fields[0]) == 0 {
+	var sb strings.Builder
-			continue
+	for _, cmd := range cmds {
-		}
+		name := cmd.Name
 		args := cmd.Args
-		switch string(bytes.ToUpper(fields[0])) {
+		switch cmd.Name {
-		case "FROM":
+		case "model":
-			command.Name = "model"
+			name = "from"
-			command.Args = string(bytes.TrimSpace(fields[1]))
+			args = cmd.Args
-			// copy command for validation
+		case "license", "template", "system", "adapter":
-			modelCommand = command
+			args = quote(args)
-		case "ADAPTER":
+		case "message":
-			command.Name = string(bytes.ToLower(fields[0]))
+			role, message, _ := strings.Cut(cmd.Args, ": ")
-			command.Args = string(bytes.TrimSpace(fields[1]))
+			args = role + " " + quote(message)
 		case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT":
 			command.Name = string(bytes.ToLower(fields[0]))
 			command.Args = string(fields[1])
 		case "PARAMETER":
 			fields = bytes.SplitN(fields[1], []byte(" "), 2)
 			if len(fields) < 2 {
 				return nil, fmt.Errorf("missing value for %s", fields)
 			}
 			command.Name = string(fields[0])
 			command.Args = string(bytes.TrimSpace(fields[1]))
 		case "EMBED":
 			return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
 		case "MESSAGE":
 			command.Name = string(bytes.ToLower(fields[0]))
 			fields = bytes.SplitN(fields[1], []byte(" "), 2)
 			if len(fields) < 2 {
 				return nil, fmt.Errorf("should be in the format <role> <message>")
 			}
 			if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
 				return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
 			}
 			command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
 		default:
-			if !bytes.HasPrefix(fields[0], []byte("#")) {
+			name = "parameter"
-				// log a warning for unknown commands
+			args = cmd.Name + " " + quote(cmd.Args)
 				slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
 			}
 			continue
 		}
-		commands = append(commands, command)
+		fmt.Fprintln(&sb, strings.ToUpper(name), args)
 		command.Reset()
 	}
-	if modelCommand.Args == "" {
+	return sb.String()
 		return nil, errors.New("no FROM line for the model was specified")
 	}
 	return commands, scanner.Err()
 }
-func scanModelfile(data []byte, atEOF bool) (advance int, token []byte, err error) {
+func Parse(r io.Reader) (cmds []Command, err error) {
-	advance, token, err = scan([]byte(`"""`), []byte(`"""`), data, atEOF)
+	var cmd Command
-	if err != nil {
+	var curr state
-		return 0, nil, err
+	var b bytes.Buffer
-	}
+	var role string
-	if advance > 0 && token != nil {
+	br := bufio.NewReader(r)
-		return advance, token, nil
+	for {
-	}
+		r, _, err := br.ReadRune()
-
+		if errors.Is(err, io.EOF) {
-	advance, token, err = scan([]byte(`"`), []byte(`"`), data, atEOF)
+			break
-	if err != nil {
+		} else if err != nil {
-		return 0, nil, err
+			return nil, err
 	}
 	if advance > 0 && token != nil {
 		return advance, token, nil
 	}
 	return bufio.ScanLines(data, atEOF)
 }
 func scan(openBytes, closeBytes, data []byte, atEOF bool) (advance int, token []byte, err error) {
 	newline := bytes.IndexByte(data, '\n')
 	if start := bytes.Index(data, openBytes); start >= 0 && start < newline {
 		end := bytes.Index(data[start+len(openBytes):], closeBytes)
 		if end < 0 {
 			if atEOF {
 				return 0, nil, fmt.Errorf("unterminated %s: expecting %s", openBytes, closeBytes)
 			} else {
 				return 0, nil, nil
 			}
 		}
-		n := start + len(openBytes) + end + len(closeBytes)
+		next, r, err := parseRuneForState(r, curr)
 		if errors.Is(err, io.ErrUnexpectedEOF) {
 			return nil, fmt.Errorf("%w: %s", err, b.String())
 		} else if err != nil {
 			return nil, err
 		}
-		newData := data[:start]
+		// process the state transition, some transitions need to be intercepted and redirected
-		newData = append(newData, data[start+len(openBytes):n-len(closeBytes)]...)
+		if next != curr {
-		return n, newData, nil
+			switch curr {
 			case stateName:
 				if !isValidCommand(b.String()) {
 					return nil, errInvalidCommand
 				}
 				// next state sometimes depends on the current buffer value
 				switch s := strings.ToLower(b.String()); s {
 				case "from":
 					cmd.Name = "model"
 				case "parameter":
 					// transition to stateParameter which sets command name
 					next = stateParameter
 				case "message":
 					// transition to stateMessage which validates the message role
 					next = stateMessage
 					fallthrough
 				default:
 					cmd.Name = s
 				}
 			case stateParameter:
 				cmd.Name = b.String()
 			case stateMessage:
 				if !isValidMessageRole(b.String()) {
 					return nil, errInvalidMessageRole
 				}
 				role = b.String()
 			case stateComment, stateNil:
 				// pass
 			case stateValue:
 				s, ok := unquote(b.String())
 				if !ok || isSpace(r) {
 					if _, err := b.WriteRune(r); err != nil {
 						return nil, err
 					}
 					continue
 				}
 				if role != "" {
 					s = role + ": " + s
 					role = ""
 				}
 				cmd.Args = s
 				cmds = append(cmds, cmd)
 			}
 			b.Reset()
 			curr = next
 		}
 		if strconv.IsPrint(r) {
 			if _, err := b.WriteRune(r); err != nil {
 				return nil, err
 			}
 		}
 	}
-	return 0, nil, nil
+	// flush the buffer
 	switch curr {
 	case stateComment, stateNil:
 		// pass; nothing to flush
 	case stateValue:
 		s, ok := unquote(b.String())
 		if !ok {
 			return nil, io.ErrUnexpectedEOF
 		}
 		if role != "" {
 			s = role + ": " + s
 		}
 		cmd.Args = s
 		cmds = append(cmds, cmd)
 	default:
 		return nil, io.ErrUnexpectedEOF
 	}
 	for _, cmd := range cmds {
 		if cmd.Name == "model" {
 			return cmds, nil
 		}
 	}
 	return nil, errMissingFrom
 }
 func parseRuneForState(r rune, cs state) (state, rune, error) {
 	switch cs {
 	case stateNil:
 		switch {
 		case r == '#':
 			return stateComment, 0, nil
 		case isSpace(r), isNewline(r):
 			return stateNil, 0, nil
 		default:
 			return stateName, r, nil
 		}
 	case stateName:
 		switch {
 		case isAlpha(r):
 			return stateName, r, nil
 		case isSpace(r):
 			return stateValue, 0, nil
 		default:
 			return stateNil, 0, errInvalidCommand
 		}
 	case stateValue:
 		switch {
 		case isNewline(r):
 			return stateNil, r, nil
 		case isSpace(r):
 			return stateNil, r, nil
 		default:
 			return stateValue, r, nil
 		}
 	case stateParameter:
 		switch {
 		case isAlpha(r), isNumber(r), r == '_':
 			return stateParameter, r, nil
 		case isSpace(r):
 			return stateValue, 0, nil
 		default:
 			return stateNil, 0, io.ErrUnexpectedEOF
 		}
 	case stateMessage:
 		switch {
 		case isAlpha(r):
 			return stateMessage, r, nil
 		case isSpace(r):
 			return stateValue, 0, nil
 		default:
 			return stateNil, 0, io.ErrUnexpectedEOF
 		}
 	case stateComment:
 		switch {
 		case isNewline(r):
 			return stateNil, 0, nil
 		default:
 			return stateComment, 0, nil
 		}
 	default:
 		return stateNil, 0, errors.New("")
 	}
 }
 func quote(s string) string {
 	if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
 		if strings.Contains(s, "\"") {
 			return `"""` + s + `"""`
 		}
 		return `"` + s + `"`
 	}
 	return s
 }
 func unquote(s string) (string, bool) {
 	if len(s) == 0 {
 		return "", false
 	}
 	// TODO: single quotes
 	if len(s) >= 3 && s[:3] == `"""` {
 		if len(s) >= 6 && s[len(s)-3:] == `"""` {
 			return s[3 : len(s)-3], true
 		}
 		return "", false
 	}
 	if len(s) >= 1 && s[0] == '"' {
 		if len(s) >= 2 && s[len(s)-1] == '"' {
 			return s[1 : len(s)-1], true
 		}
 		return "", false
 	}
 	return s, true
 }
 func isAlpha(r rune) bool {
 	return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
 }
 func isNumber(r rune) bool {
 	return r >= '0' && r <= '9'
 }
 func isSpace(r rune) bool {
 	return r == ' ' || r == '\t'
 }
 func isNewline(r rune) bool {
 	return r == '\r' || r == '\n'
 }
 func isValidMessageRole(role string) bool {
 	return role == "system" || role == "user" || role == "assistant"
 }
 func isValidCommand(cmd string) bool {
 	switch strings.ToLower(cmd) {
 	case "from", "license", "template", "system", "adapter", "parameter", "message":
 		return true
 	default:
 		return false
 	}
 }
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -1,14 +1,16 @@
 package parser
 import (
 	"bytes"
 	"fmt"
 	"io"
 	"strings"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
-func Test_Parser(t *testing.T) {
+func TestParser(t *testing.T) {
 	input := `
 FROM model1
 ADAPTER adapter1
@@ -35,21 +37,62 @@ TEMPLATE template1
 	assert.Equal(t, expectedCommands, commands)
 }
-func Test_Parser_NoFromLine(t *testing.T) {
+func TestParserFrom(t *testing.T) {
 	var cases = []struct {
 		input    string
 		expected []Command
 		err      error
 	}{
 		{
 			"FROM foo",
 			[]Command{{Name: "model", Args: "foo"}},
 			nil,
 		},
 		{
 			"FROM /path/to/model",
 			[]Command{{Name: "model", Args: "/path/to/model"}},
 			nil,
 		},
 		{
 			"FROM /path/to/model/fp16.bin",
 			[]Command{{Name: "model", Args: "/path/to/model/fp16.bin"}},
 			nil,
 		},
 		{
 			"FROM llama3:latest",
 			[]Command{{Name: "model", Args: "llama3:latest"}},
 			nil,
 		},
 		{
 			"FROM llama3:7b-instruct-q4_K_M",
 			[]Command{{Name: "model", Args: "llama3:7b-instruct-q4_K_M"}},
 			nil,
 		},
 		{
 			"", nil, errMissingFrom,
 		},
 		{
 			"PARAMETER param1 value1",
 			nil,
 			errMissingFrom,
 		},
 		{
 			"PARAMETER param1 value1\nFROM foo",
 			[]Command{{Name: "param1", Args: "value1"}, {Name: "model", Args: "foo"}},
 			nil,
 		},
 	}
-	input := `
+	for _, c := range cases {
-PARAMETER param1 value1
+		t.Run("", func(t *testing.T) {
-PARAMETER param2 value2
+			commands, err := Parse(strings.NewReader(c.input))
-`
+			assert.ErrorIs(t, err, c.err)
-
+			assert.Equal(t, c.expected, commands)
-	reader := strings.NewReader(input)
+		})
-
+	}
 	_, err := Parse(reader)
 	assert.ErrorContains(t, err, "no FROM line")
 }
-func Test_Parser_MissingValue(t *testing.T) {
+func TestParserParametersMissingValue(t *testing.T) {
 	input := `
 FROM foo
 PARAMETER param1
@@ -58,41 +101,401 @@ PARAMETER param1
 	reader := strings.NewReader(input)
 	_, err := Parse(reader)
-	assert.ErrorContains(t, err, "missing value for [param1]")
+	assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
 }
 func TestParserBadCommand(t *testing.T) {
 	input := `
 FROM foo
 BADCOMMAND param1 value1
 `
 	_, err := Parse(strings.NewReader(input))
 	assert.ErrorIs(t, err, errInvalidCommand)
 }
-func Test_Parser_Messages(t *testing.T) {
+func TestParserMessages(t *testing.T) {
-
+	var cases = []struct {
-	input := `
+		input    string
 		expected []Command
 		err      error
 	}{
 		{
 			`
 FROM foo
 MESSAGE system You are a Parser. Always Parse things.
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "message", Args: "system: You are a Parser. Always Parse things."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 MESSAGE system You are a Parser. Always Parse things.`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "message", Args: "system: You are a Parser. Always Parse things."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 MESSAGE system You are a Parser. Always Parse things.
 MESSAGE user Hey there!
 MESSAGE assistant Hello, I want to parse all the things!
-`
+`,
-
+			[]Command{
-	reader := strings.NewReader(input)
+				{Name: "model", Args: "foo"},
-	commands, err := Parse(reader)
+				{Name: "message", Args: "system: You are a Parser. Always Parse things."},
-	assert.Nil(t, err)
+				{Name: "message", Args: "user: Hey there!"},
-
+				{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
-	expectedCommands := []Command{
+			},
-		{Name: "model", Args: "foo"},
+			nil,
-		{Name: "message", Args: "system: You are a Parser. Always Parse things."},
+		},
-		{Name: "message", Args: "user: Hey there!"},
+		{
-		{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
+			`
-	}
+FROM foo
-
+MESSAGE system """
-	assert.Equal(t, expectedCommands, commands)
+You are a multiline Parser. Always Parse things.
-}
+"""
-
+			`,
-func Test_Parser_Messages_BadRole(t *testing.T) {
+			[]Command{
-
+				{Name: "model", Args: "foo"},
-	input := `
+				{Name: "message", Args: "system: \nYou are a multiline Parser. Always Parse things.\n"},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 MESSAGE badguy I'm a bad guy!
-`
+`,
 			nil,
 			errInvalidMessageRole,
 		},
 		{
 			`
 FROM foo
 MESSAGE system
 `,
 			nil,
 			io.ErrUnexpectedEOF,
 		},
 		{
 			`
 FROM foo
 MESSAGE system`,
 			nil,
 			io.ErrUnexpectedEOF,
 		},
 	}
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
 			commands, err := Parse(strings.NewReader(c.input))
 			assert.ErrorIs(t, err, c.err)
 			assert.Equal(t, c.expected, commands)
 		})
 	}
 }
 func TestParserQuoted(t *testing.T) {
 	var cases = []struct {
 		multiline string
 		expected  []Command
 		err       error
 	}{
 		{
 			`
 FROM foo
 SYSTEM """
 This is a
 multiline system.
 """
 			`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: "\nThis is a\nmultiline system.\n"},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM """
 This is a
 multiline system."""
 			`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: "\nThis is a\nmultiline system."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM """This is a
 multiline system."""
 			`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: "This is a\nmultiline system."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM """This is a multiline system."""
 			`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: "This is a multiline system."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM """This is a multiline system.""
 			`,
 			nil,
 			io.ErrUnexpectedEOF,
 		},
 		{
 			`
 FROM foo
 SYSTEM "
 			`,
 			nil,
 			io.ErrUnexpectedEOF,
 		},
 		{
 			`
 FROM foo
 SYSTEM """
 This is a multiline system with "quotes".
 """
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: "\nThis is a multiline system with \"quotes\".\n"},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM """"""
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: ""},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM ""
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: ""},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM "'"
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: "'"},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 SYSTEM """''"'""'""'"'''''""'""'"""
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "system", Args: `''"'""'""'"'''''""'""'`},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
 TEMPLATE """
 {{ .Prompt }}
 """`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 				{Name: "template", Args: "\n{{ .Prompt }}\n"},
 			},
 			nil,
 		},
 	}
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
 			commands, err := Parse(strings.NewReader(c.multiline))
 			assert.ErrorIs(t, err, c.err)
 			assert.Equal(t, c.expected, commands)
 		})
 	}
 }
 func TestParserParameters(t *testing.T) {
 	var cases = map[string]struct {
 		name, value string
 	}{
 		"numa true":                    {"numa", "true"},
 		"num_ctx 1":                    {"num_ctx", "1"},
 		"num_batch 1":                  {"num_batch", "1"},
 		"num_gqa 1":                    {"num_gqa", "1"},
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
 		"low_vram true":                {"low_vram", "true"},
 		"f16_kv true":                  {"f16_kv", "true"},
 		"logits_all true":              {"logits_all", "true"},
 		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},
 		"use_mlock true":               {"use_mlock", "true"},
 		"num_thread 1":                 {"num_thread", "1"},
 		"num_keep 1":                   {"num_keep", "1"},
 		"seed 1":                       {"seed", "1"},
 		"num_predict 1":                {"num_predict", "1"},
 		"top_k 1":                      {"top_k", "1"},
 		"top_p 1.0":                    {"top_p", "1.0"},
 		"tfs_z 1.0":                    {"tfs_z", "1.0"},
 		"typical_p 1.0":                {"typical_p", "1.0"},
 		"repeat_last_n 1":              {"repeat_last_n", "1"},
 		"temperature 1.0":              {"temperature", "1.0"},
 		"repeat_penalty 1.0":           {"repeat_penalty", "1.0"},
 		"presence_penalty 1.0":         {"presence_penalty", "1.0"},
 		"frequency_penalty 1.0":        {"frequency_penalty", "1.0"},
 		"mirostat 1":                   {"mirostat", "1"},
 		"mirostat_tau 1.0":             {"mirostat_tau", "1.0"},
 		"mirostat_eta 1.0":             {"mirostat_eta", "1.0"},
 		"penalize_newline true":        {"penalize_newline", "true"},
 		"stop ### User:":               {"stop", "### User:"},
 		"stop ### User: ":              {"stop", "### User: "},
 		"stop \"### User:\"":           {"stop", "### User:"},
 		"stop \"### User: \"":          {"stop", "### User: "},
 		"stop \"\"\"### User:\"\"\"":   {"stop", "### User:"},
 		"stop \"\"\"### User:\n\"\"\"": {"stop", "### User:\n"},
 		"stop <|endoftext|>":           {"stop", "<|endoftext|>"},
 		"stop <|eot_id|>":              {"stop", "<|eot_id|>"},
 		"stop </s>":                    {"stop", "</s>"},
 	}
 	for k, v := range cases {
 		t.Run(k, func(t *testing.T) {
 			var b bytes.Buffer
 			fmt.Fprintln(&b, "FROM foo")
 			fmt.Fprintln(&b, "PARAMETER", k)
 			commands, err := Parse(&b)
 			assert.Nil(t, err)
 			assert.Equal(t, []Command{
 				{Name: "model", Args: "foo"},
 				{Name: v.name, Args: v.value},
 			}, commands)
 		})
 	}
 }
 func TestParserComments(t *testing.T) {
 	var cases = []struct {
 		input    string
 		expected []Command
 	}{
 		{
 			`
 # comment
 FROM foo
 	`,
 			[]Command{
 				{Name: "model", Args: "foo"},
 			},
 		},
 	}
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
 			commands, err := Parse(strings.NewReader(c.input))
 			assert.Nil(t, err)
 			assert.Equal(t, c.expected, commands)
 		})
 	}
 }
 func TestParseFormatParse(t *testing.T) {
 	var cases = []string{
 		`
 FROM foo
 ADAPTER adapter1
 LICENSE MIT
 PARAMETER param1 value1
 PARAMETER param2 value2
 TEMPLATE template1
 MESSAGE system You are a Parser. Always Parse things.
 MESSAGE user Hey there!
 MESSAGE assistant Hello, I want to parse all the things!
 `,
 		`
 FROM foo
 ADAPTER adapter1
 LICENSE MIT
 PARAMETER param1 value1
 PARAMETER param2 value2
 TEMPLATE template1
 MESSAGE system """
 You are a store greeter. Always responsed with "Hello!".
 """
 MESSAGE user Hey there!
 MESSAGE assistant Hello, I want to parse all the things!
 `,
 		`
 FROM foo
 ADAPTER adapter1
 LICENSE """
 Very long and boring legal text.
 Blah blah blah.
 "Oh look, a quote!"
 """
 PARAMETER param1 value1
 PARAMETER param2 value2
 TEMPLATE template1
 MESSAGE system """
 You are a store greeter. Always responsed with "Hello!".
 """
 MESSAGE user Hey there!
 MESSAGE assistant Hello, I want to parse all the things!
 `,
 	}
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
 			commands, err := Parse(strings.NewReader(c))
 			assert.NoError(t, err)
 			commands2, err := Parse(strings.NewReader(Format(commands)))
 			assert.NoError(t, err)
 			assert.Equal(t, commands, commands2)
 		})
 	}
 	reader := strings.NewReader(input)
 	_, err := Parse(reader)
 	assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"")
 }
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,6 +7,8 @@
 $ErrorActionPreference = "Stop"
 function checkEnv() {
    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
    Write-host "Building for ${script:TARGET_ARCH}"
    write-host "Locating required tools and paths"
    $script:SRC_DIR=$PWD
    if (!$env:VCToolsRedistDir) {
@@ -30,7 +32,7 @@ function checkEnv() {
    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
-    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-amd64"
+    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
    echo "Checking version"
    if (!$env:VERSION) {
@@ -81,8 +83,8 @@ function buildOllama() {
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
-    New-Item -ItemType Directory -Path .\dist\windows-amd64\ -Force
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-amd64\ollama-windows-amd64.exe
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
 }
 function buildApp() {
@@ -109,9 +111,6 @@ function gatherDependencies() {
    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
    cp "${script:NVIDIA_DIR}\cudart64_*.dll" "${script:DEPS_DIR}\"
    cp "${script:NVIDIA_DIR}\cublas64_*.dll" "${script:DEPS_DIR}\"
    cp "${script:NVIDIA_DIR}\cublasLt64_*.dll" "${script:DEPS_DIR}\"
    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
@@ -123,15 +122,6 @@ function gatherDependencies() {
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
    if ($null -ne $env:HIP_PATH) {
        # Assumes v5.7, may need adjustments for v6
        rm -ea 0 -recurse -force -path "${script:DEPS_DIR}\rocm\"
        md "${script:DEPS_DIR}\rocm\rocblas\library\" -ea 0 > $null
        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:DEPS_DIR}\rocm\"
        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:DEPS_DIR}\rocm\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:DEPS_DIR}\rocm\rocblas\library\"
    }
 }
 function buildInstaller() {
@@ -139,16 +129,16 @@ function buildInstaller() {
    cd "${script:SRC_DIR}\app"
    $env:PKG_VERSION=$script:PKG_VERSION
    if ("${env:KEY_CONTAINER}") {
-        & "${script:INNO_SETUP_DIR}\ISCC.exe" /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
+        & "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
    } else {
-        & "${script:INNO_SETUP_DIR}\ISCC.exe" .\ollama.iss
+        & "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH .\ollama.iss
    }
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function distZip() {
-    write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-amd64.zip"
+    write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip"
-    Compress-Archive -Path "${script:SRC_DIR}\dist\windows-amd64\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-amd64.zip" -Force
+    Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
 }
 try {
--- a/server/images.go
+++ b/server/images.go
--- a/server/layers.go
+++ b/server/layers.go
@@ -5,39 +5,18 @@ import (
 	"fmt"
 	"io"
 	"os"
 	"strings"
 	"golang.org/x/exp/slices"
 )
 type Layers struct {
 	items []*Layer
 }
 func (ls *Layers) Add(layer *Layer) {
 	if layer.Size > 0 {
 		ls.items = append(ls.items, layer)
 	}
 }
 func (ls *Layers) Replace(layer *Layer) {
 	if layer.Size > 0 {
 		mediatype := layer.MediaType
 		layers := slices.DeleteFunc(ls.items, func(l *Layer) bool {
 			return l.MediaType == mediatype
 		})
 		ls.items = append(layers, layer)
 	}
 }
 type Layer struct {
 	MediaType string `json:"mediaType"`
 	Digest    string `json:"digest"`
 	Size      int64  `json:"size"`
 	From      string `json:"from,omitempty"`
-	tempFileName string
+	Intermediate bool   `json:"intermediate,omitempty"`
 	MergeBase    string `json:"merge_base,omitempty"`
 	message string
 }
 func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
@@ -46,14 +25,12 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 		return nil, err
 	}
-	const delimiter = "-"
+	temp, err := os.CreateTemp(blobs, "sha256-")
 	pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
 	temp, err := os.CreateTemp(blobs, pattern)
 	if err != nil {
 		return nil, err
 	}
 	defer temp.Close()
 	defer os.Remove(temp.Name())
 	sha256sum := sha256.New()
 	n, err := io.Copy(io.MultiWriter(temp, sha256sum), r)
@@ -61,11 +38,29 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 		return nil, err
 	}
 	if err := temp.Close(); err != nil {
 		return nil, err
 	}
 	digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil))
 	blob, err := GetBlobsPath(digest)
 	if err != nil {
 		return nil, err
 	}
 	status := "using existing layer"
 	if _, err := os.Stat(blob); err != nil {
 		status = "creating new layer"
 		if err := os.Rename(temp.Name(), blob); err != nil {
 			return nil, err
 		}
 	}
 	return &Layer{
-		MediaType:    mediatype,
+		MediaType: mediatype,
-		Digest:       fmt.Sprintf("sha256:%x", sha256sum.Sum(nil)),
+		Digest:    digest,
-		Size:         n,
+		Size:      n,
-		tempFileName: temp.Name(),
+		message:   fmt.Sprintf("%s %s", status, digest),
 	}, nil
 }
@@ -85,21 +80,15 @@ func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
 		Digest:    digest,
 		Size:      fi.Size(),
 		From:      from,
 		message:   fmt.Sprintf("using existing layer %s", digest),
 	}, nil
 }
-func (l *Layer) Commit() (bool, error) {
+func (l *Layer) Open() (*os.File, error) {
 	// always remove temp
 	defer os.Remove(l.tempFileName)
 	blob, err := GetBlobsPath(l.Digest)
 	if err != nil {
-		return false, err
+		return nil, err
 	}
-	if _, err := os.Stat(blob); err != nil {
+	return os.Open(blob)
 		return true, os.Rename(l.tempFileName, blob)
 	}
 	return false, nil
 }
--- a/server/model.go
+++ b/server/model.go
@@ -0,0 +1,259 @@
 package server
 import (
 	"archive/zip"
 	"bytes"
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/types/model"
 )
 type layerWithGGML struct {
 	*Layer
 	*llm.GGML
 }
 func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
 	modelpath := ParseModelPath(name.String())
 	manifest, _, err := GetManifest(modelpath)
 	switch {
 	case errors.Is(err, os.ErrNotExist):
 		if err := PullModel(ctx, name.String(), &registryOptions{}, fn); err != nil {
 			return nil, err
 		}
 		modelpath = ParseModelPath(name.String())
 		manifest, _, err = GetManifest(modelpath)
 		if err != nil {
 			return nil, err
 		}
 	case err != nil:
 		return nil, err
 	}
 	for _, layer := range manifest.Layers {
 		layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
 		if err != nil {
 			return nil, err
 		}
 		switch layer.MediaType {
 		case "application/vnd.ollama.image.model",
 			"application/vnd.ollama.image.projector",
 			"application/vnd.ollama.image.adapter":
 			blobpath, err := GetBlobsPath(layer.Digest)
 			if err != nil {
 				return nil, err
 			}
 			blob, err := os.Open(blobpath)
 			if err != nil {
 				return nil, err
 			}
 			defer blob.Close()
 			ggml, _, err := llm.DecodeGGML(blob)
 			if err != nil {
 				return nil, err
 			}
 			layers = append(layers, &layerWithGGML{layer, ggml})
 		default:
 			layers = append(layers, &layerWithGGML{layer, nil})
 		}
 	}
 	return layers, nil
 }
 func parseFromZipFile(_ context.Context, file *os.File, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
 	stat, err := file.Stat()
 	if err != nil {
 		return nil, err
 	}
 	r, err := zip.NewReader(file, stat.Size())
 	if err != nil {
 		return nil, err
 	}
 	tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
 	if err != nil {
 		return nil, err
 	}
 	defer os.RemoveAll(tempdir)
 	fn(api.ProgressResponse{Status: "unpacking model metadata"})
 	for _, f := range r.File {
 		// TODO(mxyng): this should not write out all files to disk
 		outfile, err := os.Create(filepath.Join(tempdir, f.Name))
 		if err != nil {
 			return nil, err
 		}
 		infile, err := f.Open()
 		if err != nil {
 			return nil, err
 		}
 		if _, err = io.Copy(outfile, infile); err != nil {
 			return nil, err
 		}
 		if err := outfile.Close(); err != nil {
 			return nil, err
 		}
 		if err := infile.Close(); err != nil {
 			return nil, err
 		}
 	}
 	mf, err := convert.GetModelFormat(tempdir)
 	if err != nil {
 		return nil, err
 	}
 	params, err := mf.GetParams(tempdir)
 	if err != nil {
 		return nil, err
 	}
 	mArch, err := mf.GetModelArch("", tempdir, params)
 	if err != nil {
 		return nil, err
 	}
 	fn(api.ProgressResponse{Status: "processing tensors"})
 	if err := mArch.GetTensors(); err != nil {
 		return nil, err
 	}
 	if err := mArch.LoadVocab(); err != nil {
 		return nil, err
 	}
 	fn(api.ProgressResponse{Status: "converting model"})
 	// TODO(mxyng): this should write directly into a layer
 	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
 	temp, err := os.CreateTemp(tempdir, "fp16")
 	if err != nil {
 		return nil, err
 	}
 	defer temp.Close()
 	defer os.Remove(temp.Name())
 	if err = mArch.WriteGGUF(temp); err != nil {
 		return nil, err
 	}
 	if _, err := temp.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}
 	layer, err := NewLayer(temp, "application/vnd.ollama.image.model")
 	if err != nil {
 		return nil, fmt.Errorf("aaa: %w", err)
 	}
 	blobpath, err := GetBlobsPath(layer.Digest)
 	if err != nil {
 		return nil, err
 	}
 	bin, err := os.Open(blobpath)
 	if err != nil {
 		return nil, err
 	}
 	defer bin.Close()
 	ggml, _, err := llm.DecodeGGML(bin)
 	if err != nil {
 		return nil, err
 	}
 	layer, err = NewLayerFromLayer(layer.Digest, layer.MediaType, "")
 	if err != nil {
 		return nil, err
 	}
 	layers = append(layers, &layerWithGGML{layer, ggml})
 	return layers, nil
 }
 func parseFromFile(ctx context.Context, file *os.File, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
 	sr := io.NewSectionReader(file, 0, 512)
 	contentType, err := detectContentType(sr)
 	if err != nil {
 		return nil, err
 	}
 	switch contentType {
 	case "gguf", "ggla":
 		// noop
 	case "application/zip":
 		return parseFromZipFile(ctx, file, fn)
 	default:
 		return nil, fmt.Errorf("unsupported content type: %s", contentType)
 	}
 	stat, err := file.Stat()
 	if err != nil {
 		return nil, err
 	}
 	var offset int64
 	for offset < stat.Size() {
 		ggml, n, err := llm.DecodeGGML(file)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
 			return nil, err
 		}
 		mediatype := "application/vnd.ollama.image.model"
 		if ggml.Name() == "ggla" {
 			mediatype = "application/vnd.ollama.image.adapter"
 		} else if ggml.KV().Architecture() == "clip" {
 			mediatype = "application/vnd.ollama.image.projector"
 		}
 		layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
 		if err != nil {
 			return nil, err
 		}
 		layers = append(layers, &layerWithGGML{layer, ggml})
 		offset = n
 	}
 	return layers, nil
 }
 func detectContentType(r io.Reader) (string, error) {
 	var b bytes.Buffer
 	if _, err := io.Copy(&b, r); err != nil {
 		return "", err
 	}
 	if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
 		return contentType, nil
 	}
 	if contentType := http.DetectContentType(b.Bytes()); contentType != "application/octet-stream" {
 		return contentType, nil
 	}
 	return "unknown", nil
 }
--- a/server/routes.go
+++ b/server/routes.go
@@ -146,6 +146,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	select {
 	case runner = <-rCh:
 	case err = <-eCh:
 		if errors.Is(err, context.Canceled) {
 			c.JSON(499, gin.H{"error": "request canceled"})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -389,6 +394,11 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	select {
 	case runner = <-rCh:
 	case err = <-eCh:
 		if errors.Is(err, context.Canceled) {
 			c.JSON(499, gin.H{"error": "request canceled"})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -570,7 +580,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
+		if err := CreateModel(ctx, model, filepath.Dir(req.Path), strings.ToUpper(req.Quantization), commands, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -718,12 +728,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		}
 	}
-	mf, err := ShowModelfile(model)
+	var sb strings.Builder
-	if err != nil {
+	fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
-		return nil, err
+	fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
-	}
+	fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
-
+	fmt.Fprint(&sb, parser.Format(model.Commands()))
-	resp.Modelfile = mf
+	resp.Modelfile = sb.String()
 	return resp, nil
 }
@@ -800,16 +810,13 @@ func (s *Server) CopyModelHandler(c *gin.Context) {
 	src := model.ParseName(r.Source)
 	if !src.IsValid() {
-		_ = c.Error(fmt.Errorf("source %q is invalid", r.Source))
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
 		return
 	}
 	dst := model.ParseName(r.Destination)
 	if !dst.IsValid() {
-		_ = c.Error(fmt.Errorf("destination %q is invalid", r.Destination))
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Source)})
 	}
 	if len(c.Errors) > 0 {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": c.Errors.Errors()})
 		return
 	}
@@ -865,11 +872,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 		return
 	}
 	if _, err := layer.Commit(); err != nil {
 		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 	c.Status(http.StatusCreated)
 }
@@ -1216,6 +1218,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	select {
 	case runner = <-rCh:
 	case err = <-eCh:
 		if errors.Is(err, context.Canceled) {
 			c.JSON(499, gin.H{"error": "request canceled"})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -124,14 +124,12 @@ func Test_Routes(t *testing.T) {
 			Method: http.MethodPost,
 			Path:   "/api/create",
 			Setup: func(t *testing.T, req *http.Request) {
-				f, err := os.CreateTemp(t.TempDir(), "ollama-model")
+				fname := createTestFile(t, "ollama-model")
 				assert.Nil(t, err)
 				defer f.Close()
 				stream := false
 				createReq := api.CreateRequest{
 					Name:      "t-bone",
-					Modelfile: fmt.Sprintf("FROM %s", f.Name()),
+					Modelfile: fmt.Sprintf("FROM %s", fname),
 					Stream:    &stream,
 				}
 				jsonData, err := json.Marshal(createReq)
@@ -216,28 +214,25 @@ func Test_Routes(t *testing.T) {
 	httpSrv := httptest.NewServer(router)
 	t.Cleanup(httpSrv.Close)
-	workDir, err := os.MkdirTemp("", "ollama-test")
+	t.Setenv("OLLAMA_MODELS", t.TempDir())
 	assert.Nil(t, err)
 	defer os.RemoveAll(workDir)
 	os.Setenv("OLLAMA_MODELS", workDir)
 	for _, tc := range testCases {
-		t.Logf("Running Test: [%s]", tc.Name)
+		t.Run(tc.Name, func(t *testing.T) {
-		u := httpSrv.URL + tc.Path
+			u := httpSrv.URL + tc.Path
-		req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil)
+			req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil)
-		assert.Nil(t, err)
+			assert.Nil(t, err)
-		if tc.Setup != nil {
+			if tc.Setup != nil {
-			tc.Setup(t, req)
+				tc.Setup(t, req)
-		}
+			}
-		resp, err := httpSrv.Client().Do(req)
+			resp, err := httpSrv.Client().Do(req)
-		assert.Nil(t, err)
+			assert.Nil(t, err)
-		defer resp.Body.Close()
+			defer resp.Body.Close()
 		if tc.Expected != nil {
 			tc.Expected(t, resp)
 		}
 			if tc.Expected != nil {
 				tc.Expected(t, resp)
 			}
 		})
 	}
 }
--- a/server/sched.go
+++ b/server/sched.go
@@ -149,6 +149,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 					// If we're CPU only mode, just limit by loadedMax above
 					// TODO handle system memory exhaustion
 					if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
 						slog.Debug("cpu mode with existing models, loading")
 						s.loadFn(pending, ggml, gpus)
 						break
 					}
 					// No models loaded. Load the model but prefer the best fit.
 					if loadedCount == 0 {
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
@@ -242,6 +250,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 						defer runner.refMu.Unlock()
 						if runner.expireTimer != nil {
 							runner.expireTimer.Stop()
 							runner.expireTimer = nil
 						}
 						s.expiredCh <- runner
 					})
@@ -288,6 +297,10 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	runner.refMu.Lock()
 	defer runner.refMu.Unlock()
 	runner.refCount++
 	if runner.expireTimer != nil {
 		runner.expireTimer.Stop()
 		runner.expireTimer = nil
 	}
 	runner.sessionDuration = pending.sessionDuration
 	pending.successCh <- runner
 	go func() {
@@ -418,6 +431,10 @@ type runnerRef struct {
 // The refMu must already be held when calling unload
 func (runner *runnerRef) unload() {
 	if runner.expireTimer != nil {
 		runner.expireTimer.Stop()
 		runner.expireTimer = nil
 	}
 	if runner.llama != nil {
 		runner.llama.Close()
 	}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -28,19 +28,33 @@ func TestInitScheduler(t *testing.T) {
 	ctx, done := context.WithCancel(context.Background())
 	defer done()
 	initialMax := loadedMax
 	initialParallel := numParallel
 	s := InitScheduler(ctx)
 	require.Equal(t, initialMax, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
 	s = InitScheduler(ctx)
 	require.Equal(t, initialMax, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
 	s = InitScheduler(ctx)
 	require.Equal(t, 0, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
 	_ = InitScheduler(ctx)
 	require.Equal(t, initialParallel, numParallel)
 	os.Setenv("OLLAMA_NUM_PARALLEL", "10")
 	_ = InitScheduler(ctx)
 	require.Equal(t, 10, numParallel)
 }
 func TestLoad(t *testing.T) {
@@ -51,6 +65,7 @@ func TestLoad(t *testing.T) {
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
 		opts:            api.DefaultOptions(),
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 		sessionDuration: 2,
@@ -63,7 +78,9 @@ func TestLoad(t *testing.T) {
 	s.load(req, ggml, gpus)
 	require.Len(t, req.successCh, 0)
 	require.Len(t, req.errCh, 1)
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 0)
 	s.loadedMu.Unlock()
 	err := <-req.errCh
 	require.Contains(t, err.Error(), "this model may be incompatible")
@@ -78,7 +95,9 @@ func TestLoad(t *testing.T) {
 	case resp := <-req.successCh:
 		require.Equal(t, uint64(10), resp.estimatedVRAM)
 		require.Equal(t, uint(1), resp.refCount)
 		s.loadedMu.Lock()
 		require.Len(t, s.loaded, 1)
 		s.loadedMu.Unlock()
 	}
 	req.model.ModelPath = "dummy_model_path"
@@ -90,7 +109,9 @@ func TestLoad(t *testing.T) {
 	case resp := <-req.successCh:
 		t.Errorf("unexpected success %v", resp)
 	}
 	s.loadedMu.Lock()
 	runner := s.loaded["dummy_model_path"]
 	s.loadedMu.Unlock()
 	require.NotNil(t, runner)
 	require.Equal(t, uint(0), runner.refCount)
 	time.Sleep(1 * time.Millisecond)
@@ -143,6 +164,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 	scenario.req = &LlmRequest{
 		ctx:             scenario.ctx,
 		model:           model,
 		opts:            api.DefaultOptions(),
 		sessionDuration: 5 * time.Millisecond,
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
@@ -171,7 +193,9 @@ func TestRequests(t *testing.T) {
 	// Multiple loaded models
 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
 	scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
-	scenario3c := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
+	scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
 	scenario3c.req.opts.NumGPU = 0                           // CPU load, will be allowed
 	scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
 	s := InitScheduler(ctx)
 	s.getGpuFn = func() gpu.GpuInfoList {
@@ -240,7 +264,9 @@ func TestRequests(t *testing.T) {
 	case <-ctx.Done():
 		t.Errorf("timeout")
 	}
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()
 	loadedMax = 0
 	s.newServerFn = scenario3b.newServer
@@ -254,19 +280,14 @@ func TestRequests(t *testing.T) {
 	case <-ctx.Done():
 		t.Errorf("timeout")
 	}
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 2)
 	s.loadedMu.Unlock()
-	// Try to load a model that wont fit
+	// This is a CPU load with NumGPU = 0 so it should load
 	s.newServerFn = scenario3c.newServer
 	slog.Info("scenario3c")
 	require.Len(t, s.loaded, 2)
 	scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
 	time.Sleep(2 * time.Millisecond)
 	s.pendingReqCh <- scenario3c.req
 	// finish prior request, so new model can load
 	time.Sleep(6 * time.Millisecond)
 	require.Len(t, s.loaded, 1)
 	scenario3b.ctxDone()
 	select {
 	case resp := <-scenario3c.req.successCh:
 		require.Equal(t, resp.llama, scenario3c.srv)
@@ -275,7 +296,36 @@ func TestRequests(t *testing.T) {
 	case <-ctx.Done():
 		t.Errorf("timeout")
 	}
-	require.Len(t, s.loaded, 1)
+	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 3)
 	s.loadedMu.Unlock()
 	// Try to load a model that wont fit
 	s.newServerFn = scenario3d.newServer
 	slog.Info("scenario3d")
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 3)
 	s.loadedMu.Unlock()
 	scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
 	time.Sleep(2 * time.Millisecond)
 	s.pendingReqCh <- scenario3d.req
 	// finish prior request, so new model can load
 	time.Sleep(6 * time.Millisecond)
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 2)
 	s.loadedMu.Unlock()
 	scenario3b.ctxDone()
 	select {
 	case resp := <-scenario3d.req.successCh:
 		require.Equal(t, resp.llama, scenario3d.srv)
 		require.Len(t, s.pendingReqCh, 0)
 		require.Len(t, scenario3d.req.errCh, 0)
 	case <-ctx.Done():
 		t.Errorf("timeout")
 	}
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 2)
 	s.loadedMu.Unlock()
 }
 func TestGetRunner(t *testing.T) {
@@ -318,7 +368,9 @@ func TestGetRunner(t *testing.T) {
 		t.Errorf("timeout")
 	}
 	scenario1a.ctxDone()
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()
 	scenario1c.req.model.ModelPath = "bad path"
 	slog.Info("scenario1c")
@@ -328,7 +380,9 @@ func TestGetRunner(t *testing.T) {
 	require.Len(t, errCh1c, 0)
 	time.Sleep(5 * time.Millisecond)
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 0)
 	s.loadedMu.Unlock()
 	require.Len(t, errCh1c, 1)
 	err = <-errCh1c
 	require.Contains(t, err.Error(), "bad path")
@@ -358,7 +412,9 @@ func TestPrematureExpired(t *testing.T) {
 		require.Equal(t, resp.llama, scenario1a.srv)
 		require.Len(t, s.pendingReqCh, 0)
 		require.Len(t, errCh1a, 0)
 		s.loadedMu.Lock()
 		require.Len(t, s.loaded, 1)
 		s.loadedMu.Unlock()
 		slog.Info("sending premature expired event now")
 		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
 	case <-ctx.Done():
@@ -383,6 +439,7 @@ func TestUseLoadedRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
 	req := &LlmRequest{
 		ctx:             ctx,
 		opts:            api.DefaultOptions(),
 		successCh:       make(chan *runnerRef, 1),
 		sessionDuration: 2,
 	}
@@ -426,8 +483,10 @@ func TestUpdateFreeSpace(t *testing.T) {
 	r2 := &runnerRef{llama: llm2, gpus: gpus}
 	s := InitScheduler(ctx)
 	s.loadedMu.Lock()
 	s.loaded["a"] = r1
 	s.loaded["b"] = r2
 	s.loadedMu.Unlock()
 	s.updateFreeSpace(gpus)
 	require.Equal(t, uint64(850), gpus[0].FreeMemory)
@@ -437,13 +496,18 @@ func TestUpdateFreeSpace(t *testing.T) {
 func TestFindRunnerToUnload(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
 	defer done()
-	req := &LlmRequest{ctx: ctx}
+	req := &LlmRequest{
 		ctx:  ctx,
 		opts: api.DefaultOptions(),
 	}
 	r1 := &runnerRef{refCount: 1, sessionDuration: 1}
 	r2 := &runnerRef{sessionDuration: 2}
 	s := InitScheduler(ctx)
 	s.loadedMu.Lock()
 	s.loaded["a"] = r1
 	s.loaded["b"] = r2
 	s.loadedMu.Unlock()
 	resp := s.findRunnerToUnload(req)
 	require.Equal(t, r2, resp)
@@ -458,10 +522,11 @@ func TestNeedsReload(t *testing.T) {
 	defer done()
 	llm := &mockLlm{}
 	do := api.DefaultOptions()
 	runner := &runnerRef{
 		adapters:   []string{"adapter1"},
 		projectors: []string{"projector1"},
-		Options:    &api.Options{},
+		Options:    &do,
 		llama:      llm,
 	}
 	req := &LlmRequest{
@@ -469,7 +534,7 @@ func TestNeedsReload(t *testing.T) {
 			AdapterPaths:   []string{"adapter2"},
 			ProjectorPaths: []string{"projector2"},
 		},
-		opts: api.Options{},
+		opts: api.DefaultOptions(),
 	}
 	resp := runner.needsReload(ctx, req)
 	require.True(t, resp)
@@ -508,8 +573,10 @@ func TestUnloadAllRunners(t *testing.T) {
 	r1 := &runnerRef{llama: llm1}
 	r2 := &runnerRef{llama: llm2}
 	s.loadedMu.Lock()
 	s.loaded["a"] = r1
 	s.loaded["b"] = r2
 	s.loadedMu.Unlock()
 	s.unloadAllRunners()
 	require.True(t, llm1.closeCalled)
--- a/types/errtypes/errtypes.go
+++ b/types/errtypes/errtypes.go
@@ -0,0 +1,18 @@
 // Package errtypes contains custom error types
 package errtypes
 import (
 	"fmt"
 	"strings"
 )
 const UnknownOllamaKeyErrMsg = "unknown ollama key"
 // TODO: This should have a structured response from the API
 type UnknownOllamaKey struct {
 	Key string
 }
 func (e *UnknownOllamaKey) Error() string {
 	return fmt.Sprintf("unauthorized: %s %q", UnknownOllamaKeyErrMsg, strings.TrimSpace(e.Key))
 }
--- a/types/model/digest.go
+++ b/types/model/digest.go
@@ -1,87 +0,0 @@
 package model
 import (
 	"fmt"
 	"log/slog"
 	"strings"
 	"unicode"
 )
 // Digest represents a digest of a model Manifest. It is a comparable value
 // type and is immutable.
 //
 // The zero Digest is not a valid digest.
 type Digest struct {
 	s string
 }
 // Split returns the digest type and the digest value.
 func (d Digest) Split() (typ, digest string) {
 	typ, digest, _ = strings.Cut(d.s, "-")
 	return
 }
 // String returns the digest in the form of "<digest-type>-<digest>", or the
 // empty string if the digest is invalid.
 func (d Digest) String() string { return d.s }
 // IsValid returns true if the digest is valid (not zero).
 //
 // A valid digest may be created only by ParseDigest, or
 // ParseName(name).Digest().
 func (d Digest) IsValid() bool { return d.s != "" }
 // LogValue implements slog.Value.
 func (d Digest) LogValue() slog.Value {
 	return slog.StringValue(d.String())
 }
 var (
 	_ slog.LogValuer = Digest{}
 )
 // ParseDigest parses a string in the form of "<digest-type>-<digest>" into a
 // Digest.
 func ParseDigest(s string) Digest {
 	typ, digest, ok := strings.Cut(s, "-")
 	if !ok {
 		typ, digest, ok = strings.Cut(s, ":")
 	}
 	if ok && isValidDigestType(typ) && isValidHex(digest) && len(digest) >= 2 {
 		return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
 	}
 	return Digest{}
 }
 func MustParseDigest(s string) Digest {
 	d := ParseDigest(s)
 	if !d.IsValid() {
 		panic(fmt.Sprintf("invalid digest: %q", s))
 	}
 	return d
 }
 func isValidDigestType(s string) bool {
 	if len(s) == 0 {
 		return false
 	}
 	for _, r := range s {
 		if !unicode.IsLower(r) && !unicode.IsDigit(r) {
 			return false
 		}
 	}
 	return true
 }
 func isValidHex(s string) bool {
 	if len(s) == 0 {
 		return false
 	}
 	for i := range s {
 		c := s[i]
 		if c < '0' || c > '9' && c < 'a' || c > 'f' {
 			return false
 		}
 	}
 	return true
 }
--- a/types/model/digest_test.go
+++ b/types/model/digest_test.go
@@ -1,46 +0,0 @@
 package model
 import "testing"
 var testDigests = map[string]Digest{
 	"":                 {},
 	"sha256-1234":      {s: "sha256-1234"},
 	"sha256-5678":      {s: "sha256-5678"},
 	"blake2-9abc":      {s: "blake2-9abc"},
 	"-1234":            {},
 	"sha256-":          {},
 	"sha256-1234-5678": {},
 	"sha256-P":         {}, //         invalid  hex
 	"sha256-1234P":     {},
 	"---":              {},
 }
 func TestDigestParse(t *testing.T) {
 	// Test cases.
 	for s, want := range testDigests {
 		got := ParseDigest(s)
 		t.Logf("ParseDigest(%q) = %#v", s, got)
 		if got != want {
 			t.Errorf("ParseDigest(%q) = %q; want %q", s, got, want)
 		}
 	}
 }
 func TestDigestString(t *testing.T) {
 	// Test cases.
 	for s, d := range testDigests {
 		want := s
 		if !d.IsValid() {
 			want = ""
 		}
 		got := d.String()
 		if got != want {
 			t.Errorf("ParseDigest(%q).String() = %q; want %q", s, got, want)
 		}
 		got = ParseDigest(s).String()
 		if got != want {
 			t.Errorf("roundtrip ParseDigest(%q).String() = %q; want %q", s, got, want)
 		}
 	}
 }
--- a/types/model/name.go
+++ b/types/model/name.go
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
--- a/types/model/testdata/fuzz/FuzzParseRef/82c2975c430ac608
+++ b/types/model/testdata/fuzz/FuzzParseRef/82c2975c430ac608
@@ -1,2 +1,2 @@
 go test fuzz v1
-string(":")
+string("00@")
--- a/types/model/testdata/fuzz/FuzzParseRef/1d43ee52085cb4aa
+++ b/types/model/testdata/fuzz/FuzzParseRef/1d43ee52085cb4aa
@@ -1,2 +0,0 @@
 go test fuzz v1
 string("/0")
--- a/types/model/testdata/fuzz/FuzzParseRef/27fd759314f0e6d6
+++ b/types/model/testdata/fuzz/FuzzParseRef/27fd759314f0e6d6
@@ -1,2 +0,0 @@
 go test fuzz v1
 string("0//0")
--- a/types/model/testdata/fuzz/FuzzParseRef/3e3b70dba384074d
+++ b/types/model/testdata/fuzz/FuzzParseRef/3e3b70dba384074d
@@ -1,2 +0,0 @@
 go test fuzz v1
 string("0 /0")
--- a/types/model/testdata/fuzz/FuzzParseRef/71f1fdff711b6dab
+++ b/types/model/testdata/fuzz/FuzzParseRef/71f1fdff711b6dab
@@ -1,2 +0,0 @@
 go test fuzz v1
 string("+0/00000")
--- a/types/model/testdata/fuzz/FuzzParseRef/b51b1c875e61a948
+++ b/types/model/testdata/fuzz/FuzzParseRef/b51b1c875e61a948
@@ -1,2 +0,0 @@
 go test fuzz v1
 string("0+.\xf2\x80\xf6\x9d00000\xe5\x99\xe6\xd900\xd90\xa60\x91\xdc0\xff\xbf\x99\xe800\xb9\xdc\xd6\xc300\x970\xfb\xfd0\xe0\x8a\xe1\xad\xd40\x9700\xa80\x980\xdd0000\xb00\x91000\xfe0\x89\x9b\x90\x93\x9f0\xe60\xf7\x84\xb0\x87\xa5\xff0\xa000\x9a\x85\xf6\x85\xfe\xa9\xf9\xe9\xde00\xf4\xe0\x8f\x81\xad\xde00\xd700\xaa\xe000000\xb1\xee0\x91")
--- a/types/structs/structs.go
+++ b/types/structs/structs.go
@@ -1,15 +0,0 @@
 // Copyright (c) Tailscale Inc & AUTHORS
 // SPDX-License-Identifier: BSD-3-Clause
 // Package structs contains the Incomparable type.
 package structs
 // Incomparable is a zero-width incomparable type. If added as the
 // first field in a struct, it marks that struct as not comparable
 // (can't do == or be a map key) and usually doesn't add any width to
 // the struct (unless the struct has only small fields).
 //
 // By making a struct incomparable, you can prevent misuse (prevent
 // people from using ==), but also you can shrink generated binaries,
 // as the compiler can omit equality funcs from the binary.
 type Incomparable [0]func()
Author	SHA1	Message	Date
Michael Yang	dc474f9b83	handle intermediate blobs	2024-05-02 17:05:49 -07:00
Michael Yang	41ae232e10	split model layer into metadata and data layers	2024-05-02 17:05:49 -07:00
Michael Yang	122b35c784	s/DisplayLongest/String/	2024-05-02 17:05:26 -07:00
Michael Yang	3244a25c79	only quantize language models	2024-05-02 17:05:26 -07:00
Michael Yang	b535afe35c	no iterator	2024-05-02 17:05:26 -07:00
Michael Yang	fd071eab8b	rebase	2024-05-02 17:05:26 -07:00
Michael Yang	da0bb5d772	comments	2024-05-02 17:05:26 -07:00
Michael Yang	1909e624ce	update tests	2024-05-02 17:05:26 -07:00
Michael Yang	1d8c850f38	quantize any fp16/fp32 model - FROM /path/to/{safetensors,pytorch} - FROM /path/to/fp{16,32}.bin - FROM model:fp{16,32}	2024-05-02 17:05:26 -07:00
Michael Yang	e9ae607ece	Merge pull request #3892 from ollama/mxyng/parser refactor modelfile parser	2024-05-02 17:04:47 -07:00
Michael Yang	93707fa3f2	Merge pull request #4108 from ollama/mxyng/lf fix line ending	2024-05-02 14:55:15 -07:00
Michael Yang	94c369095f	fix line ending replace CRLF with LF	2024-05-02 14:53:13 -07:00
Jeffrey Morgan	9164b0161b	Update .gitattributes	2024-05-02 14:06:31 -04:00
Bryce Reitano	bf4fc25f7b	Add a /clear command (#3947 ) * Add a /clear command * change help messages --------- Co-authored-by: Patrick Devine <patrick@infrahq.com>	2024-05-01 17:44:36 -04:00
Michael Yang	5b806d8d24	Merge pull request #4089 from ollama/mxyng/target-invalid server: destination invalid	2024-05-01 12:46:35 -07:00
Michael Yang	cb1e072643	Merge pull request #4087 from ollama/mxyng/fix-host-port types/model: fix name for hostport	2024-05-01 12:42:07 -07:00
Michael Yang	45b6a12e45	server: target invalid	2024-05-01 12:40:45 -07:00
alwqx	68755f1f5e	chore: fix typo in docs/development.md (#4073 )	2024-05-01 15:39:11 -04:00
Michael Yang	997a455039	want filepath	2024-05-01 12:33:41 -07:00
Michael Yang	88775e1ff9	strip scheme from name	2024-05-01 12:26:19 -07:00
Michael Yang	8867e744ff	types/model: fix name for hostport	2024-05-01 12:14:53 -07:00
Daniel Hiltgen	4fd064bea6	Merge pull request #4031 from MarkWard0110/fix/issue-3736 Fix/issue 3736: When runners are closing or expiring. Scheduler is getting dirty VRAM size readings.	2024-05-01 12:13:26 -07:00
Jeffrey Morgan	59fbceedcc	use lf for line endings (#4085 )	2024-05-01 15:02:45 -04:00
Mark Ward	321d57e1a0	Removing go routine calling .wait from load.	2024-05-01 18:51:10 +00:00
Mark Ward	ba26c7aa00	it will always return an error due to Kill() discarding Wait() errors	2024-05-01 18:51:10 +00:00
Mark Ward	63c763685f	log when the waiting for the process to stop to help debug when other tasks execute during this wait. expire timer clear the timer reference because it will not be reused. close will clean up expireTimer if calling code has not already done this.	2024-05-01 18:51:10 +00:00
Mark Ward	34a4a94f13	ignore debug bin files	2024-05-01 18:51:10 +00:00
Mark Ward	f4a73d57a4	fix runner expire during active use. Clearing the expire timer as it is used. Allowing the finish to assign an expire timer so that the runner will expire after no use.	2024-05-01 18:51:10 +00:00
Mark Ward	948114e3e3	fix sched to wait for the runner to terminate to ensure following vram check will be more accurate	2024-05-01 18:51:10 +00:00
Arpit Jain	a3e60d9058	README.md: fix typos (#4007 ) Co-authored-by: Blake Mizerany <blake.mizerany@gmail.com>	2024-05-01 10:39:38 -07:00
Michael Yang	5ea844964e	cmd: import regexp	2024-05-01 09:53:45 -07:00
Michael Yang	bd8eed57fc	fix parser name	2024-05-01 09:52:54 -07:00
Michael Yang	9cf0f2e973	use parser.Format instead of templating modelfile	2024-05-01 09:52:54 -07:00
Michael Yang	176ad3aa6e	parser: add commands format	2024-05-01 09:52:54 -07:00
Michael Yang	4d08363580	comments	2024-05-01 09:52:54 -07:00
Michael Yang	8907bf51d2	fix multiline	2024-05-01 09:52:54 -07:00
Michael Yang	abe614c705	tests	2024-05-01 09:52:54 -07:00
Michael Yang	238715037d	linting	2024-05-01 09:52:54 -07:00
Michael Yang	c0a00f68ae	refactor modelfile parser	2024-05-01 09:52:54 -07:00
Jeffrey Morgan	f0c454ab57	gpu: add 512MiB to darwin minimum, metal doesn't have partial offloading overhead (#4068 )	2024-05-01 11:46:03 -04:00
Blake Mizerany	b9f74ff3d6	types/model: reintroduce Digest (#4065 )	2024-04-30 16:38:03 -07:00
jmorganca	fcf4d60eee	llm: add back check for empty token cache	2024-04-30 17:38:44 -04:00
jmorganca	e33d5c2dbc	update llama.cpp commit to `952d03d`	2024-04-30 17:31:20 -04:00
Jeffrey Morgan	18d9a7e1f1	update llama.cpp submodule to `f364eb6` (#4060 )	2024-04-30 17:25:39 -04:00
Michael	8488388cbd	Update README.md	2024-04-30 15:45:56 -04:00
Blake Mizerany	588901f449	types/model: reduce Name.Filepath allocs from 5 to 2 (#4039 )	2024-04-30 11:09:19 -07:00
Bruce MacDonald	0a7fdbe533	prompt to display and add local ollama keys to account (#3717 ) - return descriptive error messages when unauthorized to create blob or push a model - display the local public key associated with the request that was denied	2024-04-30 11:02:08 -07:00
Christian Frantzen	5950c176ca	Update langchainpy.md (#4037 ) Updated the code a bit	2024-04-29 23:19:06 -04:00
Daniel Hiltgen	23d23409a0	Update llama.cpp (#4036 ) * Bump llama.cpp to b2761 * Adjust types for bump	2024-04-29 23:18:48 -04:00
Patrick Devine	9009bedf13	better checking for OLLAMA_HOST variable (#3661 )	2024-04-29 19:14:07 -04:00
Daniel Hiltgen	d4ac57e240	Merge pull request #4035 from dhiltgen/fix_relative_paths Fix relative path lookup	2024-04-29 16:08:06 -07:00
Daniel Hiltgen	7b59d1770f	Fix relative path lookup	2024-04-29 16:00:08 -07:00
Jeffrey Morgan	95ead8ffba	Restart server on failure when running Windows app (#3985 ) * app: restart server on failure * fix linter * address comments * refactor log directory creation to be where logs are written * check all log dir creation errors	2024-04-29 10:07:52 -04:00
Jeffrey Morgan	7aa08a77ca	llm: dont cap context window limit to training context window (#3988 )	2024-04-29 10:07:30 -04:00
Blake Mizerany	7e432cdfac	types/model: remove old comment (#4020 )	2024-04-28 20:52:26 -07:00
Jeffrey Morgan	586672f490	fix copying model to itself (#4019 )	2024-04-28 23:47:49 -04:00
Daniel Hiltgen	b03408de74	Merge pull request #3972 from hmartinez82/win_arm64 Add support for building on Windows ARM64	2024-04-28 14:52:58 -07:00
Daniel Hiltgen	1e6a28bf5b	Merge pull request #4009 from dhiltgen/cpu_concurrency Fix concurrency for CPU mode	2024-04-28 14:20:27 -07:00
Daniel Hiltgen	d6e3b64582	Fix concurrency for CPU mode Prior refactoring passes accidentally removed the logic to bypass VRAM checks for CPU loads. This adds that back, along with test coverage. This also fixes loaded map access in the unit test to be behind the mutex which was likely the cause of various flakes in the tests.	2024-04-28 13:42:39 -07:00
Blake Mizerany	114c932a8e	types/model: allow _ as starter character in Name parts (#3991 )	2024-04-27 21:24:52 -07:00
Jeffrey Morgan	7f7103de06	mac: update setup command to `llama3` (#3986 )	2024-04-27 22:52:10 -04:00
Blake Mizerany	c631a9c726	types/model: relax name length constraint from 2 to 1 (#3984 )	2024-04-27 17:58:41 -07:00
Blake Mizerany	8fd9e56804	types/structs: drop unused structs package (#3981 )	2024-04-27 14:06:11 -07:00
Hernan Martinez	8a65717f55	Do not build AVX runners on ARM64	2024-04-26 23:55:32 -06:00
Hernan Martinez	6d3152a98a	Use architecture specific folders in installer script	2024-04-26 23:35:16 -06:00
Hernan Martinez	b438d485f1	Use architecture specific folders in the generate script	2024-04-26 23:34:12 -06:00
Hernan Martinez	204349b17b	Use architecture specific folders in the build script	2024-04-26 23:26:03 -06:00
Hernan Martinez	86e67fc4a9	Add import declaration for windows,arm64 to llm.go	2024-04-26 23:23:53 -06:00
Blake Mizerany	2bed62926e	types/model: remove Digest (for now) (#3970 ) The Digest type needs more thought and is not necessary at the moment.	2024-04-26 21:14:28 -07:00
Jeffrey Morgan	aad8d128a0	also look at cwd as a root for windows runners (#3959 )	2024-04-26 19:14:08 -04:00
Daniel Hiltgen	ec1acbb867	Merge pull request #3968 from dhiltgen/win_generate Fine grain control over windows generate steps	2024-04-26 16:03:38 -07:00
Daniel Hiltgen	e4859c4563	Fine grain control over windows generate steps This will speed up CI which already tries to only build static for unit tests	2024-04-26 15:49:46 -07:00
Nataly Merezhuk	8e30eb26bd	Updates the setup command to use llama3. (#3962 )	2024-04-26 18:41:01 -04:00
Daniel Hiltgen	0b5c589ca2	Merge pull request #3966 from dhiltgen/bump Fix target in gen_windows.ps1	2024-04-26 15:36:53 -07:00
Michael Yang	65fadddc85	Merge pull request #3964 from ollama/mxyng/weights fix gemma, command-r layer weights	2024-04-26 15:23:33 -07:00
Daniel Hiltgen	ed5fb088c4	Fix target in gen_windows.ps1	2024-04-26 15:10:42 -07:00
Michael Yang	f81f308118	fix gemma, command-r layer weights	2024-04-26 15:00:55 -07:00
Blake Mizerany	b1390a7b37	types/model: export ParseNameBare and Merge (#3957 ) These are useful outside this package.	2024-04-26 14:58:07 -07:00
Michael Yang	11d83386a5	Merge pull request #3951 from ollama/mxyng/zip check file type before zip	2024-04-26 14:51:23 -07:00
Jeffrey Morgan	bb31def011	return code `499` when user cancels request while a model is loading (#3955 )	2024-04-26 17:38:29 -04:00
Michael Yang	41e03ede95	check file type before zip	2024-04-26 14:18:07 -07:00
Michael Yang	7fea1ecdf6	Merge pull request #3958 from ollama/mxyng/fix-workflow use merge base for diff-tree	2024-04-26 14:17:56 -07:00
Blake Mizerany	054894271d	.github/workflows/test.yaml: add in-flight cancellations on new push (#3956 ) Also, remove a superfluous 'go get'	2024-04-26 13:54:24 -07:00
Michael Yang	6fef042f0b	use merge base for diff-tree	2024-04-26 13:54:15 -07:00
Daniel Hiltgen	5c0c2d1d09	Merge pull request #3954 from dhiltgen/ci_fixes Put back non-avx CPU build for windows	2024-04-26 13:09:03 -07:00
Blake Mizerany	37f9c8ad99	types/model: overhaul Name and Digest types (#3924 )	2024-04-26 13:08:32 -07:00
Quinten van Buul	2a80f55e2a	Update windows.md (#3855 ) Fixed a typo	2024-04-26 16:04:15 -04:00
Daniel Hiltgen	421c878a2d	Put back non-avx CPU build for windows	2024-04-26 12:44:07 -07:00
Daniel Hiltgen	36666c2142	Merge pull request #3925 from dhiltgen/bump Bump llama.cpp to b2737	2024-04-26 10:09:38 -07:00
Daniel Hiltgen	85801317d1	Fix clip log import	2024-04-26 09:43:46 -07:00
Daniel Hiltgen	2ed0d65948	Bump llama.cpp to b2737	2024-04-26 09:43:28 -07:00
Daniel Hiltgen	d459dc4ad1	Merge pull request #3950 from dhiltgen/windows_packaging Fix exe name for zip packaging on windows	2024-04-26 09:27:37 -07:00
Daniel Hiltgen	40bc4622ef	Fix exe name for zip packaging on windows The zip file encodes the OS and architecture, so keep the short exe name	2024-04-26 09:18:05 -07:00
Daniel Hiltgen	c0f818a07a	Merge pull request #3948 from dhiltgen/win_generate Refactor windows generate for more modular usage	2024-04-26 09:17:20 -07:00
Daniel Hiltgen	8671fdeda6	Refactor windows generate for more modular usage	2024-04-26 08:35:50 -07:00
Daniel Hiltgen	2619850fb4	Merge pull request #3933 from dhiltgen/ci_fixes Move cuda/rocm dependency gathering into generate script	2024-04-26 07:01:24 -07:00
Daniel Hiltgen	8feb97dc0d	Move cuda/rocm dependency gathering into generate script This will make it simpler for CI to accumulate artifacts from prior steps	2024-04-25 22:38:44 -07:00
`@@ -1,2 +1,2 @@`
	`go test fuzz v1`	`go test fuzz v1`
	`string(":")`	`string("00@")`
		`@@ -1,2 +0,0 @@`
			`go test fuzz v1`
			`string("0+.\xf2\x80\xf6\x9d00000\xe5\x99\xe6\xd900\xd90\xa60\x91\xdc0\xff\xbf\x99\xe800\xb9\xdc\xd6\xc300\x970\xfb\xfd0\xe0\x8a\xe1\xad\xd40\x9700\xa80\x980\xdd0000\xb00\x91000\xfe0\x89\x9b\x90\x93\x9f0\xe60\xf7\x84\xb0\x87\xa5\xff0\xa000\x9a\x85\xf6\x85\xfe\xa9\xf9\xe9\xde00\xf4\xe0\x8f\x81\xad\xde00\xd700\xaa\xe000000\xb1\xee0\x91")`