llm: enable flash attention by default

Added headless-ollama (#4612 )
llm: always add bos token to prompt (#4941 )
2026-04-28 03:39:48 +02:00 · 2024-06-08 22:55:22 -07:00 · 2024-06-08 18:51:16 -07:00 · 2024-06-08 18:47:10 -07:00 · 2024-06-08 17:29:36 -07:00 · 2024-06-07 14:07:15 -07:00
12 changed files with 83 additions and 58 deletions
--- a/README.md
+++ b/README.md
@@ -326,6 +326,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
+- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
@@ -381,6 +382,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
+- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)

 ### Supported backends

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -90,6 +90,7 @@ func init() {
 	NumParallel = 1
 	MaxRunners = 1
 	MaxQueuedRequests = 512
+	FlashAttention = true

 	LoadConfig()
 }
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -835,7 +835,7 @@ struct llama_server_context
        system_tokens.clear();

        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = ::llama_tokenize(ctx, system_prompt, true);

            llama_batch_clear(batch);

@@ -1656,7 +1656,7 @@ struct llama_server_context
                    slot.t_start_process_prompt = ggml_time_us();
                    slot.t_start_genereration = 0;

-                    prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
+                    prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt

                    slot.n_prompt_tokens = prompt_tokens.size();

--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -211,7 +211,7 @@ if [ -z "${ONEAPI_ROOT}" ]; then
    ONEAPI_ROOT=/opt/intel/oneapi
 fi

-if [ -d "${ONEAPI_ROOT}" ]; then
+if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    echo "OneAPI libraries detected - building dynamic OneAPI library"
    init_vars
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -290,7 +290,7 @@ function build_cuda() {
 }

 function build_oneapi() {
-  if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
+  if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
    # Get oneAPI version
    $script:ONEAPI_VERSION = icpx --version
    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -618,22 +618,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
 		}
 	}

-	offset, err := ws.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
 	var alignment int64 = 32
-	padding := llm.padding(offset, alignment)
-	if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
-		return err
-	}
-
 	for _, tensor := range tensors {
-		if _, err := tensor.WriteTo(ws); err != nil {
-			return err
-		}
-
 		offset, err := ws.Seek(0, io.SeekCurrent)
 		if err != nil {
 			return err
@@ -643,6 +629,10 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
 		if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
 			return err
 		}
+
+		if _, err := tensor.WriteTo(ws); err != nil {
+			return err
+		}
 	}

 	return nil
--- a/server/images.go
+++ b/server/images.go
@@ -437,18 +437,17 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 					config.ModelFamilies = append(config.ModelFamilies, baseLayer.GGML.KV().Architecture())

 					if s := baseLayer.GGML.KV().ChatTemplate(); s != "" {
-						t, err := templates.NamedTemplate(s)
-						if err != nil {
-							return err
-						}
+						if t, err := templates.NamedTemplate(s); err != nil {
+							slog.Debug("template detection", "error", err)
+						} else {
+							layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
+							if err != nil {
+								return err
+							}

-						layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
-						if err != nil {
-							return err
+							layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
+							layers = append(layers, layer)
 						}
-
-						layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
-						layers = append(layers, layer)
 					}
 				}

--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -15,11 +15,12 @@ import (

 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/llm"
 )

 var stream bool = false

-func createBinFile(t *testing.T) string {
+func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "")
@@ -28,19 +29,7 @@ func createBinFile(t *testing.T) string {
 	}
 	defer f.Close()

-	if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := binary.Write(f, binary.LittleEndian, uint64(0)); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := binary.Write(f, binary.LittleEndian, uint64(0)); err != nil {
+	if err := llm.NewGGUFV3(binary.LittleEndian).Encode(f, kv, ti); err != nil {
 		t.Fatal(err)
 	}

@@ -101,7 +90,7 @@ func TestCreateFromBin(t *testing.T) {
 	var s Server
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -126,7 +115,7 @@ func TestCreateFromModel(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -166,7 +155,7 @@ func TestCreateRemovesLayers(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -186,7 +175,7 @@ func TestCreateRemovesLayers(t *testing.T) {

 	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -212,7 +201,7 @@ func TestCreateUnsetsSystem(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -232,7 +221,7 @@ func TestCreateUnsetsSystem(t *testing.T) {

 	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -267,7 +256,7 @@ func TestCreateMergeParameters(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -369,7 +358,7 @@ func TestCreateReplacesMessages(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -444,7 +433,7 @@ func TestCreateTemplateSystem(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -489,7 +478,7 @@ func TestCreateLicenses(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	})

@@ -526,3 +515,46 @@ func TestCreateLicenses(t *testing.T) {
 		t.Errorf("expected Apache-2.0, actual %s", apache)
 	}
 }
+
+func TestCreateDetectTemplate(t *testing.T) {
+	p := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", p)
+	var s Server
+
+	t.Run("matched", func(t *testing.T) {
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+			Name: "test",
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+			}, nil)),
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status code 200, actual %d", w.Code)
+		}
+
+		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
+			filepath.Join(p, "blobs", "sha256-06cd2687a518d624073f125f1db1c5c727f77c75e84a138fe745186dbbbb4cd7"),
+			filepath.Join(p, "blobs", "sha256-542b217f179c7825eeb5bca3c77d2b75ed05bafbd3451d9188891a60a85337c6"),
+			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
+		})
+	})
+
+	t.Run("unmatched", func(t *testing.T) {
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
+			Name:      "test",
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+			Stream:    &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status code 200, actual %d", w.Code)
+		}
+
+		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
+			filepath.Join(p, "blobs", "sha256-a4e5e156ddec27e286f75328784d7106b60a4eb1d246e950a001a3f944fbda99"),
+			filepath.Join(p, "blobs", "sha256-ca239d7bd8ea90e4a5d2e6bf88f8d74a47b14336e73eb4e18bed4dd325018116"),
+		})
+	})
+}
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -16,7 +16,7 @@ func TestDelete(t *testing.T) {

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
-		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 	})

 	if w.Code != http.StatusOK {
@@ -25,7 +25,7 @@ func TestDelete(t *testing.T) {

 	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test2",
-		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t)),
+		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 	})

 	if w.Code != http.StatusOK {
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -29,7 +29,7 @@ func TestList(t *testing.T) {
 	for _, n := range expectNames {
 		createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name:      n,
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		})
 	}

--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -261,7 +261,7 @@ func TestCase(t *testing.T) {
 		t.Run(tt, func(t *testing.T) {
 			w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 				Name:      tt,
-				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
+				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 				Stream:    &stream,
 			})

@@ -277,7 +277,7 @@ func TestCase(t *testing.T) {
 			t.Run("create", func(t *testing.T) {
 				w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 					Name:      strings.ToUpper(tt),
-					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
+					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 					Stream:    &stream,
 				})

--- a/templates/template.go
+++ b/templates/template.go
@@ -30,7 +30,8 @@ var templatesOnce = sync.OnceValues(func() ([]*Template, error) {
 			return nil, err
 		}

-		t.Bytes = bts
+		// normalize line endings
+		t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
 	}

 	return templates, nil
Author	SHA1	Message	Date
jmorganca	d8b3e09fb7	llm: enable flash attention by default	2024-06-08 22:55:22 -07:00
Nischal Jain	85169e8d6f	Added headless-ollama (#4612 )	2024-06-08 18:51:16 -07:00
Jeffrey Morgan	34f142797a	llm: always add bos token to prompt (#4941 ) * fix embedding by adding fixes from llama.cpp upstream * remove assert --------- Co-authored-by: Jesper Ek <deadbeef84@gmail.com>	2024-06-08 18:47:10 -07:00
Erhan	46a7f1e74a	Update README.md with LangChainRust (#4854 )	2024-06-08 17:29:36 -07:00
Daniel Hiltgen	cddc63381c	Merge pull request #4909 from dhiltgen/oneapi_disable Add ability to skip oneapi generate	2024-06-07 14:07:15 -07:00
Michael Yang	385a32ecb5	Merge pull request #4910 from ollama/mxyng/detect-chat-template fix create model when template detection errors	2024-06-07 11:07:39 -07:00
Michael Yang	030e765e76	fix create model when template detection errors	2024-06-07 10:51:35 -07:00
Daniel Hiltgen	ab8c929e20	Add ability to skip oneapi generate This follows the same pattern for cuda and rocm to allow disabling the build even when we detect the dependent libraries	2024-06-07 08:32:49 -07:00