Compare commits

...

8 Commits

Author SHA1 Message Date
jmorganca
d8b3e09fb7 llm: enable flash attention by default 2024-06-08 22:55:22 -07:00
Nischal Jain
85169e8d6f Added headless-ollama (#4612) 2024-06-08 18:51:16 -07:00
Jeffrey Morgan
34f142797a llm: always add bos token to prompt (#4941)
* fix embedding by adding fixes from llama.cpp upstream

* remove assert

---------

Co-authored-by: Jesper Ek <deadbeef84@gmail.com>
2024-06-08 18:47:10 -07:00
Erhan
46a7f1e74a Update README.md with LangChainRust (#4854) 2024-06-08 17:29:36 -07:00
Daniel Hiltgen
cddc63381c Merge pull request #4909 from dhiltgen/oneapi_disable
Add ability to skip oneapi generate
2024-06-07 14:07:15 -07:00
Michael Yang
385a32ecb5 Merge pull request #4910 from ollama/mxyng/detect-chat-template
fix create model when template detection errors
2024-06-07 11:07:39 -07:00
Michael Yang
030e765e76 fix create model when template detection errors 2024-06-07 10:51:35 -07:00
Daniel Hiltgen
ab8c929e20 Add ability to skip oneapi generate
This follows the same pattern for cuda and rocm to allow
disabling the build even when we detect the dependent libraries
2024-06-07 08:32:49 -07:00
12 changed files with 83 additions and 58 deletions

View File

@@ -326,6 +326,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
@@ -381,6 +382,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
### Supported backends

View File

@@ -90,6 +90,7 @@ func init() {
NumParallel = 1
MaxRunners = 1
MaxQueuedRequests = 512
FlashAttention = true
LoadConfig()
}

View File

@@ -835,7 +835,7 @@ struct llama_server_context
system_tokens.clear();
if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
llama_batch_clear(batch);
@@ -1656,7 +1656,7 @@ struct llama_server_context
slot.t_start_process_prompt = ggml_time_us();
slot.t_start_genereration = 0;
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
slot.n_prompt_tokens = prompt_tokens.size();

View File

@@ -211,7 +211,7 @@ if [ -z "${ONEAPI_ROOT}" ]; then
ONEAPI_ROOT=/opt/intel/oneapi
fi
if [ -d "${ONEAPI_ROOT}" ]; then
if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
echo "OneAPI libraries detected - building dynamic OneAPI library"
init_vars
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI

View File

@@ -290,7 +290,7 @@ function build_cuda() {
}
function build_oneapi() {
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}")) {
if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}")) {
# Get oneAPI version
$script:ONEAPI_VERSION = icpx --version
$script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value

View File

@@ -618,22 +618,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
}
}
offset, err := ws.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
var alignment int64 = 32
padding := llm.padding(offset, alignment)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
for _, tensor := range tensors {
if _, err := tensor.WriteTo(ws); err != nil {
return err
}
offset, err := ws.Seek(0, io.SeekCurrent)
if err != nil {
return err
@@ -643,6 +629,10 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
if _, err := tensor.WriteTo(ws); err != nil {
return err
}
}
return nil

View File

@@ -437,18 +437,17 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
config.ModelFamilies = append(config.ModelFamilies, baseLayer.GGML.KV().Architecture())
if s := baseLayer.GGML.KV().ChatTemplate(); s != "" {
t, err := templates.NamedTemplate(s)
if err != nil {
return err
}
if t, err := templates.NamedTemplate(s); err != nil {
slog.Debug("template detection", "error", err)
} else {
layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
if err != nil {
return err
}
layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
if err != nil {
return err
layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
layers = append(layers, layer)
}
layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
layers = append(layers, layer)
}
}

View File

@@ -15,11 +15,12 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
)
var stream bool = false
func createBinFile(t *testing.T) string {
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
t.Helper()
f, err := os.CreateTemp(t.TempDir(), "")
@@ -28,19 +29,7 @@ func createBinFile(t *testing.T) string {
}
defer f.Close()
if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
t.Fatal(err)
}
if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
t.Fatal(err)
}
if err := binary.Write(f, binary.LittleEndian, uint64(0)); err != nil {
t.Fatal(err)
}
if err := binary.Write(f, binary.LittleEndian, uint64(0)); err != nil {
if err := llm.NewGGUFV3(binary.LittleEndian).Encode(f, kv, ti); err != nil {
t.Fatal(err)
}
@@ -101,7 +90,7 @@ func TestCreateFromBin(t *testing.T) {
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -126,7 +115,7 @@ func TestCreateFromModel(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -166,7 +155,7 @@ func TestCreateRemovesLayers(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -186,7 +175,7 @@ func TestCreateRemovesLayers(t *testing.T) {
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -212,7 +201,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -232,7 +221,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -267,7 +256,7 @@ func TestCreateMergeParameters(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -369,7 +358,7 @@ func TestCreateReplacesMessages(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -444,7 +433,7 @@ func TestCreateTemplateSystem(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -489,7 +478,7 @@ func TestCreateLicenses(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -526,3 +515,46 @@ func TestCreateLicenses(t *testing.T) {
t.Errorf("expected Apache-2.0, actual %s", apache)
}
}
func TestCreateDetectTemplate(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
var s Server
t.Run("matched", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
}, nil)),
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code)
}
checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
filepath.Join(p, "blobs", "sha256-06cd2687a518d624073f125f1db1c5c727f77c75e84a138fe745186dbbbb4cd7"),
filepath.Join(p, "blobs", "sha256-542b217f179c7825eeb5bca3c77d2b75ed05bafbd3451d9188891a60a85337c6"),
filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
})
})
t.Run("unmatched", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status code 200, actual %d", w.Code)
}
checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
filepath.Join(p, "blobs", "sha256-a4e5e156ddec27e286f75328784d7106b60a4eb1d246e950a001a3f944fbda99"),
filepath.Join(p, "blobs", "sha256-ca239d7bd8ea90e4a5d2e6bf88f8d74a47b14336e73eb4e18bed4dd325018116"),
})
})
}

View File

@@ -16,7 +16,7 @@ func TestDelete(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
})
if w.Code != http.StatusOK {
@@ -25,7 +25,7 @@ func TestDelete(t *testing.T) {
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test2",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
})
if w.Code != http.StatusOK {

View File

@@ -29,7 +29,7 @@ func TestList(t *testing.T) {
for _, n := range expectNames {
createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: n,
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
})
}

View File

@@ -261,7 +261,7 @@ func TestCase(t *testing.T) {
t.Run(tt, func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: tt,
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream,
})
@@ -277,7 +277,7 @@ func TestCase(t *testing.T) {
t.Run("create", func(t *testing.T) {
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: strings.ToUpper(tt),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t)),
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
Stream: &stream,
})

View File

@@ -30,7 +30,8 @@ var templatesOnce = sync.OnceValues(func() ([]*Template, error) {
return nil, err
}
t.Bytes = bts
// normalize line endings
t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
}
return templates, nil