mirror of
https://github.com/ollama/ollama.git
synced 2026-04-21 00:05:40 +02:00
Compare commits
6 Commits
parth/add-
...
v0.13.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
709f842457 | ||
|
|
2dfb74410d | ||
|
|
1eb5e75972 | ||
|
|
3475d915cb | ||
|
|
48e78e9be1 | ||
|
|
a838421ea3 |
@@ -555,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
|
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
|
||||||
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
|
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
|
||||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||||
- [Swollama for Swift]([https://github.com/marcusziade/Swollama](https://github.com/guitaripod/Swollama) with [DocC]( https://guitaripod.github.io/Swollama/documentation/swollama)
|
- [Swollama for Swift](https://github.com/guitaripod/Swollama) with [DocC](https://guitaripod.github.io/Swollama/documentation/swollama)
|
||||||
- [GoLamify](https://github.com/prasad89/golamify)
|
- [GoLamify](https://github.com/prasad89/golamify)
|
||||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
||||||
|
|||||||
@@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
conv = &llama4Model{}
|
conv = &llama4Model{}
|
||||||
case "Mistral3ForConditionalGeneration":
|
case "Mistral3ForConditionalGeneration":
|
||||||
conv = &mistral3Model{}
|
conv = &mistral3Model{}
|
||||||
|
case "Ministral3ForCausalLM":
|
||||||
|
conv = &mistral3CausalModel{}
|
||||||
case "MixtralForCausalLM":
|
case "MixtralForCausalLM":
|
||||||
conv = &mixtralModel{}
|
conv = &mixtralModel{}
|
||||||
case "GemmaForCausalLM":
|
case "GemmaForCausalLM":
|
||||||
|
|||||||
@@ -33,10 +33,12 @@ type mistral3Model struct {
|
|||||||
BetaFast float32 `json:"beta_fast"`
|
BetaFast float32 `json:"beta_fast"`
|
||||||
BetaSlow float32 `json:"beta_slow"`
|
BetaSlow float32 `json:"beta_slow"`
|
||||||
Factor float32 `json:"factor"`
|
Factor float32 `json:"factor"`
|
||||||
ScalingBeta float32 `json:"llama_4_scaling_beta"`
|
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
RopeType string `json:"rope_type"`
|
RopeType string `json:"rope_type"`
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
Mscale *float32 `json:"mscale"`
|
||||||
|
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||||
} `json:"rope_parameters"`
|
} `json:"rope_parameters"`
|
||||||
} `json:"text_config"`
|
} `json:"text_config"`
|
||||||
VisionModel struct {
|
VisionModel struct {
|
||||||
@@ -50,6 +52,9 @@ type mistral3Model struct {
|
|||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
HiddenAct string `json:"hidden_act"`
|
HiddenAct string `json:"hidden_act"`
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RopeParameters struct {
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
} `json:"rope_parameters"`
|
||||||
} `json:"vision_config"`
|
} `json:"vision_config"`
|
||||||
MultiModalProjectorBias bool `json:"multimodal_projector_bias"`
|
MultiModalProjectorBias bool `json:"multimodal_projector_bias"`
|
||||||
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
||||||
@@ -72,10 +77,22 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
|
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
|
||||||
kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
|
kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
|
||||||
kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
|
kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
|
||||||
|
kv["mistral3.rope.scaling.factor"] = p.TextModel.RopeParameters.Factor
|
||||||
|
kv["mistral3.rope.scaling.type"] = p.TextModel.RopeParameters.RopeType
|
||||||
|
kv["mistral3.rope.scaling.beta_fast"] = p.TextModel.RopeParameters.BetaFast
|
||||||
|
kv["mistral3.rope.scaling.beta_slow"] = p.TextModel.RopeParameters.BetaSlow
|
||||||
|
|
||||||
|
if p.TextModel.RopeParameters.Mscale != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale"] = *p.TextModel.RopeParameters.Mscale
|
||||||
|
}
|
||||||
|
if p.TextModel.RopeParameters.MscaleAllDim != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.TextModel.RopeParameters.MscaleAllDim
|
||||||
|
}
|
||||||
if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||||
kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
|
kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
|
||||||
kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
|
}
|
||||||
|
if p.TextModel.RopeParameters.Llama4ScalingBeta != nil {
|
||||||
|
kv["mistral3.rope.scaling_beta"] = *p.TextModel.RopeParameters.Llama4ScalingBeta
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vision configuration
|
// Vision configuration
|
||||||
@@ -88,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
|
kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
|
||||||
kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
|
kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
|
||||||
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
|
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
|
||||||
kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
|
kv["mistral3.vision.rope.freq_base"] = cmp.Or(p.VisionModel.RopeTheta, p.VisionModel.RopeParameters.RopeTheta)
|
||||||
|
|
||||||
// Multimodal configuration
|
// Multimodal configuration
|
||||||
kv["mistral3.image_token_index"] = p.ImageTokenIndex
|
kv["mistral3.image_token_index"] = p.ImageTokenIndex
|
||||||
|
|||||||
181
convert/convert_mistral_causal.go
Normal file
181
convert/convert_mistral_causal.go
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mistral3CausalModel struct {
|
||||||
|
ModelParameters
|
||||||
|
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
|
HeadDim uint32 `json:"head_dim"`
|
||||||
|
SlidingWindow *uint32 `json:"sliding_window"`
|
||||||
|
HiddenAct string `json:"hidden_act"`
|
||||||
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
|
RopeParameters struct {
|
||||||
|
BetaFast float32 `json:"beta_fast"`
|
||||||
|
BetaSlow float32 `json:"beta_slow"`
|
||||||
|
Factor float32 `json:"factor"`
|
||||||
|
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||||
|
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
|
RopeType string `json:"rope_type"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
Mscale *float32 `json:"mscale"`
|
||||||
|
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||||
|
} `json:"rope_parameters"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "mistral3"
|
||||||
|
kv["mistral3.vocab_size"] = p.VocabSize
|
||||||
|
|
||||||
|
// Text configuration
|
||||||
|
kv["mistral3.block_count"] = p.NumHiddenLayers
|
||||||
|
kv["mistral3.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
kv["mistral3.embedding_length"] = p.HiddenSize
|
||||||
|
kv["mistral3.feed_forward_length"] = p.IntermediateSize
|
||||||
|
kv["mistral3.attention.head_count"] = p.NumAttentionHeads
|
||||||
|
kv["mistral3.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||||
|
kv["mistral3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||||
|
kv["mistral3.attention.key_length"] = p.HeadDim
|
||||||
|
kv["mistral3.attention.value_length"] = p.HeadDim
|
||||||
|
kv["mistral3.rope.dimension_count"] = cmp.Or(p.HeadDim, p.HiddenSize/p.NumAttentionHeads)
|
||||||
|
kv["mistral3.rope.freq_base"] = cmp.Or(p.RopeTheta, p.RopeParameters.RopeTheta)
|
||||||
|
kv["mistral3.rope.scaling.factor"] = p.RopeParameters.Factor
|
||||||
|
kv["mistral3.rope.scaling.type"] = p.RopeParameters.RopeType
|
||||||
|
kv["mistral3.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
|
||||||
|
kv["mistral3.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
|
||||||
|
|
||||||
|
if p.RopeParameters.Mscale != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale"] = *p.RopeParameters.Mscale
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeParameters.MscaleAllDim != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.RopeParameters.MscaleAllDim
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||||
|
kv["mistral3.rope.scaling.original_context_length"] = p.RopeParameters.OrigMaxPositionEmbeddings
|
||||||
|
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeParameters.Llama4ScalingBeta != nil {
|
||||||
|
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
|
for _, t := range ts {
|
||||||
|
if !strings.HasPrefix(t.Name(), "v.") {
|
||||||
|
if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
|
||||||
|
strings.HasSuffix(t.Name(), ".attn_k.weight") {
|
||||||
|
t.SetRepacker(p.repack)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"model.norm", "output_norm",
|
||||||
|
"model.", "",
|
||||||
|
"layers", "blk",
|
||||||
|
"transformer.layers", "blk",
|
||||||
|
"vision_tower", "v",
|
||||||
|
"ln_pre", "encoder_norm",
|
||||||
|
"input_layernorm", "attn_norm",
|
||||||
|
"post_attention_layernorm", "ffn_norm",
|
||||||
|
"embed_tokens", "token_embd",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"attention.q_proj", "attn_q",
|
||||||
|
"attention.k_proj", "attn_k",
|
||||||
|
"attention.v_proj", "attn_v",
|
||||||
|
"attention.o_proj", "attn_output",
|
||||||
|
"attention_norm", "attn_norm",
|
||||||
|
"feed_forward.gate_proj", "ffn_gate",
|
||||||
|
"feed_forward.down_proj", "ffn_down",
|
||||||
|
"feed_forward.up_proj", "ffn_up",
|
||||||
|
"multi_modal_projector", "mm",
|
||||||
|
"ffn_norm", "ffn_norm",
|
||||||
|
"lm_head", "output",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
var dims []int
|
||||||
|
for _, dim := range shape {
|
||||||
|
dims = append(dims, int(dim))
|
||||||
|
}
|
||||||
|
|
||||||
|
var heads uint32
|
||||||
|
if strings.HasSuffix(name, ".attn_q.weight") {
|
||||||
|
heads = p.NumAttentionHeads
|
||||||
|
} else if strings.HasSuffix(name, ".attn_k.weight") {
|
||||||
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
||||||
@@ -487,6 +487,63 @@ func TestEmbedTruncation(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
|
||||||
|
func TestEmbedLargeInput(t *testing.T) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
for _, model := range libraryEmbedModels {
|
||||||
|
model := model
|
||||||
|
t.Run(model, func(t *testing.T) {
|
||||||
|
mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
|
||||||
|
defer mcancel()
|
||||||
|
|
||||||
|
// Test with progressively larger inputs
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
inputWords int
|
||||||
|
}{
|
||||||
|
{"medium_input_256_words", 256},
|
||||||
|
{"large_input_512_words", 512},
|
||||||
|
{"very_large_input_800_words", 800},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
words := make([]string, tc.inputWords)
|
||||||
|
for i := range words {
|
||||||
|
words[i] = "word"
|
||||||
|
}
|
||||||
|
input := strings.Join(words, " ")
|
||||||
|
|
||||||
|
req := api.EmbedRequest{
|
||||||
|
Model: model,
|
||||||
|
Input: input,
|
||||||
|
KeepAlive: &api.Duration{Duration: 30 * time.Second},
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := embedTestHelper(mctx, client, t, req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("embedding failed for %d words: %v", tc.inputWords, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(res.Embeddings) != 1 {
|
||||||
|
t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(res.Embeddings[0]) == 0 {
|
||||||
|
t.Fatal("expected non-empty embedding")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Successfully embedded %d words (%d tokens)", tc.inputWords, res.PromptEvalCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestEmbedStatusCode tests that errors from the embedding endpoint
|
// TestEmbedStatusCode tests that errors from the embedding endpoint
|
||||||
// properly preserve their HTTP status codes when returned to the client.
|
// properly preserve their HTTP status codes when returned to the client.
|
||||||
// This test specifically checks the error handling path in EmbedHandler
|
// This test specifically checks the error handling path in EmbedHandler
|
||||||
|
|||||||
@@ -121,7 +121,8 @@ type ContextParams struct {
|
|||||||
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
|
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
|
||||||
params := C.llama_context_default_params()
|
params := C.llama_context_default_params()
|
||||||
params.n_ctx = C.uint(numCtx)
|
params.n_ctx = C.uint(numCtx)
|
||||||
params.n_batch = C.uint(batchSize)
|
params.n_batch = C.uint(batchSize * numSeqMax)
|
||||||
|
params.n_ubatch = C.uint(batchSize)
|
||||||
params.n_seq_max = C.uint(numSeqMax)
|
params.n_seq_max = C.uint(numSeqMax)
|
||||||
params.n_threads = C.int(threads)
|
params.n_threads = C.int(threads)
|
||||||
params.n_threads_batch = params.n_threads
|
params.n_threads_batch = params.n_threads
|
||||||
|
|||||||
@@ -474,6 +474,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
|||||||
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
|
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if embedding model and adjust batch size accordingly
|
||||||
|
_, isEmbedding := s.ggml.KV()[fmt.Sprintf("%s.pooling_type", s.ggml.KV().Architecture())]
|
||||||
|
if isEmbedding && s.loadRequest.BatchSize < s.options.NumCtx {
|
||||||
|
s.loadRequest.BatchSize = s.options.NumCtx
|
||||||
|
slog.Info("embedding model detected, setting batch size to context length", "batch_size", s.loadRequest.BatchSize)
|
||||||
|
}
|
||||||
|
|
||||||
kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
|
kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
|
||||||
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
|
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
|
||||||
|
|
||||||
|
|||||||
@@ -433,3 +433,111 @@ func ChatMiddleware() gin.HandlerFunc {
|
|||||||
c.Next()
|
c.Next()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ResponsesWriter struct {
|
||||||
|
BaseWriter
|
||||||
|
converter *openai.ResponsesStreamConverter
|
||||||
|
model string
|
||||||
|
stream bool
|
||||||
|
responseID string
|
||||||
|
itemID string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *ResponsesWriter) writeEvent(eventType string, data any) error {
|
||||||
|
d, err := json.Marshal(data)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("event: %s\ndata: %s\n\n", eventType, d)))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if f, ok := w.ResponseWriter.(http.Flusher); ok {
|
||||||
|
f.Flush()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *ResponsesWriter) writeResponse(data []byte) (int, error) {
|
||||||
|
var chatResponse api.ChatResponse
|
||||||
|
if err := json.Unmarshal(data, &chatResponse); err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if w.stream {
|
||||||
|
w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
|
||||||
|
|
||||||
|
events := w.converter.Process(chatResponse)
|
||||||
|
for _, event := range events {
|
||||||
|
if err := w.writeEvent(event.Event, event.Data); err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len(data), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-streaming response
|
||||||
|
w.ResponseWriter.Header().Set("Content-Type", "application/json")
|
||||||
|
response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse)
|
||||||
|
return len(data), json.NewEncoder(w.ResponseWriter).Encode(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *ResponsesWriter) Write(data []byte) (int, error) {
|
||||||
|
code := w.ResponseWriter.Status()
|
||||||
|
if code != http.StatusOK {
|
||||||
|
return w.writeError(data)
|
||||||
|
}
|
||||||
|
return w.writeResponse(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResponsesMiddleware() gin.HandlerFunc {
|
||||||
|
return func(c *gin.Context) {
|
||||||
|
var req openai.ResponsesRequest
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
chatReq, err := openai.FromResponsesRequest(req)
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if client requested streaming (defaults to false)
|
||||||
|
streamRequested := req.Stream != nil && *req.Stream
|
||||||
|
|
||||||
|
// Pass streaming preference to the underlying chat request
|
||||||
|
chatReq.Stream = &streamRequested
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusInternalServerError, openai.NewError(http.StatusInternalServerError, err.Error()))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.Request.Body = io.NopCloser(&b)
|
||||||
|
|
||||||
|
responseID := fmt.Sprintf("resp_%d", rand.Intn(999999))
|
||||||
|
itemID := fmt.Sprintf("msg_%d", rand.Intn(999999))
|
||||||
|
|
||||||
|
w := &ResponsesWriter{
|
||||||
|
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
|
||||||
|
converter: openai.NewResponsesStreamConverter(responseID, itemID, req.Model),
|
||||||
|
model: req.Model,
|
||||||
|
stream: streamRequested,
|
||||||
|
responseID: responseID,
|
||||||
|
itemID: itemID,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set headers based on streaming mode
|
||||||
|
if streamRequested {
|
||||||
|
c.Writer.Header().Set("Content-Type", "text/event-stream")
|
||||||
|
c.Writer.Header().Set("Cache-Control", "no-cache")
|
||||||
|
c.Writer.Header().Set("Connection", "keep-alive")
|
||||||
|
}
|
||||||
|
|
||||||
|
c.Writer = w
|
||||||
|
c.Next()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
|
"github.com/ollama/ollama/ml/nn/rope"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -17,10 +18,30 @@ type TextOptions struct {
|
|||||||
eps, ropeBase, ropeScale float32
|
eps, ropeBase, ropeScale float32
|
||||||
ropeOrigPosEmbeddings int
|
ropeOrigPosEmbeddings int
|
||||||
ropeScalingBeta float32
|
ropeScalingBeta float32
|
||||||
|
ropeType string
|
||||||
|
ropeExtrapolation float32
|
||||||
|
ropeBetaFast float32
|
||||||
|
ropeBetaSlow float32
|
||||||
|
ropeMscale float32
|
||||||
|
ropeMscaleAllDim float32
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o TextOptions) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
func (o TextOptions) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
||||||
return nn.RoPE(ctx, states, positions, o.ropeDim, o.ropeBase, 1./o.ropeScale)
|
var ropeOpts []func(*rope.Options)
|
||||||
|
if o.ropeType == "yarn" {
|
||||||
|
if o.ropeMscale != 0 && o.ropeMscaleAllDim != 0 {
|
||||||
|
ropeOpts = append(ropeOpts, rope.WithAttentionFactor(1.0/float32(0.1*math.Log(float64(o.ropeScale))+1.0)))
|
||||||
|
}
|
||||||
|
|
||||||
|
ropeOpts = append(ropeOpts,
|
||||||
|
rope.WithOriginalContextLength(o.ropeOrigPosEmbeddings),
|
||||||
|
rope.WithExtrapolationFactor(o.ropeExtrapolation),
|
||||||
|
rope.WithBetaFast(o.ropeBetaFast),
|
||||||
|
rope.WithBetaSlow(o.ropeBetaSlow),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nn.RoPE(ctx, states, positions, o.ropeDim, o.ropeBase, 1./o.ropeScale, ropeOpts...)
|
||||||
}
|
}
|
||||||
|
|
||||||
type TextModel struct {
|
type TextModel struct {
|
||||||
@@ -150,9 +171,15 @@ func newTextModel(c fs.Config) *TextModel {
|
|||||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
ropeDim: int(c.Uint("rope.dimension_count")),
|
||||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||||
ropeBase: c.Float("rope.freq_base"),
|
ropeBase: c.Float("rope.freq_base"),
|
||||||
ropeScale: c.Float("rope.scaling.factor", 1),
|
ropeScale: c.Float("rope.scaling.factor", 1.0),
|
||||||
ropeOrigPosEmbeddings: int(c.Uint("rope.scaling.original_context_length")),
|
ropeOrigPosEmbeddings: int(c.Uint("rope.scaling.original_context_length")),
|
||||||
ropeScalingBeta: c.Float("rope.scaling_beta"),
|
ropeScalingBeta: c.Float("rope.scaling_beta", 0.1),
|
||||||
|
ropeBetaFast: c.Float("rope.scaling.beta_fast", 32.0),
|
||||||
|
ropeBetaSlow: c.Float("rope.scaling.beta_slow", 1.0),
|
||||||
|
ropeType: c.String("rope.scaling.type"),
|
||||||
|
ropeMscale: c.Float("rope.scaling.mscale"),
|
||||||
|
ropeMscaleAllDim: c.Float("rope.scaling.mscale_all_dim"),
|
||||||
|
ropeExtrapolation: c.Float("rope.scaling.extrapolation_factor", 1),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -487,29 +487,9 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
types := []string{"jpeg", "jpg", "png", "webp"}
|
img, err := decodeImageURL(url)
|
||||||
valid := false
|
|
||||||
// support blank mime type to match api/chat taking just unadorned base64
|
|
||||||
if strings.HasPrefix(url, "data:;base64,") {
|
|
||||||
url = strings.TrimPrefix(url, "data:;base64,")
|
|
||||||
valid = true
|
|
||||||
}
|
|
||||||
for _, t := range types {
|
|
||||||
prefix := "data:image/" + t + ";base64,"
|
|
||||||
if strings.HasPrefix(url, prefix) {
|
|
||||||
url = strings.TrimPrefix(url, prefix)
|
|
||||||
valid = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !valid {
|
|
||||||
return nil, errors.New("invalid image input")
|
|
||||||
}
|
|
||||||
|
|
||||||
img, err := base64.StdEncoding.DecodeString(url)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.New("invalid message format")
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}})
|
messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}})
|
||||||
@@ -648,6 +628,35 @@ func nameFromToolCallID(messages []Message, toolCallID string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// decodeImageURL decodes a base64 data URI into raw image bytes.
|
||||||
|
func decodeImageURL(url string) (api.ImageData, error) {
|
||||||
|
types := []string{"jpeg", "jpg", "png", "webp"}
|
||||||
|
|
||||||
|
// Support blank mime type to match /api/chat's behavior of taking just unadorned base64
|
||||||
|
if strings.HasPrefix(url, "data:;base64,") {
|
||||||
|
url = strings.TrimPrefix(url, "data:;base64,")
|
||||||
|
} else {
|
||||||
|
valid := false
|
||||||
|
for _, t := range types {
|
||||||
|
prefix := "data:image/" + t + ";base64,"
|
||||||
|
if strings.HasPrefix(url, prefix) {
|
||||||
|
url = strings.TrimPrefix(url, prefix)
|
||||||
|
valid = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !valid {
|
||||||
|
return nil, errors.New("invalid image input")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
img, err := base64.StdEncoding.DecodeString(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.New("invalid image input")
|
||||||
|
}
|
||||||
|
return img, nil
|
||||||
|
}
|
||||||
|
|
||||||
// FromCompletionToolCall converts OpenAI ToolCall format to api.ToolCall
|
// FromCompletionToolCall converts OpenAI ToolCall format to api.ToolCall
|
||||||
func FromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
|
func FromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
|
||||||
apiToolCalls := make([]api.ToolCall, len(toolCalls))
|
apiToolCalls := make([]api.ToolCall, len(toolCalls))
|
||||||
|
|||||||
1004
openai/responses.go
Normal file
1004
openai/responses.go
Normal file
File diff suppressed because it is too large
Load Diff
1543
openai/responses_test.go
Normal file
1543
openai/responses_test.go
Normal file
File diff suppressed because it is too large
Load Diff
@@ -842,7 +842,7 @@ func (s *Server) loadModel(
|
|||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
|
ctxParams := llama.NewContextParams(kvSize, s.batchSize, s.parallel, threads, flashAttention, kvCacheType)
|
||||||
s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
|
s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
|
|||||||
@@ -1203,16 +1203,22 @@ func (s *Server) allocModel(
|
|||||||
return errors.New("loras are not yet implemented")
|
return errors.New("loras are not yet implemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if s.model.Config().Cache == nil {
|
||||||
|
if parallel > 1 {
|
||||||
|
parallel = 1
|
||||||
|
slog.Warn("model does not support caching, disabling parallel processing")
|
||||||
|
}
|
||||||
|
if s.batchSize < kvSize {
|
||||||
|
s.batchSize = kvSize
|
||||||
|
slog.Warn("model does not support caching, setting batch size to context length", "batch_size", kvSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
|
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !s.cache.enabled && parallel > 1 {
|
|
||||||
parallel = 1
|
|
||||||
slog.Warn("model does not support caching, disabling parallel processing")
|
|
||||||
}
|
|
||||||
|
|
||||||
s.parallel = parallel
|
s.parallel = parallel
|
||||||
s.seqs = make([]*Sequence, s.parallel)
|
s.seqs = make([]*Sequence, s.parallel)
|
||||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||||
|
|||||||
@@ -1532,6 +1532,7 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
|
|||||||
r.POST("/v1/embeddings", middleware.EmbeddingsMiddleware(), s.EmbedHandler)
|
r.POST("/v1/embeddings", middleware.EmbeddingsMiddleware(), s.EmbedHandler)
|
||||||
r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
|
r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
|
||||||
r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
|
r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
|
||||||
|
r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler)
|
||||||
|
|
||||||
if rc != nil {
|
if rc != nil {
|
||||||
// wrap old with new
|
// wrap old with new
|
||||||
@@ -2393,3 +2394,4 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
|
|||||||
}
|
}
|
||||||
return msgs
|
return msgs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -127,6 +127,9 @@ var funcs = template.FuncMap{
|
|||||||
// Default format is YYYY-MM-DD
|
// Default format is YYYY-MM-DD
|
||||||
return time.Now().Format("2006-01-02")
|
return time.Now().Format("2006-01-02")
|
||||||
},
|
},
|
||||||
|
"yesterdayDate": func(args ...string) string {
|
||||||
|
return time.Now().AddDate(0, 0, -1).Format("2006-01-02")
|
||||||
|
},
|
||||||
"toTypeScriptType": func(v any) string {
|
"toTypeScriptType": func(v any) string {
|
||||||
if param, ok := v.(api.ToolProperty); ok {
|
if param, ok := v.(api.ToolProperty); ok {
|
||||||
return param.ToTypeScriptType()
|
return param.ToTypeScriptType()
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
@@ -451,6 +452,72 @@ func TestExecuteWithSuffix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDateFunctions(t *testing.T) {
|
||||||
|
t.Run("currentDate", func(t *testing.T) {
|
||||||
|
tmpl, err := Parse("{{- range .Messages }}{{ .Content }}{{ end }} Today is {{ currentDate }}")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := tmpl.Execute(&b, Values{Messages: []api.Message{{Role: "user", Content: "Hello"}}}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := "Hello Today is " + time.Now().Format("2006-01-02")
|
||||||
|
if b.String() != expected {
|
||||||
|
t.Errorf("got %q, want %q", b.String(), expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("yesterdayDate", func(t *testing.T) {
|
||||||
|
tmpl, err := Parse("{{- range .Messages }}{{ .Content }}{{ end }} Yesterday was {{ yesterdayDate }}")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := tmpl.Execute(&b, Values{Messages: []api.Message{{Role: "user", Content: "Hello"}}}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := "Hello Yesterday was " + time.Now().AddDate(0, 0, -1).Format("2006-01-02")
|
||||||
|
if b.String() != expected {
|
||||||
|
t.Errorf("got %q, want %q", b.String(), expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("yesterdayDate format", func(t *testing.T) {
|
||||||
|
tmpl, err := Parse("{{- range .Messages }}{{ end }}{{ yesterdayDate }}")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := tmpl.Execute(&b, Values{Messages: []api.Message{{Role: "user", Content: "Hello"}}}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the format matches YYYY-MM-DD
|
||||||
|
result := b.String()
|
||||||
|
if len(result) != 10 {
|
||||||
|
t.Errorf("expected date length 10, got %d: %q", len(result), result)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse and verify it's a valid date
|
||||||
|
parsed, err := time.Parse("2006-01-02", result)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to parse date %q: %v", result, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify it's yesterday
|
||||||
|
yesterday := time.Now().AddDate(0, 0, -1)
|
||||||
|
if parsed.Year() != yesterday.Year() || parsed.Month() != yesterday.Month() || parsed.Day() != yesterday.Day() {
|
||||||
|
t.Errorf("expected yesterday's date, got %v", parsed)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestCollate(t *testing.T) {
|
func TestCollate(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
name string
|
name string
|
||||||
|
|||||||
Reference in New Issue
Block a user