Update README.md (#13373 )

Correct Markdown syntax for Swollama GitHub and DocC documentation links
model: fix rotary embeddings for ministral 3 (#13432 )
2026-04-21 00:05:40 +02:00 · 2025-12-11 16:08:57 -08:00 · 2025-12-11 16:02:05 -08:00 · 2025-12-11 15:37:10 -08:00 · 2025-12-11 15:36:31 -08:00 · 2025-12-11 14:47:55 -08:00
17 changed files with 3076 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -555,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
 - [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
- [Swollama for Swift]([https://github.com/marcusziade/Swollama](https://github.com/guitaripod/Swollama) with [DocC]( https://guitaripod.github.io/Swollama/documentation/swollama)
+- [Swollama for Swift](https://github.com/guitaripod/Swollama) with [DocC](https://guitaripod.github.io/Swollama/documentation/swollama)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
 		conv = &mistral3Model{}
 	case "Ministral3ForCausalLM":
 		conv = &mistral3CausalModel{}
 	case "MixtralForCausalLM":
 		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -33,10 +33,12 @@ type mistral3Model struct {
 			BetaFast                  float32  `json:"beta_fast"`
 			BetaSlow                  float32  `json:"beta_slow"`
 			Factor                    float32  `json:"factor"`
-			ScalingBeta               float32 `json:"llama_4_scaling_beta"`
+			Llama4ScalingBeta         *float32 `json:"llama_4_scaling_beta"`
 			OrigMaxPositionEmbeddings uint32   `json:"original_max_position_embeddings"`
 			RopeType                  string   `json:"rope_type"`
 			RopeTheta                 float32  `json:"rope_theta"`
 			Mscale                    *float32 `json:"mscale"`
 			MscaleAllDim              *float32 `json:"mscale_all_dim"`
 		} `json:"rope_parameters"`
 	} `json:"text_config"`
 	VisionModel struct {
@@ -50,6 +52,9 @@ type mistral3Model struct {
 		HeadDim           uint32  `json:"head_dim"`
 		HiddenAct         string  `json:"hidden_act"`
 		RopeTheta         float32 `json:"rope_theta"`
 		RopeParameters    struct {
 			RopeTheta float32 `json:"rope_theta"`
 		} `json:"rope_parameters"`
 	} `json:"vision_config"`
 	MultiModalProjectorBias bool   `json:"multimodal_projector_bias"`
 	ProjectorHiddenAct      string `json:"projector_hidden_act"`
@@ -72,10 +77,22 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
 	kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
 	kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
 	kv["mistral3.rope.scaling.factor"] = p.TextModel.RopeParameters.Factor
 	kv["mistral3.rope.scaling.type"] = p.TextModel.RopeParameters.RopeType
 	kv["mistral3.rope.scaling.beta_fast"] = p.TextModel.RopeParameters.BetaFast
 	kv["mistral3.rope.scaling.beta_slow"] = p.TextModel.RopeParameters.BetaSlow
 	if p.TextModel.RopeParameters.Mscale != nil {
 		kv["mistral3.rope.scaling.mscale"] = *p.TextModel.RopeParameters.Mscale
 	}
 	if p.TextModel.RopeParameters.MscaleAllDim != nil {
 		kv["mistral3.rope.scaling.mscale_all_dim"] = *p.TextModel.RopeParameters.MscaleAllDim
 	}
 	if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
 		kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
-		kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
+	}
 	if p.TextModel.RopeParameters.Llama4ScalingBeta != nil {
 		kv["mistral3.rope.scaling_beta"] = *p.TextModel.RopeParameters.Llama4ScalingBeta
 	}
 	// Vision configuration
@@ -88,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
 	kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
 	// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
-	kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
+	kv["mistral3.vision.rope.freq_base"] = cmp.Or(p.VisionModel.RopeTheta, p.VisionModel.RopeParameters.RopeTheta)
 	// Multimodal configuration
 	kv["mistral3.image_token_index"] = p.ImageTokenIndex
--- a/convert/convert_mistral_causal.go
+++ b/convert/convert_mistral_causal.go
@@ -0,0 +1,181 @@
 package convert
 import (
 	"cmp"
 	"fmt"
 	"strings"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type mistral3CausalModel struct {
 	ModelParameters
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	IntermediateSize      uint32  `json:"intermediate_size"`
 	NumAttentionHeads     uint32  `json:"num_attention_heads"`
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RMSNormEPS            float32 `json:"rms_norm_eps"`
 	HeadDim               uint32  `json:"head_dim"`
 	SlidingWindow         *uint32 `json:"sliding_window"`
 	HiddenAct             string  `json:"hidden_act"`
 	VocabSize             uint32  `json:"vocab_size"`
 	RopeParameters        struct {
 		BetaFast                  float32  `json:"beta_fast"`
 		BetaSlow                  float32  `json:"beta_slow"`
 		Factor                    float32  `json:"factor"`
 		Llama4ScalingBeta         *float32 `json:"llama_4_scaling_beta"`
 		OrigMaxPositionEmbeddings uint32   `json:"original_max_position_embeddings"`
 		RopeType                  string   `json:"rope_type"`
 		RopeTheta                 float32  `json:"rope_theta"`
 		Mscale                    *float32 `json:"mscale"`
 		MscaleAllDim              *float32 `json:"mscale_all_dim"`
 	} `json:"rope_parameters"`
 }
 func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.VocabSize
 	// Text configuration
 	kv["mistral3.block_count"] = p.NumHiddenLayers
 	kv["mistral3.context_length"] = p.MaxPositionEmbeddings
 	kv["mistral3.embedding_length"] = p.HiddenSize
 	kv["mistral3.feed_forward_length"] = p.IntermediateSize
 	kv["mistral3.attention.head_count"] = p.NumAttentionHeads
 	kv["mistral3.attention.head_count_kv"] = p.NumKeyValueHeads
 	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
 	kv["mistral3.attention.key_length"] = p.HeadDim
 	kv["mistral3.attention.value_length"] = p.HeadDim
 	kv["mistral3.rope.dimension_count"] = cmp.Or(p.HeadDim, p.HiddenSize/p.NumAttentionHeads)
 	kv["mistral3.rope.freq_base"] = cmp.Or(p.RopeTheta, p.RopeParameters.RopeTheta)
 	kv["mistral3.rope.scaling.factor"] = p.RopeParameters.Factor
 	kv["mistral3.rope.scaling.type"] = p.RopeParameters.RopeType
 	kv["mistral3.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
 	kv["mistral3.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
 	if p.RopeParameters.Mscale != nil {
 		kv["mistral3.rope.scaling.mscale"] = *p.RopeParameters.Mscale
 	}
 	if p.RopeParameters.MscaleAllDim != nil {
 		kv["mistral3.rope.scaling.mscale_all_dim"] = *p.RopeParameters.MscaleAllDim
 	}
 	if p.RopeParameters.OrigMaxPositionEmbeddings > 0 {
 		kv["mistral3.rope.scaling.original_context_length"] = p.RopeParameters.OrigMaxPositionEmbeddings
 		kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
 	}
 	if p.RopeParameters.Llama4ScalingBeta != nil {
 		kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
 	}
 	return kv
 }
 func (p *mistral3CausalModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") {
 			if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
 				strings.HasSuffix(t.Name(), ".attn_k.weight") {
 				t.SetRepacker(p.repack)
 			}
 		}
 		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
 		})
 	}
 	return out
 }
 func (p *mistral3CausalModel) Replacements() []string {
 	return []string{
 		"model.norm", "output_norm",
 		"model.", "",
 		"layers", "blk",
 		"transformer.layers", "blk",
 		"vision_tower", "v",
 		"ln_pre", "encoder_norm",
 		"input_layernorm", "attn_norm",
 		"post_attention_layernorm", "ffn_norm",
 		"embed_tokens", "token_embd",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
 		"attention.q_proj", "attn_q",
 		"attention.k_proj", "attn_k",
 		"attention.v_proj", "attn_v",
 		"attention.o_proj", "attn_output",
 		"attention_norm", "attn_norm",
 		"feed_forward.gate_proj", "ffn_gate",
 		"feed_forward.down_proj", "ffn_down",
 		"feed_forward.up_proj", "ffn_up",
 		"multi_modal_projector", "mm",
 		"ffn_norm", "ffn_norm",
 		"lm_head", "output",
 	}
 }
 func (p *mistral3CausalModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
 	}
 	var heads uint32
 	if strings.HasSuffix(name, ".attn_q.weight") {
 		heads = p.NumAttentionHeads
 	} else if strings.HasSuffix(name, ".attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
 	}
 	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
 		return nil, err
 	}
 	if err := n.T(0, 2, 1, 3); err != nil {
 		return nil, err
 	}
 	if err := n.Reshape(dims...); err != nil {
 		return nil, err
 	}
 	if err := n.Transpose(); err != nil {
 		return nil, err
 	}
 	ts, err := native.SelectF32(n, 1)
 	if err != nil {
 		return nil, err
 	}
 	var f32s []float32
 	for _, t := range ts {
 		f32s = append(f32s, t...)
 	}
 	return f32s, nil
 }
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -487,6 +487,63 @@ func TestEmbedTruncation(t *testing.T) {
 	}
 }
 // TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
 func TestEmbedLargeInput(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	for _, model := range libraryEmbedModels {
 		model := model
 		t.Run(model, func(t *testing.T) {
 			mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
 			defer mcancel()
 			// Test with progressively larger inputs
 			testCases := []struct {
 				name       string
 				inputWords int
 			}{
 				{"medium_input_256_words", 256},
 				{"large_input_512_words", 512},
 				{"very_large_input_800_words", 800},
 			}
 			for _, tc := range testCases {
 				t.Run(tc.name, func(t *testing.T) {
 					words := make([]string, tc.inputWords)
 					for i := range words {
 						words[i] = "word"
 					}
 					input := strings.Join(words, " ")
 					req := api.EmbedRequest{
 						Model:     model,
 						Input:     input,
 						KeepAlive: &api.Duration{Duration: 30 * time.Second},
 					}
 					res, err := embedTestHelper(mctx, client, t, req)
 					if err != nil {
 						t.Fatalf("embedding failed for %d words: %v", tc.inputWords, err)
 					}
 					if len(res.Embeddings) != 1 {
 						t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
 					}
 					if len(res.Embeddings[0]) == 0 {
 						t.Fatal("expected non-empty embedding")
 					}
 					t.Logf("Successfully embedded %d words (%d tokens)", tc.inputWords, res.PromptEvalCount)
 				})
 			}
 		})
 	}
 }
 // TestEmbedStatusCode tests that errors from the embedding endpoint
 // properly preserve their HTTP status codes when returned to the client.
 // This test specifically checks the error handling path in EmbedHandler
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -121,7 +121,8 @@ type ContextParams struct {
 func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
 	params := C.llama_context_default_params()
 	params.n_ctx = C.uint(numCtx)
-	params.n_batch = C.uint(batchSize)
+	params.n_batch = C.uint(batchSize * numSeqMax)
 	params.n_ubatch = C.uint(batchSize)
 	params.n_seq_max = C.uint(numSeqMax)
 	params.n_threads = C.int(threads)
 	params.n_threads_batch = params.n_threads
--- a/llm/server.go
+++ b/llm/server.go
@@ -474,6 +474,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
 		s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 	}
 	// Check if embedding model and adjust batch size accordingly
 	_, isEmbedding := s.ggml.KV()[fmt.Sprintf("%s.pooling_type", s.ggml.KV().Architecture())]
 	if isEmbedding && s.loadRequest.BatchSize < s.options.NumCtx {
 		s.loadRequest.BatchSize = s.options.NumCtx
 		slog.Info("embedding model detected, setting batch size to context length", "batch_size", s.loadRequest.BatchSize)
 	}
 	kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
 		s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -433,3 +433,111 @@ func ChatMiddleware() gin.HandlerFunc {
 		c.Next()
 	}
 }
 type ResponsesWriter struct {
 	BaseWriter
 	converter  *openai.ResponsesStreamConverter
 	model      string
 	stream     bool
 	responseID string
 	itemID     string
 }
 func (w *ResponsesWriter) writeEvent(eventType string, data any) error {
 	d, err := json.Marshal(data)
 	if err != nil {
 		return err
 	}
 	_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("event: %s\ndata: %s\n\n", eventType, d)))
 	if err != nil {
 		return err
 	}
 	if f, ok := w.ResponseWriter.(http.Flusher); ok {
 		f.Flush()
 	}
 	return nil
 }
 func (w *ResponsesWriter) writeResponse(data []byte) (int, error) {
 	var chatResponse api.ChatResponse
 	if err := json.Unmarshal(data, &chatResponse); err != nil {
 		return 0, err
 	}
 	if w.stream {
 		w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
 		events := w.converter.Process(chatResponse)
 		for _, event := range events {
 			if err := w.writeEvent(event.Event, event.Data); err != nil {
 				return 0, err
 			}
 		}
 		return len(data), nil
 	}
 	// Non-streaming response
 	w.ResponseWriter.Header().Set("Content-Type", "application/json")
 	response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse)
 	return len(data), json.NewEncoder(w.ResponseWriter).Encode(response)
 }
 func (w *ResponsesWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
 		return w.writeError(data)
 	}
 	return w.writeResponse(data)
 }
 func ResponsesMiddleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		var req openai.ResponsesRequest
 		if err := c.ShouldBindJSON(&req); err != nil {
 			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
 			return
 		}
 		chatReq, err := openai.FromResponsesRequest(req)
 		if err != nil {
 			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
 			return
 		}
 		// Check if client requested streaming (defaults to false)
 		streamRequested := req.Stream != nil && *req.Stream
 		// Pass streaming preference to the underlying chat request
 		chatReq.Stream = &streamRequested
 		var b bytes.Buffer
 		if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
 			c.AbortWithStatusJSON(http.StatusInternalServerError, openai.NewError(http.StatusInternalServerError, err.Error()))
 			return
 		}
 		c.Request.Body = io.NopCloser(&b)
 		responseID := fmt.Sprintf("resp_%d", rand.Intn(999999))
 		itemID := fmt.Sprintf("msg_%d", rand.Intn(999999))
 		w := &ResponsesWriter{
 			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
 			converter:  openai.NewResponsesStreamConverter(responseID, itemID, req.Model),
 			model:      req.Model,
 			stream:     streamRequested,
 			responseID: responseID,
 			itemID:     itemID,
 		}
 		// Set headers based on streaming mode
 		if streamRequested {
 			c.Writer.Header().Set("Content-Type", "text/event-stream")
 			c.Writer.Header().Set("Cache-Control", "no-cache")
 			c.Writer.Header().Set("Connection", "keep-alive")
 		}
 		c.Writer = w
 		c.Next()
 	}
 }
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -8,6 +8,7 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
@@ -17,10 +18,30 @@ type TextOptions struct {
 	eps, ropeBase, ropeScale         float32
 	ropeOrigPosEmbeddings            int
 	ropeScalingBeta                  float32
 	ropeType                         string
 	ropeExtrapolation                float32
 	ropeBetaFast                     float32
 	ropeBetaSlow                     float32
 	ropeMscale                       float32
 	ropeMscaleAllDim                 float32
 }
 func (o TextOptions) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
-	return nn.RoPE(ctx, states, positions, o.ropeDim, o.ropeBase, 1./o.ropeScale)
+	var ropeOpts []func(*rope.Options)
 	if o.ropeType == "yarn" {
 		if o.ropeMscale != 0 && o.ropeMscaleAllDim != 0 {
 			ropeOpts = append(ropeOpts, rope.WithAttentionFactor(1.0/float32(0.1*math.Log(float64(o.ropeScale))+1.0)))
 		}
 		ropeOpts = append(ropeOpts,
 			rope.WithOriginalContextLength(o.ropeOrigPosEmbeddings),
 			rope.WithExtrapolationFactor(o.ropeExtrapolation),
 			rope.WithBetaFast(o.ropeBetaFast),
 			rope.WithBetaSlow(o.ropeBetaSlow),
 		)
 	}
 	return nn.RoPE(ctx, states, positions, o.ropeDim, o.ropeBase, 1./o.ropeScale, ropeOpts...)
 }
 type TextModel struct {
@@ -150,9 +171,15 @@ func newTextModel(c fs.Config) *TextModel {
 			ropeDim:               int(c.Uint("rope.dimension_count")),
 			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:              c.Float("rope.freq_base"),
-			ropeScale:             c.Float("rope.scaling.factor", 1),
+			ropeScale:             c.Float("rope.scaling.factor", 1.0),
 			ropeOrigPosEmbeddings: int(c.Uint("rope.scaling.original_context_length")),
-			ropeScalingBeta:       c.Float("rope.scaling_beta"),
+			ropeScalingBeta:       c.Float("rope.scaling_beta", 0.1),
 			ropeBetaFast:          c.Float("rope.scaling.beta_fast", 32.0),
 			ropeBetaSlow:          c.Float("rope.scaling.beta_slow", 1.0),
 			ropeType:              c.String("rope.scaling.type"),
 			ropeMscale:            c.Float("rope.scaling.mscale"),
 			ropeMscaleAllDim:      c.Float("rope.scaling.mscale_all_dim"),
 			ropeExtrapolation:     c.Float("rope.scaling.extrapolation_factor", 1),
 		},
 	}
 }
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -487,29 +487,9 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 						}
 					}
-					types := []string{"jpeg", "jpg", "png", "webp"}
+					img, err := decodeImageURL(url)
 					valid := false
 					// support blank mime type to match api/chat taking just unadorned base64
 					if strings.HasPrefix(url, "data:;base64,") {
 						url = strings.TrimPrefix(url, "data:;base64,")
 						valid = true
 					}
 					for _, t := range types {
 						prefix := "data:image/" + t + ";base64,"
 						if strings.HasPrefix(url, prefix) {
 							url = strings.TrimPrefix(url, prefix)
 							valid = true
 							break
 						}
 					}
 					if !valid {
 						return nil, errors.New("invalid image input")
 					}
 					img, err := base64.StdEncoding.DecodeString(url)
 					if err != nil {
-						return nil, errors.New("invalid message format")
+						return nil, err
 					}
 					messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}})
@@ -648,6 +628,35 @@ func nameFromToolCallID(messages []Message, toolCallID string) string {
 	return ""
 }
 // decodeImageURL decodes a base64 data URI into raw image bytes.
 func decodeImageURL(url string) (api.ImageData, error) {
 	types := []string{"jpeg", "jpg", "png", "webp"}
 	// Support blank mime type to match /api/chat's behavior of taking just unadorned base64
 	if strings.HasPrefix(url, "data:;base64,") {
 		url = strings.TrimPrefix(url, "data:;base64,")
 	} else {
 		valid := false
 		for _, t := range types {
 			prefix := "data:image/" + t + ";base64,"
 			if strings.HasPrefix(url, prefix) {
 				url = strings.TrimPrefix(url, prefix)
 				valid = true
 				break
 			}
 		}
 		if !valid {
 			return nil, errors.New("invalid image input")
 		}
 	}
 	img, err := base64.StdEncoding.DecodeString(url)
 	if err != nil {
 		return nil, errors.New("invalid image input")
 	}
 	return img, nil
 }
 // FromCompletionToolCall converts OpenAI ToolCall format to api.ToolCall
 func FromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
 	apiToolCalls := make([]api.ToolCall, len(toolCalls))
--- a/openai/responses.go
+++ b/openai/responses.go
--- a/openai/responses_test.go
+++ b/openai/responses_test.go
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -842,7 +842,7 @@ func (s *Server) loadModel(
 		panic(err)
 	}
-	ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
+	ctxParams := llama.NewContextParams(kvSize, s.batchSize, s.parallel, threads, flashAttention, kvCacheType)
 	s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
 	if err != nil {
 		panic(err)
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1203,16 +1203,22 @@ func (s *Server) allocModel(
 		return errors.New("loras are not yet implemented")
 	}
 	if s.model.Config().Cache == nil {
 		if parallel > 1 {
 			parallel = 1
 			slog.Warn("model does not support caching, disabling parallel processing")
 		}
 		if s.batchSize < kvSize {
 			s.batchSize = kvSize
 			slog.Warn("model does not support caching, setting batch size to context length", "batch_size", kvSize)
 		}
 	}
 	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
 	if err != nil {
 		return err
 	}
 	if !s.cache.enabled && parallel > 1 {
 		parallel = 1
 		slog.Warn("model does not support caching, disabling parallel processing")
 	}
 	s.parallel = parallel
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
--- a/server/routes.go
+++ b/server/routes.go
@@ -1532,6 +1532,7 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	r.POST("/v1/embeddings", middleware.EmbeddingsMiddleware(), s.EmbedHandler)
 	r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
 	r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
 	r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler)
 	if rc != nil {
 		// wrap old with new
@@ -2393,3 +2394,4 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	}
 	return msgs
 }
--- a/template/template.go
+++ b/template/template.go
@@ -127,6 +127,9 @@ var funcs = template.FuncMap{
 		// Default format is YYYY-MM-DD
 		return time.Now().Format("2006-01-02")
 	},
 	"yesterdayDate": func(args ...string) string {
 		return time.Now().AddDate(0, 0, -1).Format("2006-01-02")
 	},
 	"toTypeScriptType": func(v any) string {
 		if param, ok := v.(api.ToolProperty); ok {
 			return param.ToTypeScriptType()
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -10,6 +10,7 @@ import (
 	"slices"
 	"strings"
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
@@ -451,6 +452,72 @@ func TestExecuteWithSuffix(t *testing.T) {
 	}
 }
 func TestDateFunctions(t *testing.T) {
 	t.Run("currentDate", func(t *testing.T) {
 		tmpl, err := Parse("{{- range .Messages }}{{ .Content }}{{ end }} Today is {{ currentDate }}")
 		if err != nil {
 			t.Fatal(err)
 		}
 		var b bytes.Buffer
 		if err := tmpl.Execute(&b, Values{Messages: []api.Message{{Role: "user", Content: "Hello"}}}); err != nil {
 			t.Fatal(err)
 		}
 		expected := "Hello Today is " + time.Now().Format("2006-01-02")
 		if b.String() != expected {
 			t.Errorf("got %q, want %q", b.String(), expected)
 		}
 	})
 	t.Run("yesterdayDate", func(t *testing.T) {
 		tmpl, err := Parse("{{- range .Messages }}{{ .Content }}{{ end }} Yesterday was {{ yesterdayDate }}")
 		if err != nil {
 			t.Fatal(err)
 		}
 		var b bytes.Buffer
 		if err := tmpl.Execute(&b, Values{Messages: []api.Message{{Role: "user", Content: "Hello"}}}); err != nil {
 			t.Fatal(err)
 		}
 		expected := "Hello Yesterday was " + time.Now().AddDate(0, 0, -1).Format("2006-01-02")
 		if b.String() != expected {
 			t.Errorf("got %q, want %q", b.String(), expected)
 		}
 	})
 	t.Run("yesterdayDate format", func(t *testing.T) {
 		tmpl, err := Parse("{{- range .Messages }}{{ end }}{{ yesterdayDate }}")
 		if err != nil {
 			t.Fatal(err)
 		}
 		var b bytes.Buffer
 		if err := tmpl.Execute(&b, Values{Messages: []api.Message{{Role: "user", Content: "Hello"}}}); err != nil {
 			t.Fatal(err)
 		}
 		// Verify the format matches YYYY-MM-DD
 		result := b.String()
 		if len(result) != 10 {
 			t.Errorf("expected date length 10, got %d: %q", len(result), result)
 		}
 		// Parse and verify it's a valid date
 		parsed, err := time.Parse("2006-01-02", result)
 		if err != nil {
 			t.Errorf("failed to parse date %q: %v", result, err)
 		}
 		// Verify it's yesterday
 		yesterday := time.Now().AddDate(0, 0, -1)
 		if parsed.Year() != yesterday.Year() || parsed.Month() != yesterday.Month() || parsed.Day() != yesterday.Day() {
 			t.Errorf("expected yesterday's date, got %v", parsed)
 		}
 	})
 }
 func TestCollate(t *testing.T) {
 	cases := []struct {
 		name     string
Author	SHA1	Message	Date
JJ	709f842457	Update README.md (#13373 ) Correct Markdown syntax for Swollama GitHub and DocC documentation links	2025-12-11 16:08:57 -08:00
Jeffrey Morgan	2dfb74410d	model: fix rotary embeddings for ministral 3 (#13432 )	2025-12-11 16:02:05 -08:00
Devon Rifkin	1eb5e75972	openai: add v1/responses support (#13351 ) Only supporting the stateless part of the API. Doc updates to come once this is shipped. Closes: #9659	2025-12-11 15:37:10 -08:00
nicole pardal	3475d915cb	embeddings: modified batch size (#13429 ) This PR detects embedding models and sets batch_size = context_size so the full input fits in a single batch. Previously, if batch size was smaller than the input, tokens could be split across batches and cause a SIGTRAP crash. This change ensures all tokens stay in one batch and prevents crashes. Fixes: #12938 #13054 Co-authored-by: Jesse Gross <jesse@ollama.com>	2025-12-11 15:36:31 -08:00
Jeffrey Morgan	48e78e9be1	template: add yesterdayDate helper function (#13431 )	2025-12-11 14:47:55 -08:00
Jeffrey Morgan	a838421ea3	model: conversion and hyperparameter fixes for ministral and devstral (#13424 )	2025-12-11 13:04:00 -08:00