restructure

image processing Update model.go Update model.go Update model.go no projector no projector vision model scaffold ... ... wip ... rebase fix patch merger tidy ... Update model_vision.go server: do not attempt to parse offset file as gguf This logic was causing issues for me when importing a gguf that had some padding at the end of the file. The valid gguf would be read, but then it would try to read the offset as a different gguf file. This does not seem right. Update process_image_test.go apply norm prompt processing prompt processing fix post tokenize fix gguf padding + populate the split patch embeddings ... ... another shot at patch embeddings ... patch embedding Update model_vision.go split pixels
2026-04-23 17:29:54 +02:00 · 2025-04-02 10:41:51 -07:00
parent 198b1e6db9
commit c1f9bcb4dd
17 changed files with 1194 additions and 208 deletions
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -1,10 +1,11 @@
 package qwen25vl

 import (
+	"bytes"
 	"fmt"
-	"math"
-	"strings"
+	"image"

+	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
@@ -12,147 +13,151 @@ import (
 	"github.com/ollama/ollama/model/input"
 )

-type Options struct {
-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps                                      float32
-	ropeConfig                               ml.RoPEConfig
-}
-
 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	*TextModel
+	*VisionModel `gguf:"v,vision"`
+	*PatchMerger `gguf:"mm"`

-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	*Options
+	ImageProcessor
 }

-func New(c ml.Config) (model.Model, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+// Implement MultimodalProcessor interface
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+type PatchMerger struct {
+	MLPLayer1 *nn.Linear `gguf:"0"`
+	MLPLayer2 *nn.Linear `gguf:"2"`
+}
+
+// Forward computes patch merging for the vision model
+func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
+	// Get dimensions
+	hiddenSize := visionOutputs.Dim(0)
+	numPositions := visionOutputs.Dim(1)
+	batchSize := visionOutputs.Dim(2)
+
+	reshaped := visionOutputs.Reshape(ctx, hiddenSize*4, numPositions/4, batchSize)
+
+	// Apply first linear layer (mm_0_w, mm_0_b)
+	hidden := pm.MLPLayer1.Forward(ctx, reshaped)
+
+	activated := hidden.GELU(ctx)
+
+	// Apply second linear layer (mm_1_w, mm_1_b)
+	output := pm.MLPLayer2.Forward(ctx, activated)
+
+	return output
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := &Model{
+		TextModel:      NewTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
 	}

-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Uints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
-		Layers: make([]Layer, c.Uint("block_count")),
-		Options: &Options{
-			ctxLen:     int(c.Uint("context_length")),
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-			ropeConfig: ml.RoPEConfig{
-				Base:       c.Float("rope.freq_base"),
-				Scale:      c.Float("rope.freq_scale", 1),
-				Dim:        c.Uint("rope.dimension_count", 128),
-				Type:       ml.RopeTypeNeox,
-				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 32768))),
-			},
-		},
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+type imageFeatures struct {
+	Tensor ml.Tensor
+	GridT  int
+	GridH  int
+	GridW  int
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
 	}

-	m.Cache = kvcache.NewCausalCache(m.Shift)
-
-	return &m, nil
-}
-
-// SelfAttention implements the multi-head self-attention mechanism
-// with separate projections for query, key, value and output transformations
-type SelfAttention struct {
-	Query       *nn.Linear `gguf:"attn_q"`
-	Key         *nn.Linear `gguf:"attn_k"`
-	Value       *nn.Linear `gguf:"attn_v"`
-	Output      *nn.Linear `gguf:"attn_output"`
-	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
-}
-
-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
-
-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeConfig)
-
-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeConfig)
-
-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
-
-	return sa.Output.Forward(ctx, kqv)
-}
-
-// Shift applies rotary position embeddings to the key tensor for causal attention caching
-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, m.ropeConfig), nil
-}
-
-// MLP implements the feed-forward network component with SwiGLU activation
-type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-	Gate *nn.Linear `gguf:"ffn_gate"`
-}
-
-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
-	// Apply SwiGLU activation gating
-	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
-	// Project back to hidden dimension
-	return mlp.Down.Forward(ctx, hiddenState)
-}
-
-// Layer represents a single transformer layer combining self-attention and feed-forward components
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	SelfAttention *SelfAttention
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
-}
-
-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	// Self-attention branch with residual connection
-	residual := hiddenState
-
-	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
-
-	// In the final layer (outputs != nil), optimize by pruning to just the token positions
-	// we need logits for.
-	if outputs != nil {
-		hiddenState = hiddenState.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
 	}

-	hiddenState = hiddenState.Add(ctx, residual)
-	// Feed-forward branch with residual connection
-	residual = hiddenState
-	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
+	f32s, gridT, gridH, gridW, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, err
+	}
+
+	// Calculate tensor dimensions
+	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
+		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+	numPatches := gridT * gridH * gridW
+
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create tensor from image: %w", err)
+	}
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+	visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.eps)
+
+	return &imageFeatures{
+		Tensor: visionOutputs,
+		GridT:  gridT,
+		GridH:  gridH,
+		GridW:  gridW,
+	}, nil
+}
+
+// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
+
+	// Get image token IDs from config
+	imageToken := 151655
+	visionStartToken := 151652
+	visionEndToken := 151653
+
+	// Get merge size from config
+	mergeSize := m.ImageProcessor.mergeSize
+
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else {
+			// This is an image token with multimodal data
+			features := inp.Multimodal.(*imageFeatures)
+
+			// Get grid dimensions from the features
+			gridT := features.GridT
+			gridH := features.GridH
+			gridW := features.GridW
+
+			// Calculate tokens per grid based on grid dimensions
+			mergeLength := mergeSize * mergeSize
+			gridProduct := gridT * gridH * gridW
+			tokensPerGrid := gridProduct / mergeLength
+
+			// First add the vision start token
+			result = append(result, input.Input{Token: int32(visionStartToken)})
+
+			// Add the image token with the multimodal tensor data at the first position
+			result = append(result, input.Input{
+				Token:          int32(imageToken),
+				Multimodal:     features.Tensor,
+				MultimodalHash: inp.MultimodalHash,
+			})
+
+			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
+			for range tokensPerGrid - 1 {
+				result = append(result, input.Input{Token: int32(imageToken)})
+			}
+
+			result = append(result, input.Input{Token: int32(visionEndToken)})
+		}
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	// Convert input tokens and positions to tensors
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
@@ -163,25 +168,10 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		return nil, err
 	}

-	// Initial token embedding
-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	// Process through transformer layers
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var lastLayerOutputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
-		}
-
-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
-	}
-
-	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-	return m.Output.Forward(ctx, hiddenState), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }

 func init() {
+	model.Register("qwen25vl", New)
 	model.Register("qwen2vl", New)
 }
--- a/model/models/qwen25vl/model_test.go
+++ b/model/models/qwen25vl/model_test.go
@@ -0,0 +1,59 @@
+package qwen25vl
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/ml/backend/ggml"
+	"github.com/ollama/ollama/model/input"
+)
+
+func TestPostTokenize(t *testing.T) {
+	// Set up test inputs
+	model := &Model{}
+	mockHash := uint64(12345678)
+
+	inputs := []input.Input{
+		{Token: 123}, // Regular token
+		{Token: 456}, // Regular token
+		{Token: 151655, Multimodal: &ggml.Tensor{}, MultimodalHash: mockHash}, // Image token
+		{Token: 789}, // Regular token
+	}
+
+	// Run the function being tested
+	result, err := model.PostTokenize(inputs)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	// Verify the actual length first
+	expectedLength := 21
+	if len(result) != expectedLength {
+		t.Fatalf("Result has wrong length: got %d, expected %d", len(result), expectedLength)
+	}
+
+	// Check key positions only
+	checkPositions := map[int]int32{
+		0:  123,    // First regular token
+		1:  456,    // Second regular token
+		2:  151652, // Vision start token
+		4:  151655, // First placeholder token
+		19: 151653, // Vision end token
+		20: 789,    // Final regular token
+	}
+
+	for pos, expectedToken := range checkPositions {
+		if pos >= len(result) {
+			t.Errorf("Position %d is out of bounds (result length: %d)", pos, len(result))
+			continue
+		}
+		if result[pos].Token != expectedToken {
+			t.Errorf("Position %d: expected token %d, got %d", pos, expectedToken, result[pos].Token)
+		}
+	}
+
+	// Check multimodal data is preserved
+	if result[3].MultimodalHash != mockHash {
+		t.Errorf("Multimodal hash not preserved: got %d, expected %d",
+			result[3].MultimodalHash, mockHash)
+	}
+}
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -0,0 +1,165 @@
+package qwen25vl
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type TextOptions struct {
+	ctxLen, hiddenSize, numHeads, numKVHeads int
+	eps                                      float32
+	ropeConfig                               ml.RoPEConfig
+}
+
+type TextModel struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	*TextOptions
+}
+
+func NewTextModel(c fs.Config) *TextModel {
+	m := TextModel{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+			},
+		),
+		Layers: make([]Layer, c.Uint("block_count")),
+		TextOptions: &TextOptions{
+			ctxLen:     int(c.Uint("context_length")),
+			hiddenSize: int(c.Uint("embedding_length")),
+			numHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+			ropeConfig: ml.RoPEConfig{
+				Base:       c.Float("rope.freq_base"),
+				Scale:      c.Float("rope.freq_scale", 1),
+				Dim:        c.Uint("rope.dimension_count", 128),
+				Type:       ml.RopeTypeNeox,
+				YarnConfig: ml.DefaultYarnConfig(int32(c.Uint("context_length", 128000))),
+			},
+		},
+	}
+
+	return &m
+}
+
+// SelfAttention implements the multi-head self-attention mechanism
+// with separate projections for query, key, value and output transformations
+type SelfAttention struct {
+	Query       *nn.Linear `gguf:"attn_q"`
+	Key         *nn.Linear `gguf:"attn_k"`
+	Value       *nn.Linear `gguf:"attn_v"`
+	Output      *nn.Linear `gguf:"attn_output"`
+	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeConfig)
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeConfig)
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+// Shift applies rotary position embeddings to the key tensor for causal attention caching
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, m.ropeConfig), nil
+}
+
+// MLP implements the feed-forward network component with SwiGLU activation
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
+	// Apply SwiGLU activation gating
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	// Project back to hidden dimension
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+// Layer represents a single transformer layer combining self-attention and feed-forward components
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *SelfAttention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	// Self-attention branch with residual connection
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+
+	// In the final layer (outputs != nil), optimize by pruning to just the token positions
+	// we need logits for.
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	// Feed-forward branch with residual connection
+	residual = hiddenState
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
+	// Initial token embedding
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+
+	// Process through transformer layers
+	for i, layer := range m.Layers {
+		cache.SetLayer(i)
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = outputs
+		}
+
+		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, cache, m.TextOptions)
+	}
+
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	return m.Output.Forward(ctx, hiddenState), nil
+}
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -0,0 +1,260 @@
+package qwen25vl
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+var batchSize int = 1
+
+// VisionSelfAttention implements self-attention for the Qwen vision model
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+
+// Forward computes self-attention for the vision model
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates ml.Tensor, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
+	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
+	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
+
+	config := ml.RoPEConfig{
+		Dim:        uint32(opts.headDim / 2),
+		Type:       ml.RopeTypeMRoPE,
+		Base:       opts.ropeTheta,
+		Scale:      1.0,
+		YarnConfig: ml.DefaultYarnConfig(128000),
+	}
+
+	query = query.RoPEMulti(
+		ctx,
+		positionIDs,
+		nil,
+		[4]int{opts.headDim / 4, opts.headDim / 4, opts.headDim / 4, opts.headDim / 4},
+		config,
+	)
+	key = key.RoPEMulti(
+		ctx,
+		positionIDs,
+		nil,
+		[4]int{opts.headDim / 4, opts.headDim / 4, opts.headDim / 4, opts.headDim / 4},
+		config,
+	)
+
+	// Scale factor for scaled dot-product attention
+	scale := 1.0 / math.Sqrt(float64(opts.headDim))
+
+	attention := nn.Attention(ctx, query, key, value, scale, nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+// VisionMLP implements the MLP for the Qwen vision model
+type VisionMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+// Forward computes the MLP for the vision model
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	// Using GEGLU activation: (Gate * Up) * GELU(Gate)
+	gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
+	upOutput := mlp.Up.Forward(ctx, hiddenStates)
+	hiddenStates = gateOutput.GELU(ctx).Mul(ctx, upOutput)
+
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+// VisionEncoderLayer implements an encoder layer for the Qwen vision model
+type VisionEncoderLayer struct {
+	Norm1         *nn.RMSNorm `gguf:"ln1"`
+	Norm2         *nn.RMSNorm `gguf:"ln2"`
+	SelfAttention *VisionSelfAttention
+	MLP           *VisionMLP
+}
+
+// Forward computes an encoder layer for the vision model
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates ml.Tensor, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, positionIDs, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+// VisionModelOptions contains configuration options for the Qwen vision model
+type VisionModelOptions struct {
+	hiddenSize       int
+	numHeads         int
+	headDim          int
+	intermediateSize int
+	imageSize        int
+	patchSize        int
+	numChannels      int
+	eps              float32
+	ropeTheta        float32
+	outHiddenSize    int
+}
+
+type PatchEmbedding struct {
+	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
+	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
+}
+
+func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, patchSize int) ml.Tensor {
+	shape := pixelValues.Shape()
+	numChannels := 3
+	temporalPatchSize := 2
+	embedDim := 1280
+	numPatches := shape[1] / temporalPatchSize
+
+	// Split the input tensor into two temporal slices and process each separately
+	// First temporal slice (frame 0)
+	slice0 := pixelValues.View(ctx, 0, patchSize*patchSize*numChannels, 0, numPatches, 0, 1).Contiguous(ctx)
+	reshaped0 := slice0.Reshape(ctx,
+		patchSize,   // height
+		patchSize,   // width
+		numChannels, // channels
+		numPatches)  // batch
+
+	// Second temporal slice (frame 1)
+	slice1 := pixelValues.View(ctx, 0, patchSize*patchSize*numChannels, 0, numPatches, 1, 1).Contiguous(ctx)
+	reshaped1 := slice1.Reshape(ctx,
+		patchSize,   // height
+		patchSize,   // width
+		numChannels, // channels
+		numPatches)  // batch
+
+	// Apply the appropriate convolution to each temporal slice
+	// PatchConv0 corresponds to weights for temporal frame 0
+	// PatchConv1 corresponds to weights for temporal frame 1
+	s0, s1 := patchSize, patchSize // Use full stride as in original
+	p0, p1 := 0, 0                 // padding
+	d0, d1 := 1, 1                 // dilation
+
+	output0 := pe.PatchConv0.Forward(ctx, reshaped0, s0, s1, p0, p1, d0, d1)
+	output1 := pe.PatchConv1.Forward(ctx, reshaped1, s0, s1, p0, p1, d0, d1)
+
+	// Add the outputs from the two temporal convolutions
+	combined := output0.Add(ctx, output1)
+
+	// Reshape to required output dimensions
+	result := combined.Reshape(ctx, embedDim, numPatches)
+
+	fmt.Println(ml.Dump(ctx, result))
+
+	return result
+}
+
+// VisionPatchMerger implements patch merging for the Qwen vision model
+type VisionPatchMerger struct {
+	LNQ *nn.RMSNorm `gguf:"ln_q"`
+	MLP *nn.Linear  `gguf:"mlp"`
+}
+
+// Forward computes patch merging for the vision model
+func (pm *VisionPatchMerger) Forward(ctx ml.Context, x ml.Tensor, outDim, contextDim, spatialMergeSize int) ml.Tensor {
+	hiddenSize := contextDim * (spatialMergeSize * spatialMergeSize)
+
+	// Normalize and reshape
+	x = pm.LNQ.Forward(ctx, x, 1e-6)
+	x = x.Reshape(ctx, -1, hiddenSize)
+
+	// Apply MLP for merging
+	x = pm.MLP.Forward(ctx, x)
+
+	return x
+}
+
+// VisionModel implements the Qwen vision model
+type VisionModel struct {
+	PatchEmbedding *PatchEmbedding
+	Layers         []VisionEncoderLayer `gguf:"blk"`
+	PostLayerNorm  *nn.LayerNorm        `gguf:"post_ln"`
+	PatchMerger    *VisionPatchMerger   `gguf:"patch_merger"`
+
+	*VisionModelOptions
+}
+
+// Forward computes the vision model for an input tensor
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
+	// Calculate position IDs for 2D RoPE
+	numPatchesH := pixelValues.Dim(0) / m.patchSize
+	numPatchesW := pixelValues.Dim(1) / m.patchSize
+	numPatches := numPatchesH * numPatchesW
+
+	// Extract patch embeddings
+	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize)
+
+	// Create position IDs - for Qwen2VL mRoPE we need 4 values per position
+	// The format needed is specified in the C++ code as "mrope expecting 4 position ids per token"
+	positions := make([]int32, numPatches*4)
+
+	for h := 0; h < numPatchesH; h++ {
+		for w := 0; w < numPatchesW; w++ {
+			idx := h*numPatchesW + w
+			// For each position, store both h and w coordinates twice
+			// This matches the pattern seen in the C++ implementation
+			positions[idx*4] = int32(h)   // y coordinate
+			positions[idx*4+1] = int32(w) // x coordinate
+			positions[idx*4+2] = int32(h) // y coordinate (repeated)
+			positions[idx*4+3] = int32(w) // x coordinate (repeated)
+		}
+	}
+
+	// Create the position IDs tensor with correct dimensions
+	positionIDs, err := ctx.Input().FromIntSlice(positions, numPatches*4)
+	if err != nil {
+		panic(err)
+	}
+
+	// Apply encoder layers
+	for _, layer := range m.Layers {
+		hiddenStates = layer.Forward(ctx, hiddenStates, positionIDs, m.VisionModelOptions)
+	}
+
+	hiddenStates = m.PostLayerNorm.Forward(ctx, hiddenStates, m.eps)
+	return hiddenStates
+}
+
+// newVisionModel creates a new instance of the Qwen vision model
+func newVisionModel(c fs.Config) *VisionModel {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
+	ropeTheta := c.Float("vision.rope_theta", 10000.0)             // not set
+	outHiddenSize := int(c.Uint("vision.out_embedding_length", 0)) // not set
+	numHeads := int(c.Uint("vision.attention.head_count", 16))
+
+	return &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize:       hiddenSize,
+			numHeads:         numHeads,
+			headDim:          hiddenSize / numHeads,
+			intermediateSize: int(c.Uint("vision.feed_forward_length", 0)),
+			imageSize:        int(c.Uint("vision.image_size", 560)),
+			patchSize:        patchSize,
+			numChannels:      int(c.Uint("vision.num_channels", 3)), // not set
+			eps:              c.Float("vision.attention.layer_norm_epsilon", 1e-6),
+			ropeTheta:        ropeTheta,
+			outHiddenSize:    outHiddenSize,
+		},
+	}
+}
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -0,0 +1,196 @@
+package qwen25vl
+
+import (
+	"fmt"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
+type ImageProcessor struct {
+	imageSize         int
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	minPixels         int
+	maxPixels         int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+
+	return ImageProcessor{
+		imageSize:         int(c.Uint("vision.image_size", 560)),
+		numChannels:       3,
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		minPixels:         56 * 56,
+		maxPixels:         28 * 28 * 4 * 1280,
+		factor:            patchSize * mergeSize,
+		rescaleFactor:     1.0 / 255.0,
+		imageMean:         []float32{0.48145466, 0.4578275, 0.40821073},
+		imageStd:          []float32{0.26862954, 0.26130258, 0.27577711},
+	}
+}
+
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if float64(max(height, width))/float64(min(height, width)) > 200 {
+		aspectRatio := float64(max(height, width)) / float64(min(height, width))
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %f", aspectRatio))
+	}
+
+	round := func(x float64) int {
+		return int(math.Round(x))
+	}
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+
+	if hBar*wBar > p.maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.minPixels {
+		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, int, int, int, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+
+	// Calculate grid dimensions
+	gridH := resizedHeight / p.patchSize
+	gridW := resizedWidth / p.patchSize
+	gridT := 1 // For single images, temporal dimension is 1
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, gridH, gridW, gridT)
+	if err != nil {
+		return nil, 0, 0, 0, fmt.Errorf("failed to create patches: %v", err)
+	}
+
+	// Return patches and grid dimensions
+	return patches, gridT, gridH, gridW, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width, gridH, gridW, gridT int) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	// Calculate output dimensions
+	numPatches := gridT * gridH * gridW
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+
+	// Create output tensor
+	result := make([]float32, numPatches*patchDim)
+
+	// Instead of the complex 9D reshape+transpose, directly extract patches
+	// in the format expected by the forward pass
+	patchIndex := 0
+
+	for t := 0; t < gridT; t++ {
+		// For each patch in the grid
+		for h := 0; h < gridH; h += mergeSize {
+			for w := 0; w < gridW; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := 0; mh < mergeSize; mh++ {
+					for mw := 0; mw < mergeSize; mw++ {
+						// For each pixel in the patch
+						for py := 0; py < patchSize; py++ {
+							for px := 0; px < patchSize; px++ {
+								// Calculate source coordinates
+								y := (h+mh)*patchSize + py
+								x := (w+mw)*patchSize + px
+
+								// For each channel
+								for c := 0; c < channels; c++ {
+									// Channel-first format (CHW)
+									srcIdx := c*height*width + y*width + x
+
+									// Calculate destination index based on the expected layout
+									// This is the key part that matches what the model expects
+									dstIdx := patchIndex*patchDim +
+										(c * temporalPatchSize * patchSize * patchSize) +
+										(0 * patchSize * patchSize) + // temporal dim
+										(py * patchSize) +
+										px
+
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+
+						// Handle temporal dimension padding (if needed)
+						for tp := 1; tp < temporalPatchSize; tp++ {
+							for py := 0; py < patchSize; py++ {
+								for px := 0; px < patchSize; px++ {
+									for c := 0; c < channels; c++ {
+										srcIdx := patchIndex*patchDim +
+											(c * temporalPatchSize * patchSize * patchSize) +
+											(0 * patchSize * patchSize) + // first temporal frame
+											(py * patchSize) +
+											px
+
+										dstIdx := patchIndex*patchDim +
+											(c * temporalPatchSize * patchSize * patchSize) +
+											(tp * patchSize * patchSize) + // current temporal frame
+											(py * patchSize) +
+											px
+
+										if srcIdx < len(result) && dstIdx < len(result) {
+											result[dstIdx] = result[srcIdx] // Copy from first frame
+										}
+									}
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}
--- a/model/models/qwen25vl/process_image_test.go
+++ b/model/models/qwen25vl/process_image_test.go
@@ -0,0 +1,47 @@
+package qwen25vl
+
+import (
+	"image"
+	_ "image/jpeg" // Register JPEG decoder
+	"testing"
+)
+
+func TestSmartResize(t *testing.T) {
+	type smartResizeCase struct {
+		TestImage image.Image
+		Expected  image.Point
+	}
+
+	// Create an image processor with default values
+	processor := ImageProcessor{
+		imageSize:   560, // Example value
+		numChannels: 3,
+		factor:      28,
+		minPixels:   56 * 56,
+		maxPixels:   14 * 14 * 4 * 1280,
+	}
+
+	cases := []smartResizeCase{
+		{
+			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
+			Expected:  image.Point{980, 980},
+		},
+		{
+			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			Expected:  image.Point{1036, 756},
+		},
+		{
+			TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
+			Expected:  image.Point{980, 980},
+		},
+	}
+
+	for _, c := range cases {
+		b := c.TestImage.Bounds().Max
+		x, y := processor.SmartResize(b.X, b.Y)
+		actual := image.Point{x, y}
+		if actual != c.Expected {
+			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
+		}
+	}
+}