package lfm2 import ( "math" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" ) const lfm2VisionBatchSize = 1 type visionPatchGrid struct { Width int Height int } type VisionSelfAttention struct { Query *nn.Linear `gguf:"attn_q"` Key *nn.Linear `gguf:"attn_k"` Value *nn.Linear `gguf:"attn_v"` Output *nn.Linear `gguf:"attn_output,alt:attn_out"` } func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { headDim := opts.hiddenSize / opts.numHeads query := sa.Query.Forward(ctx, hiddenState) key := sa.Key.Forward(ctx, hiddenState) value := sa.Value.Forward(ctx, hiddenState) query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), lfm2VisionBatchSize) key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), lfm2VisionBatchSize) value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), lfm2VisionBatchSize) attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil) attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), lfm2VisionBatchSize) return sa.Output.Forward(ctx, attention) } type VisionMLP struct { Up *nn.Linear `gguf:"ffn_up"` Down *nn.Linear `gguf:"ffn_down"` } func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor { return mlp.Down.Forward(ctx, mlp.Up.Forward(ctx, hiddenState).GELU(ctx)) } type VisionEncoderLayer struct { LayerNorm1 *nn.LayerNorm `gguf:"ln1"` SelfAttention *VisionSelfAttention LayerNorm2 *nn.LayerNorm `gguf:"ln2"` MLP *VisionMLP } func (l *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { residual := hiddenState hiddenState = l.LayerNorm1.Forward(ctx, hiddenState, opts.eps) hiddenState = l.SelfAttention.Forward(ctx, hiddenState, opts) hiddenState = hiddenState.Add(ctx, residual) residual = hiddenState hiddenState = l.LayerNorm2.Forward(ctx, hiddenState, opts.eps) hiddenState = l.MLP.Forward(ctx, hiddenState) return hiddenState.Add(ctx, residual) } type VisionModelOptions struct { hiddenSize, numHeads int imageSize, patchSize int eps float32 } type VisionModel struct { PatchEmbedding *nn.Conv2D `gguf:"patch_embd"` PositionEmbedding *nn.Embedding `gguf:"position_embd"` PostLayerNorm *nn.LayerNorm `gguf:"post_ln"` Layers []VisionEncoderLayer `gguf:"blk"` *VisionModelOptions } func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, patches visionPatchGrid) ml.Tensor { numPatches := patches.Width * patches.Height hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1) hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) if m.PositionEmbedding != nil { posTokens := m.PositionEmbedding.Weight.Dim(1) source := int(math.Sqrt(float64(posTokens))) var positionEmbeddings ml.Tensor if source > 0 && source*source == posTokens && (source != patches.Width || source != patches.Height) { // SigLIP2 NAFlex-style position interpolation for variable image sizes. positionIDs := ctx.Arange(0, float32(posTokens), 1, ml.DTypeI32) positionEmbeddings = m.PositionEmbedding.Forward(ctx, positionIDs) positionEmbeddings = positionEmbeddings.Reshape(ctx, -1, source, source) positionEmbeddings = positionEmbeddings.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) positionEmbeddings = positionEmbeddings.Interpolate(ctx, [4]int{ patches.Width, patches.Height, hiddenState.Dim(0), 1, }, ml.SamplingModeBilinear) positionEmbeddings = positionEmbeddings.Permute(ctx, 1, 2, 0, 3) positionEmbeddings = positionEmbeddings.Contiguous(ctx, -1, patches.Width*patches.Height) } else { positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32) positionEmbeddings = m.PositionEmbedding.Forward(ctx, positionIDs) } hiddenState = hiddenState.Add(ctx, positionEmbeddings) } for _, layer := range m.Layers { hiddenState = layer.Forward(ctx, hiddenState, m.VisionModelOptions) } return m.PostLayerNorm.Forward(ctx, hiddenState, m.eps) } func newVisionModel(c fs.Config) *VisionModel { return &VisionModel{ Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")), VisionModelOptions: &VisionModelOptions{ hiddenSize: int(c.Uint("vision.embedding_length", 1152)), numHeads: int(c.Uint("vision.attention.head_count", 16)), imageSize: int(c.Uint("vision.image_size", 256)), patchSize: int(c.Uint("vision.patch_size", 16)), eps: c.Float("vision.attention.layer_norm_epsilon", 1e-6), }, } } type VisionProjector struct { LayerNorm *nn.LayerNorm `gguf:"layer_norm"` Linear1 *nn.Linear `gguf:"1"` Linear2 *nn.Linear `gguf:"2"` } type VisionProjectorOptions struct { scaleFactor int useLayerNorm bool } func (p *VisionProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, patches visionPatchGrid, opts VisionProjectorOptions) ml.Tensor { hiddenSize := visionOutputs.Dim(0) featureMap := visionOutputs merge := max(opts.scaleFactor, 1) if merge > 1 { width := patches.Width height := patches.Height featureMap = featureMap.Reshape(ctx, hiddenSize, width, height) // Match llama.cpp patch merger: pad spatial dims to merge factor. padWidth := (merge - width%merge) % merge padHeight := (merge - height%merge) % merge if padWidth != 0 || padHeight != 0 { featureMap = featureMap.Pad(ctx, 0, padWidth, padHeight, 0) width += padWidth height += padHeight } featureMap = featureMap.Reshape(ctx, hiddenSize*merge, width/merge, height) featureMap = featureMap.Permute(ctx, 0, 2, 1).Contiguous(ctx, hiddenSize*merge*merge, height/merge, width/merge) featureMap = featureMap.Permute(ctx, 0, 2, 1).Contiguous(ctx) featureMap = featureMap.Contiguous(ctx, featureMap.Dim(0), featureMap.Dim(1)*featureMap.Dim(2)) } if opts.useLayerNorm && p.LayerNorm != nil { featureMap = p.LayerNorm.Forward(ctx, featureMap, 1e-5) } featureMap = p.Linear1.Forward(ctx, featureMap).GELU(ctx) return p.Linear2.Forward(ctx, featureMap) }