package glmocr import ( "log/slog" "math" "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn/rope" ) type Grid struct { Height int // Number of patches in height direction Width int // Number of patches in width direction Temporal int ImageHeight int // Full image height in pixels ImageWidth int // Full image width in pixels } type VisionModelOptions struct { hiddenSize int numHeads int headDim int numChannels int patchSize int temporalPatchSize int imageSize int spatialMergeSize int outHiddenSize int intermediateSize int eps float32 } type VisionPatchEmbed struct { Proj *nn.Conv2D `gguf:"patch_embd_0"` Proj1 *nn.Conv2D `gguf:"patch_embd_1"` Bias ml.Tensor `gguf:"patch_embd.bias"` } func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor { _ = grid // patches are already in merge-block order // pixelValues shape: [patchDim, numPatches] numPatches := pixelValues.Shape()[1] // Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches] pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches) // Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches] pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) // Slice temporal frames for Conv2D (simulate Conv3D) in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx) in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches) s0, s1 := opts.patchSize, opts.patchSize p0, p1 := 0, 0 d0, d1 := 1, 1 hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1) if pe.Proj1 != nil && opts.temporalPatchSize > 1 { in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx) in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches) out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1) hiddenStates = hiddenStates.Add(ctx, out1) } // Flatten to [hidden_size, num_patches] hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches) // Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting if pe.Bias != nil { hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1)) } return hiddenStates } type VisionSelfAttention struct { QKV *nn.Linear `gguf:"attn_qkv"` QNorm *nn.RMSNorm `gguf:"attn_q_norm"` KNorm *nn.RMSNorm `gguf:"attn_k_norm"` Output *nn.Linear `gguf:"attn_out"` } func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor { batchSize := hiddenStates.Dim(1) // Combined QKV projection: [3*hidden_size, batch_size] qkv := sa.QKV.Forward(ctx, hiddenStates) // Split using ChunkSections along dim 0 (handles byte offsets correctly) // ChunkSections returns views - must make contiguous before further operations chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize) q := chunks[0].Contiguous(ctx) k := chunks[1].Contiguous(ctx) v := chunks[2].Contiguous(ctx) // Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N] q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) // Apply Q-norm and K-norm after head reshape // Weights are [headDim]=64, tensor is [headDim, numHeads, N] q = sa.QNorm.Forward(ctx, q, opts.eps) k = sa.KNorm.Forward(ctx, k, opts.eps) // Apply rotary position embeddings with vision-style 2D positions. // ggml's vision RoPE uses two position dimensions (H/W) with half-rotation pairs. // We provide H/W sections and leave the remaining sections empty. ropeFreqBase := float32(10000.0) section := opts.headDim / 4 if section <= 0 { section = 1 } sections := []int{section, section, 0, 0} q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections)) k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections)) // Scale factor for scaled dot-product attention scale := 1.0 / math.Sqrt(float64(opts.headDim)) // Try flash attention first (ScaledDotProductAttention), fall back to manual if sdpa, ok := q.(ml.ScaledDotProductAttention); ok { attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false) attention = attention.Reshape(ctx, opts.hiddenSize, batchSize) return sa.Output.Forward(ctx, attention) } slog.Warn("glmocr: vision attention falling back to manual attention", "batchSize", batchSize, "numHeads", opts.numHeads, "hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention") // Manual attention fallback // q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1 q = q.Permute(ctx, 0, 2, 1, 3) k = k.Permute(ctx, 0, 2, 1, 3) v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) // Attention scores kq := k.MulmatFullPrec(ctx, q) kq = kq.Scale(ctx, scale) kq = kq.Softmax(ctx) // Attention output: v @ kq (note: v first) kqv := v.Mulmat(ctx, kq) attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) attention = attention.Reshape(ctx, opts.hiddenSize, batchSize) return sa.Output.Forward(ctx, attention) } type VisionMLP struct { Gate *nn.Linear `gguf:"ffn_gate"` Up *nn.Linear `gguf:"ffn_up"` Down *nn.Linear `gguf:"ffn_down"` } func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor { // SwiGLU: down(silu(gate(x)) * up(x)) gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates)) return mlp.Down.Forward(ctx, gate) } type VisionBlock struct { Norm1 *nn.RMSNorm `gguf:"ln1"` SelfAttention *VisionSelfAttention Norm2 *nn.RMSNorm `gguf:"ln2"` MLP *VisionMLP } func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor { // Pre-norm architecture residual := hiddenStates hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps) hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts) hiddenStates = hiddenStates.Add(ctx, residual) residual = hiddenStates hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps) hiddenStates = b.MLP.Forward(ctx, hiddenStates) hiddenStates = hiddenStates.Add(ctx, residual) return hiddenStates } type VisionDownsample struct { *nn.Conv2D } func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor { // Apply spatial downsampling via Conv2D // Input: [hidden_size, num_patches] where patches are in merge-block order if d.Conv2D == nil || d.Weight == nil { slog.Error("VisionDownsample weights not loaded - model may be corrupted or incompatible") return hiddenStates // Return input unchanged as fallback } merge := opts.spatialMergeSize numOutputTokens := (grid.Height / merge) * (grid.Width / merge) // Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens] hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens) // Step 2: Permute to [merge, merge, hidden_size, num_output_tokens] // ggml semantics: result.ne[perm[i]] = input.ne[i] // So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N] hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) // Step 3: Apply Conv2D without bias (bias added after reshape) // Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama s0, s1 := merge, merge p0, p1 := 0, 0 d0, d1 := 1, 1 hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1) // Step 4: Reshape to [out_hidden_size, num_output_tokens] hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens) // Step 5: Add bias after reshape // Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting if d.Bias != nil { hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1)) } return hiddenStates } type PatchMerger struct { // GGUF tags align with mm.* keys used by the model Proj *nn.Linear `gguf:"model.fc"` // mm.model.fc.weight PostLN *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias GateProj *nn.Linear `gguf:"gate"` // mm.gate.weight UpProj *nn.Linear `gguf:"up"` // mm.up.weight DownProj *nn.Linear `gguf:"down"` // mm.down.weight } func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor { // Linear projection hiddenStates = m.Proj.Forward(ctx, hiddenStates) // Post-projection layer norm + GELU ERF hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps) hiddenStates = hiddenStates.GELU_ERF(ctx) // Force a copy to avoid in-place mutation issues with GELU_ERF hiddenStates = hiddenStates.Contiguous(ctx) // SwiGLU MLP: down(silu(gate(x)) * up(x)) gateOut := m.GateProj.Forward(ctx, hiddenStates) upOut := m.UpProj.Forward(ctx, hiddenStates) gate := gateOut.SILU(ctx, upOut) return m.DownProj.Forward(ctx, gate) } type VisionModel struct { PatchEmbed *VisionPatchEmbed Blocks []VisionBlock `gguf:"blk"` PostLN *nn.RMSNorm `gguf:"post_ln"` // Note: Downsample is applied at the model level so mm.patch_merger stays separate *VisionModelOptions } func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor { // Extract patch embeddings from flattened patches hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions) // Create position IDs for RoPE (spatial grid) // Patches are already in merge-block order from preprocessing positions := m.createPositions(ctx, grid) // Process through vision blocks for _, block := range m.Blocks { hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions) } // Post-layernorm hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps) // Note: Downsample is now applied separately in Model.EncodeMultimodal // so mm.patch_merger remains a distinct module return hiddenStates } func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor { // Create spatial position IDs for vision RoPE // Position layout: [height, width, height, width] - 4 sections for mrope // Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving // This follows the GLM-OCR rot_pos_emb layout numPatches := grid.Height * grid.Width mergeRatio := m.spatialMergeSize // Build position arrays in merge-block order // Each merge_ratio x merge_ratio block of patches is grouped together hpos := make([]int32, numPatches) wpos := make([]int32, numPatches) ptr := 0 for y := 0; y < grid.Height; y += mergeRatio { for x := 0; x < grid.Width; x += mergeRatio { for dy := range mergeRatio { for dx := range mergeRatio { hpos[ptr] = int32(y + dy) wpos[ptr] = int32(x + dx) ptr++ } } } } // Build position arrays for 4 sections (mrope). ggml vision RoPE uses only H/W; // keep remaining sections zeroed to match its conventions. zeros := make([]int32, numPatches) s := [][]int32{ hpos, // Section 0: height wpos, // Section 1: width zeros, // Section 2: unused zeros, // Section 3: unused } return ctx.Input().FromInts(slices.Concat(s...), numPatches*4) } func newVisionModel(c fs.Config) *VisionModel { hiddenSize := int(c.Uint("vision.embedding_length", 1024)) numHeads := int(c.Uint("vision.attention.head_count", 16)) numChannels := int(c.Uint("vision.num_channels", 3)) patchSize := int(c.Uint("vision.patch_size", 14)) temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2)) imageSize := int(c.Uint("vision.image_size", 336)) spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2)) outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536)) intermediateSize := int(c.Uint("vision.intermediate_size", 4096)) eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5) return &VisionModel{ Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)), VisionModelOptions: &VisionModelOptions{ hiddenSize: hiddenSize, numHeads: numHeads, headDim: hiddenSize / numHeads, numChannels: numChannels, patchSize: patchSize, temporalPatchSize: temporalPatchSize, imageSize: imageSize, spatialMergeSize: spatialMergeSize, outHiddenSize: outHiddenSize, intermediateSize: intermediateSize, eps: eps, }, } }