mirror of
https://github.com/ollama/ollama.git
synced 2026-04-21 16:25:42 +02:00
the updated interface supports variadic attention options which removes the need for individual `AttentionWith...` functions. it means more models can use the attention interface, e.g. models with custom masks, logit softcapping, etc. additionally, this interface should be less error prone since there are now reasonable defaults for all optional parameters
118 lines
3.9 KiB
Go
118 lines
3.9 KiB
Go
package deepseekocr
|
|
|
|
import (
|
|
"math"
|
|
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/ml/nn"
|
|
)
|
|
|
|
type visionModel struct {
|
|
PatchEmbedding *nn.Conv2D `gguf:"patch_embd"`
|
|
ClassEmbedding ml.Tensor `gguf:"class_embd"`
|
|
PositionEmbedding *nn.Embedding `gguf:"position_embd"`
|
|
|
|
PreLayerNorm *nn.LayerNorm `gguf:"pre_layrnorm"`
|
|
Blocks []visionBlock `gguf:"blk"`
|
|
|
|
Options visionOptions
|
|
}
|
|
|
|
func (m *visionModel) absolutePositionEmbedding(ctx ml.Context, embeds ml.Tensor) ml.Tensor {
|
|
numPatches := m.Options.imageSize / m.Options.patchSize * m.Options.imageSize / m.Options.patchSize
|
|
positions := ctx.Arange(0, float32(numPatches+1), 1, ml.DTypeI32)
|
|
positionEmbeds := m.PositionEmbedding.Forward(ctx, positions)
|
|
|
|
source := int(math.Sqrt(float64(positionEmbeds.Dim(1) - 1)))
|
|
target := int(math.Sqrt(float64(embeds.Dim(1) - 1)))
|
|
if source != target {
|
|
newPositionEmbeds := positionEmbeds.Slice(ctx, 1, 1, positionEmbeds.Dim(1), 1)
|
|
newPositionEmbeds = newPositionEmbeds.Reshape(ctx, -1, source, source)
|
|
newPositionEmbeds = newPositionEmbeds.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
|
newPositionEmbeds = newPositionEmbeds.Interpolate(ctx, [4]int{target, target, embeds.Dim(0), 1}, ml.SamplingModeBilinear)
|
|
newPositionEmbeds = newPositionEmbeds.Permute(ctx, 1, 2, 0, 3)
|
|
newPositionEmbeds = newPositionEmbeds.Contiguous(ctx, -1, target*target)
|
|
|
|
positionEmbeds = positionEmbeds.Slice(ctx, 1, 0, 1, 1).Concat(ctx, newPositionEmbeds, 1)
|
|
}
|
|
|
|
return positionEmbeds
|
|
}
|
|
|
|
func (m *visionModel) Forward(ctx ml.Context, pixelValues, patchEmbeds ml.Tensor) ml.Tensor {
|
|
if patchEmbeds == nil {
|
|
patchEmbeds = m.PatchEmbedding.Forward(ctx, pixelValues, m.Options.patchSize, m.Options.patchSize, 0, 0, 1, 1)
|
|
}
|
|
|
|
patchEmbeds = patchEmbeds.Reshape(ctx, -1, patchEmbeds.Dim(2), patchEmbeds.Dim(3))
|
|
patchEmbeds = patchEmbeds.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
|
|
|
classEmbeds := m.ClassEmbedding.Repeat(ctx, 2, patchEmbeds.Dim(2))
|
|
embeds := classEmbeds.Concat(ctx, patchEmbeds, 1)
|
|
embeds = embeds.Add(ctx, m.absolutePositionEmbedding(ctx, embeds))
|
|
|
|
hiddenStates := m.PreLayerNorm.Forward(ctx, embeds, m.Options.eps)
|
|
for _, block := range m.Blocks {
|
|
hiddenStates = block.Forward(ctx, hiddenStates, m.Options)
|
|
}
|
|
|
|
return hiddenStates
|
|
}
|
|
|
|
type visionOptions struct {
|
|
hiddenSize,
|
|
numHeads int
|
|
eps float32
|
|
|
|
imageSize, patchSize int
|
|
}
|
|
|
|
func (o visionOptions) headDim() int {
|
|
return o.hiddenSize / o.numHeads
|
|
}
|
|
|
|
type visionBlock struct {
|
|
Norm1 *nn.LayerNorm `gguf:"layer_norm1"`
|
|
Attention *visionAttention `gguf:"self_attn"`
|
|
Norm2 *nn.LayerNorm `gguf:"layer_norm2"`
|
|
FeedForward *visionMLP `gguf:"mlp"`
|
|
}
|
|
|
|
func (m *visionBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts visionOptions) ml.Tensor {
|
|
residual := hiddenStates
|
|
hiddenStates = m.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = m.Attention.Forward(ctx, hiddenStates, opts)
|
|
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
|
|
residual = hiddenStates
|
|
hiddenStates = m.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = m.FeedForward.Forward(ctx, hiddenStates)
|
|
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
return hiddenStates
|
|
}
|
|
|
|
type visionAttention struct {
|
|
QKV *nn.Linear `gguf:"qkv_proj"`
|
|
Output *nn.Linear `gguf:"out_proj"`
|
|
}
|
|
|
|
func (m *visionAttention) Forward(ctx ml.Context, t ml.Tensor, opts visionOptions) ml.Tensor {
|
|
qkv := m.QKV.Forward(ctx, t)
|
|
qkv = qkv.Reshape(ctx, opts.headDim(), -1, qkv.Dim(1), qkv.Dim(2))
|
|
chunks := qkv.Chunk(ctx, 1, opts.numHeads)
|
|
query, key, value := chunks[0], chunks[1], chunks[2]
|
|
|
|
attention := nn.Attention(ctx, query, key, value, nil)
|
|
attention = attention.Reshape(ctx, -1, attention.Dim(2), attention.Dim(3))
|
|
return m.Output.Forward(ctx, attention)
|
|
}
|
|
|
|
type visionMLP struct {
|
|
FC1 *nn.Linear `gguf:"fc1"`
|
|
FC2 *nn.Linear `gguf:"fc2"`
|
|
}
|
|
|
|
func (m *visionMLP) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
|
return m.FC2.Forward(ctx, m.FC1.Forward(ctx, t).QuickGELU(ctx))
|
|
}
|