mirror of
https://github.com/ollama/ollama.git
synced 2026-04-18 13:54:11 +02:00
deepseekocr
This commit is contained in:
committed by
Michael Yang
parent
8ed1adf3db
commit
92981ae3f2
192
model/models/deepseekocr/model.go
Normal file
192
model/models/deepseekocr/model.go
Normal file
@@ -0,0 +1,192 @@
|
||||
package deepseekocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.TextProcessor
|
||||
|
||||
Sam *samModel `gguf:"s"`
|
||||
Vision *visionModel `gguf:"v"`
|
||||
Text *textModel
|
||||
|
||||
ImageNewline ml.Tensor `gguf:"mm.image_newline"`
|
||||
//nolint:misspell // this misspelling is upstream. fixing it breaks the model
|
||||
ViewSeperator ml.Tensor `gguf:"mm.view_seperator"`
|
||||
|
||||
Projector *nn.Linear `gguf:"mm.layers"`
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, bts []byte) ([]input.Multimodal, error) {
|
||||
patches, original, crop, err := ProcessImage(ctx, bts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var outputs []ml.Tensor
|
||||
if true { // TODO: local features if sum(patches) != 0
|
||||
samOutputs := m.Sam.Forward(ctx, patches)
|
||||
visionOutputs := m.Vision.Forward(ctx, patches, samOutputs)
|
||||
|
||||
samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
|
||||
visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
|
||||
localOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
|
||||
localOutputs = m.Projector.Forward(ctx, localOutputs)
|
||||
|
||||
hw := int(math.Sqrt(float64(localOutputs.Dim(1))))
|
||||
localOutputs = localOutputs.Reshape(ctx, -1, hw, crop[0], crop[1])
|
||||
localOutputs = localOutputs.Permute(ctx, 0, 2, 1, 3)
|
||||
localOutputs = localOutputs.Contiguous(ctx, -1, crop[0]*hw, crop[1]*hw)
|
||||
localOutputs = localOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, localOutputs.Dim(2)), 1)
|
||||
localOutputs = localOutputs.Reshape(ctx, localOutputs.Dim(0), -1)
|
||||
|
||||
outputs = append(outputs, localOutputs)
|
||||
}
|
||||
|
||||
samOutputs := m.Sam.Forward(ctx, original)
|
||||
visionOutputs := m.Vision.Forward(ctx, original, samOutputs)
|
||||
|
||||
samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
|
||||
visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
|
||||
globalOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
|
||||
globalOutputs = m.Projector.Forward(ctx, globalOutputs)
|
||||
|
||||
hw := int(math.Sqrt(float64(globalOutputs.Dim(1))))
|
||||
globalOutputs = globalOutputs.Reshape(ctx, -1, hw, hw)
|
||||
globalOutputs = globalOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, globalOutputs.Dim(2)), 1)
|
||||
globalOutputs = globalOutputs.Reshape(ctx, globalOutputs.Dim(0), -1)
|
||||
|
||||
outputs = append(outputs, globalOutputs, m.ViewSeperator)
|
||||
return []input.Multimodal{
|
||||
{Tensor: outputs[0].Stack(ctx, 1, outputs[1:]...)},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
outputs := make([]*input.Input, 0, len(inputs))
|
||||
for i := range inputs {
|
||||
if inputs[i].Multimodal == nil {
|
||||
outputs = append(outputs, inputs[i])
|
||||
continue
|
||||
}
|
||||
|
||||
t := inputs[i].Multimodal[0].Tensor
|
||||
outputs = append(outputs, &input.Input{
|
||||
Token: 128815,
|
||||
Multimodal: inputs[i].Multimodal,
|
||||
MultimodalHash: inputs[i].MultimodalHash,
|
||||
SameBatch: t.Dim(1) - 1,
|
||||
})
|
||||
|
||||
outputs = slices.Grow(outputs, t.Dim(1)-1)
|
||||
outputs = append(outputs, slices.Repeat([]*input.Input{{Token: 128815}}, t.Dim(1)-1)...)
|
||||
}
|
||||
return outputs, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
inputsEmbeds := m.Text.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
||||
positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
|
||||
|
||||
for _, mm := range batch.Multimodal {
|
||||
t := mm.Multimodal[0].Tensor
|
||||
ctx.Forward(t.Copy(ctx, inputsEmbeds.View(ctx, mm.Index*inputsEmbeds.Stride(1), t.Dim(0)*t.Dim(1))))
|
||||
}
|
||||
|
||||
hiddenStates := inputsEmbeds
|
||||
for i, block := range m.Text.Blocks {
|
||||
if m.Cache != nil {
|
||||
m.Cache.SetLayer(i)
|
||||
}
|
||||
|
||||
var outputs ml.Tensor
|
||||
if i == len(m.Text.Blocks)-1 {
|
||||
outputs = batch.Outputs
|
||||
}
|
||||
|
||||
hiddenStates = block.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Text.Options)
|
||||
}
|
||||
|
||||
hiddenStates = m.Text.OutputNorm.Forward(ctx, hiddenStates, m.Text.Options.eps)
|
||||
return m.Text.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("deepseekocr", func(c fs.Config) (model.Model, error) {
|
||||
textBlocks := make([]textBlock, c.Uint("block_count"))
|
||||
leadingDenseBlockCount := int(c.Uint("leading_dense_block_count", 1))
|
||||
for i := range textBlocks {
|
||||
if i >= leadingDenseBlockCount {
|
||||
textBlocks[i].FeedForward = &textMoe{}
|
||||
} else {
|
||||
textBlocks[i].FeedForward = &textMLP{}
|
||||
}
|
||||
}
|
||||
|
||||
m := Model{
|
||||
TextProcessor: model.NewBytePairEncoding(
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
// Split regex into multiple parts (according to DeepSeek3's regex)
|
||||
"\\p{N}{1,3}",
|
||||
`[一-龥-ゟ゠-ヿ]+`,
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
),
|
||||
Text: &textModel{
|
||||
Blocks: textBlocks,
|
||||
Options: textOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
numExperts: int(c.Uint("expert_count")),
|
||||
numExpertsUsed: int(c.Uint("expert_used_count")),
|
||||
ropeBase: c.Float("rope.freq_base", 10_000),
|
||||
ropeScale: c.Float("rope.scaling.factor", 1.0),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon", 1e-6),
|
||||
},
|
||||
},
|
||||
Vision: &visionModel{
|
||||
Blocks: make([]visionBlock, c.Uint("vision.block_count")),
|
||||
Options: visionOptions{
|
||||
hiddenSize: int(c.Uint("vision.embedding_length")),
|
||||
numHeads: int(c.Uint("vision.head_count")),
|
||||
imageSize: int(c.Uint("vision.image_size", 224)),
|
||||
patchSize: int(c.Uint("vision.patch_size", 14)),
|
||||
eps: c.Float("vision.attention.layer_norm_epsilon", 1e-5),
|
||||
},
|
||||
},
|
||||
Sam: &samModel{
|
||||
Blocks: make([]samBlock, c.Uint("sam.block_count")),
|
||||
Options: samOptions{
|
||||
hiddenSize: int(c.Uint("sam.embedding_length")),
|
||||
numHeads: int(c.Uint("sam.head_count")),
|
||||
eps: c.Float("sam.attention.layer_norm_epsilon", 1e-6),
|
||||
globalAttentionLayers: c.Ints("sam.global_attention_indexes"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.Text.Shift)
|
||||
return &m, nil
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user