mirror of
https://github.com/ollama/ollama.git
synced 2026-04-17 19:54:03 +02:00
627 lines
18 KiB
Go
627 lines
18 KiB
Go
package qwen3next
|
|
|
|
import (
|
|
"bytes"
|
|
"cmp"
|
|
"fmt"
|
|
"image"
|
|
"math"
|
|
"slices"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/ml/nn"
|
|
"github.com/ollama/ollama/ml/nn/rope"
|
|
"github.com/ollama/ollama/model"
|
|
"github.com/ollama/ollama/model/input"
|
|
"github.com/ollama/ollama/model/models/qwen3vl"
|
|
"github.com/ollama/ollama/tokenizer"
|
|
)
|
|
|
|
// Options contains model configuration
|
|
type Options struct {
|
|
hiddenSize int
|
|
numHeads int
|
|
numKVHeads int
|
|
keyLength int
|
|
valueLength int
|
|
ropeDim int
|
|
|
|
eps float32
|
|
ropeBase float32
|
|
ropeScale float32
|
|
ropeType string
|
|
originalContextLength int
|
|
attentionScale float64
|
|
|
|
// MoE config
|
|
numExperts int
|
|
numExpertsUsed int
|
|
normTopKProb bool
|
|
|
|
// Linear attention (Gated Delta Net) config
|
|
ssmDInner int // d_inner = head_v_dim * num_v_heads
|
|
ssmDState int // head_k_dim
|
|
ssmNGroup int // num_k_heads
|
|
ssmDtRank int // num_v_heads
|
|
convKernelSize int // SSM conv kernel size
|
|
vHeadReordered bool
|
|
|
|
// Per-layer type from GGUF metadata
|
|
isRecurrent []bool
|
|
|
|
// RoPE mode config (used by qwen35/qwen35moe)
|
|
mropeSections []int
|
|
mropeInterleaved bool
|
|
|
|
// Pre-computed masks for chunked attention (created once per forward pass)
|
|
masks *Masks
|
|
}
|
|
|
|
func (o Options) headDim() int {
|
|
return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
|
|
}
|
|
|
|
func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
|
var opts []func(*rope.Options)
|
|
if len(o.mropeSections) > 0 {
|
|
if o.mropeInterleaved {
|
|
opts = append(opts, rope.WithInterleaveMRoPE(o.mropeSections))
|
|
} else {
|
|
opts = append(opts, rope.WithMRoPE(o.mropeSections))
|
|
}
|
|
} else {
|
|
opts = append(opts, rope.WithTypeNeoX())
|
|
}
|
|
|
|
if o.ropeType == "yarn" {
|
|
attnFactor := float32(1.0 / (1.0 + 0.1*math.Log(float64(o.ropeScale))))
|
|
opts = append(opts,
|
|
rope.WithOriginalContextLength(o.originalContextLength),
|
|
rope.WithExtrapolationFactor(1.),
|
|
rope.WithAttentionFactor(attnFactor),
|
|
)
|
|
}
|
|
ropeDim := cmp.Or(o.ropeDim, o.headDim())
|
|
return nn.RoPE(ctx, states, positions, ropeDim, o.ropeBase, 1./o.ropeScale, opts...)
|
|
}
|
|
|
|
// Operator is the interface for attention-like operators
|
|
type Operator interface {
|
|
Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error)
|
|
}
|
|
|
|
// MLP is the interface for feedforward networks
|
|
type MLP interface {
|
|
Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor
|
|
}
|
|
|
|
// sparse implements MoE with shared experts
|
|
type sparse struct {
|
|
Router *nn.Linear `gguf:"ffn_gate_inp"`
|
|
Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
|
|
Up *nn.LinearBatch `gguf:"ffn_up_exps"`
|
|
Down *nn.LinearBatch `gguf:"ffn_down_exps"`
|
|
|
|
// Shared experts
|
|
SharedGateInp *nn.Linear `gguf:"ffn_gate_inp_shexp"`
|
|
SharedGate *nn.Linear `gguf:"ffn_gate_shexp"`
|
|
SharedUp *nn.Linear `gguf:"ffn_up_shexp"`
|
|
SharedDown *nn.Linear `gguf:"ffn_down_shexp"`
|
|
}
|
|
|
|
func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
|
|
hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
|
|
if batchSize == 0 {
|
|
batchSize = 1
|
|
}
|
|
hiddenStates2D := hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
|
|
|
|
// Router logits
|
|
routerLogits := mlp.Router.Forward(ctx, hiddenStates2D)
|
|
|
|
// Softmax routing weights
|
|
routingWeights := routerLogits.Softmax(ctx)
|
|
selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
|
|
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates2D.Dim(1)).Rows(ctx, selectedExperts)
|
|
if opts.normTopKProb {
|
|
routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates2D.Dim(1))
|
|
routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
|
|
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates2D.Dim(1))
|
|
}
|
|
|
|
hiddenStates3D := hiddenStates2D.Reshape(ctx, hiddenStates2D.Dim(0), 1, hiddenStates2D.Dim(1))
|
|
|
|
// Expert computation with SILU activation
|
|
gateOut := mlp.Gate.Forward(ctx, hiddenStates3D, selectedExperts)
|
|
upOut := mlp.Up.Forward(ctx, hiddenStates3D, selectedExperts)
|
|
experts := gateOut.SILU(ctx, upOut)
|
|
experts = mlp.Down.Forward(ctx, experts, selectedExperts)
|
|
experts = experts.Mul(ctx, routingWeights)
|
|
|
|
// Sum over experts
|
|
moeOut := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
|
|
for i := 1; i < opts.numExpertsUsed; i++ {
|
|
moeOut = moeOut.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
|
|
}
|
|
|
|
// Add shared experts if present
|
|
if mlp.SharedUp != nil {
|
|
sharedGate := mlp.SharedGate.Forward(ctx, hiddenStates2D)
|
|
sharedUp := mlp.SharedUp.Forward(ctx, hiddenStates2D)
|
|
sharedOut := sharedGate.SILU(ctx, sharedUp)
|
|
sharedOut = mlp.SharedDown.Forward(ctx, sharedOut)
|
|
|
|
// Apply shared expert gating
|
|
if mlp.SharedGateInp != nil {
|
|
sharedGateVal := mlp.SharedGateInp.Forward(ctx, hiddenStates2D)
|
|
sharedGateVal = sharedGateVal.SigmoidOut(ctx)
|
|
// Broadcast gate to match dimensions
|
|
sharedGateVal = sharedGateVal.Repeat(ctx, 0, sharedOut.Dim(0))
|
|
sharedOut = sharedOut.Mul(ctx, sharedGateVal)
|
|
}
|
|
|
|
moeOut = moeOut.Add(ctx, sharedOut)
|
|
}
|
|
|
|
return moeOut
|
|
}
|
|
|
|
// dense implements standard feedforward
|
|
type dense struct {
|
|
Gate *nn.Linear `gguf:"ffn_gate"`
|
|
Up *nn.Linear `gguf:"ffn_up"`
|
|
Down *nn.Linear `gguf:"ffn_down"`
|
|
}
|
|
|
|
func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
|
|
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
|
return mlp.Down.Forward(ctx, hiddenStates)
|
|
}
|
|
|
|
// Layer represents a single transformer layer
|
|
type Layer struct {
|
|
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
|
AttentionPostNorm *nn.RMSNorm `gguf:"post_attention_norm"` // Post-attention norm before FFN
|
|
Operator Operator
|
|
|
|
FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
|
MLP MLP
|
|
}
|
|
|
|
func (l *Layer) Forward(ctx ml.Context, layer int, hiddenStates, positions, outputs ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
|
|
residual := hiddenStates
|
|
|
|
// Pre-attention norm
|
|
hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
|
|
// Attention (full or linear)
|
|
var err error
|
|
hiddenStates, err = l.Operator.Forward(ctx, hiddenStates, positions, cache, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Output projection for last layer
|
|
if outputs != nil {
|
|
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
|
residual = residual.Rows(ctx, outputs)
|
|
}
|
|
|
|
// First residual connection
|
|
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
|
|
// Save for FFN residual
|
|
ffnResidual := hiddenStates
|
|
|
|
// Post-attention norm (before FFN)
|
|
hiddenStates = l.AttentionPostNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
|
|
// FFN
|
|
hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
|
|
|
|
// Second residual connection
|
|
return hiddenStates.Add(ctx, ffnResidual), nil
|
|
}
|
|
|
|
// Model is the main Qwen3-Next model
|
|
type Model struct {
|
|
model.Base
|
|
tokenizer.Tokenizer
|
|
|
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
|
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
|
|
|
Layers []Layer `gguf:"blk"`
|
|
Vision *qwen3vl.VisionModel `gguf:"v"`
|
|
|
|
ImageProcessor *qwen3vl.ImageProcessor
|
|
|
|
*Options
|
|
|
|
positionCache []int32
|
|
imageToken int32
|
|
visionStart int32
|
|
visionEnd int32
|
|
spatialMergeSize uint32
|
|
}
|
|
|
|
func (m *Model) mapPosition(id int32) int32 {
|
|
if id < int32(len(m.positionCache)) {
|
|
return m.positionCache[id]
|
|
}
|
|
if len(m.positionCache) > 0 {
|
|
return id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
|
|
}
|
|
return id
|
|
}
|
|
|
|
func (m *Model) buildPositions(ctx ml.Context, batch input.Batch) ml.Tensor {
|
|
if len(m.mropeSections) == 0 {
|
|
return ctx.Input().FromInts(batch.Positions, len(batch.Positions))
|
|
}
|
|
|
|
// ggml MRoPE expects [time, height, width, extra] for each token.
|
|
positionSlice := [][]int32{
|
|
make([]int32, len(batch.Positions)),
|
|
make([]int32, len(batch.Positions)),
|
|
make([]int32, len(batch.Positions)),
|
|
make([]int32, len(batch.Positions)),
|
|
}
|
|
|
|
for i, id := range batch.Positions {
|
|
p := m.mapPosition(id)
|
|
positionSlice[0][i] = p
|
|
positionSlice[1][i] = p
|
|
positionSlice[2][i] = p
|
|
}
|
|
|
|
if m.Vision != nil {
|
|
for _, mi := range batch.Multimodal {
|
|
grid, ok := mi.Multimodal[0].Data.(*qwen3vl.Grid)
|
|
if !ok {
|
|
continue
|
|
}
|
|
w := max(1, grid.Width/int(m.spatialMergeSize))
|
|
for i := range mi.Multimodal[0].Tensor.Dim(1) {
|
|
positionSlice[1][mi.Index+i] += int32(i / w)
|
|
positionSlice[2][mi.Index+i] += int32(i % w)
|
|
}
|
|
}
|
|
}
|
|
|
|
return ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
|
|
}
|
|
|
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
|
if m.Vision == nil || m.ImageProcessor == nil || len(m.Vision.Layers) == 0 {
|
|
return nil, model.ErrNoVisionModel
|
|
}
|
|
|
|
img, _, err := image.Decode(bytes.NewReader(multimodalData))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pixelValues, grid, err := m.ImageProcessor.ProcessImage(ctx, img)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
visionOutputs, deepstackVisualEmbeds := m.Vision.Forward(ctx, pixelValues, grid)
|
|
mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
|
|
for i := range deepstackVisualEmbeds {
|
|
mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
|
|
}
|
|
|
|
return mm, nil
|
|
}
|
|
|
|
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
|
m.positionCache = m.positionCache[:0]
|
|
var result []*input.Input
|
|
appendInput := func(inp *input.Input, position int32) {
|
|
result = append(result, inp)
|
|
m.positionCache = append(m.positionCache, position)
|
|
}
|
|
|
|
var p int32
|
|
for _, inp := range inputs {
|
|
if inp.Multimodal == nil {
|
|
appendInput(inp, p)
|
|
p++
|
|
continue
|
|
}
|
|
|
|
grid := inp.Multimodal[0].Data.(*qwen3vl.Grid)
|
|
tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
|
|
|
|
appendInput(&input.Input{
|
|
Token: m.visionStart,
|
|
SameBatch: tokensPerGrid + 1,
|
|
}, p)
|
|
p++
|
|
|
|
appendInput(&input.Input{
|
|
Token: m.imageToken,
|
|
Multimodal: inp.Multimodal,
|
|
MultimodalHash: inp.MultimodalHash,
|
|
}, p)
|
|
|
|
for range tokensPerGrid - 1 {
|
|
appendInput(&input.Input{
|
|
Token: m.imageToken,
|
|
}, p)
|
|
}
|
|
|
|
gridSpan := max(grid.Width/int(m.spatialMergeSize), grid.Height/int(m.spatialMergeSize))
|
|
p = p + int32(gridSpan)
|
|
appendInput(&input.Input{
|
|
Token: m.visionEnd,
|
|
}, p)
|
|
p++
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|
positions := m.buildPositions(ctx, batch)
|
|
|
|
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
|
if len(batch.Multimodal) > 0 {
|
|
hiddenStates = hiddenStates.Duplicate(ctx)
|
|
|
|
var deepstackVisualEmbeds []ml.Tensor
|
|
for _, mi := range batch.Multimodal {
|
|
visionOutputs := mi.Multimodal[0].Tensor
|
|
ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
|
|
|
|
if len(mi.Multimodal[1:]) > len(deepstackVisualEmbeds) {
|
|
deepstackVisualEmbeds = append(deepstackVisualEmbeds, make([]ml.Tensor, len(mi.Multimodal[1:])-len(deepstackVisualEmbeds))...)
|
|
}
|
|
for i, mm := range mi.Multimodal[1:] {
|
|
if deepstackVisualEmbeds[i] == nil {
|
|
deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
|
|
}
|
|
ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
|
|
}
|
|
}
|
|
|
|
cache := m.Cache.(*HybridCache)
|
|
m.Options.masks = nil
|
|
for i, layer := range m.Layers {
|
|
cache.SetLayer(i)
|
|
|
|
var outputs ml.Tensor
|
|
if i == len(m.Layers)-1 {
|
|
outputs = batch.Outputs
|
|
}
|
|
|
|
var err error
|
|
hiddenStates, err = layer.Forward(ctx, i, hiddenStates, positions, outputs, cache, m.Options)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if i < len(deepstackVisualEmbeds) {
|
|
hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
|
|
}
|
|
}
|
|
|
|
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
|
return m.Output.Forward(ctx, hiddenStates), nil
|
|
}
|
|
|
|
cache := m.Cache.(*HybridCache)
|
|
|
|
// Masks are allocated lazily only for chunked recurrent prefill.
|
|
m.Options.masks = nil
|
|
|
|
for i, layer := range m.Layers {
|
|
cache.SetLayer(i)
|
|
|
|
var outputs ml.Tensor
|
|
if i == len(m.Layers)-1 {
|
|
outputs = batch.Outputs
|
|
}
|
|
|
|
var err error
|
|
hiddenStates, err = layer.Forward(ctx, i, hiddenStates, positions, outputs, cache, m.Options)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
|
return m.Output.Forward(ctx, hiddenStates), nil
|
|
}
|
|
|
|
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
|
m.positionCache = nil
|
|
if len(m.mropeSections) > 0 {
|
|
shift = shift.Repeat(ctx, 1, 4).Reshape(ctx, -1)
|
|
}
|
|
return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
|
|
}
|
|
|
|
var (
|
|
_ model.Model = (*Model)(nil)
|
|
_ model.MultimodalProcessor = (*Model)(nil)
|
|
)
|
|
|
|
func New(c fs.Config) (model.Model, error) {
|
|
numLayers := int(c.Uint("block_count"))
|
|
layers := make([]Layer, numLayers)
|
|
|
|
// Get per-layer head counts (for detecting layer type)
|
|
type headCounts interface {
|
|
HeadCount() []uint64
|
|
HeadCountKV() []uint64
|
|
}
|
|
|
|
var isRecurrent []bool
|
|
var headCountKV []uint64
|
|
if hc, ok := c.(headCounts); ok {
|
|
headCountKV = hc.HeadCountKV()
|
|
}
|
|
|
|
isRecurrent = make([]bool, numLayers)
|
|
hasZero := false
|
|
hasFull := false
|
|
for i := range numLayers {
|
|
// If KV head count is 0, it's a recurrent layer
|
|
if i < len(headCountKV) && headCountKV[i] == 0 {
|
|
isRecurrent[i] = true
|
|
hasZero = true
|
|
} else if i < len(headCountKV) && headCountKV[i] > 0 {
|
|
hasFull = true
|
|
}
|
|
}
|
|
if !hasZero || !hasFull {
|
|
return nil, fmt.Errorf("qwen3next: invalid attention.head_count_kv array; expected mix of zero and non-zero values")
|
|
}
|
|
|
|
// Determine if MoE
|
|
isMoE := c.Uint("expert_count") > 0
|
|
|
|
for i := range layers {
|
|
if isRecurrent[i] {
|
|
layers[i].Operator = &GatedDeltaNet{Layer: i}
|
|
} else {
|
|
layers[i].Operator = &FullAttention{}
|
|
}
|
|
|
|
if isMoE {
|
|
layers[i].MLP = &sparse{}
|
|
} else {
|
|
layers[i].MLP = &dense{}
|
|
}
|
|
}
|
|
|
|
mropeSections := c.Ints("mrope_sections", nil)
|
|
if len(mropeSections) == 0 {
|
|
mropeSections = c.Ints("rope.mrope_section", nil)
|
|
}
|
|
if len(mropeSections) == 0 {
|
|
mropeSections = c.Ints("rope.dimension_sections", nil)
|
|
}
|
|
if len(mropeSections) > 4 {
|
|
mropeSections = mropeSections[:4]
|
|
}
|
|
|
|
ropeType := c.String("rope.scaling.type")
|
|
if ropeType == "" {
|
|
ropeType = c.String("rope.type")
|
|
}
|
|
|
|
opts := &Options{
|
|
hiddenSize: int(c.Uint("embedding_length")),
|
|
numHeads: int(c.Uint("attention.head_count")),
|
|
numKVHeads: func() int {
|
|
for _, v := range headCountKV {
|
|
if v > 0 {
|
|
return int(v)
|
|
}
|
|
}
|
|
return 0
|
|
}(),
|
|
keyLength: int(c.Uint("attention.key_length")),
|
|
valueLength: int(c.Uint("attention.value_length")),
|
|
ropeDim: int(c.Uint("rope.dimension_count")),
|
|
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
|
ropeType: ropeType,
|
|
ropeBase: c.Float("rope.freq_base"),
|
|
ropeScale: c.Float("rope.scaling.factor", 1),
|
|
originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
|
|
attentionScale: float64(c.Float("attention.scale")),
|
|
numExperts: int(c.Uint("expert_count")),
|
|
numExpertsUsed: int(c.Uint("expert_used_count")),
|
|
normTopKProb: c.Bool("norm_top_k_prob", true),
|
|
ssmDInner: int(c.Uint("ssm.inner_size")),
|
|
ssmDState: int(c.Uint("ssm.state_size")),
|
|
ssmNGroup: int(c.Uint("ssm.group_count")),
|
|
ssmDtRank: int(c.Uint("ssm.time_step_rank")),
|
|
convKernelSize: int(c.Uint("ssm.conv_kernel")),
|
|
vHeadReordered: c.Bool("ssm.v_head_reordered", false),
|
|
isRecurrent: isRecurrent,
|
|
mropeSections: slices.Collect(func(yield func(int) bool) {
|
|
for _, section := range mropeSections {
|
|
if !yield(int(section)) {
|
|
return
|
|
}
|
|
}
|
|
}),
|
|
mropeInterleaved: c.Bool("rope.mrope_interleaved", c.Bool("mrope_interleaved", false)),
|
|
}
|
|
if opts.numKVHeads == 0 {
|
|
return nil, fmt.Errorf("qwen3next: attention.head_count_kv array must include at least one non-zero value")
|
|
}
|
|
|
|
// Calculate cache dimensions
|
|
convDim := max(0, opts.convKernelSize-1)
|
|
convChannels := opts.ssmDInner + 2*opts.ssmNGroup*opts.ssmDState
|
|
headVDim := 0
|
|
numVHeads := opts.ssmDtRank
|
|
if numVHeads > 0 {
|
|
headVDim = opts.ssmDInner / numVHeads
|
|
}
|
|
deltaStateSize := headVDim * headVDim * numVHeads
|
|
|
|
// Validate dimension assumption: headKDim == headVDim is required for state computations
|
|
headKDim := opts.ssmDState
|
|
if headKDim != headVDim && headKDim > 0 && headVDim > 0 {
|
|
return nil, fmt.Errorf("qwen3next: headKDim (%d) != headVDim (%d) not supported; state computations require equal dimensions", headKDim, headVDim)
|
|
}
|
|
|
|
var vision *qwen3vl.VisionModel
|
|
var imageProcessor *qwen3vl.ImageProcessor
|
|
if c.Uint("vision.block_count", 0) > 0 {
|
|
vision = qwen3vl.NewVisionModel(c)
|
|
processor := qwen3vl.NewImageProcessor(c)
|
|
imageProcessor = &processor
|
|
}
|
|
|
|
spatialMergeSize := c.Uint("vision.spatial_merge_size", 2)
|
|
if spatialMergeSize == 0 {
|
|
spatialMergeSize = 2
|
|
}
|
|
|
|
m := Model{
|
|
Tokenizer: tokenizer.NewBytePairEncoding(
|
|
&tokenizer.Vocabulary{
|
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
// Qwen3 tokenizers typically set add_bos_token=false and bos_token=null.
|
|
// Default to false when the GGUF key is missing to avoid injecting a spurious BOS.
|
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
|
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
EOS: append(
|
|
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
),
|
|
},
|
|
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
|
),
|
|
Layers: layers,
|
|
Vision: vision,
|
|
ImageProcessor: imageProcessor,
|
|
Options: opts,
|
|
imageToken: int32(c.Uint("image_token_id", 151655)),
|
|
visionStart: int32(c.Uint("vision_start_token_id", 151652)),
|
|
visionEnd: int32(c.Uint("vision_end_token_id", 151653)),
|
|
spatialMergeSize: spatialMergeSize,
|
|
}
|
|
|
|
m.Cache = NewHybridCache(m.Shift, convDim, convChannels, deltaStateSize)
|
|
return &m, nil
|
|
}
|
|
|
|
func init() {
|
|
model.Register("qwen35", New)
|
|
model.Register("qwen35moe", New)
|
|
model.Register("qwen3next", New)
|
|
}
|