mirror of
https://github.com/ollama/ollama.git
synced 2026-04-23 01:05:47 +02:00
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model implementations, and sample. llama-server (built from upstream llama.cpp via FetchContent) is now the sole inference engine for GGUF-based models. (Safetensor based models continue to run on the new MLX engine.) This allows us to more rapidly pick up new capabilities and fixes from llama.cpp as they come out. On windows this now requires recent AMD driver versions to support ROCm v7 as llama.cpp currently does not support building against v6.
227 lines
7.0 KiB
Go
227 lines
7.0 KiB
Go
package convert
|
|
|
|
import (
|
|
"cmp"
|
|
"encoding/json"
|
|
"io/fs"
|
|
"slices"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type qwen25VLModel struct {
|
|
qwen2Model
|
|
|
|
Preprocessor struct {
|
|
ImageMean []float32 `json:"image_mean"`
|
|
ImageStd []float32 `json:"image_std"`
|
|
MinPixels uint32 `json:"min_pixels"`
|
|
MaxPixels uint32 `json:"max_pixels"`
|
|
} `json:"-"`
|
|
|
|
VisionModel struct {
|
|
Depth uint32 `json:"depth"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
NumHeads uint32 `json:"num_heads"`
|
|
InChannels uint32 `json:"in_chans"`
|
|
PatchSize uint32 `json:"patch_size"`
|
|
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
|
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
|
WindowSize uint32 `json:"window_size"`
|
|
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
|
RopeTheta float32 `json:"rope_theta"`
|
|
FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
|
|
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
|
IntermediateSize uint32 `json:"intermediate_size"`
|
|
ImageSize uint32 `json:"image_size"`
|
|
} `json:"vision_config"`
|
|
}
|
|
|
|
var _ MultimodalConverter = (*qwen25VLModel)(nil)
|
|
|
|
func (q *qwen25VLModel) parseMore(fsys fs.FS) error {
|
|
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return json.Unmarshal(bts, &q.Preprocessor)
|
|
}
|
|
|
|
func (q *qwen25VLModel) KV(t *Tokenizer) KV {
|
|
kv := q.ModelParameters.KV(t)
|
|
kv["general.architecture"] = "qwen2vl"
|
|
|
|
for k, v := range q.qwen2Model.KV(t) {
|
|
if strings.HasPrefix(k, "qwen2.") {
|
|
kv[strings.Replace(k, "qwen2.", "qwen2vl.", 1)] = v
|
|
}
|
|
}
|
|
|
|
// rope.dimension_sections — required by llama-server for M-RoPE
|
|
if len(q.RopeScaling.MropeSection) > 0 {
|
|
sections := make([]int32, 4)
|
|
copy(sections, q.RopeScaling.MropeSection)
|
|
kv["rope.dimension_sections"] = sections
|
|
}
|
|
|
|
return kv
|
|
}
|
|
|
|
// ProjectorKV returns KV metadata for the qwen2.5vl vision projector.
|
|
func (q *qwen25VLModel) ProjectorKV(t *Tokenizer) KV {
|
|
kv := KV{
|
|
"general.architecture": "clip",
|
|
"clip.projector_type": "qwen2.5vl_merger",
|
|
"clip.has_vision_encoder": true,
|
|
|
|
"clip.vision.block_count": cmp.Or(q.VisionModel.Depth, 32),
|
|
"clip.vision.embedding_length": q.VisionModel.HiddenSize,
|
|
"clip.vision.feed_forward_length": cmp.Or(q.VisionModel.IntermediateSize, q.VisionModel.HiddenSize*4),
|
|
"clip.vision.attention.head_count": cmp.Or(q.VisionModel.NumHeads, 16),
|
|
"clip.vision.attention.layer_norm_epsilon": cmp.Or(q.VisionModel.RMSNormEps, 1e-6),
|
|
"clip.vision.num_channels": q.VisionModel.InChannels,
|
|
"clip.vision.patch_size": cmp.Or(q.VisionModel.PatchSize, 14),
|
|
"clip.vision.spatial_merge_size": cmp.Or(q.VisionModel.SpatialMergeSize, 2),
|
|
"clip.vision.image_size": cmp.Or(q.VisionModel.ImageSize, 560),
|
|
"clip.vision.projection_dim": q.HiddenSize, // text model hidden_size
|
|
"clip.vision.temporal_patch_size": cmp.Or(q.VisionModel.TemporalPatchSize, 2),
|
|
"clip.vision.n_wa_pattern": cmp.Or(q.VisionModel.WindowSize, 112) / cmp.Or(q.VisionModel.PatchSize, 14),
|
|
"clip.use_silu": true,
|
|
"clip.vision.fullatt_block_indexes": q.VisionModel.FullAttentionBlocks,
|
|
"clip.vision.rope.freq_base": cmp.Or(q.VisionModel.RopeTheta, 1e4),
|
|
}
|
|
|
|
if q.VisionModel.FullAttentionBlocks == nil {
|
|
kv["clip.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
|
|
}
|
|
|
|
if len(q.Preprocessor.ImageMean) == 3 {
|
|
kv["clip.vision.image_mean"] = q.Preprocessor.ImageMean
|
|
}
|
|
if len(q.Preprocessor.ImageStd) == 3 {
|
|
kv["clip.vision.image_std"] = q.Preprocessor.ImageStd
|
|
}
|
|
if q.Preprocessor.MinPixels > 0 {
|
|
kv["clip.vision.min_pixels"] = q.Preprocessor.MinPixels
|
|
}
|
|
if q.Preprocessor.MaxPixels > 0 {
|
|
kv["clip.vision.max_pixels"] = q.Preprocessor.MaxPixels
|
|
}
|
|
|
|
return kv
|
|
}
|
|
|
|
func isQwen25VLVisionTensor(name string) bool {
|
|
return strings.HasPrefix(name, "v.") || strings.HasPrefix(name, "mm.")
|
|
}
|
|
|
|
// TextTensors returns only text model tensors (no vision/merger).
|
|
func (q *qwen25VLModel) TextTensors(ts []Tensor, t *Tokenizer) []*ggml.Tensor {
|
|
var textOnly []Tensor
|
|
for _, tensor := range ts {
|
|
if !isQwen25VLVisionTensor(tensor.Name()) {
|
|
textOnly = append(textOnly, tensor)
|
|
}
|
|
}
|
|
return q.qwen2Model.Tensors(textOnly)
|
|
}
|
|
|
|
// ProjectorTensors returns only vision/merger tensors.
|
|
func (q *qwen25VLModel) ProjectorTensors(ts []Tensor) []*ggml.Tensor {
|
|
var out []*ggml.Tensor
|
|
|
|
for _, t := range ts {
|
|
if !isQwen25VLVisionTensor(t.Name()) {
|
|
continue
|
|
}
|
|
|
|
name := t.Name()
|
|
|
|
// Split patch_embed.proj along temporal dimension into two 4D tensors
|
|
// First: v.patch_embd.weight, Second: v.patch_embd.weight.1
|
|
if strings.Contains(name, "patch_embed.proj") {
|
|
idx := 0
|
|
for t := range splitDim(t, 2,
|
|
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd")},
|
|
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd")},
|
|
) {
|
|
t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
|
|
if idx == 1 {
|
|
// Second temporal slice: append .1 before extension
|
|
// v.patch_embd.weight → v.patch_embd.weight.1
|
|
t.Name = t.Name + ".1"
|
|
}
|
|
out = append(out, t)
|
|
idx++
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Split fused qkv into separate q, k, v
|
|
if strings.Contains(name, "attn.qkv") {
|
|
out = append(out, slices.Collect(splitDim(t, 0,
|
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
|
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
|
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
|
|
))...)
|
|
continue
|
|
}
|
|
|
|
out = append(out, &ggml.Tensor{
|
|
Name: name,
|
|
Kind: t.Kind(),
|
|
Shape: t.Shape(),
|
|
WriterTo: t,
|
|
})
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
var out []*ggml.Tensor
|
|
|
|
for _, t := range ts {
|
|
if strings.Contains(t.Name(), "patch_embed.proj") {
|
|
for t := range splitDim(t, 2,
|
|
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
|
|
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
|
|
) {
|
|
t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
|
|
out = append(out, t)
|
|
}
|
|
} else if strings.Contains(t.Name(), "attn.qkv") {
|
|
out = append(out, slices.Collect(splitDim(t, 0,
|
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
|
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
|
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
|
|
))...)
|
|
} else {
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: t.Shape(),
|
|
WriterTo: t,
|
|
})
|
|
}
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func (p *qwen25VLModel) Replacements() []string {
|
|
return append(
|
|
p.qwen2Model.Replacements(),
|
|
// Merger (multimodal projector) — must come before "visual" → "v" to match full path
|
|
"visual.merger.mlp", "mm",
|
|
"visual.merger.ln_q", "v.post_ln",
|
|
// Vision encoder
|
|
"visual", "v",
|
|
"blocks", "blk",
|
|
"attn.proj", "attn_out",
|
|
"norm1", "ln1",
|
|
"norm2", "ln2",
|
|
)
|
|
}
|