mirror of
https://github.com/ollama/ollama.git
synced 2026-04-23 17:29:54 +02:00
Remove the vendored GGML and llama.cpp backend, CGO runner, Go model implementations, and sample. llama-server (built from upstream llama.cpp via FetchContent) is now the sole inference engine for GGUF-based models. (Safetensor based models continue to run on the new MLX engine.) This allows us to more rapidly pick up new capabilities and fixes from llama.cpp as they come out. On windows this now requires recent AMD driver versions to support ROCm v7 as llama.cpp currently does not support building against v6.
445 lines
13 KiB
Go
445 lines
13 KiB
Go
package convert
|
|
|
|
import (
|
|
"cmp"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"iter"
|
|
"log/slog"
|
|
"maps"
|
|
"os"
|
|
"slices"
|
|
"strings"
|
|
|
|
ofs "github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type ModelParameters struct {
|
|
Architectures []string `json:"architectures"`
|
|
VocabSize uint32 `json:"vocab_size"`
|
|
|
|
// TODO is this needed?
|
|
ModelType string `json:"model_type"`
|
|
|
|
TextModel struct {
|
|
VocabSize uint32 `json:"vocab_size"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
ModelType string `json:"model_type"`
|
|
} `json:"text_config"`
|
|
}
|
|
|
|
type AdapterParameters struct {
|
|
Alpha uint32 `json:"lora_alpha"`
|
|
LoraLayers uint32 `json:"lora_layers"`
|
|
LoraParameters struct {
|
|
Rank uint32 `json:"rank"`
|
|
Alpha float32 `json:"alpha"`
|
|
Scale float32 `json:"scale"`
|
|
} `json:"lora_parameters"`
|
|
}
|
|
|
|
type KV map[string]any
|
|
|
|
func (kv KV) Architecture() string {
|
|
return kv.String("general.architecture", "unknown")
|
|
}
|
|
|
|
type valueTypes interface {
|
|
uint8 | int8 | uint16 | int16 |
|
|
uint32 | int32 | uint64 | int64 |
|
|
string | float32 | float64 | bool
|
|
}
|
|
|
|
type arrayValueTypes interface {
|
|
[]uint8 | []int8 | []uint16 | []int16 |
|
|
[]uint32 | []int32 | []uint64 | []int64 |
|
|
[]string | []float32 | []float64 | []bool
|
|
}
|
|
|
|
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
|
|
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
|
key = kv.Architecture() + "." + key
|
|
}
|
|
|
|
if val, ok := kv[key].(T); ok {
|
|
return val, true
|
|
}
|
|
return defaultValue[0], false
|
|
}
|
|
|
|
func (kv KV) String(key string, defaultValue ...string) string {
|
|
val, _ := keyValue(kv, key, append(defaultValue, "")...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
|
|
val, _ := keyValue(kv, key, append(defaultValue, 0)...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Float(key string, defaultValue ...float32) float32 {
|
|
val, _ := keyValue(kv, key, append(defaultValue, 0)...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Bool(key string, defaultValue ...bool) bool {
|
|
val, _ := keyValue(kv, key, append(defaultValue, false)...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
|
val, _ := keyValue(kv, key, append(defaultValue, []string{""})...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
|
|
val, _ := keyValue(kv, key, append(defaultValue, []int32{0})...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
|
val, _ := keyValue(kv, key, append(defaultValue, []uint32{0})...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
|
|
val, _ := keyValue(kv, key, append(defaultValue, []float32{0})...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
|
|
val, _ := keyValue(kv, key, append(defaultValue, []bool{false})...)
|
|
return val
|
|
}
|
|
|
|
func (kv KV) Len() int {
|
|
return len(kv)
|
|
}
|
|
|
|
func (kv KV) Keys() iter.Seq[string] {
|
|
return maps.Keys(kv)
|
|
}
|
|
|
|
func (kv KV) Value(key string) any {
|
|
return kv[key]
|
|
}
|
|
|
|
func (ModelParameters) KV(t *Tokenizer) KV {
|
|
kv := KV{
|
|
"general.file_type": uint32(1),
|
|
"general.quantization_version": uint32(2),
|
|
"tokenizer.ggml.pre": t.Pre,
|
|
"tokenizer.ggml.model": t.Vocabulary.Model,
|
|
"tokenizer.ggml.tokens": t.Vocabulary.Tokens,
|
|
"tokenizer.ggml.scores": t.Vocabulary.Scores,
|
|
"tokenizer.ggml.token_type": t.Vocabulary.Types,
|
|
}
|
|
|
|
if len(t.Merges) > 0 {
|
|
kv["tokenizer.ggml.merges"] = t.Merges
|
|
}
|
|
|
|
if t.Template != "" {
|
|
kv["tokenizer.chat_template"] = t.Template
|
|
}
|
|
|
|
for _, sv := range t.SpecialVocabulary {
|
|
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
|
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
|
if len(sv.IDs) > 0 {
|
|
kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
|
|
}
|
|
}
|
|
|
|
return kv
|
|
}
|
|
|
|
func (p AdapterParameters) KV() KV {
|
|
var alpha float32
|
|
if p.LoraParameters.Alpha == 0 {
|
|
alpha = float32(p.Alpha)
|
|
} else {
|
|
alpha = p.LoraParameters.Alpha
|
|
}
|
|
|
|
kv := KV{
|
|
"adapter.lora.alpha": alpha,
|
|
"adapter.type": "lora",
|
|
"general.file_type": uint32(1),
|
|
"general.type": "adapter",
|
|
"general.version": "v0.2",
|
|
}
|
|
|
|
return kv
|
|
}
|
|
|
|
func (ModelParameters) specialTokenTypes() []string {
|
|
return []string{
|
|
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
|
|
}
|
|
}
|
|
|
|
type ModelKV interface {
|
|
// KV maps parameters to LLM key-values
|
|
KV(*Tokenizer) KV
|
|
}
|
|
|
|
type ModelConverter interface {
|
|
ModelKV
|
|
|
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
|
Tensors([]Tensor) []*ggml.Tensor
|
|
// Replacements returns a list of string pairs to replace in tensor names.
|
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
|
Replacements() []string
|
|
|
|
// specialTokenTypes returns any special token types the model uses
|
|
specialTokenTypes() []string
|
|
}
|
|
|
|
// MultimodalConverter is an optional interface for models with embedded vision
|
|
// projectors. When implemented, ConvertModel splits the output into a text
|
|
// model GGUF and a separate projector GGUF.
|
|
type MultimodalConverter interface {
|
|
ModelConverter
|
|
// ProjectorKV returns KV metadata for the projector GGUF.
|
|
ProjectorKV(*Tokenizer) KV
|
|
// ProjectorTensors filters the full tensor list to only vision/projector tensors.
|
|
ProjectorTensors([]Tensor) []*ggml.Tensor
|
|
// TextTensors filters the full tensor list to only text model tensors (excluding vision).
|
|
// The tokenizer is provided so implementations can truncate embeddings to the actual vocab size.
|
|
TextTensors([]Tensor, *Tokenizer) []*ggml.Tensor
|
|
}
|
|
|
|
// vocabSizer optionally returns the maximum vocabulary size for a model.
|
|
// When implemented, the tokenizer is truncated to this size.
|
|
type vocabSizer interface {
|
|
VocabSize() int
|
|
}
|
|
|
|
type moreParser interface {
|
|
parseMore(fs.FS) error
|
|
}
|
|
|
|
type AdapterConverter interface {
|
|
// KV maps parameters to LLM key-values
|
|
KV(ofs.Config) KV
|
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
|
Tensors([]Tensor) []*ggml.Tensor
|
|
// Replacements returns a list of string pairs to replace in tensor names.
|
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
|
Replacements() []string
|
|
}
|
|
|
|
func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
|
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var p AdapterParameters
|
|
if err := json.Unmarshal(bts, &p); err != nil {
|
|
return err
|
|
}
|
|
|
|
arch := baseKV.Architecture()
|
|
if arch == "" {
|
|
return errors.New("architecture not set for the base model")
|
|
}
|
|
|
|
var conv AdapterConverter
|
|
switch arch {
|
|
case "llama":
|
|
conv = &llamaAdapter{}
|
|
case "gemma2":
|
|
conv = &gemma2Adapter{}
|
|
default:
|
|
return errors.New("unsupported architecture")
|
|
}
|
|
|
|
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := json.Unmarshal(bts, conv); err != nil {
|
|
return err
|
|
}
|
|
|
|
return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
|
|
}
|
|
|
|
func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
|
|
bts, err := fs.ReadFile(fsys, "config.json")
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
bts = sanitizeNonFiniteJSON(bts)
|
|
|
|
var p ModelParameters
|
|
if err := json.Unmarshal(bts, &p); err != nil {
|
|
return nil, nil, fmt.Errorf("parse config.json: %w", err)
|
|
}
|
|
|
|
if len(p.Architectures) < 1 {
|
|
return nil, nil, errors.New("unknown architecture")
|
|
}
|
|
|
|
var conv ModelConverter
|
|
switch p.Architectures[0] {
|
|
case "LlamaForCausalLM":
|
|
conv = &llamaModel{}
|
|
case "MllamaForConditionalGeneration":
|
|
conv = &mllamaModel{}
|
|
case "Llama4ForConditionalGeneration":
|
|
conv = &llama4Model{}
|
|
case "Mistral3ForConditionalGeneration":
|
|
conv = &mistral3Model{}
|
|
case "Ministral3ForCausalLM":
|
|
conv = &mistral3CausalModel{}
|
|
case "MixtralForCausalLM":
|
|
conv = &mixtralModel{}
|
|
case "GemmaForCausalLM":
|
|
conv = &gemmaModel{}
|
|
case "Gemma2ForCausalLM":
|
|
conv = &gemma2Model{}
|
|
case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration", "Gemma3TextModel":
|
|
conv = &gemma3Model{Architecture: p.Architectures[0]}
|
|
case "Gemma3nForConditionalGeneration":
|
|
conv = &gemma3nModel{}
|
|
case "Gemma4ForCausalLM", "Gemma4ForConditionalGeneration":
|
|
conv = &gemma4Model{Architecture: p.Architectures[0]}
|
|
case "Phi3ForCausalLM":
|
|
conv = &phi3Model{}
|
|
case "Qwen2ForCausalLM":
|
|
conv = &qwen2Model{}
|
|
case "Qwen2_5_VLForConditionalGeneration":
|
|
conv = &qwen25VLModel{}
|
|
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
|
|
conv = &qwen3VLModel{}
|
|
case "Olmo3ForCausalLM":
|
|
conv = &olmoModel{}
|
|
case "BertModel":
|
|
conv = &bertModel{}
|
|
case "NomicBertModel", "NomicBertMoEModel":
|
|
conv = &nomicbertModel{}
|
|
case "CohereForCausalLM":
|
|
conv = &commandrModel{}
|
|
case "GptOssForCausalLM":
|
|
conv = &gptossModel{}
|
|
case "DeepseekOCRForCausalLM":
|
|
conv = &deepseekocr{}
|
|
case "DeepseekV3ForCausalLM":
|
|
conv = &deepseek2Model{}
|
|
case "Glm4MoeLiteForCausalLM":
|
|
conv = &glm4MoeLiteModel{}
|
|
case "GlmOcrForConditionalGeneration":
|
|
conv = &glmOcrModel{}
|
|
case "Lfm2ForCausalLM", "Lfm2MoeForCausalLM":
|
|
conv = &lfm2Model{}
|
|
case "Lfm2VlForConditionalGeneration":
|
|
conv = &lfm2VLTextModel{}
|
|
case "Qwen3NextForCausalLM", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration":
|
|
conv = &qwen3NextModel{}
|
|
case "NemotronHForCausalLM":
|
|
conv = &nemotronHModel{}
|
|
default:
|
|
return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
|
|
}
|
|
|
|
if err := json.Unmarshal(bts, conv); err != nil {
|
|
return nil, nil, fmt.Errorf("parse config.json for %q: %w", p.Architectures[0], err)
|
|
}
|
|
|
|
if t, ok := conv.(moreParser); ok {
|
|
if err := t.parseMore(fsys); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
|
|
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Allow converters to override the vocab size (e.g., to exclude multimodal tokens
|
|
// from text-only models where tokenizer.json includes them but the model doesn't use them).
|
|
// NOTE: if multiple models need this, consider making truncation the default behavior
|
|
// in the vocabSize < len(tokens) case below instead of per-model opt-in.
|
|
if vs, ok := conv.(vocabSizer); ok {
|
|
if maxVocab := vs.VocabSize(); maxVocab > 0 && maxVocab < len(t.Vocabulary.Tokens) {
|
|
slog.Debug("converter requested vocab truncation", "from", len(t.Vocabulary.Tokens), "to", maxVocab)
|
|
t.Vocabulary.Tokens = t.Vocabulary.Tokens[:maxVocab]
|
|
t.Vocabulary.Scores = t.Vocabulary.Scores[:maxVocab]
|
|
t.Vocabulary.Types = t.Vocabulary.Types[:maxVocab]
|
|
// Also update config so the padding logic below doesn't re-add the truncated tokens
|
|
p.VocabSize = uint32(maxVocab)
|
|
p.TextModel.VocabSize = uint32(maxVocab)
|
|
}
|
|
}
|
|
|
|
vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
|
|
|
|
switch {
|
|
case vocabSize == 0:
|
|
slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
|
case vocabSize > len(t.Vocabulary.Tokens):
|
|
slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
|
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
|
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
|
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
|
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
|
}
|
|
case vocabSize < len(t.Vocabulary.Tokens):
|
|
slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
|
|
p.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
|
p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
|
default:
|
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
|
}
|
|
return conv, t, nil
|
|
}
|
|
|
|
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
|
// and files it finds in the input path.
|
|
// Supported input model formats include safetensors.
|
|
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
|
func ConvertModel(fsys fs.FS, f *os.File, projectorFiles ...*os.File) error {
|
|
kv, t, err := LoadModelMetadata(fsys)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
conv := kv.(ModelConverter)
|
|
|
|
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// If the converter supports multimodal and a projector file is provided,
|
|
// split into text model + projector
|
|
if mc, ok := conv.(MultimodalConverter); ok && len(projectorFiles) > 0 && projectorFiles[0] != nil {
|
|
projTensors := mc.ProjectorTensors(ts)
|
|
if len(projTensors) > 0 {
|
|
slog.Info("splitting multimodal model into text + projector")
|
|
if err := writeFile(f, mc.KV(t), mc.TextTensors(ts, t)); err != nil {
|
|
return err
|
|
}
|
|
return writeFile(projectorFiles[0], mc.ProjectorKV(t), projTensors)
|
|
}
|
|
}
|
|
|
|
return writeFile(f, conv.KV(t), conv.Tensors(ts))
|
|
}
|
|
|
|
func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
|
|
for i := range ts {
|
|
ts[i].Shape = slices.Clone(ts[i].Shape)
|
|
slices.Reverse(ts[i].Shape)
|
|
}
|
|
return ggml.WriteGGUF(f, kv, ts)
|
|
}
|