ollama/convert/convert.go

package convert

import (
	"cmp"
	"encoding/json"
	"errors"
	"fmt"
	"io/fs"
	"iter"
	"log/slog"
	"maps"
	"os"
	"slices"
	"strings"

	ofs "github.com/ollama/ollama/fs"
	"github.com/ollama/ollama/fs/ggml"
)

type ModelParameters struct {
	Architectures []string `json:"architectures"`
	VocabSize     uint32   `json:"vocab_size"`

	// TODO is this needed?
	ModelType string `json:"model_type"`

	TextModel struct {
		VocabSize  uint32 `json:"vocab_size"`
		HiddenSize uint32 `json:"hidden_size"`
		ModelType  string `json:"model_type"`
	} `json:"text_config"`
}

type AdapterParameters struct {
	Alpha          uint32 `json:"lora_alpha"`
	LoraLayers     uint32 `json:"lora_layers"`
	LoraParameters struct {
		Rank  uint32  `json:"rank"`
		Alpha float32 `json:"alpha"`
		Scale float32 `json:"scale"`
	} `json:"lora_parameters"`
}

type KV map[string]any

func (kv KV) Architecture() string {
	return kv.String("general.architecture", "unknown")
}

type valueTypes interface {
	uint8 | int8 | uint16 | int16 |
		uint32 | int32 | uint64 | int64 |
		string | float32 | float64 | bool
}

type arrayValueTypes interface {
	[]uint8 | []int8 | []uint16 | []int16 |
		[]uint32 | []int32 | []uint64 | []int64 |
		[]string | []float32 | []float64 | []bool
}

func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
		key = kv.Architecture() + "." + key
	}

	if val, ok := kv[key].(T); ok {
		return val, true
	}
	return defaultValue[0], false
}

func (kv KV) String(key string, defaultValue ...string) string {
	val, _ := keyValue(kv, key, append(defaultValue, "")...)
	return val
}

func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
}

func (kv KV) Float(key string, defaultValue ...float32) float32 {
	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
	return val
}

func (kv KV) Bool(key string, defaultValue ...bool) bool {
	val, _ := keyValue(kv, key, append(defaultValue, false)...)
	return val
}

func (kv KV) Strings(key string, defaultValue ...[]string) []string {
	val, _ := keyValue(kv, key, append(defaultValue, []string{""})...)
	return val
}

func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
	val, _ := keyValue(kv, key, append(defaultValue, []int32{0})...)
	return val
}

func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
	val, _ := keyValue(kv, key, append(defaultValue, []uint32{0})...)
	return val
}

func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
	val, _ := keyValue(kv, key, append(defaultValue, []float32{0})...)
	return val
}

func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
	val, _ := keyValue(kv, key, append(defaultValue, []bool{false})...)
	return val
}

func (kv KV) Len() int {
	return len(kv)
}

func (kv KV) Keys() iter.Seq[string] {
	return maps.Keys(kv)
}

func (kv KV) Value(key string) any {
	return kv[key]
}

func (ModelParameters) KV(t *Tokenizer) KV {
	kv := KV{
		"general.file_type":            uint32(1),
		"general.quantization_version": uint32(2),
		"tokenizer.ggml.pre":           t.Pre,
		"tokenizer.ggml.model":         t.Vocabulary.Model,
		"tokenizer.ggml.tokens":        t.Vocabulary.Tokens,
		"tokenizer.ggml.scores":        t.Vocabulary.Scores,
		"tokenizer.ggml.token_type":    t.Vocabulary.Types,
	}

	if len(t.Merges) > 0 {
		kv["tokenizer.ggml.merges"] = t.Merges
	}

	if t.Template != "" {
		kv["tokenizer.chat_template"] = t.Template
	}

	for _, sv := range t.SpecialVocabulary {
		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
		if len(sv.IDs) > 0 {
			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
		}
	}

	return kv
}

func (p AdapterParameters) KV() KV {
	var alpha float32
	if p.LoraParameters.Alpha == 0 {
		alpha = float32(p.Alpha)
	} else {
		alpha = p.LoraParameters.Alpha
	}

	kv := KV{
		"adapter.lora.alpha": alpha,
		"adapter.type":       "lora",
		"general.file_type":  uint32(1),
		"general.type":       "adapter",
		"general.version":    "v0.2",
	}

	return kv
}

func (ModelParameters) specialTokenTypes() []string {
	return []string{
		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
	}
}

type ModelKV interface {
	// KV maps parameters to LLM key-values
	KV(*Tokenizer) KV
}

type ModelConverter interface {
	ModelKV

	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
	Tensors([]Tensor) []*ggml.Tensor
	// Replacements returns a list of string pairs to replace in tensor names.
	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
	Replacements() []string

	// specialTokenTypes returns any special token types the model uses
	specialTokenTypes() []string
}

// MultimodalConverter is an optional interface for models with embedded vision
// projectors. When implemented, ConvertModel splits the output into a text
// model GGUF and a separate projector GGUF.
type MultimodalConverter interface {
	ModelConverter
	// ProjectorKV returns KV metadata for the projector GGUF.
	ProjectorKV(*Tokenizer) KV
	// ProjectorTensors filters the full tensor list to only vision/projector tensors.
	ProjectorTensors([]Tensor) []*ggml.Tensor
	// TextTensors filters the full tensor list to only text model tensors (excluding vision).
	// The tokenizer is provided so implementations can truncate embeddings to the actual vocab size.
	TextTensors([]Tensor, *Tokenizer) []*ggml.Tensor
}

// vocabSizer optionally returns the maximum vocabulary size for a model.
// When implemented, the tokenizer is truncated to this size.
type vocabSizer interface {
	VocabSize() int
}

type moreParser interface {
	parseMore(fs.FS) error
}

type AdapterConverter interface {
	// KV maps parameters to LLM key-values
	KV(ofs.Config) KV
	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
	Tensors([]Tensor) []*ggml.Tensor
	// Replacements returns a list of string pairs to replace in tensor names.
	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
	Replacements() []string
}

func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
	bts, err := fs.ReadFile(fsys, "adapter_config.json")
	if err != nil {
		return err
	}

	var p AdapterParameters
	if err := json.Unmarshal(bts, &p); err != nil {
		return err
	}

	arch := baseKV.Architecture()
	if arch == "" {
		return errors.New("architecture not set for the base model")
	}

	var conv AdapterConverter
	switch arch {
	case "llama":
		conv = &llamaAdapter{}
	case "gemma2":
		conv = &gemma2Adapter{}
	default:
		return errors.New("unsupported architecture")
	}

	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
	if err != nil {
		return err
	}

	if err := json.Unmarshal(bts, conv); err != nil {
		return err
	}

	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
}

func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
	bts, err := fs.ReadFile(fsys, "config.json")
	if err != nil {
		return nil, nil, err
	}
	bts = sanitizeNonFiniteJSON(bts)

	var p ModelParameters
	if err := json.Unmarshal(bts, &p); err != nil {
		return nil, nil, fmt.Errorf("parse config.json: %w", err)
	}

	if len(p.Architectures) < 1 {
		return nil, nil, errors.New("unknown architecture")
	}

	var conv ModelConverter
	switch p.Architectures[0] {
	case "LlamaForCausalLM":
		conv = &llamaModel{}
	case "MllamaForConditionalGeneration":
		conv = &mllamaModel{}
	case "Llama4ForConditionalGeneration":
		conv = &llama4Model{}
	case "Mistral3ForConditionalGeneration":
		conv = &mistral3Model{}
	case "Ministral3ForCausalLM":
		conv = &mistral3CausalModel{}
	case "MixtralForCausalLM":
		conv = &mixtralModel{}
	case "GemmaForCausalLM":
		conv = &gemmaModel{}
	case "Gemma2ForCausalLM":
		conv = &gemma2Model{}
	case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration", "Gemma3TextModel":
		conv = &gemma3Model{Architecture: p.Architectures[0]}
	case "Gemma3nForConditionalGeneration":
		conv = &gemma3nModel{}
	case "Gemma4ForCausalLM", "Gemma4ForConditionalGeneration":
		conv = &gemma4Model{Architecture: p.Architectures[0]}
	case "Phi3ForCausalLM":
		conv = &phi3Model{}
	case "Qwen2ForCausalLM":
		conv = &qwen2Model{}
	case "Qwen2_5_VLForConditionalGeneration":
		conv = &qwen25VLModel{}
	case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
		conv = &qwen3VLModel{}
	case "Olmo3ForCausalLM":
		conv = &olmoModel{}
	case "BertModel":
		conv = &bertModel{}
	case "NomicBertModel", "NomicBertMoEModel":
		conv = &nomicbertModel{}
	case "CohereForCausalLM":
		conv = &commandrModel{}
	case "GptOssForCausalLM":
		conv = &gptossModel{}
	case "DeepseekOCRForCausalLM":
		conv = &deepseekocr{}
	case "DeepseekV3ForCausalLM":
		conv = &deepseek2Model{}
	case "Glm4MoeLiteForCausalLM":
		conv = &glm4MoeLiteModel{}
	case "GlmOcrForConditionalGeneration":
		conv = &glmOcrModel{}
	case "Lfm2ForCausalLM", "Lfm2MoeForCausalLM":
		conv = &lfm2Model{}
	case "Lfm2VlForConditionalGeneration":
		conv = &lfm2VLTextModel{}
	case "Qwen3NextForCausalLM", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration":
		conv = &qwen3NextModel{}
	case "NemotronHForCausalLM":
		conv = &nemotronHModel{}
	default:
		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
	}

	if err := json.Unmarshal(bts, conv); err != nil {
		return nil, nil, fmt.Errorf("parse config.json for %q: %w", p.Architectures[0], err)
	}

	if t, ok := conv.(moreParser); ok {
		if err := t.parseMore(fsys); err != nil {
			return nil, nil, err
		}
	}

	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
	if err != nil {
		return nil, nil, err
	}

	// Allow converters to override the vocab size (e.g., to exclude multimodal tokens
	// from text-only models where tokenizer.json includes them but the model doesn't use them).
	// NOTE: if multiple models need this, consider making truncation the default behavior
	// in the vocabSize < len(tokens) case below instead of per-model opt-in.
	if vs, ok := conv.(vocabSizer); ok {
		if maxVocab := vs.VocabSize(); maxVocab > 0 && maxVocab < len(t.Vocabulary.Tokens) {
			slog.Debug("converter requested vocab truncation", "from", len(t.Vocabulary.Tokens), "to", maxVocab)
			t.Vocabulary.Tokens = t.Vocabulary.Tokens[:maxVocab]
			t.Vocabulary.Scores = t.Vocabulary.Scores[:maxVocab]
			t.Vocabulary.Types = t.Vocabulary.Types[:maxVocab]
			// Also update config so the padding logic below doesn't re-add the truncated tokens
			p.VocabSize = uint32(maxVocab)
			p.TextModel.VocabSize = uint32(maxVocab)
		}
	}

	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))

	switch {
	case vocabSize == 0:
		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
	case vocabSize > len(t.Vocabulary.Tokens):
		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
		for i := range vocabSize - len(t.Vocabulary.Tokens) {
			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
		}
	case vocabSize < len(t.Vocabulary.Tokens):
		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
	default:
		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
	}
	return conv, t, nil
}

// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
// and files it finds in the input path.
// Supported input model formats include safetensors.
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
func ConvertModel(fsys fs.FS, f *os.File, projectorFiles ...*os.File) error {
	kv, t, err := LoadModelMetadata(fsys)
	if err != nil {
		return err
	}
	conv := kv.(ModelConverter)

	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
	if err != nil {
		return err
	}

	// If the converter supports multimodal and a projector file is provided,
	// split into text model + projector
	if mc, ok := conv.(MultimodalConverter); ok && len(projectorFiles) > 0 && projectorFiles[0] != nil {
		projTensors := mc.ProjectorTensors(ts)
		if len(projTensors) > 0 {
			slog.Info("splitting multimodal model into text + projector")
			if err := writeFile(f, mc.KV(t), mc.TextTensors(ts, t)); err != nil {
				return err
			}
			return writeFile(projectorFiles[0], mc.ProjectorKV(t), projTensors)
		}
	}

	return writeFile(f, conv.KV(t), conv.Tensors(ts))
}

func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
	for i := range ts {
		ts[i].Shape = slices.Clone(ts[i].Shape)
		slices.Reverse(ts[i].Shape)
	}
	return ggml.WriteGGUF(f, kv, ts)
}