Files
ollama/x/create/create.go
Daniel Hiltgen 30fdd229a4 create: Clean up experimental paths, fix create from existing safetensor model (#14679)
* create:  Clean up experimental paths

This cleans up the experimental features, and adds both unit and integration test coverage to verify no regressions.

* create: preserve config and layer names when creating from safetensors models

When creating a model FROM an existing safetensors model, ModelFormat,
Capabilities, and layer Name fields were lost. ModelFormat stayed empty
because it's only set from GGML layers (which safetensors models lack),
and layer names weren't copied in parseFromModel. This caused derived
models to fail loading ("config.json not found in manifest").

* review comments
2026-04-07 08:12:57 -07:00

995 lines
30 KiB
Go

package create
import (
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"regexp"
"slices"
"sort"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/x/safetensors"
)
// ModelConfig represents the config blob stored with a model.
type ModelConfig struct {
ModelFormat string `json:"model_format"`
Capabilities []string `json:"capabilities"`
}
// Manifest represents the manifest JSON structure.
type Manifest struct {
SchemaVersion int `json:"schemaVersion"`
MediaType string `json:"mediaType"`
Config ManifestLayer `json:"config"`
Layers []ManifestLayer `json:"layers"`
}
// ManifestLayer represents a layer in the manifest.
type ManifestLayer struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int64 `json:"size"`
Name string `json:"name,omitempty"`
}
// defaultManifestDir returns the manifest storage directory.
func defaultManifestDir() string {
return filepath.Join(envconfig.Models(), "manifests")
}
// defaultBlobDir returns the blob storage directory.
func defaultBlobDir() string {
return filepath.Join(envconfig.Models(), "blobs")
}
// resolveManifestPath converts a model name to a manifest file path.
func resolveManifestPath(modelName string) string {
host := "registry.ollama.ai"
namespace := "library"
name := modelName
tag := "latest"
if idx := strings.LastIndex(name, ":"); idx != -1 {
tag = name[idx+1:]
name = name[:idx]
}
parts := strings.Split(name, "/")
switch len(parts) {
case 3:
host = parts[0]
namespace = parts[1]
name = parts[2]
case 2:
namespace = parts[0]
name = parts[1]
}
return filepath.Join(defaultManifestDir(), host, namespace, name, tag)
}
// loadManifest loads a manifest for the given model name.
func loadManifest(modelName string) (*Manifest, error) {
manifestPath := resolveManifestPath(modelName)
data, err := os.ReadFile(manifestPath)
if err != nil {
return nil, err
}
var manifest Manifest
if err := json.Unmarshal(data, &manifest); err != nil {
return nil, err
}
return &manifest, nil
}
// loadModelConfig loads the config blob for a model.
func loadModelConfig(modelName string) (*ModelConfig, error) {
manifest, err := loadManifest(modelName)
if err != nil {
return nil, err
}
// Read the config blob
blobName := strings.Replace(manifest.Config.Digest, ":", "-", 1)
blobPath := filepath.Join(defaultBlobDir(), blobName)
data, err := os.ReadFile(blobPath)
if err != nil {
return nil, err
}
var config ModelConfig
if err := json.Unmarshal(data, &config); err != nil {
return nil, err
}
return &config, nil
}
// IsSafetensorsModel checks if a model was created with the experimental
// safetensors builder by checking the model format in the config.
func IsSafetensorsModel(modelName string) bool {
config, err := loadModelConfig(modelName)
if err != nil {
return false
}
return config.ModelFormat == "safetensors"
}
// IsSafetensorsLLMModel checks if a model is a safetensors LLM model
// (has completion capability, not image generation).
func IsSafetensorsLLMModel(modelName string) bool {
config, err := loadModelConfig(modelName)
if err != nil {
return false
}
return config.ModelFormat == "safetensors" && slices.Contains(config.Capabilities, "completion")
}
// IsImageGenModel checks if a model is an image generation model
// (has image capability).
func IsImageGenModel(modelName string) bool {
config, err := loadModelConfig(modelName)
if err != nil {
return false
}
return config.ModelFormat == "safetensors" && slices.Contains(config.Capabilities, "image")
}
// GetModelArchitecture returns the architecture from the model's config.json layer.
func GetModelArchitecture(modelName string) (string, error) {
manifest, err := loadManifest(modelName)
if err != nil {
return "", err
}
// Find the config.json layer
for _, layer := range manifest.Layers {
if layer.Name == "config.json" && layer.MediaType == "application/vnd.ollama.image.json" {
blobName := strings.Replace(layer.Digest, ":", "-", 1)
blobPath := filepath.Join(defaultBlobDir(), blobName)
data, err := os.ReadFile(blobPath)
if err != nil {
return "", err
}
var cfg struct {
Architectures []string `json:"architectures"`
ModelType string `json:"model_type"`
}
if err := json.Unmarshal(data, &cfg); err != nil {
return "", err
}
// Prefer model_type, fall back to first architecture
if cfg.ModelType != "" {
return cfg.ModelType, nil
}
if len(cfg.Architectures) > 0 {
return cfg.Architectures[0], nil
}
}
}
return "", fmt.Errorf("architecture not found in model config")
}
// IsTensorModelDir checks if the directory contains a diffusers-style tensor model
// by looking for model_index.json, which is the standard diffusers pipeline config.
func IsTensorModelDir(dir string) bool {
_, err := os.Stat(filepath.Join(dir, "model_index.json"))
return err == nil
}
// IsSafetensorsModelDir checks if the directory contains a standard safetensors model
// by looking for config.json and at least one .safetensors file.
func IsSafetensorsModelDir(dir string) bool {
// Must have config.json
if _, err := os.Stat(filepath.Join(dir, "config.json")); err != nil {
return false
}
// Must have at least one .safetensors file
entries, err := os.ReadDir(dir)
if err != nil {
return false
}
for _, entry := range entries {
if strings.HasSuffix(entry.Name(), ".safetensors") {
return true
}
}
return false
}
// LayerInfo holds metadata for a created layer.
type LayerInfo struct {
Digest string
Size int64
MediaType string
Name string // Path-style name: "component/tensor" or "path/to/config.json"
}
// LayerCreator is called to create a blob layer.
// name is the path-style name (e.g., "tokenizer/tokenizer.json")
type LayerCreator func(r io.Reader, mediaType, name string) (LayerInfo, error)
// TensorLayerCreator creates a tensor blob layer with metadata.
// name is the path-style name including component (e.g., "text_encoder/model.embed_tokens.weight")
type TensorLayerCreator func(r io.Reader, name, dtype string, shape []int32) (LayerInfo, error)
// QuantizingTensorLayerCreator creates tensor layers with optional quantization.
// When quantize is non-empty (e.g., "int8"), returns multiple layers (weight + scales + biases).
type QuantizingTensorLayerCreator func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error)
// ManifestWriter writes the manifest file.
type ManifestWriter func(modelName string, config LayerInfo, layers []LayerInfo) error
// ShouldQuantize returns true if a tensor should be quantized.
// For image gen models (component non-empty): quantizes linear weights, skipping VAE, embeddings, norms.
// For LLM models (component empty): quantizes linear weights, skipping embeddings, norms, and small tensors.
func ShouldQuantize(name, component string) bool {
// Image gen specific: skip VAE entirely
if component == "vae" {
return false
}
// Skip embeddings
if strings.Contains(name, "embed") {
return false
}
// Skip layer norms and RMS norms
if strings.Contains(name, "norm") || strings.Contains(name, "ln_") || strings.Contains(name, "layernorm") {
return false
}
// Skip biases
if strings.HasSuffix(name, ".bias") {
return false
}
// Only quantize weights
return strings.HasSuffix(name, ".weight")
}
// ShouldQuantizeTensor returns true if a tensor should be quantized based on name, shape, and quantize type.
// This is a more detailed check that also considers tensor dimensions.
// The quantize parameter specifies the quantization type (e.g., "int4", "nvfp4", "mxfp4", "int8", "mxfp8").
func ShouldQuantizeTensor(name string, shape []int32, quantize string) bool {
return GetTensorQuantization(name, shape, quantize) != ""
}
// normalizeQuantType converts various quantization type aliases to canonical forms.
// Supports: q4/Q4/int4/INT4/fp4/FP4 -> int4, q8/Q8/int8/INT8/fp8/FP8 -> int8, nvfp4/NVFP4, mxfp4/MXFP4, mxfp8/MXFP8
func normalizeQuantType(quantize string) string {
switch strings.ToUpper(quantize) {
case "Q4", "INT4", "FP4":
return "int4"
case "Q8", "INT8", "FP8":
return "int8"
case "NVFP4":
return "nvfp4"
case "MXFP4":
return "mxfp4"
case "MXFP8":
return "mxfp8"
default:
return quantize
}
}
func isStackedExpertWeight(name string) bool {
// Combined/stacked expert tensors may be emitted either as "...proj.weight" (per-expert)
// or "...proj" (pre-stacked packed tensor).
if strings.HasSuffix(name, ".bias") || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".qbias") {
return false
}
return strings.Contains(name, ".mlp.switch_mlp.") ||
strings.Contains(name, ".mlp.experts.") ||
strings.Contains(name, ".mlp.shared_experts.")
}
// GetTensorQuantization returns the appropriate quantization type for a tensor.
// Returns "" if the tensor should not be quantized.
// This implements mixed-precision quantization:
// - Attention MLA weights (q_a, q_b, kv_a, kv_b): unquantized (most sensitive)
// - Output projection, gate/up weights: int4 (less sensitive)
// - Down projection weights: int8 (more sensitive, would be Q6 in GGML but no MLX kernel)
// - Norms, embeddings, biases, routing gates: no quantization
func GetTensorQuantization(name string, shape []int32, quantize string) string {
stackedExpert := isStackedExpertWeight(name)
// Use basic name-based check first
if !stackedExpert && !ShouldQuantize(name, "") {
return ""
}
// Quantize standard linear weights (2D). Also allow stacked expert weights (3D),
// e.g. qwen switch_mlp / experts combined tensors.
if len(shape) != 2 && !(len(shape) == 3 && stackedExpert) {
return ""
}
// Skip small tensors (less than 1024 elements) - not worth quantizing
var elems int64 = 1
for _, d := range shape {
elems *= int64(d)
}
if elems < 1024 {
return ""
}
// Normalize quantization type to canonical form
quantNorm := normalizeQuantType(quantize)
// MLX quantization requires last dimension to be divisible by group size
// nvfp4: 16, mxfp4/mxfp8: 32, int4/int8: 64
groupSize := int32(32)
switch quantNorm {
case "nvfp4":
groupSize = 16
case "int4", "int8":
groupSize = 64
}
if shape[len(shape)-1]%groupSize != 0 {
return ""
}
// Skip routing gate weights (should stay high precision)
// In safetensors these are: mlp.gate.weight (not mlp.gate_proj.weight)
if strings.Contains(name, "mlp.gate.weight") && !strings.Contains(name, "_proj") {
return ""
}
// For non-affine modes, use the same quantization for all eligible tensors.
if quantNorm == "nvfp4" || quantNorm == "mxfp4" || quantNorm == "mxfp8" {
return quantNorm
}
// Attention MLA weights - keep unquantized (bf16)
// These are highly sensitive: errors accumulate in the KV cache over time
// q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj
if strings.Contains(name, "q_a_proj") ||
strings.Contains(name, "q_b_proj") ||
strings.Contains(name, "kv_a_proj") ||
strings.Contains(name, "kv_b_proj") {
return "" // No quantization - keep bf16
}
// Down projection weights - use INT8 (would be Q6_K in GGML, but MLX has no Q6 kernel)
// mlp.down_proj, mlp.experts.X.down_proj, mlp.shared_experts.down_proj
if strings.Contains(name, "down_proj") {
return "int8"
}
// Output projection, gate/up weights - use requested quantization (INT4)
// o_proj, gate_proj, up_proj
if strings.Contains(name, "o_proj") ||
strings.Contains(name, "gate_proj") ||
strings.Contains(name, "up_proj") {
return quantNorm
}
// LM head - use requested quantization
if strings.Contains(name, "lm_head") {
return quantNorm
}
// Default to requested quantization for other weights
return quantNorm
}
var expertLayerPrefixRegexp = regexp.MustCompile(`^(?:model\.language_model\.|language_model(?:\.model)?\.|model\.)?layers\.\d+$`)
// ExpertGroupPrefix returns the group prefix for expert tensors that should be packed together.
// For example:
// - "model.layers.1.mlp.experts.0.down_proj.weight" -> "model.layers.1.mlp.experts"
// - "model.layers.1.mlp.shared_experts.down_proj.weight" -> "model.layers.1.mlp.shared_experts"
// - "language_model.model.layers.1.mlp.switch_mlp.down_proj.weight" -> "language_model.model.layers.1.mlp.switch_mlp"
// - "model.layers.0.mlp.down_proj.weight" -> "" (dense layer, no experts)
// - "model.layers.1.mlp.gate.weight" -> "" (routing gate, not an expert)
func ExpertGroupPrefix(tensorName string) string {
if !strings.HasSuffix(tensorName, ".weight") {
return ""
}
for _, marker := range []string{
".mlp.experts.",
".mlp.shared_experts.",
".mlp.switch_mlp.",
} {
idx := strings.Index(tensorName, marker)
if idx == -1 {
continue
}
layerPrefix := tensorName[:idx]
if !expertLayerPrefixRegexp.MatchString(layerPrefix) {
continue
}
return layerPrefix + strings.TrimSuffix(marker, ".")
}
return ""
}
// PackedTensorInput holds metadata for a tensor that will be packed into a multi-tensor blob.
type PackedTensorInput struct {
Name string
Dtype string
Shape []int32
Quantize string // per-tensor quantization type (may differ within group)
Reader io.Reader // safetensors-wrapped tensor data
}
// PackedTensorLayerCreator creates a single blob layer containing multiple packed tensors.
// groupName is the group prefix (e.g., "model.layers.1.mlp.experts").
type PackedTensorLayerCreator func(groupName string, tensors []PackedTensorInput) (LayerInfo, error)
type sourceQuantization struct {
Bits int `json:"bits"`
GroupSize int `json:"group_size"`
Mode string `json:"mode"`
QuantMethod string `json:"quant_method"`
WeightBlockSize []int32 `json:"weight_block_size"`
}
type sourceModelConfig struct {
ModelType string `json:"model_type"`
Architectures []string `json:"architectures"`
Quantization sourceQuantization `json:"quantization"`
QuantizationConfig sourceQuantization `json:"quantization_config"`
TextConfig struct {
ModelType string `json:"model_type"`
Quantization sourceQuantization `json:"quantization"`
QuantizationConfig sourceQuantization `json:"quantization_config"`
} `json:"text_config"`
}
func readSourceModelConfig(modelDir string) (sourceModelConfig, error) {
configPath := filepath.Join(modelDir, "config.json")
data, err := os.ReadFile(configPath)
if err != nil {
return sourceModelConfig{}, err
}
var cfg sourceModelConfig
if err := json.Unmarshal(data, &cfg); err != nil {
return sourceModelConfig{}, err
}
return cfg, nil
}
func (cfg sourceModelConfig) Architecture() string {
if len(cfg.Architectures) > 0 && cfg.Architectures[0] != "" {
return cfg.Architectures[0]
}
if cfg.ModelType != "" {
return cfg.ModelType
}
return cfg.TextConfig.ModelType
}
func (cfg sourceModelConfig) QuantMetadata() map[string]string {
// Use the first non-empty quantization config found
var q sourceQuantization
for _, candidate := range []sourceQuantization{
cfg.Quantization,
cfg.QuantizationConfig,
cfg.TextConfig.Quantization,
cfg.TextConfig.QuantizationConfig,
} {
if candidate.Bits != 0 {
q = candidate
break
}
}
quantType := sourceQuantType(q.Mode, q.Bits)
if quantType == "" {
return nil
}
metadata := map[string]string{"quant_type": quantType}
if q.GroupSize > 0 {
metadata["group_size"] = strconv.Itoa(q.GroupSize)
}
return metadata
}
type sourceQuantizedKind string
const (
sourceQuantizedKindNone sourceQuantizedKind = ""
sourceQuantizedKindPrequantized sourceQuantizedKind = "prequantized"
sourceQuantizedKindHFFP8 sourceQuantizedKind = "hf_fp8"
)
func (cfg sourceModelConfig) quantizationConfigs() []sourceQuantization {
return []sourceQuantization{
cfg.Quantization,
cfg.QuantizationConfig,
cfg.TextConfig.Quantization,
cfg.TextConfig.QuantizationConfig,
}
}
func (cfg sourceModelConfig) HFFP8WeightBlockSize() (rows, cols int32, ok bool) {
for _, q := range cfg.quantizationConfigs() {
if !strings.EqualFold(q.QuantMethod, "fp8") || len(q.WeightBlockSize) != 2 {
continue
}
return q.WeightBlockSize[0], q.WeightBlockSize[1], true
}
return 0, 0, false
}
func inspectSourceQuantization(modelDir string, cfg sourceModelConfig) (sourceQuantizedKind, error) {
entries, err := os.ReadDir(modelDir)
if err != nil {
return sourceQuantizedKindNone, err
}
hasScaleInv := false
for _, entry := range entries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".safetensors") {
continue
}
extractor, err := safetensors.OpenForExtraction(filepath.Join(modelDir, entry.Name()))
if err != nil {
return sourceQuantizedKindNone, err
}
for _, name := range extractor.ListTensors() {
switch {
case strings.HasSuffix(name, ".scales"):
extractor.Close()
return sourceQuantizedKindPrequantized, nil
case strings.HasSuffix(name, ".weight_scale_inv"):
hasScaleInv = true
}
}
extractor.Close()
}
if hasScaleInv {
if _, _, ok := cfg.HFFP8WeightBlockSize(); ok {
return sourceQuantizedKindHFFP8, nil
}
}
return sourceQuantizedKindNone, nil
}
func resolveEffectiveQuantization(cfg sourceModelConfig, sourceKind sourceQuantizedKind, requested string) (string, error) {
switch sourceKind {
case sourceQuantizedKindNone:
return requested, nil
case sourceQuantizedKindPrequantized:
if requested != "" {
return "", fmt.Errorf("cannot requantize already-quantized source model with --quantize %q", requested)
}
return "", nil
case sourceQuantizedKindHFFP8:
if requested != "" {
return "", fmt.Errorf("cannot requantize already-quantized fp8 source model with --quantize %q", requested)
}
rows, cols, ok := cfg.HFFP8WeightBlockSize()
if !ok {
return "", fmt.Errorf("fp8 source model missing weight_block_size metadata")
}
if rows != 128 || cols != 128 {
return "", fmt.Errorf("unsupported fp8 source block size %dx%d", rows, cols)
}
return "mxfp8", nil
default:
return "", fmt.Errorf("unsupported source quantization kind %q", sourceKind)
}
}
type tensorImportTransform interface {
skipTensor(name string) bool
transformTensor(td *safetensors.TensorData) ([]*safetensors.TensorData, error)
quantizationType(name string, shape []int32, quantize string) string
}
type noopImportTransform struct{}
func (noopImportTransform) skipTensor(string) bool { return false }
func (noopImportTransform) transformTensor(td *safetensors.TensorData) ([]*safetensors.TensorData, error) {
if td == nil {
return nil, nil
}
return []*safetensors.TensorData{td}, nil
}
func (noopImportTransform) quantizationType(name string, shape []int32, quantize string) string {
return GetTensorQuantization(name, shape, quantize)
}
type tensorImportTransformFactory func(modelDir string, cfg sourceModelConfig) (tensorImportTransform, error)
var tensorImportTransformRegistry = map[string]tensorImportTransformFactory{
"Qwen3_5ForCausalLM": newQwen35ImportTransform,
"Qwen3_5ForConditionalGeneration": newQwen35ImportTransform,
"Qwen3NextForCausalLM": newQwen35ImportTransform,
"Qwen3NextForConditionalGeneration": newQwen35ImportTransform,
"Qwen3_5MoeForCausalLM": newQwen35ImportTransform,
"Qwen3_5MoeForConditionalGeneration": newQwen35ImportTransform,
"Qwen3NextMoeForCausalLM": newQwen35ImportTransform,
"Qwen3NextMoeForConditionalGeneration": newQwen35ImportTransform,
}
func newTensorImportTransform(modelDir string, cfg sourceModelConfig) (tensorImportTransform, error) {
if factory, ok := tensorImportTransformRegistry[cfg.Architecture()]; ok {
return factory(modelDir, cfg)
}
return noopImportTransform{}, nil
}
// CreateSafetensorsModel imports a standard safetensors model from a directory.
// This handles Hugging Face style models with config.json and *.safetensors files.
// Stores each tensor as a separate blob for fine-grained deduplication.
// Expert tensors are packed into per-layer blobs when createPackedLayer is non-nil.
// If quantize is non-empty (e.g., "int8"), eligible tensors will be quantized.
func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer LayerCreator, createTensorLayer QuantizingTensorLayerCreator, writeManifest ManifestWriter, fn func(status string), createPackedLayer ...PackedTensorLayerCreator) error {
var layers []LayerInfo
var configLayer LayerInfo
sourceConfig, err := readSourceModelConfig(modelDir)
if err != nil {
return fmt.Errorf("failed to read source config.json: %w", err)
}
sourceQuantKind, err := inspectSourceQuantization(modelDir, sourceConfig)
if err != nil {
return fmt.Errorf("failed to inspect source quantization: %w", err)
}
effectiveQuantize, err := resolveEffectiveQuantization(sourceConfig, sourceQuantKind, quantize)
if err != nil {
return err
}
sourceQuantMetadata := sourceConfig.QuantMetadata()
importTransform, err := newTensorImportTransform(modelDir, sourceConfig)
if err != nil {
return fmt.Errorf("failed to construct import transform for architecture %q: %w", sourceConfig.Architecture(), err)
}
// Resolve the optional packed layer creator
var packedCreator PackedTensorLayerCreator
if len(createPackedLayer) > 0 {
packedCreator = createPackedLayer[0]
}
// Accumulate expert tensors by group prefix for packing.
// Readers reference file-backed SectionReaders, so we keep extractors
// open until each group is flushed to avoid buffering tensor data in memory.
expertGroups := make(map[string][]PackedTensorInput)
var expertGroupOrder []string
// Track open extractors so we can close them after flushing groups
var openExtractors []*safetensors.TensorExtractor
closeExtractors := func() {
for _, ext := range openExtractors {
ext.Close()
}
openExtractors = nil
}
entries, err := os.ReadDir(modelDir)
if err != nil {
return fmt.Errorf("failed to read directory: %w", err)
}
// Process all safetensors files
for _, entry := range entries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".safetensors") {
continue
}
stPath := filepath.Join(modelDir, entry.Name())
// Extract individual tensors from safetensors file
extractor, err := safetensors.OpenForExtraction(stPath)
if err != nil {
closeExtractors()
return fmt.Errorf("failed to open %s: %w", stPath, err)
}
tensorNames := extractor.ListTensors()
tensorSet := make(map[string]struct{}, len(tensorNames))
for _, name := range tensorNames {
tensorSet[name] = struct{}{}
}
quantizeMsg := ""
if effectiveQuantize != "" {
quantizeMsg = fmt.Sprintf(", quantizing to %s", effectiveQuantize)
}
fn(fmt.Sprintf("importing %s (%d tensors%s)", entry.Name(), len(tensorNames), quantizeMsg))
// Track whether this extractor has expert tensors that need to stay open
hasExpertTensors := false
for _, tensorName := range tensorNames {
if importTransform.skipTensor(tensorName) {
continue
}
if shouldSkipSourceCompanion(tensorName, tensorSet) {
continue
}
sourceFP8ScaleName, hasSourceFP8Scale := sourceFP8Companion(tensorName, tensorSet)
td, err := extractor.GetTensor(tensorName)
if err != nil {
extractor.Close()
closeExtractors()
return fmt.Errorf("failed to get tensor %s: %w", tensorName, err)
}
if effectiveQuantize == "" {
layer, ok, err := createPrequantizedLayer(extractor, td, tensorName, tensorSet, sourceQuantMetadata, createLayer)
if err != nil {
extractor.Close()
closeExtractors()
return err
}
if ok {
layers = append(layers, layer)
continue
}
}
outputTensors, err := importTransform.transformTensor(td)
if err != nil {
extractor.Close()
closeExtractors()
return fmt.Errorf("failed to transform tensor %s: %w", tensorName, err)
}
for _, outTD := range outputTensors {
// Determine quantization type for this tensor (empty string if not quantizing)
// GetTensorQuantization handles mixed-precision (e.g., Q8 for attention, Q4 for FFN)
quantizeType := ""
switch {
case sourceQuantKind == sourceQuantizedKindHFFP8 && hasSourceFP8Scale:
quantizeType = "mxfp8"
case sourceQuantKind == sourceQuantizedKindHFFP8:
quantizeType = ""
case effectiveQuantize != "":
quantizeType = importTransform.quantizationType(outTD.Name, outTD.Shape, effectiveQuantize)
}
reader := outTD.SafetensorsReader()
if hasSourceFP8Scale {
if len(outputTensors) != 1 {
extractor.Close()
closeExtractors()
return fmt.Errorf("source fp8 tensor %s rewrote into %d tensors; only 1:1 rewrites are supported", tensorName, len(outputTensors))
}
if quantizeType == "" {
extractor.Close()
closeExtractors()
return fmt.Errorf("source fp8 tensor %s was not scheduled for mxfp8 conversion", tensorName)
}
scaleTD, err := extractor.GetTensor(sourceFP8ScaleName)
if err != nil {
extractor.Close()
closeExtractors()
return fmt.Errorf("failed to get fp8 scale tensor %s: %w", sourceFP8ScaleName, err)
}
reader = buildSourceFP8Reader(outTD, scaleTD.WithName(outTD.Name+".scale_inv"))
}
// Check if this tensor belongs to an expert group for packing
groupPrefix := ""
if packedCreator != nil {
groupPrefix = ExpertGroupPrefix(outTD.Name)
}
if groupPrefix != "" {
// Accumulate expert tensor for packed blob.
// The Reader uses a file-backed SectionReader, so we must
// keep the extractor open until this group is flushed.
hasExpertTensors = true
if _, exists := expertGroups[groupPrefix]; !exists {
expertGroupOrder = append(expertGroupOrder, groupPrefix)
}
expertGroups[groupPrefix] = append(expertGroups[groupPrefix], PackedTensorInput{
Name: outTD.Name,
Dtype: outTD.Dtype,
Shape: outTD.Shape,
Quantize: quantizeType,
Reader: reader,
})
} else {
// Store as minimal safetensors format (88 bytes header overhead)
// This enables native mmap loading via mlx_load_safetensors
// createTensorLayer returns multiple layers if quantizing (weight + scales)
newLayers, err := createTensorLayer(reader, outTD.Name, outTD.Dtype, outTD.Shape, quantizeType)
if err != nil {
extractor.Close()
closeExtractors()
return fmt.Errorf("failed to create layer for %s: %w", outTD.Name, err)
}
layers = append(layers, newLayers...)
}
}
}
if hasExpertTensors {
// Keep extractor open - readers still reference its file handle
openExtractors = append(openExtractors, extractor)
} else {
extractor.Close()
}
}
// Process accumulated expert groups into packed blobs, then close extractors
if packedCreator != nil {
sort.Strings(expertGroupOrder)
for _, groupName := range expertGroupOrder {
tensors := expertGroups[groupName]
fn(fmt.Sprintf("packing %s (%d tensors)", groupName, len(tensors)))
layer, err := packedCreator(groupName, tensors)
if err != nil {
closeExtractors()
return fmt.Errorf("failed to create packed layer for %s: %w", groupName, err)
}
layers = append(layers, layer)
}
}
closeExtractors()
// Process all JSON config files
for _, entry := range entries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".json") {
continue
}
// Skip the index file as we don't need it after extraction
if entry.Name() == "model.safetensors.index.json" {
continue
}
cfgPath := entry.Name()
fullPath := filepath.Join(modelDir, cfgPath)
fn(fmt.Sprintf("importing config %s", cfgPath))
f, err := os.Open(fullPath)
if err != nil {
return fmt.Errorf("failed to open %s: %w", cfgPath, err)
}
layer, err := createLayer(f, "application/vnd.ollama.image.json", cfgPath)
f.Close()
if err != nil {
return fmt.Errorf("failed to create layer for %s: %w", cfgPath, err)
}
// Use config.json as the config layer
if cfgPath == "config.json" {
configLayer = layer
}
layers = append(layers, layer)
}
if configLayer.Digest == "" {
return fmt.Errorf("config.json not found in %s", modelDir)
}
fn(fmt.Sprintf("writing manifest for %s", modelName))
if err := writeManifest(modelName, configLayer, layers); err != nil {
return fmt.Errorf("failed to write manifest: %w", err)
}
fn(fmt.Sprintf("successfully imported %s with %d layers", modelName, len(layers)))
return nil
}
func shouldSkipSourceCompanion(name string, tensorSet map[string]struct{}) bool {
switch {
case strings.HasSuffix(name, ".scales"):
_, ok := tensorSet[strings.TrimSuffix(name, ".scales")+".weight"]
return ok
case strings.HasSuffix(name, ".biases"):
_, ok := tensorSet[strings.TrimSuffix(name, ".biases")+".weight"]
return ok
case strings.HasSuffix(name, ".weight_scale_inv"):
_, ok := tensorSet[strings.TrimSuffix(name, "_scale_inv")]
return ok
default:
return false
}
}
func sourceFP8Companion(weightName string, tensorSet map[string]struct{}) (scaleName string, ok bool) {
if !strings.HasSuffix(weightName, ".weight") {
return "", false
}
scaleName = weightName + "_scale_inv"
_, ok = tensorSet[scaleName]
return scaleName, ok
}
func buildSourceFP8Reader(weightTD, scaleTD *safetensors.TensorData) io.Reader {
return safetensors.BuildPackedSafetensorsReader([]*safetensors.TensorData{weightTD, scaleTD})
}
func createPrequantizedLayer(
extractor *safetensors.TensorExtractor,
td *safetensors.TensorData,
tensorName string,
tensorSet map[string]struct{},
metadata map[string]string,
createLayer LayerCreator,
) (LayerInfo, bool, error) {
scaleName, biasName, ok := prequantizedCompanions(tensorName, tensorSet)
if !ok {
return LayerInfo{}, false, nil
}
tensors := []*safetensors.TensorData{td.WithName(tensorName)}
scaleTD, err := extractor.GetTensor(scaleName)
if err != nil {
return LayerInfo{}, false, fmt.Errorf("failed to get tensor %s: %w", scaleName, err)
}
tensors = append(tensors, scaleTD.WithName(tensorName+".scale"))
if biasName != "" {
biasTD, err := extractor.GetTensor(biasName)
if err != nil {
return LayerInfo{}, false, fmt.Errorf("failed to get tensor %s: %w", biasName, err)
}
tensors = append(tensors, biasTD.WithName(tensorName+".bias"))
}
layer, err := createLayer(
safetensors.BuildPackedSafetensorsReaderWithMetadata(tensors, metadata),
"application/vnd.ollama.image.tensor",
tensorName,
)
if err != nil {
return LayerInfo{}, false, fmt.Errorf("failed to create prequantized layer for %s: %w", tensorName, err)
}
return layer, true, nil
}
func prequantizedCompanions(weightName string, tensorSet map[string]struct{}) (scaleName, biasName string, ok bool) {
if !strings.HasSuffix(weightName, ".weight") {
return "", "", false
}
base := strings.TrimSuffix(weightName, ".weight")
scaleName = base + ".scales"
if _, ok := tensorSet[scaleName]; !ok {
return "", "", false
}
biasName = base + ".biases"
if _, ok := tensorSet[biasName]; !ok {
biasName = ""
}
return scaleName, biasName, true
}