ollama/x/server/show.go

package server

import (
	"encoding/binary"
	"encoding/json"
	"fmt"
	"io"
	"os"
	"strings"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/manifest"
	"github.com/ollama/ollama/types/model"
)

// modelConfig represents the HuggingFace config.json structure
type modelConfig struct {
	Architectures         []string `json:"architectures"`
	ModelType             string   `json:"model_type"`
	HiddenSize            int      `json:"hidden_size"`
	NumHiddenLayers       int      `json:"num_hidden_layers"`
	MaxPositionEmbeddings int      `json:"max_position_embeddings"`
	IntermediateSize      int      `json:"intermediate_size"`
	NumAttentionHeads     int      `json:"num_attention_heads"`
	NumKeyValueHeads      int      `json:"num_key_value_heads"`
	VocabSize             int      `json:"vocab_size"`
	RMSNormEps            float64  `json:"rms_norm_eps"`
	RopeTheta             float64  `json:"rope_theta"`
	TorchDtype            string   `json:"torch_dtype"`
	TextConfig            *struct {
		HiddenSize            int `json:"hidden_size"`
		MaxPositionEmbeddings int `json:"max_position_embeddings"`
		NumHiddenLayers       int `json:"num_hidden_layers"`
	} `json:"text_config"`
}

// GetSafetensorsLLMInfo extracts model information from safetensors LLM models.
// It reads the config.json layer and returns a map compatible with GGML's KV format.
func GetSafetensorsLLMInfo(name model.Name) (map[string]any, error) {
	mf, err := manifest.ParseNamedManifest(name)
	if err != nil {
		return nil, fmt.Errorf("failed to load manifest: %w", err)
	}

	var config modelConfig
	if err := mf.ReadConfigJSON("config.json", &config); err != nil {
		return nil, fmt.Errorf("failed to read config.json: %w", err)
	}

	// Calculate total tensor bytes from manifest layers
	var totalBytes int64
	var tensorCount int64
	for _, layer := range mf.Layers {
		if layer.MediaType == manifest.MediaTypeImageTensor {
			totalBytes += layer.Size
			tensorCount++
		}
	}

	return buildModelInfo(config, totalBytes, tensorCount), nil
}

// buildModelInfo constructs the model info map from config and tensor stats.
// This is separated for testability.
func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map[string]any {
	// Determine architecture
	arch := config.ModelType
	if arch == "" && len(config.Architectures) > 0 {
		// Convert HuggingFace architecture name to Ollama format
		// e.g., "Gemma3ForCausalLM" -> "gemma3"
		hfArch := config.Architectures[0]
		arch = strings.ToLower(hfArch)
		arch = strings.TrimSuffix(arch, "forcausallm")
		arch = strings.TrimSuffix(arch, "forconditionalgeneration")
	}

	// Use text_config values if they exist (for multimodal models)
	hiddenSize := config.HiddenSize
	maxPosEmbed := config.MaxPositionEmbeddings
	numLayers := config.NumHiddenLayers

	if config.TextConfig != nil {
		if config.TextConfig.HiddenSize > 0 {
			hiddenSize = config.TextConfig.HiddenSize
		}
		if config.TextConfig.MaxPositionEmbeddings > 0 {
			maxPosEmbed = config.TextConfig.MaxPositionEmbeddings
		}
		if config.TextConfig.NumHiddenLayers > 0 {
			numLayers = config.TextConfig.NumHiddenLayers
		}
	}

	// Get dtype to determine bytes per parameter for count calculation
	dtype := config.TorchDtype

	// Determine bytes per parameter based on dtype
	var bytesPerParam int64 = 2 // default to float16/bfloat16
	switch strings.ToLower(dtype) {
	case "float32":
		bytesPerParam = 4
	case "float16", "bfloat16":
		bytesPerParam = 2
	case "int8", "uint8":
		bytesPerParam = 1
	}

	// Subtract safetensors header overhead (88 bytes per tensor file)
	// Each tensor is stored as a minimal safetensors file
	totalBytes := totalTensorBytes - tensorCount*88

	paramCount := totalBytes / bytesPerParam

	info := map[string]any{
		"general.architecture": arch,
	}

	if maxPosEmbed > 0 {
		info[fmt.Sprintf("%s.context_length", arch)] = maxPosEmbed
	}

	if hiddenSize > 0 {
		info[fmt.Sprintf("%s.embedding_length", arch)] = hiddenSize
	}

	if numLayers > 0 {
		info[fmt.Sprintf("%s.block_count", arch)] = numLayers
	}

	if config.NumAttentionHeads > 0 {
		info[fmt.Sprintf("%s.attention.head_count", arch)] = config.NumAttentionHeads
	}

	if config.NumKeyValueHeads > 0 {
		info[fmt.Sprintf("%s.attention.head_count_kv", arch)] = config.NumKeyValueHeads
	}

	if config.IntermediateSize > 0 {
		info[fmt.Sprintf("%s.feed_forward_length", arch)] = config.IntermediateSize
	}

	if config.VocabSize > 0 {
		info[fmt.Sprintf("%s.vocab_size", arch)] = config.VocabSize
	}

	if paramCount > 0 {
		info["general.parameter_count"] = paramCount
	}

	return info
}

// GetSafetensorsTensorInfo extracts tensor information from safetensors model layers.
// Each tensor is stored as a minimal safetensors file with an 88-byte header containing metadata.
func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
	mf, err := manifest.ParseNamedManifest(name)
	if err != nil {
		return nil, fmt.Errorf("failed to load manifest: %w", err)
	}

	return getTensorInfoFromManifest(mf)
}

// getTensorInfoFromManifest extracts tensor info from a manifest.
// This is separated for testability.
// For quantized models, groups weight/scale/qbias into single entries with detected quantization type.
func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
	var tensors []api.Tensor

	// First pass: collect all tensor info and identify scale tensors
	type tensorData struct {
		info   *safetensorsTensorInfo
		digest string
	}
	tensorMap := make(map[string]*tensorData)
	scaleMap := make(map[string]*tensorData) // base name -> scale tensor info

	for _, layer := range mf.Layers {
		if layer.MediaType != manifest.MediaTypeImageTensor {
			continue
		}

		// Read the safetensors header from the blob
		blobPath, err := manifest.BlobsPath(layer.Digest)
		if err != nil {
			continue
		}
		info, err := readSafetensorsHeader(blobPath)
		if err != nil {
			continue
		}

		td := &tensorData{info: info, digest: layer.Digest}

		if strings.HasSuffix(layer.Name, "_scale") {
			baseName := strings.TrimSuffix(layer.Name, "_scale")
			scaleMap[baseName] = td
		} else if strings.HasSuffix(layer.Name, "_qbias") {
			// Skip qbias tensors - they're included with the quantized weight
			continue
		} else {
			tensorMap[layer.Name] = td
		}
	}

	// Second pass: build tensor list with quantization info
	for _, layer := range mf.Layers {
		if layer.MediaType != manifest.MediaTypeImageTensor {
			continue
		}

		// Skip scale and qbias tensors
		if strings.HasSuffix(layer.Name, "_scale") || strings.HasSuffix(layer.Name, "_qbias") {
			continue
		}

		td := tensorMap[layer.Name]
		if td == nil {
			continue
		}

		// Check if this tensor has a corresponding scale tensor (quantized)
		scaleTd := scaleMap[layer.Name]
		if scaleTd != nil && len(td.info.Shape) >= 2 && len(scaleTd.info.Shape) >= 2 {
			// Quantized tensor - detect bits from shapes
			weightCols := td.info.Shape[len(td.info.Shape)-1]
			scaleCols := scaleTd.info.Shape[len(scaleTd.info.Shape)-1]

			// Detect quantization: Q4 has pack_factor=8, Q8 has pack_factor=4
			// Q4 uses group_size=32: weightCols * 8 / scaleCols = 32
			// Q8 uses group_size=64: weightCols * 4 / scaleCols = 64
			var bits int
			var quantType string
			if weightCols*8/scaleCols == 32 {
				bits = 4
				quantType = "Q4"
			} else if weightCols*4/scaleCols == 64 {
				bits = 8
				quantType = "Q8"
			} else {
				// Unknown quantization, show raw
				quantType = td.info.Dtype
			}

			// Calculate unpacked shape
			shape := make([]uint64, len(td.info.Shape))
			for i, s := range td.info.Shape {
				shape[i] = uint64(s)
			}
			if bits > 0 {
				packFactor := int64(32 / bits)
				shape[len(shape)-1] = uint64(td.info.Shape[len(td.info.Shape)-1] * packFactor)
			}

			tensors = append(tensors, api.Tensor{
				Name:  layer.Name,
				Type:  quantType,
				Shape: shape,
			})
		} else {
			// Non-quantized tensor
			shape := make([]uint64, len(td.info.Shape))
			for i, s := range td.info.Shape {
				shape[i] = uint64(s)
			}

			tensors = append(tensors, api.Tensor{
				Name:  layer.Name,
				Type:  td.info.Dtype,
				Shape: shape,
			})
		}
	}

	return tensors, nil
}

// GetSafetensorsDtype returns the quantization type for a safetensors model.
// Reads from model_index.json first, falls back to detection from tensor names.
// Otherwise returns the torch_dtype from config.json.
func GetSafetensorsDtype(name model.Name) (string, error) {
	mf, err := manifest.ParseNamedManifest(name)
	if err != nil {
		return "", fmt.Errorf("failed to load manifest: %w", err)
	}

	// First try to read quantization from model_index.json
	var modelIndex struct {
		Quantization string `json:"quantization"`
	}
	if err := mf.ReadConfigJSON("model_index.json", &modelIndex); err == nil && modelIndex.Quantization != "" {
		return modelIndex.Quantization, nil
	}

	// Fallback: detect from tensor names
	hasScales := false
	hasQBias := false
	for _, layer := range mf.Layers {
		if layer.MediaType == manifest.MediaTypeImageTensor {
			if strings.HasSuffix(layer.Name, "_scale") {
				hasScales = true
			}
			if strings.HasSuffix(layer.Name, "_qbias") {
				hasQBias = true
			}
		}
	}

	if hasScales {
		if hasQBias {
			// Affine mode (has scale + qbias) - could be Q4 or Q8
			// Default to Q4 as it's more common
			return "Q4", nil
		}
		// No qbias = NVFP4
		return "NVFP4", nil
	}

	// Not quantized - return torch_dtype from config.json
	var cfg struct {
		TorchDtype string `json:"torch_dtype"`
	}
	if err := mf.ReadConfigJSON("config.json", &cfg); err != nil {
		return "", fmt.Errorf("failed to read config.json: %w", err)
	}

	return cfg.TorchDtype, nil
}

// safetensorsTensorInfo holds metadata about a tensor from a safetensors header
type safetensorsTensorInfo struct {
	Dtype string  `json:"dtype"`
	Shape []int64 `json:"shape"`
}

// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
// Safetensors format: 8-byte header size (little endian) + JSON header + tensor data
func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {
	f, err := os.Open(path)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	return parseSafetensorsHeader(f)
}

// parseSafetensorsHeader parses a safetensors header from a reader.
// This is separated for testability.
func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
	// Read header size (8 bytes, little endian)
	var headerSize uint64
	if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
		return nil, fmt.Errorf("failed to read header size: %w", err)
	}

	// Sanity check - header shouldn't be too large
	if headerSize > 1024*1024 {
		return nil, fmt.Errorf("header size too large: %d", headerSize)
	}

	// Read header JSON
	headerBytes := make([]byte, headerSize)
	if _, err := io.ReadFull(r, headerBytes); err != nil {
		return nil, fmt.Errorf("failed to read header: %w", err)
	}

	// Parse as map of tensor name -> info
	var header map[string]json.RawMessage
	if err := json.Unmarshal(headerBytes, &header); err != nil {
		return nil, fmt.Errorf("failed to parse header: %w", err)
	}

	// Find the first (and should be only) tensor entry
	for name, raw := range header {
		if name == "__metadata__" {
			continue
		}
		var info safetensorsTensorInfo
		if err := json.Unmarshal(raw, &info); err != nil {
			return nil, fmt.Errorf("failed to parse tensor info: %w", err)
		}
		return &info, nil
	}

	return nil, fmt.Errorf("no tensor found in header")
}