mirror of
https://github.com/ollama/ollama.git
synced 2026-04-18 16:23:27 +02:00
* prefer rocm v6 on windows Avoid building with v7 - more changes are needed * MLX: add header vendoring and remove go build tag This switches to using a vendoring approach for the mlx-c headers so that Go can build without requiring a cmake first. This enables building the new MLX based code by default. Every time cmake runs, the headers are refreshed, so we can easily keep them in sync when we bump mlx versions. Basic Windows and Linux support are verified. * ci: harden for flaky choco repo servers CI sometimes fails due to choco not actually installing cache. Since it just speeds up the build, we can proceed without. * review comments
251 lines
5.9 KiB
Go
251 lines
5.9 KiB
Go
package model
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/x/imagegen/manifest"
|
|
)
|
|
|
|
// TensorQuantInfo describes per-tensor quantization metadata.
|
|
type TensorQuantInfo struct {
|
|
QuantType string
|
|
GroupSize int
|
|
}
|
|
|
|
// Root wraps a ModelManifest with pre-scanned quantization metadata.
|
|
type Root struct {
|
|
Manifest *manifest.ModelManifest
|
|
|
|
// Backwards-compatible model-level quant metadata (first tensor blob).
|
|
quantType string
|
|
groupSize int
|
|
|
|
// Per-tensor quantization metadata.
|
|
tensorQuant map[string]*TensorQuantInfo
|
|
}
|
|
|
|
// Open loads a manifest for the given model name and scans tensor blobs for
|
|
// quantization metadata.
|
|
func Open(modelName string) (*Root, error) {
|
|
m, err := manifest.LoadManifest(modelName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
root := &Root{
|
|
Manifest: m,
|
|
tensorQuant: make(map[string]*TensorQuantInfo),
|
|
}
|
|
|
|
for _, layer := range m.GetTensorLayers("") {
|
|
blobPath := m.BlobPath(layer.Digest)
|
|
|
|
infos, blobQuantType, blobGroupSize, err := readBlobTensorQuantInfo(blobPath)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
for name, info := range infos {
|
|
root.tensorQuant[name] = info
|
|
}
|
|
|
|
if root.quantType == "" && blobQuantType != "" {
|
|
root.quantType = strings.ToUpper(blobQuantType)
|
|
root.groupSize = blobGroupSize
|
|
if root.groupSize == 0 {
|
|
root.groupSize = defaultGroupSize(root.quantType)
|
|
}
|
|
}
|
|
}
|
|
|
|
return root, nil
|
|
}
|
|
|
|
// Close is a no-op for now (future: release resources).
|
|
func (r *Root) Close() {}
|
|
|
|
// QuantType returns the quantization type detected from the first tensor blob metadata.
|
|
func (r *Root) QuantType() string { return r.quantType }
|
|
|
|
// GroupSize returns the quantization group size detected from the first tensor blob metadata.
|
|
func (r *Root) GroupSize() int { return r.groupSize }
|
|
|
|
// TensorQuant returns per-tensor quantization metadata if available.
|
|
func (r *Root) TensorQuant(name string) *TensorQuantInfo {
|
|
if r == nil {
|
|
return nil
|
|
}
|
|
return r.tensorQuant[name]
|
|
}
|
|
|
|
// AllTensorQuant returns a copy of the per-tensor quantization metadata.
|
|
func (r *Root) AllTensorQuant() map[string]*TensorQuantInfo {
|
|
out := make(map[string]*TensorQuantInfo, len(r.tensorQuant))
|
|
for k, v := range r.tensorQuant {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
copy := *v
|
|
out[k] = ©
|
|
}
|
|
return out
|
|
}
|
|
|
|
func defaultGroupSize(quantType string) int {
|
|
groupSize, _, _ := QuantizationParams(quantType)
|
|
return groupSize
|
|
}
|
|
|
|
func readBlobTensorQuantInfo(path string) (map[string]*TensorQuantInfo, string, int, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, "", 0, err
|
|
}
|
|
defer f.Close()
|
|
|
|
var headerSize uint64
|
|
if err := binary.Read(f, binary.LittleEndian, &headerSize); err != nil {
|
|
return nil, "", 0, err
|
|
}
|
|
if headerSize > 100*1024*1024 {
|
|
return nil, "", 0, fmt.Errorf("header too large: %d", headerSize)
|
|
}
|
|
|
|
data := make([]byte, headerSize)
|
|
if _, err := io.ReadFull(f, data); err != nil {
|
|
return nil, "", 0, err
|
|
}
|
|
|
|
var header map[string]json.RawMessage
|
|
if err := json.Unmarshal(data, &header); err != nil {
|
|
return nil, "", 0, err
|
|
}
|
|
|
|
globalQuantType, globalGroupSize := parseGlobalQuantMetadata(header)
|
|
globalQuantType = strings.ToUpper(globalQuantType)
|
|
|
|
mainNames := mainTensorNames(header)
|
|
infos := make(map[string]*TensorQuantInfo)
|
|
for _, name := range mainNames {
|
|
if _, ok := header[name+".scale"]; !ok {
|
|
continue
|
|
}
|
|
|
|
quantType := globalQuantType
|
|
groupSize := globalGroupSize
|
|
|
|
inferredType, inferredGroup := inferQuantTypeFromShapes(header, name, quantType)
|
|
if quantType == "" {
|
|
quantType = inferredType
|
|
}
|
|
if groupSize == 0 {
|
|
groupSize = inferredGroup
|
|
}
|
|
if quantType == "" {
|
|
continue
|
|
}
|
|
if groupSize == 0 {
|
|
groupSize = defaultGroupSize(quantType)
|
|
}
|
|
|
|
infos[name] = &TensorQuantInfo{QuantType: quantType, GroupSize: groupSize}
|
|
}
|
|
|
|
return infos, globalQuantType, globalGroupSize, nil
|
|
}
|
|
|
|
func parseGlobalQuantMetadata(header map[string]json.RawMessage) (quantType string, groupSize int) {
|
|
metaRaw, ok := header["__metadata__"]
|
|
if !ok {
|
|
return "", 0
|
|
}
|
|
|
|
var meta map[string]string
|
|
if err := json.Unmarshal(metaRaw, &meta); err != nil {
|
|
return "", 0
|
|
}
|
|
|
|
quantType = meta["quant_type"]
|
|
if gs := meta["group_size"]; gs != "" {
|
|
groupSize, _ = strconv.Atoi(gs)
|
|
}
|
|
return quantType, groupSize
|
|
}
|
|
|
|
func mainTensorNames(header map[string]json.RawMessage) []string {
|
|
names := make([]string, 0, len(header))
|
|
for name := range header {
|
|
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
|
continue
|
|
}
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
return names
|
|
}
|
|
|
|
func inferQuantTypeFromShapes(header map[string]json.RawMessage, tensorName string, hintQuantType string) (string, int) {
|
|
type tensorShape struct {
|
|
Shape []int64 `json:"shape"`
|
|
}
|
|
|
|
mainRaw, ok := header[tensorName]
|
|
if !ok {
|
|
return "", 0
|
|
}
|
|
scaleRaw, ok := header[tensorName+".scale"]
|
|
if !ok {
|
|
return "", 0
|
|
}
|
|
|
|
var mainInfo tensorShape
|
|
if err := json.Unmarshal(mainRaw, &mainInfo); err != nil || len(mainInfo.Shape) == 0 {
|
|
return "", 0
|
|
}
|
|
|
|
var scaleInfo tensorShape
|
|
if err := json.Unmarshal(scaleRaw, &scaleInfo); err != nil || len(scaleInfo.Shape) == 0 {
|
|
return "", 0
|
|
}
|
|
|
|
weightCols := int(mainInfo.Shape[len(mainInfo.Shape)-1])
|
|
scalesCols := int(scaleInfo.Shape[len(scaleInfo.Shape)-1])
|
|
if weightCols <= 0 || scalesCols <= 0 {
|
|
return "", 0
|
|
}
|
|
|
|
groupSize4 := weightCols * 8 / scalesCols
|
|
groupSize8 := weightCols * 4 / scalesCols
|
|
|
|
switch {
|
|
case groupSize4 == 32:
|
|
return "INT4", 32
|
|
case groupSize8 == 64:
|
|
return "INT8", 64
|
|
case groupSize4 == 64 && groupSize8 == 32:
|
|
h := strings.ToUpper(hintQuantType)
|
|
if strings.Contains(h, "8") {
|
|
return "INT8", 32
|
|
}
|
|
if strings.Contains(h, "4") {
|
|
return "INT4", 64
|
|
}
|
|
}
|
|
|
|
if isCommonGroupSize(groupSize4) && !isCommonGroupSize(groupSize8) {
|
|
return "INT4", groupSize4
|
|
}
|
|
if isCommonGroupSize(groupSize8) && !isCommonGroupSize(groupSize4) {
|
|
return "INT8", groupSize8
|
|
}
|
|
|
|
return "", 0
|
|
}
|