merge ggml file decoding

This commit is contained in:
Michael Yang
2024-12-03 11:17:45 -08:00
parent 2c5fb24855
commit b7943d941d
30 changed files with 429 additions and 1733 deletions

View File

@@ -28,6 +28,7 @@ import (
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runners"
)
@@ -72,7 +73,7 @@ type llmServer struct {
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*GGML, error) {
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -83,13 +84,13 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
}
defer f.Close()
ggml, _, err := DecodeGGML(f, maxArraySize)
ggml, _, err := ggml.Decode(f, maxArraySize)
return ggml, err
}
// NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family.
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error
var cpuRunner string
var estimate MemoryEstimate
@@ -109,9 +110,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = runners.ServerForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
estimate = EstimateGPULayers(gpus, f, projectors, opts)
} else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
estimate = EstimateGPULayers(gpus, f, projectors, opts)
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
@@ -212,7 +213,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
fa = false
}
if fa && !ggml.SupportsFlashAttention() {
if fa && !f.SupportsFlashAttention() {
slog.Warn("flash attention enabled but not supported by model")
fa = false
}
@@ -225,7 +226,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
// Flash Attention also supports kv cache quantization
// Enable if the requested and kv cache type is supported by the model
if kvct != "" && ggml.SupportsKVCacheType(kvct) {
if kvct != "" && f.SupportsKVCacheType(kvct) {
params = append(params, "--kv-cache-type", kvct)
} else {
slog.Warn("kv cache type not supported by model", "type", kvct)
@@ -238,7 +239,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
for _, g := range gpus {
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
opts.UseMMap = new(bool)
*opts.UseMMap = false
}
@@ -330,7 +331,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
estimate: estimate,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
totalLayers: f.KV().BlockCount() + 1,
gpus: gpus,
done: make(chan error, 1),
}