mirror of
https://github.com/ollama/ollama.git
synced 2026-04-22 00:36:11 +02:00
merge ggml file decoding
This commit is contained in:
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/runners"
|
||||
)
|
||||
@@ -72,7 +73,7 @@ type llmServer struct {
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -83,13 +84,13 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||
ggml, _, err := ggml.Decode(f, maxArraySize)
|
||||
return ggml, err
|
||||
}
|
||||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
var err error
|
||||
var cpuRunner string
|
||||
var estimate MemoryEstimate
|
||||
@@ -109,9 +110,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
}
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
estimate = EstimateGPULayers(gpus, f, projectors, opts)
|
||||
} else {
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
estimate = EstimateGPULayers(gpus, f, projectors, opts)
|
||||
|
||||
switch {
|
||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||
@@ -212,7 +213,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
fa = false
|
||||
}
|
||||
|
||||
if fa && !ggml.SupportsFlashAttention() {
|
||||
if fa && !f.SupportsFlashAttention() {
|
||||
slog.Warn("flash attention enabled but not supported by model")
|
||||
fa = false
|
||||
}
|
||||
@@ -225,7 +226,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if kvct != "" && ggml.SupportsKVCacheType(kvct) {
|
||||
if kvct != "" && f.SupportsKVCacheType(kvct) {
|
||||
params = append(params, "--kv-cache-type", kvct)
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
@@ -238,7 +239,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
for _, g := range gpus {
|
||||
if g.Library == "metal" &&
|
||||
uint64(opts.NumGPU) > 0 &&
|
||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
|
||||
opts.UseMMap = new(bool)
|
||||
*opts.UseMMap = false
|
||||
}
|
||||
@@ -330,7 +331,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
estimate: estimate,
|
||||
numParallel: numParallel,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
totalLayers: f.KV().BlockCount() + 1,
|
||||
gpus: gpus,
|
||||
done: make(chan error, 1),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user