mirror of
https://github.com/ollama/ollama.git
synced 2026-04-17 15:53:27 +02:00
This change adds a new MLX based runner which includes: * Method-based MLX bindings * Subprocess-based MLX runner (x/mlxrunner) * KV cache with tree management * A basic sampler The GLM4-MoE-Lite model has been ported to use the new bindings. --------- Co-authored-by: Michael Yang <git@mxy.ng>
140 lines
3.5 KiB
Go
140 lines
3.5 KiB
Go
//go:build mlx
|
|
|
|
package mlxrunner
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"net/http"
|
|
"time"
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"github.com/ollama/ollama/x/imagegen/manifest"
|
|
"github.com/ollama/ollama/x/imagegen/tokenizer"
|
|
"github.com/ollama/ollama/x/mlxrunner/cache"
|
|
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
|
"github.com/ollama/ollama/x/mlxrunner/sample"
|
|
"github.com/ollama/ollama/x/models/glm4_moe_lite"
|
|
)
|
|
|
|
// TextModel is the interface that model implementations must satisfy.
|
|
type TextModel interface {
|
|
Forward(inputs *mlx.Array, cache []cache.Cache) *mlx.Array
|
|
Unembed(x *mlx.Array) *mlx.Array
|
|
NumLayers() int
|
|
}
|
|
|
|
type Request struct {
|
|
TextCompletionsRequest
|
|
Responses chan Response
|
|
Pipeline func(Request) error
|
|
|
|
sample.Sampler
|
|
caches []cache.Cache
|
|
}
|
|
|
|
type TextCompletionsRequest struct {
|
|
Prompt string `json:"prompt"`
|
|
Options struct {
|
|
Temperature float32 `json:"temperature"`
|
|
TopP float32 `json:"top_p"`
|
|
MinP float32 `json:"min_p"`
|
|
TopK int `json:"top_k"`
|
|
MaxTokens int `json:"max_tokens"`
|
|
|
|
// Deprecated: use MaxTokens instead
|
|
NumPredict int `json:"num_predict"`
|
|
} `json:"options"`
|
|
}
|
|
|
|
type Response struct {
|
|
Text string `json:"content,omitempty"`
|
|
Token int `json:"token,omitempty"`
|
|
Logprobs []float32 `json:"logprobs,omitempty"`
|
|
Done bool `json:"done,omitempty"`
|
|
DoneReason int `json:"done_reason,omitempty"`
|
|
|
|
PromptTokens int `json:"prompt_eval_count,omitempty"`
|
|
PromptTokensDuration time.Duration `json:"prompt_eval_duration,omitempty"`
|
|
CompletionTokens int `json:"eval_count,omitempty"`
|
|
CompletionTokensDuration time.Duration `json:"eval_duration,omitempty"`
|
|
TotalTokens int `json:"total_tokens,omitempty"`
|
|
}
|
|
|
|
type Runner struct {
|
|
Model TextModel
|
|
Tokenizer *tokenizer.Tokenizer
|
|
Requests chan Request
|
|
CacheEntries map[int32]*CacheEntry
|
|
}
|
|
|
|
func (r *Runner) Load(modelName string) error {
|
|
modelManifest, err := manifest.LoadManifest(modelName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Read config to detect architecture
|
|
configData, err := modelManifest.ReadConfig("config.json")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read config.json: %w", err)
|
|
}
|
|
|
|
var archConfig struct {
|
|
Architectures []string `json:"architectures"`
|
|
}
|
|
if err := json.Unmarshal(configData, &archConfig); err != nil {
|
|
return fmt.Errorf("failed to parse config.json: %w", err)
|
|
}
|
|
|
|
if len(archConfig.Architectures) == 0 {
|
|
return fmt.Errorf("no architectures found in config.json")
|
|
}
|
|
|
|
slog.Info("Model architecture", "arch", archConfig.Architectures[0])
|
|
|
|
switch archConfig.Architectures[0] {
|
|
case "Glm4MoeLiteForCausalLM", "GLM4MoeLite":
|
|
model, err := glm4_moe_lite.LoadFromManifest(modelManifest)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load GLM4-MoE-Lite model: %w", err)
|
|
}
|
|
r.Model = model
|
|
r.Tokenizer = model.Tokenizer()
|
|
default:
|
|
return fmt.Errorf("unsupported architecture: %s", archConfig.Architectures[0])
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *Runner) Run(host, port string, mux http.Handler) error {
|
|
g, ctx := errgroup.WithContext(context.Background())
|
|
|
|
g.Go(func() error {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case request := <-r.Requests:
|
|
if err := request.Pipeline(request); err != nil {
|
|
break
|
|
}
|
|
|
|
close(request.Responses)
|
|
}
|
|
}
|
|
})
|
|
|
|
g.Go(func() error {
|
|
slog.Info("Starting HTTP server", "host", host, "port", port)
|
|
return http.ListenAndServe(net.JoinHostPort(host, port), mux)
|
|
})
|
|
|
|
return g.Wait()
|
|
}
|