Add MLX runner with GLM4-MoE-Lite model support (#14185)

This change adds a new MLX based runner which includes: * Method-based MLX bindings * Subprocess-based MLX runner (x/mlxrunner) * KV cache with tree management * A basic sampler The GLM4-MoE-Lite model has been ported to use the new bindings. --------- Co-authored-by: Michael Yang <git@mxy.ng>
2026-04-17 21:54:08 +02:00 · 2026-02-10 14:57:57 -08:00
parent db493d6e5e
commit 44bdd9a2ef
42 changed files with 14900 additions and 9 deletions
--- a/go.mod
+++ b/go.mod
@@ -13,7 +13,7 @@ require (
 	github.com/mattn/go-sqlite3 v1.14.24
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
-	github.com/stretchr/testify v1.9.0
+	github.com/stretchr/testify v1.10.0
 	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.17.0
 	golang.org/x/sys v0.37.0
@@ -31,6 +31,8 @@ require (
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
 	github.com/tree-sitter/go-tree-sitter v0.25.0
 	github.com/tree-sitter/tree-sitter-cpp v0.23.4
 	github.com/wk8/go-ordered-map/v2 v2.1.8
 	golang.org/x/image v0.22.0
 	golang.org/x/mod v0.30.0
@@ -60,6 +62,7 @@ require (
 	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/mattn/go-localereader v0.0.1 // indirect
 	github.com/mattn/go-pointer v0.0.1 // indirect
 	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
 	github.com/muesli/cancelreader v0.2.2 // indirect
 	github.com/muesli/termenv v0.16.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -172,6 +172,8 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
 github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
 github.com/mattn/go-pointer v0.0.1 h1:n+XhsuGeVO6MEAp7xyEukFINEa+Quek5psIR/ylA6o0=
 github.com/mattn/go-pointer v0.0.1/go.mod h1:2zXcozF6qYGgmsG+SeTZz3oAbFLdD3OWqnUbNvJZAlc=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
 github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
@@ -233,12 +235,39 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/tkrajina/go-reflector v0.5.5 h1:gwoQFNye30Kk7NrExj8zm3zFtrGPqOkzFMLuQZg1DtQ=
 github.com/tkrajina/go-reflector v0.5.5/go.mod h1:ECbqLgccecY5kPmPmXg1MrHW585yMcDkVl6IvJe64T4=
 github.com/tkrajina/typescriptify-golang-structs v0.2.0 h1:ZedWk82egydDspGTryAatbX0/1NZDQbdiZLoCbOk4f8=
 github.com/tkrajina/typescriptify-golang-structs v0.2.0/go.mod h1:sjU00nti/PMEOZb07KljFlR+lJ+RotsC0GBQMv9EKls=
 github.com/tree-sitter/go-tree-sitter v0.25.0 h1:sx6kcg8raRFCvc9BnXglke6axya12krCJF5xJ2sftRU=
 github.com/tree-sitter/go-tree-sitter v0.25.0/go.mod h1:r77ig7BikoZhHrrsjAnv8RqGti5rtSyvDHPzgTPsUuU=
 github.com/tree-sitter/tree-sitter-c v0.23.4 h1:nBPH3FV07DzAD7p0GfNvXM+Y7pNIoPenQWBpvM++t4c=
 github.com/tree-sitter/tree-sitter-c v0.23.4/go.mod h1:MkI5dOiIpeN94LNjeCp8ljXN/953JCwAby4bClMr6bw=
 github.com/tree-sitter/tree-sitter-cpp v0.23.4 h1:LaWZsiqQKvR65yHgKmnaqA+uz6tlDJTJFCyFIeZU/8w=
 github.com/tree-sitter/tree-sitter-cpp v0.23.4/go.mod h1:doqNW64BriC7WBCQ1klf0KmJpdEvfxyXtoEybnBo6v8=
 github.com/tree-sitter/tree-sitter-embedded-template v0.23.2 h1:nFkkH6Sbe56EXLmZBqHHcamTpmz3TId97I16EnGy4rg=
 github.com/tree-sitter/tree-sitter-embedded-template v0.23.2/go.mod h1:HNPOhN0qF3hWluYLdxWs5WbzP/iE4aaRVPMsdxuzIaQ=
 github.com/tree-sitter/tree-sitter-go v0.23.4 h1:yt5KMGnTHS+86pJmLIAZMWxukr8W7Ae1STPvQUuNROA=
 github.com/tree-sitter/tree-sitter-go v0.23.4/go.mod h1:Jrx8QqYN0v7npv1fJRH1AznddllYiCMUChtVjxPK040=
 github.com/tree-sitter/tree-sitter-html v0.23.2 h1:1UYDV+Yd05GGRhVnTcbP58GkKLSHHZwVaN+lBZV11Lc=
 github.com/tree-sitter/tree-sitter-html v0.23.2/go.mod h1:gpUv/dG3Xl/eebqgeYeFMt+JLOY9cgFinb/Nw08a9og=
 github.com/tree-sitter/tree-sitter-java v0.23.5 h1:J9YeMGMwXYlKSP3K4Us8CitC6hjtMjqpeOf2GGo6tig=
 github.com/tree-sitter/tree-sitter-java v0.23.5/go.mod h1:NRKlI8+EznxA7t1Yt3xtraPk1Wzqh3GAIC46wxvc320=
 github.com/tree-sitter/tree-sitter-javascript v0.23.1 h1:1fWupaRC0ArlHJ/QJzsfQ3Ibyopw7ZfQK4xXc40Zveo=
 github.com/tree-sitter/tree-sitter-javascript v0.23.1/go.mod h1:lmGD1EJdCA+v0S1u2fFgepMg/opzSg/4pgFym2FPGAs=
 github.com/tree-sitter/tree-sitter-json v0.24.8 h1:tV5rMkihgtiOe14a9LHfDY5kzTl5GNUYe6carZBn0fQ=
 github.com/tree-sitter/tree-sitter-json v0.24.8/go.mod h1:F351KK0KGvCaYbZ5zxwx/gWWvZhIDl0eMtn+1r+gQbo=
 github.com/tree-sitter/tree-sitter-php v0.23.11 h1:iHewsLNDmznh8kgGyfWfujsZxIz1YGbSd2ZTEM0ZiP8=
 github.com/tree-sitter/tree-sitter-php v0.23.11/go.mod h1:T/kbfi+UcCywQfUNAJnGTN/fMSUjnwPXA8k4yoIks74=
 github.com/tree-sitter/tree-sitter-python v0.23.6 h1:qHnWFR5WhtMQpxBZRwiaU5Hk/29vGju6CVtmvu5Haas=
 github.com/tree-sitter/tree-sitter-python v0.23.6/go.mod h1:cpdthSy/Yoa28aJFBscFHlGiU+cnSiSh1kuDVtI8YeM=
 github.com/tree-sitter/tree-sitter-ruby v0.23.1 h1:T/NKHUA+iVbHM440hFx+lzVOzS4dV6z8Qw8ai+72bYo=
 github.com/tree-sitter/tree-sitter-ruby v0.23.1/go.mod h1:kUS4kCCQloFcdX6sdpr8p6r2rogbM6ZjTox5ZOQy8cA=
 github.com/tree-sitter/tree-sitter-rust v0.23.2 h1:6AtoooCW5GqNrRpfnvl0iUhxTAZEovEmLKDbyHlfw90=
 github.com/tree-sitter/tree-sitter-rust v0.23.2/go.mod h1:hfeGWic9BAfgTrc7Xf6FaOAguCFJRo3RBbs7QJ6D7MI=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
--- a/runner/runner.go
+++ b/runner/runner.go
@@ -4,6 +4,7 @@ import (
 	"github.com/ollama/ollama/runner/llamarunner"
 	"github.com/ollama/ollama/runner/ollamarunner"
 	"github.com/ollama/ollama/x/imagegen"
 	"github.com/ollama/ollama/x/mlxrunner"
 )
 func Execute(args []string) error {
@@ -17,6 +18,8 @@ func Execute(args []string) error {
 			return ollamarunner.Execute(args[1:])
 		case "--imagegen-engine":
 			return imagegen.Execute(args[1:])
 		case "--mlx-engine":
 			return mlxrunner.Execute(args[1:])
 		}
 	}
 	return llamarunner.Execute(args)
--- a/x/imagegen/manifest/weights.go
+++ b/x/imagegen/manifest/weights.go
@@ -102,8 +102,15 @@ func (mw *ManifestWeights) Load(dtype mlx.Dtype) error {
 		for _, entry := range entries {
 			name := entry.name
-			// Try to get tensor by manifest name
+			// Try to get tensor by stripped name first, then with component prefix.
-			arr := sf.Get(name)
+			// Blobs may store tensors with the full prefixed name (e.g., "text_encoder/model.layers.0.weight")
 			// while the tensors map uses stripped names (e.g., "model.layers.0.weight").
 			lookupName := name
 			arr := sf.Get(lookupName)
 			if arr == nil && mw.component != "" {
 				lookupName = mw.component + "/" + name
 				arr = sf.Get(lookupName)
 			}
 			if arr != nil {
 				// Single-tensor blob or tensor found by name
 				if dtype != 0 && arr.Dtype() != dtype {
@@ -114,14 +121,14 @@ func (mw *ManifestWeights) Load(dtype mlx.Dtype) error {
 				arrays = append(arrays, arr)
 				// Check for scale tensor
-				if scale := sf.Get(name + ".scale"); scale != nil {
+				if scale := sf.Get(lookupName + ".scale"); scale != nil {
 					scale = mlx.Contiguous(scale)
 					mw.cache[name+"_scale"] = scale
 					arrays = append(arrays, scale)
 				}
 				// Check for bias tensor
-				if bias := sf.Get(name + ".bias"); bias != nil {
+				if bias := sf.Get(lookupName + ".bias"); bias != nil {
 					bias = mlx.Contiguous(bias)
 					mw.cache[name+"_qbias"] = bias
 					arrays = append(arrays, bias)
@@ -147,20 +154,27 @@ func (mw *ManifestWeights) Load(dtype mlx.Dtype) error {
 						tArr = mlx.AsType(tArr, dtype)
 					}
 					tArr = mlx.Contiguous(tArr)
-					mw.cache[tensorName] = tArr
+
 					// Strip component prefix from blob-internal names so cache keys
 					// match the stripped names used by LoadModule.
 					cacheName := tensorName
 					if mw.component != "" {
 						cacheName = strings.TrimPrefix(tensorName, mw.component+"/")
 					}
 					mw.cache[cacheName] = tArr
 					arrays = append(arrays, tArr)
 					// Check for scale tensor
 					if scale := sf.Get(tensorName + ".scale"); scale != nil {
 						scale = mlx.Contiguous(scale)
-						mw.cache[tensorName+"_scale"] = scale
+						mw.cache[cacheName+"_scale"] = scale
 						arrays = append(arrays, scale)
 					}
 					// Check for bias tensor
 					if bias := sf.Get(tensorName + ".bias"); bias != nil {
 						bias = mlx.Contiguous(bias)
-						mw.cache[tensorName+"_qbias"] = bias
+						mw.cache[cacheName+"_qbias"] = bias
 						arrays = append(arrays, bias)
 					}
 				}
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -0,0 +1,96 @@
 //go:build mlx
 package mlxrunner
 import (
 	"log/slog"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 )
 type CacheEntry struct {
 	Caches  []cache.Cache
 	Count   int
 	Entries map[int32]*CacheEntry
 }
 func (s Runner) FindNearestCache(tokens []int32) ([]cache.Cache, []int32) {
 	current := &CacheEntry{Entries: s.CacheEntries}
 	index, cacheIndex := 0, -1
 	for _, token := range tokens {
 		if _, ok := current.Entries[token]; !ok {
 			break
 		}
 		current = current.Entries[token]
 		if len(current.Caches) > 0 {
 			cacheIndex = index
 		}
 		index += 1
 	}
 	if cacheIndex == len(tokens)-1 {
 		slog.Info("Cache hit", "type", "exact", "total", len(tokens), "cached", len(tokens), "left", len(tokens))
 		return current.Caches, []int32{}
 	} else if cacheIndex > 1 {
 		slog.Info("Cache hit", "type", "partial", "total", len(tokens), "cached", cacheIndex+1, "left", len(tokens[cacheIndex+1:]))
 		return current.Caches, tokens[cacheIndex+1:]
 	} else if index > 0 && cacheIndex < 0 {
 		type stackItem struct {
 			entry  *CacheEntry
 			tokens []int32
 		}
 		var best, item stackItem
 		stack := []stackItem{{entry: current, tokens: []int32{}}}
 		for len(stack) > 0 {
 			item, stack = stack[len(stack)-1], stack[:len(stack)-1]
 			if len(item.entry.Caches) > 0 {
 				if len(best.tokens) == 0 || len(item.tokens) < len(best.tokens) {
 					best = item
 				}
 			} else {
 				for token, entry := range item.entry.Entries {
 					stack = append(stack, stackItem{
 						entry:  entry,
 						tokens: append(item.tokens, token),
 					})
 				}
 			}
 		}
 		prefix := min(len(tokens)-1, index)
 		caches := make([]cache.Cache, len(best.entry.Caches))
 		trim := len(best.tokens)+1
 		for i := range caches {
 			caches[i] = best.entry.Caches[i].Clone()
 			caches[i].Trim(trim)
 		}
 		slog.Info("Cache hit", "type", "prefix", "total", len(tokens), "cached", prefix, "left", len(tokens[prefix:]), "trimmed", trim)
 		return caches, tokens[prefix:]
 	}
 	slog.Info("Cache miss", "left", len(tokens))
 	return nil, tokens
 }
 func (s *Runner) InsertCache(tokens []int32, caches []cache.Cache) {
 	current := &CacheEntry{Entries: s.CacheEntries}
 	for _, token := range tokens {
 		if _, ok := current.Entries[token]; !ok {
 			current.Entries[token] = &CacheEntry{
 				Entries: make(map[int32]*CacheEntry),
 			}
 		}
 		current = current.Entries[token]
 	}
 	if len(current.Caches) > 0 {
 		current.Count += 1
 	} else {
 		current.Caches = caches
 	}
 }
--- a/x/mlxrunner/cache/cache.go
+++ b/x/mlxrunner/cache/cache.go
@@ -0,0 +1,198 @@
 //go:build mlx
 package cache
 import (
 	"log/slog"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )
 type Cache interface {
 	Update(keys, values *mlx.Array) (newKeys, newValues *mlx.Array)
 	State() (keys, values *mlx.Array)
 	Trim(int) int
 	Clone() Cache
 	Offset() int
 	Len() int
 }
 type KVCache struct {
 	keys, values *mlx.Array
 	offset       int
 	step         int
 }
 func NewKVCache() *KVCache {
 	return &KVCache{step: 256}
 }
 func (c *KVCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
 	B, H, L, Dk, Dv := keys.Dim(0), keys.Dim(1), keys.Dim(2), keys.Dim(3), values.Dim(3)
 	prev := c.offset
 	// Grow buffer if needed
 	if c.keys == nil || (prev+L) > c.keys.Dim(2) {
 		steps := (c.step + L - 1) / c.step
 		newKeys := mlx.Zeros(keys.DType(), B, H, steps*c.step, Dk)
 		newValues := mlx.Zeros(values.DType(), B, H, steps*c.step, Dv)
 		if c.keys != nil {
 			if prev%c.step != 0 {
 				c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, prev), mlx.Slice()))
 				c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, prev), mlx.Slice()))
 			}
 			c.keys.Set(c.keys.Concatenate(2, newKeys))
 			c.values.Set(c.values.Concatenate(2, newValues))
 		} else {
 			c.keys, c.values = newKeys, newValues
 		}
 	}
 	c.offset += L
 	c.keys.Set(c.keys.SliceUpdate(keys, mlx.Slice(), mlx.Slice(), mlx.Slice(prev, c.offset), mlx.Slice()))
 	c.values.Set(c.values.SliceUpdate(values, mlx.Slice(), mlx.Slice(), mlx.Slice(prev, c.offset), mlx.Slice()))
 	return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
 		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
 }
 func (c *KVCache) State() (*mlx.Array, *mlx.Array) {
 	if c.offset == c.keys.Dim(2) {
 		return c.keys, c.values
 	}
 	return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
 		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
 }
 func (c *KVCache) Trim(n int) int {
 	n = min(c.offset, n)
 	c.offset -= n
 	return n
 }
 func (c *KVCache) Clone() Cache {
 	return &KVCache{
 		keys:   c.keys.Clone(),
 		values: c.values.Clone(),
 		offset: c.offset,
 		step:   c.step,
 	}
 }
 func (c *KVCache) Offset() int { return c.offset }
 func (c *KVCache) Len() int    { return c.offset }
 // RotatingKVCache implements sliding window attention with bounded memory
 type RotatingKVCache struct {
 	maxSize int
 	idx     int
 	*KVCache
 }
 func NewRotatingKVCache(maxSize int) *RotatingKVCache {
 	return &RotatingKVCache{maxSize: maxSize, KVCache: NewKVCache()}
 }
 func (c *RotatingKVCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
 	if keys.Dim(2) > 1 {
 		return c.concat(keys, values)
 	}
 	return c.update(keys, values)
 }
 func (c *RotatingKVCache) concat(keys, values *mlx.Array) (newK *mlx.Array, newV *mlx.Array) {
 	slog.Debug("(*RotatingKVCache).concat", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
 	if c.keys == nil {
 		c.keys, c.values = keys, values
 	} else {
 		if c.idx < c.keys.Dim(2) {
 			c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
 			c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
 		}
 		// Trim to max_size to maintain sliding window
 		if trim := c.idx - c.maxSize + 1; trim > 0 {
 			c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.keys.Dim(2)), mlx.Slice()))
 			c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.values.Dim(2)), mlx.Slice()))
 		}
 		c.keys.Set(c.keys.Concatenate(2, keys))
 		c.values.Set(c.values.Concatenate(2, values))
 		c.idx = c.keys.Dim(2)
 	}
 	c.offset += keys.Dim(2)
 	c.idx = c.keys.Dim(2)
 	return c.keys, c.values
 }
 func (c *RotatingKVCache) update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
 	slog.Debug("(*RotatingKVCache).update", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
 	B, H, L, Dk, Dv := keys.Dim(0), keys.Dim(1), keys.Dim(2), keys.Dim(3), values.Dim(3)
 	prev := c.offset
 	// Grow buffer if not yet at max
 	if c.keys == nil || (prev >= c.keys.Dim(2) && c.keys.Dim(2) < c.maxSize) {
 		newSize := min(c.step, c.maxSize-prev)
 		newKeys := mlx.Zeros(keys.DType(), B, H, newSize, Dk)
 		newValues := mlx.Zeros(values.DType(), B, H, newSize, Dv)
 		if c.keys != nil {
 			c.keys.Set(c.keys.Concatenate(2, newKeys))
 			c.values.Set(c.values.Concatenate(2, newValues))
 		} else {
 			c.keys, c.values = newKeys, newValues
 		}
 		c.idx = prev
 	}
 	// Trim to max_size to maintain sliding window
 	if trim := c.keys.Dim(2) - c.maxSize; trim > 0 {
 		c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.keys.Dim(2)), mlx.Slice()))
 		c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.values.Dim(2)), mlx.Slice()))
 		c.idx = c.maxSize
 	}
 	// Rotate when hitting max
 	if c.idx >= c.maxSize {
 		c.idx = 0
 	}
 	c.keys.Set(c.keys.SliceUpdate(keys, mlx.Slice(), mlx.Slice(), mlx.Slice(c.idx, c.idx+L), mlx.Slice()))
 	c.values.Set(c.values.SliceUpdate(values, mlx.Slice(), mlx.Slice(), mlx.Slice(c.idx, c.idx+L), mlx.Slice()))
 	c.offset += L
 	c.idx += L
 	validLen := min(c.offset, c.maxSize)
 	return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, validLen), mlx.Slice()),
 		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, validLen), mlx.Slice())
 }
 func (c *RotatingKVCache) State() (*mlx.Array, *mlx.Array) {
 	if c.offset < c.keys.Dim(2) {
 		return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
 			c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
 	}
 	return c.keys, c.values
 }
 func (c *RotatingKVCache) Trim(n int) int {
 	n = min(c.offset, n)
 	c.offset -= n
 	c.idx -= n
 	return n
 }
 func (c *RotatingKVCache) Clone() Cache {
 	return &RotatingKVCache{
 		maxSize: c.maxSize,
 		idx:     c.idx,
 		KVCache: c.KVCache.Clone().(*KVCache),
 	}
 }
 func (c *RotatingKVCache) Len() int { return min(c.offset, c.maxSize) }
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -0,0 +1,174 @@
 package mlxrunner
 import (
 	"bufio"
 	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"math"
 	"net"
 	"net/http"
 	"net/url"
 	"os/exec"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/ml"
 )
 type Client struct {
 	Port int
 	*exec.Cmd
 }
 func (c *Client) JoinPath(path string) string {
 	return (&url.URL{
 		Scheme: "http",
 		Host:   net.JoinHostPort("127.0.0.1", strconv.Itoa(c.Port)),
 	}).JoinPath(path).String()
 }
 func (c *Client) CheckError(w *http.Response) error {
 	if w.StatusCode >= 400 {
 		return errors.New(w.Status)
 	}
 	return nil
 }
 // Close implements llm.LlamaServer.
 func (c *Client) Close() error {
 	return c.Cmd.Process.Kill()
 }
 // Completion implements llm.LlamaServer.
 func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
 	var b bytes.Buffer
 	if err := json.NewEncoder(&b).Encode(req); err != nil {
 		return err
 	}
 	w, err := http.Post(c.JoinPath("/v1/completions"), "application/json", &b)
 	if err != nil {
 		return err
 	}
 	defer w.Body.Close()
 	if err := c.CheckError(w); err != nil {
 		return err
 	}
 	scanner := bufio.NewScanner(w.Body)
 	for scanner.Scan() {
 		bts := scanner.Bytes()
 		var resp llm.CompletionResponse
 		if err := json.Unmarshal(bts, &resp); err != nil {
 			return err
 		}
 		fn(resp)
 	}
 	return nil
 }
 func (c *Client) ContextLength() int {
 	return math.MaxInt
 }
 // Detokenize implements llm.LlamaServer.
 func (c *Client) Detokenize(ctx context.Context, tokens []int) (string, error) {
 	panic("unimplemented")
 }
 // Embedding implements llm.LlamaServer.
 func (c *Client) Embedding(ctx context.Context, input string) ([]float32, int, error) {
 	panic("unimplemented")
 }
 // GetDeviceInfos implements llm.LlamaServer.
 func (c *Client) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	panic("unimplemented")
 }
 // GetPort implements llm.LlamaServer.
 func (c *Client) GetPort() int {
 	return c.Port
 }
 // HasExited implements llm.LlamaServer.
 func (c *Client) HasExited() bool {
 	panic("unimplemented")
 }
 // Load implements llm.LlamaServer.
 func (c *Client) Load(ctx context.Context, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) ([]ml.DeviceID, error) {
 	w, err := http.Post(c.JoinPath("/v1/models"), "application/json", nil)
 	if err != nil {
 		return nil, err
 	}
 	defer w.Body.Close()
 	return []ml.DeviceID{}, nil
 }
 // ModelPath implements llm.LlamaServer.
 func (c *Client) ModelPath() string {
 	panic("unimplemented")
 }
 // Pid implements llm.LlamaServer.
 func (c *Client) Pid() int {
 	panic("unimplemented")
 }
 // Ping implements llm.LlamaServer.
 func (c *Client) Ping(ctx context.Context) error {
 	w, err := http.Get(c.JoinPath("/v1/status"))
 	if err != nil {
 		return err
 	}
 	defer w.Body.Close()
 	return nil
 }
 // Tokenize implements llm.LlamaServer.
 func (c *Client) Tokenize(ctx context.Context, content string) ([]int, error) {
 	w, err := http.Post(c.JoinPath("/v1/tokenize"), "text/plain", strings.NewReader(content))
 	if err != nil {
 		return nil, err
 	}
 	defer w.Body.Close()
 	var tokens []int
 	if err := json.NewDecoder(w.Body).Decode(&tokens); err != nil {
 		return nil, err
 	}
 	return tokens, nil
 }
 // TotalSize implements llm.LlamaServer.
 func (c *Client) TotalSize() uint64 {
 	panic("unimplemented")
 }
 // VRAMByGPU implements llm.LlamaServer.
 func (c *Client) VRAMByGPU(id ml.DeviceID) uint64 {
 	panic("unimplemented")
 }
 // VRAMSize implements llm.LlamaServer.
 func (c *Client) VRAMSize() uint64 {
 	panic("unimplemented")
 }
 // WaitUntilRunning implements llm.LlamaServer.
 func (c *Client) WaitUntilRunning(ctx context.Context) error {
 	panic("unimplemented")
 }
 var _ llm.LlamaServer = (*Client)(nil)
--- a/x/mlxrunner/mlx/.gitignore
+++ b/x/mlxrunner/mlx/.gitignore
@@ -0,0 +1,3 @@
 _deps
 build
 dist
--- a/x/mlxrunner/mlx/CMakeLists.txt
+++ b/x/mlxrunner/mlx/CMakeLists.txt
@@ -0,0 +1,26 @@
 cmake_minimum_required(VERSION 3.5)
 project(mlx)
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/dist" CACHE PATH "" FORCE)
 endif()
 set(MLX_BUILD_GGUF OFF CACHE BOOL "" FORCE)
 set(MLX_BUILD_SAFETENSORS ON CACHE BOOL "" FORCE)
 set(MLX_C_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
 set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
 set(CMAKE_INSTALL_RPATH "@loader_path")
 include(FetchContent)
 set(MLX_C_GIT_TAG "v0.4.1" CACHE STRING "")
 FetchContent_Declare(
  mlx-c
  GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
  GIT_TAG ${MLX_C_GIT_TAG}
 )
 FetchContent_MakeAvailable(mlx-c)
--- a/x/mlxrunner/mlx/act.go
+++ b/x/mlxrunner/mlx/act.go
@@ -0,0 +1,23 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import "math"
 func GELUApprox(t *Array) *Array {
 	return t.Multiply(
 		FromValue[float32](0.5),
 	).Multiply(
 		t.Add(
 			t.Power(FromValue[float32](3.0)).Multiply(FromValue[float32](0.044715)),
 		).Multiply(
 			FromValue(float32(math.Sqrt(2 / math.Pi))),
 		).Tanh().Add(FromValue[float32](1.0)),
 	).AsType(t.DType())
 }
 func SILU(t *Array) *Array {
 	return t.Multiply(t.Sigmoid()).AsType(t.DType())
 }
--- a/x/mlxrunner/mlx/array.go
+++ b/x/mlxrunner/mlx/array.go
@@ -0,0 +1,273 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"encoding/binary"
 	"log/slog"
 	"reflect"
 	"strings"
 	"time"
 	"unsafe"
 	"github.com/ollama/ollama/logutil"
 )
 type tensorDesc struct {
 	name    string
 	inputs  []*Array
 	numRefs int
 }
 func (d tensorDesc) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.String("name", d.name),
 		slog.Int("inputs", len(d.inputs)),
 		slog.Int("num_refs", d.numRefs),
 	)
 }
 type Array struct {
 	ctx  C.mlx_array
 	desc tensorDesc
 }
 // constructor utilities
 func New(name string, inputs ...*Array) *Array {
 	t := &Array{
 		desc: tensorDesc{
 			name:   name,
 			inputs: inputs,
 		},
 	}
 	for _, input := range inputs {
 		input.desc.numRefs++
 	}
 	logutil.Trace("New", "t", t)
 	return t
 }
 type scalarTypes interface {
 	~bool | ~int | ~float32 | ~float64 | ~complex64
 }
 func FromValue[T scalarTypes](t T) *Array {
 	tt := New("")
 	switch v := any(t).(type) {
 	case bool:
 		tt.ctx = C.mlx_array_new_bool(C.bool(v))
 	case int:
 		tt.ctx = C.mlx_array_new_int(C.int(v))
 	case float32:
 		tt.ctx = C.mlx_array_new_float32(C.float(v))
 	case float64:
 		tt.ctx = C.mlx_array_new_float64(C.double(v))
 	case complex64:
 		tt.ctx = C.mlx_array_new_complex(C.float(real(v)), C.float(imag(v)))
 	default:
 		panic("unsupported type")
 	}
 	return tt
 }
 type arrayTypes interface {
 	~bool | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
 		~int8 | ~int16 | ~int32 | ~int64 |
 		~float32 | ~float64 |
 		~complex64
 }
 func FromValues[S ~[]E, E arrayTypes](s S, shape ...int) *Array {
 	if len(shape) == 0 {
 		panic("shape must be provided for non-scalar tensors")
 	}
 	cShape := make([]C.int, len(shape))
 	for i := range shape {
 		cShape[i] = C.int(shape[i])
 	}
 	var dtype DType
 	switch reflect.TypeOf(s).Elem().Kind() {
 	case reflect.Bool:
 		dtype = DTypeBool
 	case reflect.Uint8:
 		dtype = DTypeUint8
 	case reflect.Uint16:
 		dtype = DTypeUint16
 	case reflect.Uint32:
 		dtype = DTypeUint32
 	case reflect.Uint64:
 		dtype = DTypeUint64
 	case reflect.Int8:
 		dtype = DTypeInt8
 	case reflect.Int16:
 		dtype = DTypeInt16
 	case reflect.Int32:
 		dtype = DTypeInt32
 	case reflect.Int64:
 		dtype = DTypeInt64
 	case reflect.Float32:
 		dtype = DTypeFloat32
 	case reflect.Float64:
 		dtype = DTypeFloat64
 	case reflect.Complex64:
 		dtype = DTypeComplex64
 	default:
 		panic("unsupported type")
 	}
 	bts := make([]byte, binary.Size(s))
 	if _, err := binary.Encode(bts, binary.LittleEndian, s); err != nil {
 		panic(err)
 	}
 	tt := New("")
 	tt.ctx = C.mlx_array_new_data(unsafe.Pointer(&bts[0]), unsafe.SliceData(cShape), C.int(len(cShape)), C.mlx_dtype(dtype))
 	return tt
 }
 func (t *Array) Set(other *Array) {
 	other.desc.numRefs++
 	t.desc.inputs = []*Array{other}
 	C.mlx_array_set(&t.ctx, other.ctx)
 }
 func (t *Array) Clone() *Array {
 	tt := New(t.desc.name, t.desc.inputs...)
 	C.mlx_array_set(&tt.ctx, t.ctx)
 	return tt
 }
 // misc. utilities
 func (t *Array) Valid() bool {
 	return t.ctx.ctx != nil
 }
 func (t *Array) String() string {
 	str := C.mlx_string_new()
 	defer C.mlx_string_free(str)
 	C.mlx_array_tostring(&str, t.ctx)
 	return strings.TrimSpace(C.GoString(C.mlx_string_data(str)))
 }
 func (t *Array) LogValue() slog.Value {
 	attrs := []slog.Attr{slog.Any("", t.desc)}
 	if t.Valid() {
 		attrs = append(attrs,
 			slog.Any("dtype", t.DType()),
 			slog.Any("shape", t.Dims()),
 			slog.Int("num_bytes", t.NumBytes()),
 		)
 	}
 	return slog.GroupValue(attrs...)
 }
 // shape utilities
 func (t Array) Size() int {
 	return int(C.mlx_array_size(t.ctx))
 }
 func (t Array) NumBytes() int {
 	return int(C.mlx_array_nbytes(t.ctx))
 }
 func (t Array) NumDims() int {
 	return int(C.mlx_array_ndim(t.ctx))
 }
 func (t Array) Dims() []int {
 	dims := make([]int, t.NumDims())
 	for i := range dims {
 		dims[i] = t.Dim(i)
 	}
 	return dims
 }
 func (t Array) Dim(dim int) int {
 	return int(C.mlx_array_dim(t.ctx, C.int(dim)))
 }
 func (t Array) DType() DType {
 	return DType(C.mlx_array_dtype(t.ctx))
 }
 // data utilities
 func (t Array) Int() int {
 	var item C.int64_t
 	C.mlx_array_item_int64(&item, t.ctx)
 	return int(item)
 }
 func (t Array) Float() float64 {
 	var item C.double
 	C.mlx_array_item_float64(&item, t.ctx)
 	return float64(item)
 }
 func (t Array) Ints() []int {
 	ints := make([]int, t.Size())
 	for i, f := range unsafe.Slice(C.mlx_array_data_int32(t.ctx), len(ints)) {
 		ints[i] = int(f)
 	}
 	return ints
 }
 func (t Array) Floats() []float32 {
 	floats := make([]float32, t.Size())
 	for i, f := range unsafe.Slice(C.mlx_array_data_float32(t.ctx), len(floats)) {
 		floats[i] = float32(f)
 	}
 	return floats
 }
 func (t Array) Save(name string) error {
 	cName := C.CString(name)
 	defer C.free(unsafe.Pointer(cName))
 	C.mlx_save(cName, t.ctx)
 	return nil
 }
 func Free(s ...*Array) (n int) {
 	now := time.Now()
 	defer func() {
 		if n > 0 {
 			logutil.Trace("Freed tensors", "num_bytes", PrettyBytes(n), "took", time.Since(now))
 		}
 	}()
 	free := make([]*Array, 0, 8192)
 	fn := func(t *Array) {
 		if t.Valid() {
 			free = append(free, t.desc.inputs...)
 			t.desc.numRefs--
 			if t.desc.numRefs <= 0 {
 				logutil.Trace("Free", "t", t)
 				n += t.NumBytes()
 				C.mlx_array_free(t.ctx)
 				t.ctx.ctx = nil
 			}
 		}
 	}
 	for _, t := range s {
 		fn(t)
 	}
 	for len(free) > 0 {
 		tail := free[len(free)-1]
 		free = free[:len(free)-1]
 		fn(tail)
 	}
 	return n
 }
--- a/x/mlxrunner/mlx/array_test.go
+++ b/x/mlxrunner/mlx/array_test.go
@@ -0,0 +1,45 @@
 //go:build mlx
 package mlx
 import "testing"
 func TestFromValue(t *testing.T) {
 	for got, want := range map[*Array]DType{
 		FromValue(true):              DTypeBool,
 		FromValue(false):             DTypeBool,
 		FromValue(int(7)):            DTypeInt32,
 		FromValue(float32(3.14)):     DTypeFloat32,
 		FromValue(float64(2.71)):     DTypeFloat64,
 		FromValue(complex64(1 + 2i)): DTypeComplex64,
 	} {
 		t.Run(want.String(), func(t *testing.T) {
 			if got.DType() != want {
 				t.Errorf("want %v, got %v", want, got)
 			}
 		})
 	}
 }
 func TestFromValues(t *testing.T) {
 	for got, want := range map[*Array]DType{
 		FromValues([]bool{true, false, true}, 3):           DTypeBool,
 		FromValues([]uint8{1, 2, 3}, 3):                    DTypeUint8,
 		FromValues([]uint16{1, 2, 3}, 3):                   DTypeUint16,
 		FromValues([]uint32{1, 2, 3}, 3):                   DTypeUint32,
 		FromValues([]uint64{1, 2, 3}, 3):                   DTypeUint64,
 		FromValues([]int8{-1, -2, -3}, 3):                  DTypeInt8,
 		FromValues([]int16{-1, -2, -3}, 3):                 DTypeInt16,
 		FromValues([]int32{-1, -2, -3}, 3):                 DTypeInt32,
 		FromValues([]int64{-1, -2, -3}, 3):                 DTypeInt64,
 		FromValues([]float32{3.14, 2.71, 1.61}, 3):         DTypeFloat32,
 		FromValues([]float64{3.14, 2.71, 1.61}, 3):         DTypeFloat64,
 		FromValues([]complex64{1 + 2i, 3 + 4i, 5 + 6i}, 3): DTypeComplex64,
 	} {
 		t.Run(want.String(), func(t *testing.T) {
 			if got.DType() != want {
 				t.Errorf("want %v, got %v", want, got)
 			}
 		})
 	}
 }
--- a/x/mlxrunner/mlx/dtype.go
+++ b/x/mlxrunner/mlx/dtype.go
@@ -0,0 +1,96 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 type DType int
 func (t DType) String() string {
 	switch t {
 	case DTypeBool:
 		return "BOOL"
 	case DTypeUint8:
 		return "U8"
 	case DTypeUint16:
 		return "U16"
 	case DTypeUint32:
 		return "U32"
 	case DTypeUint64:
 		return "U64"
 	case DTypeInt8:
 		return "I8"
 	case DTypeInt16:
 		return "I16"
 	case DTypeInt32:
 		return "I32"
 	case DTypeInt64:
 		return "I64"
 	case DTypeFloat16:
 		return "F16"
 	case DTypeFloat32:
 		return "F32"
 	case DTypeFloat64:
 		return "F64"
 	case DTypeBFloat16:
 		return "BF16"
 	case DTypeComplex64:
 		return "C64"
 	default:
 		return "Unknown"
 	}
 }
 func (t *DType) UnmarshalJSON(b []byte) error {
 	switch string(b) {
 	case `"BOOL"`:
 		*t = DTypeBool
 	case `"U8"`:
 		*t = DTypeUint8
 	case `"U16"`:
 		*t = DTypeUint16
 	case `"U32"`:
 		*t = DTypeUint32
 	case `"U64"`:
 		*t = DTypeUint64
 	case `"I8"`:
 		*t = DTypeInt8
 	case `"I16"`:
 		*t = DTypeInt16
 	case `"I32"`:
 		*t = DTypeInt32
 	case `"I64"`:
 		*t = DTypeInt64
 	case `"F16"`:
 		*t = DTypeFloat16
 	case `"F64"`:
 		*t = DTypeFloat64
 	case `"F32"`:
 		*t = DTypeFloat32
 	case `"BF16"`:
 		*t = DTypeBFloat16
 	case `"C64"`:
 		*t = DTypeComplex64
 	default:
 		return nil
 	}
 	return nil
 }
 const (
 	DTypeBool      DType = C.MLX_BOOL
 	DTypeUint8     DType = C.MLX_UINT8
 	DTypeUint16    DType = C.MLX_UINT16
 	DTypeUint32    DType = C.MLX_UINT32
 	DTypeUint64    DType = C.MLX_UINT64
 	DTypeInt8      DType = C.MLX_INT8
 	DTypeInt16     DType = C.MLX_INT16
 	DTypeInt32     DType = C.MLX_INT32
 	DTypeInt64     DType = C.MLX_INT64
 	DTypeFloat16   DType = C.MLX_FLOAT16
 	DTypeFloat32   DType = C.MLX_FLOAT32
 	DTypeFloat64   DType = C.MLX_FLOAT64
 	DTypeBFloat16  DType = C.MLX_BFLOAT16
 	DTypeComplex64 DType = C.MLX_COMPLEX64
 )
--- a/x/mlxrunner/mlx/dynamic.c
+++ b/x/mlxrunner/mlx/dynamic.c
@@ -0,0 +1,34 @@
 #include "dynamic.h"
 #include <stdio.h>
 #ifdef _WIN32
 #include <windows.h>
 #define DLOPEN(path) LoadLibraryA(path)
 #define DLCLOSE(handle) FreeLibrary((HMODULE)(handle))
 #else
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #include <libgen.h>
 #endif
 #include <dlfcn.h>
 #define DLOPEN(path) dlopen(path, RTLD_LAZY | RTLD_GLOBAL)
 #define DLCLOSE(handle) dlclose(handle)
 #endif
 static int mlx_dynamic_open(mlx_dynamic_handle* handle, const char* path) {
    handle->ctx = (void*) DLOPEN(path);
    CHECK(handle->ctx != NULL);
    return 0;
 }
 int mlx_dynamic_load(mlx_dynamic_handle* handle, const char *path) {
    return mlx_dynamic_open(handle, path);
 }
 void mlx_dynamic_unload(mlx_dynamic_handle* handle) {
    if (handle->ctx) {
        DLCLOSE(handle->ctx);
        handle->ctx = NULL;
    }
 }
--- a/x/mlxrunner/mlx/dynamic.go
+++ b/x/mlxrunner/mlx/dynamic.go
@@ -0,0 +1,65 @@
 //go:build mlx
 package mlx
 // #include "dynamic.h"
 // #include "generated.h"
 // #include <stdlib.h>
 import "C"
 import (
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"unsafe"
 )
 func init() {
 	switch runtime.GOOS {
 	case "darwin":
 	case "windows":
 	default:
 		return
 	}
 	paths, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH")
 	if !ok {
 		slog.Debug("OLLAMA_LIBRARY_PATH not set, skipping mlx dynamic loading")
 		return
 	}
 	for _, path := range filepath.SplitList(paths) {
 		matches, err := fs.Glob(os.DirFS(path), "libmlxc.*")
 		if err != nil {
 			panic(err)
 		}
 		for _, match := range matches {
 			path := filepath.Join(paths, match)
 			slog.Info("Loading MLX dynamic library", "path", path)
 			cPath := C.CString(path)
 			defer C.free(unsafe.Pointer(cPath))
 			var handle C.mlx_dynamic_handle
 			if C.mlx_dynamic_load(&handle, cPath) != 0 {
 				slog.Error("Failed to load MLX dynamic library", "path", path)
 				continue
 			}
 			if C.mlx_dynamic_load_symbols(handle) != 0 {
 				slog.Error("Failed to load MLX dynamic library symbols", "path", path)
 				C.mlx_dynamic_unload(&handle)
 				continue
 			}
 			slog.Info("Loaded MLX dynamic library", "path", path)
 			return
 		}
 	}
 	panic("Failed to load any MLX dynamic library")
 }
--- a/x/mlxrunner/mlx/dynamic.h
+++ b/x/mlxrunner/mlx/dynamic.h
@@ -0,0 +1,41 @@
 #ifndef MLX_DYNAMIC_H
 #define MLX_DYNAMIC_H
 #ifdef _WIN32
 #include <windows.h>
 #define DLSYM(handle, symbol) GetProcAddress((HMODULE)(handle), symbol)
 #else
 #include <dlfcn.h>
 #define DLSYM(handle, symbol) dlsym(handle.ctx, symbol)
 #endif
 #include <stdint.h>
 // Provide fallback typedefs for float16_t and bfloat16_t on non-ARM64
 // platforms where arm_fp16.h and arm_bf16.h are not available. These are
 // only used as function pointer signature placeholders since MLX requires
 // Apple Silicon at runtime.
 #if !defined(__aarch64__) && !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
 typedef uint16_t float16_t;
 #endif
 #if !defined(__aarch64__) && !defined(__ARM_FEATURE_BF16)
 typedef uint16_t bfloat16_t;
 #endif
 #define ERROR(fmt, ...) fprintf(stderr, "%s %s - ERROR - %s:%d - " fmt "\n", __DATE__, __TIME__, __FILE__, __LINE__, ##__VA_ARGS__); return 1
 #define CHECK(x) if (!(x)) { ERROR("CHECK failed: " #x); }
 #define CHECK_LOAD(handle, x) x##_ = DLSYM(handle, #x); CHECK(x##_)
 typedef struct {
    void* ctx;
 } mlx_dynamic_handle;
 int mlx_dynamic_load(
    mlx_dynamic_handle* handle,
    const char *path);
 void mlx_dynamic_unload(
    mlx_dynamic_handle* handle);
 #endif // MLX_DYNAMIC_H
--- a/x/mlxrunner/mlx/fast.go
+++ b/x/mlxrunner/mlx/fast.go
@@ -0,0 +1,74 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"unsafe"
 )
 func ScaledDotProductAttention(query, key, value, mask *Array, scale float32) *Array {
 	if mask == nil {
 		mask = New("")
 	}
 	sinks := New("")
 	mode := "causal"
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))
 	out := New("FAST_SDPA", query, key, value, mask, sinks)
 	C.mlx_fast_scaled_dot_product_attention(&out.ctx, query.ctx, key.ctx, value.ctx, C.float(scale), cMode, mask.ctx, sinks.ctx, DefaultStream().ctx)
 	return out
 }
 type LayerNorm struct {
 	Weight Array `weight:"weight"`
 	Bias   Array `weight:"bias"`
 }
 func (r *LayerNorm) Forward(x *Array, eps float32) *Array {
 	out := New("FAST_LAYERNORM", x)
 	C.mlx_fast_layer_norm(&out.ctx, x.ctx, r.Weight.ctx, r.Bias.ctx, C.float(eps), DefaultStream().ctx)
 	return out
 }
 type RMSNorm struct {
 	Weight Array `weight:"weight"`
 }
 func (r RMSNorm) Forward(x *Array, eps float32) *Array {
 	out := New("FAST_RMSNORM", x)
 	C.mlx_fast_rms_norm(&out.ctx, x.ctx, r.Weight.ctx, C.float(eps), DefaultStream().ctx)
 	return out
 }
 type RoPE struct {
 	Dims        int
 	Traditional bool
 	Base        float32 `json:"rope_theta"`
 	Scale       float32
 }
 func (r RoPE) Forward(t *Array, offset int) *Array {
 	freqs := New("")
 	out := New("FAST_ROPE", t, freqs)
 	C.mlx_fast_rope(
 		&out.ctx,
 		t.ctx,
 		C.int(r.Dims),
 		C._Bool(r.Traditional),
 		C.mlx_optional_float{
 			value:     C.float(r.Base),
 			has_value: C._Bool(func() bool { return r.Base != 0 }()),
 		},
 		C.float(r.Scale),
 		C.int(offset),
 		freqs.ctx,
 		DefaultStream().ctx,
 	)
 	return out
 }
--- a/x/mlxrunner/mlx/generated.c
+++ b/x/mlxrunner/mlx/generated.c
--- a/x/mlxrunner/mlx/generated.h
+++ b/x/mlxrunner/mlx/generated.h
--- a/x/mlxrunner/mlx/generator/generated.c.gotmpl
+++ b/x/mlxrunner/mlx/generator/generated.c.gotmpl
@@ -0,0 +1,17 @@
 // This code is auto-generated; DO NOT EDIT.
 #include "generated.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 {{ range .Functions }}
 {{ .Type }} (*{{ .Name }}_){{ .Parameters }} = NULL;
 {{- end }}
 int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
 {{- range .Functions }}
    CHECK_LOAD(handle, {{ .Name }});
 {{- end }}
    return 0;
 }
--- a/x/mlxrunner/mlx/generator/generated.h.gotmpl
+++ b/x/mlxrunner/mlx/generator/generated.h.gotmpl
@@ -0,0 +1,22 @@
 // This code is auto-generated; DO NOT EDIT.
 #ifndef MLX_GENERATED_H
 #define MLX_GENERATED_H
 #include "dynamic.h"
 #include "mlx/c/mlx.h"
 {{ range .Functions }}
 #undef {{ .Name }}
 {{- end }}
 {{ range .Functions }}
 extern {{ .Type }} (*{{ .Name }}_){{ .Parameters }};
 {{- end }}
 int mlx_dynamic_load_symbols(mlx_dynamic_handle handle);
 {{ range .Functions }}
 static inline {{ .Type }} {{ .Name }}{{ .Parameters }} {{ "{" }}
    return {{ .Name }}_({{ .Args }});
 {{ "}" }}
 {{- end }}
 #endif // MLX_GENERATED_H
--- a/x/mlxrunner/mlx/generator/main.go
+++ b/x/mlxrunner/mlx/generator/main.go
@@ -0,0 +1,135 @@
 package main
 import (
 	"embed"
 	"flag"
 	"fmt"
 	"os"
 	"path/filepath"
 	"slices"
 	"strings"
 	"text/template"
 	tree_sitter "github.com/tree-sitter/go-tree-sitter"
 	tree_sitter_cpp "github.com/tree-sitter/tree-sitter-cpp/bindings/go"
 )
 //go:embed *.gotmpl
 var fsys embed.FS
 type Function struct {
 	Type,
 	Name,
 	Parameters,
 	Args string
 }
 func ParseFunction(node *tree_sitter.Node, tc *tree_sitter.TreeCursor, source []byte) Function {
 	var fn Function
 	fn.Name = node.ChildByFieldName("declarator").Utf8Text(source)
 	if params := node.ChildByFieldName("parameters"); params != nil {
 		fn.Parameters = params.Utf8Text(source)
 		fn.Args = ParseParameters(params, tc, source)
 	}
 	var types []string
 	for node.Parent() != nil && node.Parent().Kind() != "declaration" {
 		if node.Parent().Kind() == "pointer_declarator" {
 			types = append(types, "*")
 		}
 		node = node.Parent()
 	}
 	for sibling := node.PrevSibling(); sibling != nil; sibling = sibling.PrevSibling() {
 		types = append(types, sibling.Utf8Text(source))
 	}
 	slices.Reverse(types)
 	fn.Type = strings.Join(types, " ")
 	return fn
 }
 func ParseParameters(node *tree_sitter.Node, tc *tree_sitter.TreeCursor, source []byte) string {
 	var s []string
 	for _, child := range node.Children(tc) {
 		if child.IsNamed() {
 			child := child.ChildByFieldName("declarator")
 			for child != nil && child.Kind() != "identifier" {
 				if child.Kind() == "parenthesized_declarator" {
 					child = child.Child(1)
 				} else {
 					child = child.ChildByFieldName("declarator")
 				}
 			}
 			if child != nil {
 				s = append(s, child.Utf8Text(source))
 			}
 		}
 	}
 	return strings.Join(s, ", ")
 }
 func main() {
 	var output string
 	flag.StringVar(&output, "output", ".", "Output directory for generated files")
 	flag.Parse()
 	parser := tree_sitter.NewParser()
 	defer parser.Close()
 	language := tree_sitter.NewLanguage(tree_sitter_cpp.Language())
 	parser.SetLanguage(language)
 	query, _ := tree_sitter.NewQuery(language, `(function_declarator declarator: (identifier)) @func`)
 	defer query.Close()
 	qc := tree_sitter.NewQueryCursor()
 	defer qc.Close()
 	var funs []Function
 	for _, arg := range flag.Args() {
 		bts, err := os.ReadFile(arg)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "Error reading file %s: %v\n", arg, err)
 			continue
 		}
 		tree := parser.Parse(bts, nil)
 		defer tree.Close()
 		tc := tree.Walk()
 		defer tc.Close()
 		matches := qc.Matches(query, tree.RootNode(), bts)
 		for match := matches.Next(); match != nil; match = matches.Next() {
 			for _, capture := range match.Captures {
 				funs = append(funs, ParseFunction(&capture.Node, tc, bts))
 			}
 		}
 	}
 	tmpl, err := template.New("").ParseFS(fsys, "*.gotmpl")
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "Error parsing template: %v\n", err)
 		return
 	}
 	for _, tmpl := range tmpl.Templates() {
 		name := filepath.Join(output, strings.TrimSuffix(tmpl.Name(), ".gotmpl"))
 		fmt.Println("Generating", name)
 		f, err := os.Create(name)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "Error creating file %s: %v\n", name, err)
 			continue
 		}
 		defer f.Close()
 		if err := tmpl.Execute(f, map[string]any{
 			"Functions": funs,
 		}); err != nil {
 			fmt.Fprintf(os.Stderr, "Error executing template %s: %v\n", tmpl.Name(), err)
 		}
 	}
 }
--- a/x/mlxrunner/mlx/io.go
+++ b/x/mlxrunner/mlx/io.go
@@ -0,0 +1,45 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"iter"
 	"unsafe"
 )
 func Load(path string) iter.Seq2[string, *Array] {
 	return func(yield func(string, *Array) bool) {
 		string2array := C.mlx_map_string_to_array_new()
 		defer C.mlx_map_string_to_array_free(string2array)
 		string2string := C.mlx_map_string_to_string_new()
 		defer C.mlx_map_string_to_string_free(string2string)
 		cPath := C.CString(path)
 		defer C.free(unsafe.Pointer(cPath))
 		cpu := C.mlx_default_cpu_stream_new()
 		defer C.mlx_stream_free(cpu)
 		C.mlx_load_safetensors(&string2array, &string2string, cPath, cpu)
 		it := C.mlx_map_string_to_array_iterator_new(string2array)
 		defer C.mlx_map_string_to_array_iterator_free(it)
 		for {
 			var key *C.char
 			value := C.mlx_array_new()
 			if C.mlx_map_string_to_array_iterator_next(&key, &value, it) != 0 {
 				break
 			}
 			name := C.GoString(key)
 			if !yield(name, &Array{ctx: value, desc: tensorDesc{name: name, numRefs: 1000}}) {
 				break
 			}
 		}
 	}
 }
--- a/x/mlxrunner/mlx/memory.go
+++ b/x/mlxrunner/mlx/memory.go
@@ -0,0 +1,87 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"fmt"
 	"log/slog"
 	"strconv"
 )
 func (b Byte) String() string {
 	return strconv.FormatInt(int64(b), 10) + " B"
 }
 func (b KibiByte) String() string {
 	return strconv.FormatFloat(float64(b)/(1<<10), 'f', 2, 64) + " KiB"
 }
 func (b MebiByte) String() string {
 	return strconv.FormatFloat(float64(b)/(1<<(2*10)), 'f', 2, 64) + " MiB"
 }
 func (b GibiByte) String() string {
 	return strconv.FormatFloat(float64(b)/(1<<(3*10)), 'f', 2, 64) + " GiB"
 }
 func (b TebiByte) String() string {
 	return strconv.FormatFloat(float64(b)/(1<<(4*10)), 'f', 2, 64) + " TiB"
 }
 func PrettyBytes(n int) fmt.Stringer {
 	switch {
 	case n < 1<<10:
 		return Byte(n)
 	case n < 1<<(2*10):
 		return KibiByte(n)
 	case n < 1<<(3*10):
 		return MebiByte(n)
 	case n < 1<<(4*10):
 		return GibiByte(n)
 	default:
 		return TebiByte(n)
 	}
 }
 func ActiveMemory() int {
 	var active C.size_t
 	C.mlx_get_active_memory(&active)
 	return int(active)
 }
 func CacheMemory() int {
 	var cache C.size_t
 	C.mlx_get_cache_memory(&cache)
 	return int(cache)
 }
 func PeakMemory() int {
 	var peak C.size_t
 	C.mlx_get_peak_memory(&peak)
 	return int(peak)
 }
 type Memory struct{}
 func (Memory) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.Any("active", PrettyBytes(ActiveMemory())),
 		slog.Any("cache", PrettyBytes(CacheMemory())),
 		slog.Any("peak", PrettyBytes(PeakMemory())),
 	)
 }
 type (
 	Byte     int
 	KibiByte int
 	MebiByte int
 	GibiByte int
 	TebiByte int
 )
 func ClearCache() {
 	C.mlx_clear_cache()
 }
--- a/x/mlxrunner/mlx/mlx.go
+++ b/x/mlxrunner/mlx/mlx.go
@@ -0,0 +1,40 @@
 //go:build mlx
 package mlx
 //go:generate cmake -S . -B build -DCMAKE_INSTALL_PREFIX=dist -DCMAKE_BUILD_TYPE=Release
 //go:generate cmake --build build --parallel
 //go:generate cmake --install build
 //go:generate sh -c "go run generator/main.go -output=. ./dist/include/mlx/c/*.h"
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/dist/include
 // #cgo LDFLAGS: -L${SRCDIR}/dist/lib -lstdc++
 // #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework Accelerate
 // #include "generated.h"
 import "C"
 func doEval(outputs []*Array, async bool) {
 	vector := C.mlx_vector_array_new()
 	defer C.mlx_vector_array_free(vector)
 	for _, output := range outputs {
 		if output.Valid() {
 			C.mlx_vector_array_append_value(vector, output.ctx)
 		}
 	}
 	if async {
 		C.mlx_async_eval(vector)
 	} else {
 		C.mlx_eval(vector)
 	}
 }
 func AsyncEval(outputs ...*Array) {
 	doEval(outputs, true)
 }
 func Eval(outputs ...*Array) {
 	doEval(outputs, false)
 }
--- a/x/mlxrunner/mlx/nn.go
+++ b/x/mlxrunner/mlx/nn.go
@@ -0,0 +1,38 @@
 //go:build mlx
 package mlx
 type Linear struct {
 	Weight Array `weight:"weight"`
 	Bias   Array `weight:"bias"`
 }
 // Forward computes the linear transformation: x @ Weight.T + Bias
 func (m Linear) Forward(x *Array) *Array {
 	w := m.Weight.Transpose(1, 0)
 	if m.Bias.Valid() {
 		return m.Bias.Addmm(x, w, 1.0, 1.0)
 	}
 	return x.Matmul(w)
 }
 func (m Linear) Gather(x, lhs, rhs *Array, sorted bool) *Array {
 	w := m.Weight.Transpose(0, 2, 1)
 	// TODO: bias
 	return x.GatherMM(w, lhs, rhs, sorted)
 }
 type Embedding struct {
 	Weight Array `weight:"weight"`
 }
 func (e *Embedding) Forward(indices *Array) *Array {
 	return e.Weight.TakeAxis(indices, 0)
 }
 func (e *Embedding) AsLinear() Linear {
 	return Linear{
 		Weight: e.Weight,
 	}
 }
--- a/x/mlxrunner/mlx/ops.go
+++ b/x/mlxrunner/mlx/ops.go
@@ -0,0 +1,256 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"unsafe"
 )
 func (t *Array) Abs() *Array {
 	out := New("ABS", t)
 	C.mlx_abs(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Add(other *Array) *Array {
 	out := New("ADD", t, other)
 	C.mlx_add(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Addmm(a, b *Array, alpha, beta float32) *Array {
 	out := New("ADDMM", t, a, b)
 	C.mlx_addmm(&out.ctx, t.ctx, a.ctx, b.ctx, C.float(alpha), C.float(beta), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Argmax(axis int, keepDims bool) *Array {
 	out := New("ARGMAX", t)
 	C.mlx_argmax_axis(&out.ctx, t.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
 	return out
 }
 func (t *Array) ArgpartitionAxis(kth int, axis int) *Array {
 	out := New("ARGPARTITION", t)
 	C.mlx_argpartition_axis(&out.ctx, t.ctx, C.int(kth), C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) ArgsortAxis(axis int) *Array {
 	out := New("ARGSORT_AXIS", t)
 	C.mlx_argsort_axis(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) AsType(dtype DType) *Array {
 	out := New("AS_TYPE", t)
 	C.mlx_astype(&out.ctx, t.ctx, C.mlx_dtype(dtype), DefaultStream().ctx)
 	return out
 }
 func (t *Array) AsStrided(shape []int, strides []int, offset int) *Array {
 	cShape := make([]C.int, len(shape))
 	for i, s := range shape {
 		cShape[i] = C.int(s)
 	}
 	cStrides := make([]C.int64_t, len(strides))
 	for i, s := range strides {
 		cStrides[i] = C.int64_t(s)
 	}
 	out := New("AS_STRIDED", t)
 	C.mlx_as_strided(
 		&out.ctx, t.ctx,
 		unsafe.SliceData(cShape), C.size_t(len(shape)),
 		unsafe.SliceData(cStrides), C.size_t(len(strides)),
 		C.size_t(offset),
 		DefaultStream().ctx,
 	)
 	return out
 }
 func (t *Array) Concatenate(axis int, others ...*Array) *Array {
 	vector := C.mlx_vector_array_new()
 	defer C.mlx_vector_array_free(vector)
 	s := append([]*Array{t}, others...)
 	for _, other := range s {
 		C.mlx_vector_array_append_value(vector, other.ctx)
 	}
 	out := New("CONCATENATE", s...)
 	C.mlx_concatenate_axis(&out.ctx, vector, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Divide(other *Array) *Array {
 	out := New("DIVIDE", t, other)
 	C.mlx_divide(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) ExpandDims(axis int) *Array {
 	out := New("EXPAND_DIMS", t)
 	C.mlx_expand_dims(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Flatten(startAxis, endAxis int) *Array {
 	out := New("FLATTEN", t)
 	C.mlx_flatten(&out.ctx, t.ctx, C.int(startAxis), C.int(endAxis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) FloorDivide(other *Array) *Array {
 	out := New("FLOOR_DIVIDE", t, other)
 	C.mlx_floor_divide(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) GatherMM(other, lhs, rhs *Array, sorted bool) *Array {
 	if lhs == nil {
 		lhs = New("")
 	}
 	if rhs == nil {
 		rhs = New("")
 	}
 	out := New("GATHER_MM", t, other, lhs, rhs)
 	C.mlx_gather_mm(&out.ctx, t.ctx, other.ctx, lhs.ctx, rhs.ctx, C.bool(sorted), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Logsumexp(keepDims bool) *Array {
 	out := New("LOGSUMEXP", t)
 	C.mlx_logsumexp(&out.ctx, t.ctx, C.bool(keepDims), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Matmul(other *Array) *Array {
 	out := New("MATMUL", t, other)
 	C.mlx_matmul(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Multiply(other *Array) *Array {
 	out := New("MULTIPLY", t, other)
 	C.mlx_multiply(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Negative() *Array {
 	out := New("NEGATIVE", t)
 	C.mlx_negative(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Power(exponent *Array) *Array {
 	out := New("POWER", t, exponent)
 	C.mlx_power(&out.ctx, t.ctx, exponent.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) PutAlongAxis(indices, values *Array, axis int) *Array {
 	out := New("PUT_ALONG_AXIS", t, indices, values)
 	C.mlx_put_along_axis(&out.ctx, t.ctx, indices.ctx, values.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Reshape(axes ...int) *Array {
 	cAxes := make([]C.int, len(axes))
 	for i := range axes {
 		cAxes[i] = C.int(axes[i])
 	}
 	out := New("RESHAPE", t)
 	C.mlx_reshape(&out.ctx, t.ctx, unsafe.SliceData(cAxes), C.size_t(len(cAxes)), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Sigmoid() *Array {
 	out := New("SIGMOID", t)
 	C.mlx_sigmoid(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Sqrt() *Array {
 	out := New("SQRT", t)
 	C.mlx_sqrt(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Squeeze(axis int) *Array {
 	out := New("SQUEEZE", t)
 	C.mlx_squeeze_axis(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) StackAxis(axis int, others ...*Array) *Array {
 	vectorData := make([]C.mlx_array, len(others)+1)
 	vectorData[0] = t.ctx
 	for i := range others {
 		vectorData[i+1] = others[i].ctx
 	}
 	vector := C.mlx_vector_array_new_data(unsafe.SliceData(vectorData), C.size_t(len(vectorData)))
 	defer C.mlx_vector_array_free(vector)
 	out := New("STACK_AXIS", append(others, t)...)
 	C.mlx_stack_axis(&out.ctx, vector, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Subtract(other *Array) *Array {
 	out := New("SUBTRACT", t, other)
 	C.mlx_subtract(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) SumAxis(axis int, keepDims bool) *Array {
 	out := New("SUM_AXIS", t)
 	C.mlx_sum_axis(&out.ctx, t.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
 	return out
 }
 func (t *Array) TakeAxis(indices *Array, axis int) *Array {
 	out := New("TAKE_AXIS", t, indices)
 	C.mlx_take_axis(&out.ctx, t.ctx, indices.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) TakeAlongAxis(indices *Array, axis int) *Array {
 	out := New("TAKE_ALONG_AXIS", t, indices)
 	C.mlx_take_along_axis(&out.ctx, t.ctx, indices.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func (t *Array) Tanh() *Array {
 	out := New("TANH", t)
 	C.mlx_tanh(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }
 func (t *Array) Transpose(axes ...int) *Array {
 	cAxes := make([]C.int, len(axes))
 	for i, axis := range axes {
 		cAxes[i] = C.int(axis)
 	}
 	out := New("TRANSPOSE", t)
 	C.mlx_transpose_axes(&out.ctx, t.ctx, unsafe.SliceData(cAxes), C.size_t(len(cAxes)), DefaultStream().ctx)
 	return out
 }
 func Zeros(dtype DType, shape ...int) *Array {
 	cAxes := make([]C.int, len(shape))
 	for i := range shape {
 		cAxes[i] = C.int(shape[i])
 	}
 	t := New("ZEROS")
 	C.mlx_zeros(&t.ctx, unsafe.SliceData(cAxes), C.size_t(len(cAxes)), C.mlx_dtype(dtype), DefaultStream().ctx)
 	return t
 }
--- a/x/mlxrunner/mlx/ops_extra.go
+++ b/x/mlxrunner/mlx/ops_extra.go
@@ -0,0 +1,427 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"reflect"
 	"unsafe"
 )
 // Quantization operations
 func Quantize(w *Array, groupSize, bits int, mode string) (weights, scales, biases *Array) {
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))
 	optGroupSize := C.mlx_optional_int{value: C.int(groupSize), has_value: true}
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}
 	res := C.mlx_vector_array_new()
 	defer C.mlx_vector_array_free(res)
 	C.mlx_quantize(&res, w.ctx, optGroupSize, optBits, cMode, DefaultStream().ctx)
 	vecSize := int(C.mlx_vector_array_size(res))
 	w0 := New("QUANTIZE_W")
 	C.mlx_vector_array_get(&w0.ctx, res, 0)
 	w1 := New("QUANTIZE_S")
 	C.mlx_vector_array_get(&w1.ctx, res, 1)
 	if vecSize >= 3 {
 		w2 := New("QUANTIZE_B")
 		C.mlx_vector_array_get(&w2.ctx, res, 2)
 		return w0, w1, w2
 	}
 	return w0, w1, nil
 }
 func Dequantize(w, scales, biases *Array, groupSize, bits int, mode string) *Array {
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))
 	optGroupSize := C.mlx_optional_int{value: C.int(groupSize), has_value: true}
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}
 	optDtype := C.mlx_optional_dtype{has_value: false}
 	inputs := []*Array{w, scales}
 	var b C.mlx_array
 	if biases != nil {
 		b = biases.ctx
 		inputs = append(inputs, biases)
 	}
 	out := New("DEQUANTIZE", inputs...)
 	C.mlx_dequantize(&out.ctx, w.ctx, scales.ctx, b, optGroupSize, optBits, cMode, optDtype, DefaultStream().ctx)
 	return out
 }
 func QuantizedMatmul(x, w, scales, biases *Array, transpose bool, groupSize, bits int, mode string) *Array {
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))
 	optGroupSize := C.mlx_optional_int{value: C.int(groupSize), has_value: true}
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}
 	inputs := []*Array{x, w, scales}
 	var b C.mlx_array
 	if biases != nil {
 		b = biases.ctx
 		inputs = append(inputs, biases)
 	}
 	out := New("QUANTIZED_MATMUL", inputs...)
 	C.mlx_quantized_matmul(&out.ctx, x.ctx, w.ctx, scales.ctx, b, C.bool(transpose), optGroupSize, optBits, cMode, DefaultStream().ctx)
 	return out
 }
 func GatherQMM(x, w, scales *Array, biases, lhsIndices, rhsIndices *Array, transpose bool, groupSize, bits int, mode string, sortedIndices bool) *Array {
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))
 	optGroupSize := C.mlx_optional_int{value: C.int(groupSize), has_value: true}
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}
 	inputs := []*Array{x, w, scales}
 	var b, lhs, rhs C.mlx_array
 	if biases != nil {
 		b = biases.ctx
 		inputs = append(inputs, biases)
 	}
 	if lhsIndices != nil {
 		lhs = lhsIndices.ctx
 		inputs = append(inputs, lhsIndices)
 	}
 	if rhsIndices != nil {
 		rhs = rhsIndices.ctx
 		inputs = append(inputs, rhsIndices)
 	}
 	out := New("GATHER_QMM", inputs...)
 	C.mlx_gather_qmm(&out.ctx, x.ctx, w.ctx, scales.ctx, b, lhs, rhs, C.bool(transpose), optGroupSize, optBits, cMode, C.bool(sortedIndices), DefaultStream().ctx)
 	return out
 }
 // Missing tensor ops
 func Tile(a *Array, reps []int32) *Array {
 	cReps := make([]C.int, len(reps))
 	for i, r := range reps {
 		cReps[i] = C.int(r)
 	}
 	out := New("TILE", a)
 	C.mlx_tile(&out.ctx, a.ctx, unsafe.SliceData(cReps), C.size_t(len(reps)), DefaultStream().ctx)
 	return out
 }
 func Tri(n, m int32, k int) *Array {
 	out := New("TRI")
 	C.mlx_tri(&out.ctx, C.int(n), C.int(m), C.int(k), C.mlx_dtype(DTypeFloat32), DefaultStream().ctx)
 	return out
 }
 func Where(condition, a, b *Array) *Array {
 	out := New("WHERE", condition, a, b)
 	C.mlx_where(&out.ctx, condition.ctx, a.ctx, b.ctx, DefaultStream().ctx)
 	return out
 }
 // Convenience wrappers (function-style for the model code)
 func Stack(arrays []*Array, axis int) *Array {
 	vectorData := make([]C.mlx_array, len(arrays))
 	for i := range arrays {
 		vectorData[i] = arrays[i].ctx
 	}
 	vector := C.mlx_vector_array_new_data(unsafe.SliceData(vectorData), C.size_t(len(vectorData)))
 	defer C.mlx_vector_array_free(vector)
 	out := New("STACK", arrays...)
 	C.mlx_stack_axis(&out.ctx, vector, C.int(axis), DefaultStream().ctx)
 	return out
 }
 func Neg(a *Array) *Array {
 	return a.Negative()
 }
 func Sum(a *Array, axis int, keepDims bool) *Array {
 	return a.SumAxis(axis, keepDims)
 }
 func Argsort(a *Array, axis int) *Array {
 	return a.ArgsortAxis(axis)
 }
 func Take(a *Array, indices *Array, axis int) *Array {
 	return a.TakeAxis(indices, axis)
 }
 func RSqrt(a *Array) *Array {
 	out := New("RSQRT", a)
 	C.mlx_rsqrt(&out.ctx, a.ctx, DefaultStream().ctx)
 	return out
 }
 func Mean(a *Array, axis int, keepDims bool) *Array {
 	out := New("MEAN_AXIS", a)
 	C.mlx_mean_axis(&out.ctx, a.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
 	return out
 }
 func Argpartition(a *Array, kth int, axis int) *Array {
 	return a.ArgpartitionAxis(kth, axis)
 }
 func TakeAlongAxis(a, indices *Array, axis int) *Array {
 	return a.TakeAlongAxis(indices, axis)
 }
 // Function-style wrappers matching imagegen API
 func Add(a, b *Array) *Array {
 	return a.Add(b)
 }
 func Sub(a, b *Array) *Array {
 	return a.Subtract(b)
 }
 func Mul(a, b *Array) *Array {
 	return a.Multiply(b)
 }
 func Div(a, b *Array) *Array {
 	return a.Divide(b)
 }
 func Matmul(a, b *Array) *Array {
 	return a.Matmul(b)
 }
 func Reshape(a *Array, shape ...int32) *Array {
 	axes := make([]int, len(shape))
 	for i, s := range shape {
 		axes[i] = int(s)
 	}
 	return a.Reshape(axes...)
 }
 func Transpose(a *Array, axes ...int) *Array {
 	return a.Transpose(axes...)
 }
 func ExpandDims(a *Array, axis int) *Array {
 	return a.ExpandDims(axis)
 }
 func Squeeze(a *Array, axis int) *Array {
 	return a.Squeeze(axis)
 }
 func Flatten(a *Array) *Array {
 	return a.Flatten(0, -1)
 }
 func Concatenate(arrays []*Array, axis int) *Array {
 	if len(arrays) == 0 {
 		return nil
 	}
 	return arrays[0].Concatenate(axis, arrays[1:]...)
 }
 func SliceStartStop(a *Array, start, stop []int32) *Array {
 	n := len(start)
 	cStart := make([]C.int, n)
 	cStop := make([]C.int, n)
 	cStrides := make([]C.int, n)
 	for i := 0; i < n; i++ {
 		cStart[i] = C.int(start[i])
 		cStop[i] = C.int(stop[i])
 		cStrides[i] = 1
 	}
 	out := New("SLICE", a)
 	C.mlx_slice(&out.ctx, a.ctx, unsafe.SliceData(cStart), C.size_t(n), unsafe.SliceData(cStop), C.size_t(n), unsafe.SliceData(cStrides), C.size_t(n), DefaultStream().ctx)
 	return out
 }
 func GatherMM(a, b *Array, lhsIndices, rhsIndices *Array, sortedIndices bool) *Array {
 	if lhsIndices == nil {
 		lhsIndices = New("")
 	}
 	if rhsIndices == nil {
 		rhsIndices = New("")
 	}
 	return a.GatherMM(b, lhsIndices, rhsIndices, sortedIndices)
 }
 func SiLU(a *Array) *Array {
 	sig := a.Sigmoid()
 	return a.Multiply(sig)
 }
 func RoPEWithBase(x *Array, dims int, traditional bool, base, scale float32, offset int) *Array {
 	freqs := New("")
 	out := New("FAST_ROPE", x, freqs)
 	C.mlx_fast_rope(
 		&out.ctx,
 		x.ctx,
 		C.int(dims),
 		C.bool(traditional),
 		C.mlx_optional_float{
 			value:     C.float(base),
 			has_value: C.bool(func() bool { return base != 0 }()),
 		},
 		C.float(scale),
 		C.int(offset),
 		freqs.ctx,
 		DefaultStream().ctx,
 	)
 	return out
 }
 func Sigmoid(a *Array) *Array {
 	return a.Sigmoid()
 }
 func ScaledDotProductAttentionCausal(q, k, v *Array, scale float32, causalMask bool) *Array {
 	mask := New("")
 	sinks := New("")
 	mode := ""
 	if causalMask {
 		mode = "causal"
 	}
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))
 	out := New("FAST_SDPA", q, k, v, mask, sinks)
 	C.mlx_fast_scaled_dot_product_attention(&out.ctx, q.ctx, k.ctx, v.ctx, C.float(scale), cMode, mask.ctx, sinks.ctx, DefaultStream().ctx)
 	return out
 }
 func RMSNormFn(x, weight *Array, eps float32) *Array {
 	out := New("FAST_RMSNORM", x)
 	C.mlx_fast_rms_norm(&out.ctx, x.ctx, weight.ctx, C.float(eps), DefaultStream().ctx)
 	return out
 }
 func AddMM(c, a, b *Array, alpha, beta float32) *Array {
 	return c.Addmm(a, b, alpha, beta)
 }
 // Scalar helpers
 func AddScalar(a *Array, s float32) *Array {
 	scalar := FromValue(s)
 	return a.Add(scalar)
 }
 func MulScalar(a *Array, s float32) *Array {
 	scalar := FromValue(s)
 	return a.Multiply(scalar)
 }
 func DivScalar(a *Array, s float32) *Array {
 	scalar := FromValue(s)
 	return a.Divide(scalar)
 }
 func FloorDivideScalar(a *Array, s int32) *Array {
 	scalar := FromValue(int(s))
 	return a.FloorDivide(scalar)
 }
 // Array constructors
 func NewArrayInt32(data []int32, shape []int32) *Array {
 	cShape := make([]C.int, len(shape))
 	for i, s := range shape {
 		cShape[i] = C.int(s)
 	}
 	out := New("NEW_ARRAY_INT32")
 	out.ctx = C.mlx_array_new_data(unsafe.Pointer(&data[0]), unsafe.SliceData(cShape), C.int(len(shape)), C.mlx_dtype(DTypeInt32))
 	return out
 }
 func NewScalarArray(value float32) *Array {
 	out := New("SCALAR")
 	out.ctx = C.mlx_array_new_float32(C.float(value))
 	return out
 }
 func ZerosF32(shape []int32) *Array {
 	return Zeros(DTypeFloat32, func() []int {
 		ints := make([]int, len(shape))
 		for i, s := range shape {
 			ints[i] = int(s)
 		}
 		return ints
 	}()...)
 }
 // Utility
 func Collect(v any) []*Array {
 	var arrays []*Array
 	seen := make(map[uintptr]bool)
 	collect(reflect.ValueOf(v), &arrays, seen)
 	return arrays
 }
 func collect(v reflect.Value, arrays *[]*Array, seen map[uintptr]bool) {
 	if !v.IsValid() {
 		return
 	}
 	if v.Kind() == reflect.Ptr {
 		if v.IsNil() {
 			return
 		}
 		ptr := v.Pointer()
 		if seen[ptr] {
 			return
 		}
 		seen[ptr] = true
 		if arr, ok := v.Interface().(*Array); ok {
 			if arr != nil && arr.Valid() {
 				*arrays = append(*arrays, arr)
 			}
 			return
 		}
 		collect(v.Elem(), arrays, seen)
 		return
 	}
 	switch v.Kind() {
 	case reflect.Struct:
 		// Check if this struct IS an Array (not a pointer to one)
 		if arr, ok := v.Addr().Interface().(*Array); ok {
 			if arr != nil && arr.Valid() {
 				*arrays = append(*arrays, arr)
 			}
 			return
 		}
 		for i := 0; i < v.NumField(); i++ {
 			field := v.Field(i)
 			if field.CanInterface() {
 				collect(field, arrays, seen)
 			}
 		}
 	case reflect.Slice:
 		for i := 0; i < v.Len(); i++ {
 			collect(v.Index(i), arrays, seen)
 		}
 	case reflect.Map:
 		for _, key := range v.MapKeys() {
 			collect(v.MapIndex(key), arrays, seen)
 		}
 	case reflect.Interface:
 		if !v.IsNil() {
 			collect(v.Elem(), arrays, seen)
 		}
 	}
 }
 func EnableCompile() {
 	C.mlx_enable_compile()
 }
 func DisableCompile() {
 	C.mlx_disable_compile()
 }
--- a/x/mlxrunner/mlx/random.go
+++ b/x/mlxrunner/mlx/random.go
@@ -0,0 +1,13 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 func (t *Array) Categorical(axis int) *Array {
 	key := New("")
 	out := New("", t, key)
 	C.mlx_random_categorical(&out.ctx, t.ctx, C.int(axis), key.ctx, DefaultStream().ctx)
 	return out
 }
--- a/x/mlxrunner/mlx/slice.go
+++ b/x/mlxrunner/mlx/slice.go
@@ -0,0 +1,86 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"cmp"
 	"unsafe"
 )
 type slice struct {
 	args []int
 }
 func Slice(args ...int) slice {
 	return slice{args: args}
 }
 func makeSlices(dims []int, slices ...slice) (starts, stops, strides []C.int) {
 	if len(slices) != len(dims) {
 		panic("number of slice arguments must match number of tensor dimensions")
 	}
 	args := [3][]C.int{
 		make([]C.int, len(slices)),
 		make([]C.int, len(slices)),
 		make([]C.int, len(slices)),
 	}
 	for i, s := range slices {
 		switch len(s.args) {
 		case 0:
 			// slice[:]
 			args[0][i] = C.int(0)
 			args[1][i] = C.int(dims[i])
 			args[2][i] = C.int(1)
 		case 1:
 			// slice[i]
 			args[0][i] = C.int(s.args[0])
 			args[1][i] = C.int(s.args[0] + 1)
 			args[2][i] = C.int(1)
 		case 2:
 			// slice[i:j]
 			args[0][i] = C.int(s.args[0])
 			args[1][i] = cmp.Or(C.int(s.args[1]), C.int(dims[i]))
 			args[2][i] = C.int(1)
 		case 3:
 			// slice[i:j:k]
 			args[0][i] = C.int(s.args[0])
 			args[1][i] = cmp.Or(C.int(s.args[1]), C.int(dims[i]))
 			args[2][i] = C.int(s.args[2])
 		default:
 			panic("invalid slice arguments")
 		}
 	}
 	return args[0], args[1], args[2]
 }
 func (t *Array) Slice(slices ...slice) *Array {
 	starts, stops, strides := makeSlices(t.Dims(), slices...)
 	out := New("SLICE", t)
 	C.mlx_slice(
 		&out.ctx, t.ctx,
 		unsafe.SliceData(starts), C.size_t(len(starts)),
 		unsafe.SliceData(stops), C.size_t(len(stops)),
 		unsafe.SliceData(strides), C.size_t(len(strides)),
 		DefaultStream().ctx,
 	)
 	return out
 }
 func (t *Array) SliceUpdate(other *Array, slices ...slice) *Array {
 	starts, stops, strides := makeSlices(t.Dims(), slices...)
 	out := New("SLICE_UPDATE", t, other)
 	C.mlx_slice_update(
 		&out.ctx, t.ctx, other.ctx,
 		unsafe.SliceData(starts), C.size_t(len(starts)),
 		unsafe.SliceData(stops), C.size_t(len(stops)),
 		unsafe.SliceData(strides), C.size_t(len(strides)),
 		DefaultStream().ctx,
 	)
 	return out
 }
--- a/x/mlxrunner/mlx/stream.go
+++ b/x/mlxrunner/mlx/stream.go
@@ -0,0 +1,45 @@
 //go:build mlx
 package mlx
 // #include "generated.h"
 import "C"
 import (
 	"log/slog"
 	"sync"
 )
 type Device struct {
 	ctx C.mlx_device
 }
 func (d Device) LogValue() slog.Value {
 	str := C.mlx_string_new()
 	defer C.mlx_string_free(str)
 	C.mlx_device_tostring(&str, d.ctx)
 	return slog.StringValue(C.GoString(C.mlx_string_data(str)))
 }
 var DefaultDevice = sync.OnceValue(func() Device {
 	d := C.mlx_device_new()
 	C.mlx_get_default_device(&d)
 	return Device{d}
 })
 type Stream struct {
 	ctx C.mlx_stream
 }
 func (s Stream) LogValue() slog.Value {
 	str := C.mlx_string_new()
 	defer C.mlx_string_free(str)
 	C.mlx_stream_tostring(&str, s.ctx)
 	return slog.StringValue(C.GoString(C.mlx_string_data(str)))
 }
 var DefaultStream = sync.OnceValue(func() Stream {
 	s := C.mlx_stream_new()
 	C.mlx_get_default_stream(&s, DefaultDevice().ctx)
 	return Stream{s}
 })
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -0,0 +1,123 @@
 //go:build mlx
 package mlxrunner
 import (
 	"bytes"
 	"errors"
 	"log/slog"
 	"time"
 	"unicode/utf8"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )
 func (r *Runner) TextGenerationPipeline(request Request) error {
 	if r.Model == nil {
 		return errors.New("model not loaded")
 	}
 	inputs := r.Tokenizer.Encode(request.Prompt, true)
 	caches, tokens := r.FindNearestCache(inputs)
 	if len(caches) == 0 {
 		caches = make([]cache.Cache, r.Model.NumLayers())
 		for i := range caches {
 			caches[i] = cache.NewKVCache()
 		}
 	}
 	total, processed := len(tokens), 0
 	slog.Info("Prompt processing progress", "processed", processed, "total", total)
 	for total-processed > 1 {
 		n := min(2<<10, total-processed-1)
 		temp := r.Model.Forward(mlx.FromValues(tokens[processed:processed+n], n).ExpandDims(0), caches)
 		defer mlx.Free(temp)
 		mlx.Eval(func() []*mlx.Array {
 			s := make([]*mlx.Array, 2*len(caches))
 			for i, c := range caches {
 				s[2*i], s[2*i+1] = c.State()
 			}
 			return s
 		}()...)
 		processed += n
 		slog.Info("Prompt processing progress", "processed", processed, "total", total)
 		mlx.ClearCache()
 	}
 	step := func(token *mlx.Array) (*mlx.Array, *mlx.Array) {
 		logits := r.Model.Unembed(r.Model.Forward(token.ExpandDims(0), caches))
 		logits = logits.Slice(mlx.Slice(), mlx.Slice(logits.Dim(1)-1), mlx.Slice()).Squeeze(1)
 		logprobs := logits.Subtract(logits.Logsumexp(true))
 		return request.Sample(logprobs), logprobs
 	}
 	sample, logprobs := step(mlx.FromValues(tokens[processed:], total-processed))
 	mlx.AsyncEval(sample, logprobs)
 	var b bytes.Buffer
 	now := time.Now()
 	final := Response{PromptTokens: total, CompletionTokens: request.Options.MaxTokens, DoneReason: 1}
 	outputs := make([]int32, 0, request.Options.MaxTokens)
 	for i := range request.Options.MaxTokens {
 		nextSample, nextLogprobs := step(sample)
 		mlx.AsyncEval(nextSample, nextLogprobs)
 		if i == 0 {
 			slog.Info("Prompt processing progress", "processed", total, "total", total)
 			mlx.Eval(sample)
 			final.PromptTokensDuration = time.Since(now)
 			now = time.Now()
 		}
 		output := int32(sample.Int())
 		outputs = append(outputs, output)
 		if r.Tokenizer.IsEOS(output) {
 			final.Token = int(output)
 			final.DoneReason = 0
 			final.CompletionTokens = i
 			break
 		}
 		request.Responses <- Response{
 			Text:  r.Decode(output, &b),
 			Token: int(output),
 		}
 		mlx.Free(sample, logprobs)
 		if i%256 == 0 {
 			mlx.ClearCache()
 		}
 		sample, logprobs = nextSample, nextLogprobs
 	}
 	mlx.Free(sample, logprobs)
 	final.CompletionTokensDuration = time.Since(now)
 	request.Responses <- final
 	r.InsertCache(append(inputs, outputs...), caches)
 	return nil
 }
 func (r Runner) Decode(sample int32, b *bytes.Buffer) string {
 	token := r.Tokenizer.Decode([]int32{sample})
 	if _, err := b.WriteString(token); err != nil {
 		slog.Error("Failed to write token to buffer", "error", err)
 		return ""
 	}
 	if text := b.String(); utf8.ValidString(text) {
 		b.Reset()
 		return text
 	} else if b.Len() >= utf8.UTFMax {
 		b.Reset()
 		return text
 	}
 	return ""
 }
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -0,0 +1,139 @@
 //go:build mlx
 package mlxrunner
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"net"
 	"net/http"
 	"time"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/x/imagegen/manifest"
 	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/sample"
 	"github.com/ollama/ollama/x/models/glm4_moe_lite"
 )
 // TextModel is the interface that model implementations must satisfy.
 type TextModel interface {
 	Forward(inputs *mlx.Array, cache []cache.Cache) *mlx.Array
 	Unembed(x *mlx.Array) *mlx.Array
 	NumLayers() int
 }
 type Request struct {
 	TextCompletionsRequest
 	Responses chan Response
 	Pipeline  func(Request) error
 	sample.Sampler
 	caches []cache.Cache
 }
 type TextCompletionsRequest struct {
 	Prompt  string `json:"prompt"`
 	Options struct {
 		Temperature float32 `json:"temperature"`
 		TopP        float32 `json:"top_p"`
 		MinP        float32 `json:"min_p"`
 		TopK        int     `json:"top_k"`
 		MaxTokens   int     `json:"max_tokens"`
 		// Deprecated: use MaxTokens instead
 		NumPredict int `json:"num_predict"`
 	} `json:"options"`
 }
 type Response struct {
 	Text       string    `json:"content,omitempty"`
 	Token      int       `json:"token,omitempty"`
 	Logprobs   []float32 `json:"logprobs,omitempty"`
 	Done       bool      `json:"done,omitempty"`
 	DoneReason int       `json:"done_reason,omitempty"`
 	PromptTokens             int           `json:"prompt_eval_count,omitempty"`
 	PromptTokensDuration     time.Duration `json:"prompt_eval_duration,omitempty"`
 	CompletionTokens         int           `json:"eval_count,omitempty"`
 	CompletionTokensDuration time.Duration `json:"eval_duration,omitempty"`
 	TotalTokens              int           `json:"total_tokens,omitempty"`
 }
 type Runner struct {
 	Model        TextModel
 	Tokenizer    *tokenizer.Tokenizer
 	Requests     chan Request
 	CacheEntries map[int32]*CacheEntry
 }
 func (r *Runner) Load(modelName string) error {
 	modelManifest, err := manifest.LoadManifest(modelName)
 	if err != nil {
 		return err
 	}
 	// Read config to detect architecture
 	configData, err := modelManifest.ReadConfig("config.json")
 	if err != nil {
 		return fmt.Errorf("failed to read config.json: %w", err)
 	}
 	var archConfig struct {
 		Architectures []string `json:"architectures"`
 	}
 	if err := json.Unmarshal(configData, &archConfig); err != nil {
 		return fmt.Errorf("failed to parse config.json: %w", err)
 	}
 	if len(archConfig.Architectures) == 0 {
 		return fmt.Errorf("no architectures found in config.json")
 	}
 	slog.Info("Model architecture", "arch", archConfig.Architectures[0])
 	switch archConfig.Architectures[0] {
 	case "Glm4MoeLiteForCausalLM", "GLM4MoeLite":
 		model, err := glm4_moe_lite.LoadFromManifest(modelManifest)
 		if err != nil {
 			return fmt.Errorf("failed to load GLM4-MoE-Lite model: %w", err)
 		}
 		r.Model = model
 		r.Tokenizer = model.Tokenizer()
 	default:
 		return fmt.Errorf("unsupported architecture: %s", archConfig.Architectures[0])
 	}
 	return nil
 }
 func (r *Runner) Run(host, port string, mux http.Handler) error {
 	g, ctx := errgroup.WithContext(context.Background())
 	g.Go(func() error {
 		for {
 			select {
 			case <-ctx.Done():
 				return nil
 			case request := <-r.Requests:
 				if err := request.Pipeline(request); err != nil {
 					break
 				}
 				close(request.Responses)
 			}
 		}
 	})
 	g.Go(func() error {
 		slog.Info("Starting HTTP server", "host", host, "port", port)
 		return http.ListenAndServe(net.JoinHostPort(host, port), mux)
 	})
 	return g.Wait()
 }
--- a/x/mlxrunner/sample/sample.go
+++ b/x/mlxrunner/sample/sample.go
@@ -0,0 +1,77 @@
 //go:build mlx
 package sample
 import (
 	"math"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )
 type Sampler interface {
 	Sample(*mlx.Array) *mlx.Array
 }
 func New(temp, top_p, min_p float32, top_k int) Sampler {
 	if temp == 0 {
 		return greedy{}
 	}
 	var samplers []Sampler
 	if top_p > 0 && top_p < 1 {
 		samplers = append(samplers, TopP(top_p))
 	}
 	if min_p != 0 {
 		samplers = append(samplers, MinP(min_p))
 	}
 	if top_k > 0 {
 		samplers = append(samplers, TopK(top_k))
 	}
 	samplers = append(samplers, Temperature(temp))
 	return chain(samplers)
 }
 type greedy struct{}
 func (greedy) Sample(logits *mlx.Array) *mlx.Array {
 	return logits.Argmax(-1, false)
 }
 type chain []Sampler
 func (c chain) Sample(logits *mlx.Array) *mlx.Array {
 	for _, sampler := range c {
 		logits = sampler.Sample(logits)
 	}
 	return logits
 }
 type Temperature float32
 func (t Temperature) Sample(logits *mlx.Array) *mlx.Array {
 	return logits.Multiply(mlx.FromValue(1 / float32(t))).Categorical(-1)
 }
 type TopP float32
 func (p TopP) Sample(logprobs *mlx.Array) *mlx.Array {
 	// TODO: implement
 	return logprobs
 }
 type MinP float32
 func (p MinP) Sample(logprobs *mlx.Array) *mlx.Array {
 	// TODO: implement
 	return logprobs
 }
 type TopK int
 func (k TopK) Sample(logprobs *mlx.Array) *mlx.Array {
 	mask := logprobs.Negative().ArgpartitionAxis(int(k)-1, -1).Slice(mlx.Slice(), mlx.Slice(int(k), 0))
 	return logprobs.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
 }
--- a/x/mlxrunner/server.go
+++ b/x/mlxrunner/server.go
@@ -0,0 +1,176 @@
 //go:build mlx
 package mlxrunner
 import (
 	"bytes"
 	"cmp"
 	"encoding/json"
 	"flag"
 	"io"
 	"log/slog"
 	"net/http"
 	"os"
 	"strconv"
 	"time"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/x/mlxrunner/sample"
 )
 func Execute(args []string) error {
 	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	var (
 		modelName string
 		port      int
 	)
 	flagSet := flag.NewFlagSet("mlxrunner", flag.ExitOnError)
 	flagSet.StringVar(&modelName, "model", "", "Model name")
 	flagSet.IntVar(&port, "port", 0, "Port to listen on")
 	_ = flagSet.Bool("verbose", false, "Enable debug logging")
 	flagSet.Parse(args)
 	runner := Runner{
 		Requests:     make(chan Request),
 		CacheEntries: make(map[int32]*CacheEntry),
 	}
 	if err := runner.Load(modelName); err != nil {
 		return err
 	}
 	mux := http.NewServeMux()
 	mux.HandleFunc("GET /v1/status", func(w http.ResponseWriter, r *http.Request) {
 		if err := json.NewEncoder(w).Encode(map[string]any{
 			"status":   0,
 			"progress": 100,
 		}); err != nil {
 			slog.Error("Failed to encode response", "error", err)
 			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
 			return
 		}
 	})
 	mux.HandleFunc("/v1/models", func(w http.ResponseWriter, r *http.Request) {
 		switch r.Method {
 		case "POST":
 			fallthrough
 		case "GET":
 			if err := json.NewEncoder(w).Encode(map[string]any{
 				"Success": true,
 			}); err != nil {
 				slog.Error("Failed to encode response", "error", err)
 				http.Error(w, "Internal Server Error", http.StatusInternalServerError)
 				return
 			}
 		case "DELETE":
 			// TODO: cleanup model and cache
 		}
 	})
 	mux.HandleFunc("POST /v1/completions", func(w http.ResponseWriter, r *http.Request) {
 		request := Request{Responses: make(chan Response)}
 		if err := json.NewDecoder(r.Body).Decode(&request.TextCompletionsRequest); err != nil {
 			slog.Error("Failed to decode request", "error", err)
 			http.Error(w, "Bad Request", http.StatusBadRequest)
 			return
 		}
 		request.Options.MaxTokens = cmp.Or(request.Options.MaxTokens, request.Options.NumPredict)
 		if request.Options.MaxTokens < 1 {
 			request.Options.MaxTokens = 16 << 10
 		}
 		request.Pipeline = runner.TextGenerationPipeline
 		request.Sampler = sample.New(
 			request.Options.Temperature,
 			request.Options.TopP,
 			request.Options.MinP,
 			request.Options.TopK,
 		)
 		runner.Requests <- request
 		w.Header().Set("Content-Type", "application/jsonl")
 		w.WriteHeader(http.StatusOK)
 		enc := json.NewEncoder(w)
 		for response := range request.Responses {
 			if err := enc.Encode(response); err != nil {
 				slog.Error("Failed to encode response", "error", err)
 				return
 			}
 			if f, ok := w.(http.Flusher); ok {
 				f.Flush()
 			}
 		}
 	})
 	mux.HandleFunc("POST /v1/tokenize", func(w http.ResponseWriter, r *http.Request) {
 		var b bytes.Buffer
 		if _, err := io.Copy(&b, r.Body); err != nil {
 			slog.Error("Failed to read request body", "error", err)
 			http.Error(w, "Bad Request", http.StatusBadRequest)
 			return
 		}
 		tokens := runner.Tokenizer.Encode(b.String(), true)
 		if err := json.NewEncoder(w).Encode(tokens); err != nil {
 			slog.Error("Failed to encode response", "error", err)
 			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
 			return
 		}
 	})
 	for source, target := range map[string]string{
 		"GET /health":      "/v1/status",
 		"POST /load":       "/v1/models",
 		"POST /completion": "/v1/completions",
 	} {
 		mux.Handle(source, http.RedirectHandler(target, http.StatusPermanentRedirect))
 	}
 	return runner.Run("127.0.0.1", strconv.Itoa(port), http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Content-Type", "application/json")
 		recorder := &statusRecorder{ResponseWriter: w, code: http.StatusOK}
 		t := time.Now()
 		mux.ServeHTTP(recorder, r)
 		var level slog.Level
 		switch {
 		case recorder.code >= 500:
 			level = slog.LevelError
 		case recorder.code >= 400:
 			level = slog.LevelWarn
 		case recorder.code >= 300:
 			return
 		}
 		slog.Log(r.Context(), level, "ServeHTTP", "method", r.Method, "path", r.URL.Path, "took", time.Since(t), "status", recorder.Status())
 	}))
 }
 type statusRecorder struct {
 	http.ResponseWriter
 	code int
 }
 func (w *statusRecorder) WriteHeader(code int) {
 	w.code = code
 	w.ResponseWriter.WriteHeader(code)
 }
 func (w *statusRecorder) Status() string {
 	return strconv.Itoa(w.code) + " " + http.StatusText(w.code)
 }
 func (w *statusRecorder) Flush() {
 	if f, ok := w.ResponseWriter.(http.Flusher); ok {
 		f.Flush()
 	}
 }
--- a/x/mlxrunner/server_stub.go
+++ b/x/mlxrunner/server_stub.go
@@ -0,0 +1,10 @@
 //go:build !mlx
 package mlxrunner
 import "errors"
 // Execute returns an error when not built with MLX support.
 func Execute(args []string) error {
 	return errors.New("MLX runner not available: build with mlx tag")
 }
--- a/x/models/glm4_moe_lite/glm4_moe_lite.go
+++ b/x/models/glm4_moe_lite/glm4_moe_lite.go
@@ -0,0 +1,860 @@
 //go:build mlx
 // Package glm4_moe_lite provides the GLM4-MoE-Lite implementation for MLX.
 // This model uses Multi-head Latent Attention (MLA) and Mixture of Experts (MoE).
 package glm4_moe_lite
 import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
 	"math"
 	"os"
 	"strings"
 	"github.com/ollama/ollama/x/imagegen/manifest"
 	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/models/nn"
 )
 // RopeScaling holds RoPE scaling configuration
 type RopeScaling struct {
 	Factor       float32 `json:"factor"`
 	MscaleAllDim float32 `json:"mscale_all_dim"`
 }
 // Config holds GLM4-MoE-Lite model configuration
 type Config struct {
 	HiddenSize            int32   `json:"hidden_size"`
 	NumHiddenLayers       int32   `json:"num_hidden_layers"`
 	IntermediateSize      int32   `json:"intermediate_size"`
 	MoEIntermediateSize   int32   `json:"moe_intermediate_size"`
 	NumAttentionHeads     int32   `json:"num_attention_heads"`
 	NumKeyValueHeads      int32   `json:"num_key_value_heads"`
 	VocabSize             int32   `json:"vocab_size"`
 	RMSNormEps            float32 `json:"rms_norm_eps"`
 	RopeTheta             float32 `json:"rope_theta"`
 	MaxPositionEmbeddings int32   `json:"max_position_embeddings"`
 	AttentionBias         bool    `json:"attention_bias"`
 	// MLA (Multi-head Latent Attention) parameters
 	QLoraRank     int32 `json:"q_lora_rank"`
 	KVLoraRank    int32 `json:"kv_lora_rank"`
 	QKRopeHeadDim int32 `json:"qk_rope_head_dim"`
 	QKNopeHeadDim int32 `json:"qk_nope_head_dim"`
 	VHeadDim      int32 `json:"v_head_dim"`
 	// MoE parameters
 	NRoutedExperts      int32   `json:"n_routed_experts"`
 	NSharedExperts      int32   `json:"n_shared_experts"`
 	NumExpertsPerTok    int32   `json:"num_experts_per_tok"`
 	RoutedScalingFactor float32 `json:"routed_scaling_factor"`
 	NormTopKProb        bool    `json:"norm_topk_prob"`
 	FirstKDenseReplace  int32   `json:"first_k_dense_replace"`
 	NGroup              int32   `json:"n_group"`
 	TopKGroup           int32   `json:"topk_group"`
 	// RoPE scaling
 	RopeScaling *RopeScaling `json:"rope_scaling"`
 	// Quantization parameters (set during load based on model quantization)
 	QuantGroupSize int    `json:"-"` // Group size for quantization (default 64)
 	QuantBits      int    `json:"-"` // Bits per weight (4 or 8)
 	QuantMode      string `json:"-"` // Quantization mode ("affine", etc.)
 	// Computed fields
 	QHeadDim int32   `json:"-"` // qk_nope_head_dim + qk_rope_head_dim
 	Scale    float32 `json:"-"` // 1/sqrt(QHeadDim) with mscale adjustment
 }
 // MLAAttention implements Multi-head Latent Attention with absorption.
 type MLAAttention struct {
 	QAProj      nn.LinearLayer
 	QALayerNorm *nn.RMSNorm
 	QBProj      nn.LinearLayer
 	KVAProjWithMQA nn.LinearLayer
 	KVALayerNorm   *nn.RMSNorm
 	EmbedQ     *nn.MultiLinear
 	UnembedOut *nn.MultiLinear
 	OProj nn.LinearLayer
 }
 // Forward computes absorbed MLA attention output.
 func (a *MLAAttention) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
 	q := a.QAProj.Forward(x)
 	q = a.QALayerNorm.Forward(q, cfg.RMSNormEps)
 	q = a.QBProj.Forward(q)
 	q = mlx.Reshape(q, B, L, cfg.NumAttentionHeads, cfg.QHeadDim)
 	q = mlx.Transpose(q, 0, 2, 1, 3)
 	qNope := mlx.SliceStartStop(q, []int32{0, 0, 0, 0}, []int32{B, cfg.NumAttentionHeads, L, cfg.QKNopeHeadDim})
 	qPE := mlx.SliceStartStop(q, []int32{0, 0, 0, cfg.QKNopeHeadDim}, []int32{B, cfg.NumAttentionHeads, L, cfg.QHeadDim})
 	compressedKV := a.KVAProjWithMQA.Forward(x)
 	kvCompressed := mlx.SliceStartStop(compressedKV, []int32{0, 0, 0}, []int32{B, L, cfg.KVLoraRank})
 	kPE := mlx.SliceStartStop(compressedKV, []int32{0, 0, cfg.KVLoraRank}, []int32{B, L, cfg.KVLoraRank + cfg.QKRopeHeadDim})
 	kPE = mlx.Reshape(kPE, B, L, 1, cfg.QKRopeHeadDim)
 	kPE = mlx.Transpose(kPE, 0, 2, 1, 3)
 	kvLatent := a.KVALayerNorm.Forward(kvCompressed, cfg.RMSNormEps)
 	kvLatent = mlx.ExpandDims(kvLatent, 1)
 	offset := 0
 	if c != nil {
 		offset = c.Offset()
 	}
 	qPE = mlx.RoPEWithBase(qPE, int(cfg.QKRopeHeadDim), true, cfg.RopeTheta, 1.0, offset)
 	kPE = mlx.RoPEWithBase(kPE, int(cfg.QKRopeHeadDim), true, cfg.RopeTheta, 1.0, offset)
 	qLatent := a.EmbedQ.Forward(qNope)
 	keys := mlx.Concatenate([]*mlx.Array{kvLatent, kPE}, 3)
 	cachedL := L
 	if c != nil {
 		placeholderValues := mlx.ZerosF32([]int32{B, 1, L, 0})
 		keys, _ = c.Update(keys, placeholderValues)
 		cachedL = int32(keys.Dim(2))
 	}
 	values := mlx.SliceStartStop(keys, []int32{0, 0, 0, 0}, []int32{B, 1, cachedL, cfg.KVLoraRank})
 	queries := mlx.Concatenate([]*mlx.Array{qLatent, qPE}, 3)
 	out := mlx.ScaledDotProductAttentionCausal(queries, keys, values, cfg.Scale, L > 1)
 	out = a.UnembedOut.Forward(out)
 	out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.NumAttentionHeads*cfg.VHeadDim)
 	return a.OProj.Forward(out)
 }
 // DenseMLP implements the standard SwiGLU MLP for dense layers
 type DenseMLP struct {
 	GateProj nn.LinearLayer
 	UpProj   nn.LinearLayer
 	DownProj nn.LinearLayer
 }
 // Forward applies the SwiGLU MLP
 func (m *DenseMLP) Forward(x *mlx.Array) *mlx.Array {
 	gate := mlx.SiLU(m.GateProj.Forward(x))
 	up := m.UpProj.Forward(x)
 	return m.DownProj.Forward(mlx.Mul(gate, up))
 }
 // MoEGate implements the expert gating mechanism
 type MoEGate struct {
 	Gate                 nn.LinearLayer
 	EScoreCorrectionBias *mlx.Array
 }
 // Forward computes expert selection indices and scores
 func (g *MoEGate) Forward(x *mlx.Array, cfg *Config) (*mlx.Array, *mlx.Array) {
 	gates := g.Gate.Forward(x)
 	scores := mlx.Sigmoid(gates)
 	origScores := scores
 	if g.EScoreCorrectionBias != nil {
 		scores = mlx.Add(scores, g.EScoreCorrectionBias)
 	}
 	topK := cfg.NumExpertsPerTok
 	negScores := mlx.Neg(scores)
 	inds := mlx.Argpartition(negScores, int(topK)-1, -1)
 	dims := inds.Dims()
 	inds = mlx.SliceStartStop(inds, []int32{0, 0, 0}, []int32{int32(dims[0]), int32(dims[1]), topK})
 	scores = mlx.TakeAlongAxis(origScores, inds, -1)
 	if topK > 1 && cfg.NormTopKProb {
 		sumScores := mlx.Sum(scores, -1, true)
 		scores = mlx.Div(scores, sumScores)
 	}
 	scores = mlx.MulScalar(scores, cfg.RoutedScalingFactor)
 	return inds, scores
 }
 // SwitchMLP implements the MoE expert computation using stacked weights
 type SwitchMLP struct {
 	GateWeight *mlx.Array
 	UpWeight   *mlx.Array
 	DownWeight *mlx.Array
 	GateWeightQ, GateScales, GateBiases *mlx.Array
 	UpWeightQ, UpScales, UpBiases       *mlx.Array
 	DownWeightQ, DownScales, DownBiases *mlx.Array
 	GateBits int
 	UpBits   int
 	DownBits int
 	GateGroupSize int
 	UpGroupSize   int
 	DownGroupSize int
 	UseQuantized bool
 }
 // Forward applies the switched expert MLP
 func (s *SwitchMLP) Forward(x *mlx.Array, indices *mlx.Array, cfg *Config) *mlx.Array {
 	dims := x.Dims()
 	B, L := int32(dims[0]), int32(dims[1])
 	topK := cfg.NumExpertsPerTok
 	xExpanded := mlx.ExpandDims(mlx.ExpandDims(x, -2), -2)
 	xFlat := mlx.Reshape(xExpanded, B*L, 1, 1, cfg.HiddenSize)
 	idxFlat := mlx.Reshape(indices, B*L, topK)
 	doSort := B*L >= 64
 	var invOrder *mlx.Array
 	n := B * L * topK
 	if doSort {
 		idxAll := mlx.Flatten(idxFlat)
 		order := mlx.Argsort(idxAll, 0)
 		invOrder = mlx.Argsort(order, 0)
 		xFlat = mlx.ExpandDims(mlx.Take(mlx.Squeeze(xFlat, 1), mlx.FloorDivideScalar(order, topK), 0), 1)
 		idxFlat = mlx.Reshape(mlx.Take(idxAll, order, 0), n, 1)
 	}
 	var gate, up, hidden, down *mlx.Array
 	if s.UseQuantized {
 		gate = mlx.GatherQMM(xFlat, s.GateWeightQ, s.GateScales, s.GateBiases,
 			nil, idxFlat, true, s.GateGroupSize, s.GateBits, cfg.QuantMode, doSort)
 		up = mlx.GatherQMM(xFlat, s.UpWeightQ, s.UpScales, s.UpBiases,
 			nil, idxFlat, true, s.UpGroupSize, s.UpBits, cfg.QuantMode, doSort)
 		hidden = mlx.Mul(mlx.SiLU(gate), up)
 		down = mlx.GatherQMM(hidden, s.DownWeightQ, s.DownScales, s.DownBiases,
 			nil, idxFlat, true, s.DownGroupSize, s.DownBits, cfg.QuantMode, doSort)
 	} else {
 		gate = mlx.GatherMM(xFlat, mlx.Transpose(s.GateWeight, 0, 2, 1), nil, idxFlat, doSort)
 		up = mlx.GatherMM(xFlat, mlx.Transpose(s.UpWeight, 0, 2, 1), nil, idxFlat, doSort)
 		hidden = mlx.Mul(mlx.SiLU(gate), up)
 		down = mlx.GatherMM(hidden, mlx.Transpose(s.DownWeight, 0, 2, 1), nil, idxFlat, doSort)
 	}
 	if doSort {
 		down = mlx.Reshape(mlx.Take(mlx.Squeeze(mlx.Squeeze(down, 2), 1), invOrder, 0), B*L, topK, cfg.HiddenSize)
 	} else {
 		down = mlx.Squeeze(down, 2)
 	}
 	return mlx.Reshape(down, B, L, topK, cfg.HiddenSize)
 }
 // SharedExperts implements the shared expert MLP
 type SharedExperts struct {
 	GateProj nn.LinearLayer
 	UpProj   nn.LinearLayer
 	DownProj nn.LinearLayer
 }
 // Forward applies the shared expert MLP
 func (s *SharedExperts) Forward(x *mlx.Array) *mlx.Array {
 	gate := mlx.SiLU(s.GateProj.Forward(x))
 	up := s.UpProj.Forward(x)
 	return s.DownProj.Forward(mlx.Mul(gate, up))
 }
 // MoE implements the full Mixture of Experts layer
 type MoE struct {
 	Gate          *MoEGate
 	SwitchMLP     *SwitchMLP
 	SharedExperts *SharedExperts
 }
 // Forward applies the MoE layer
 func (m *MoE) Forward(x *mlx.Array, cfg *Config) *mlx.Array {
 	dims := x.Dims()
 	B, L := int32(dims[0]), int32(dims[1])
 	inds, scores := m.Gate.Forward(x, cfg)
 	expertOut := m.SwitchMLP.Forward(x, inds, cfg)
 	scoresExpanded := mlx.ExpandDims(scores, -1)
 	y := mlx.Sum(mlx.Mul(expertOut, scoresExpanded), 2, false)
 	if m.SharedExperts != nil {
 		y = mlx.Add(y, m.SharedExperts.Forward(x))
 	}
 	return mlx.Reshape(y, B, L, cfg.HiddenSize)
 }
 // DenseBlock represents a dense transformer block (for first_k_dense_replace layers)
 type DenseBlock struct {
 	Attention              *MLAAttention
 	MLP                    *DenseMLP
 	InputLayerNorm         *nn.RMSNorm
 	PostAttentionLayerNorm *nn.RMSNorm
 }
 // Forward applies the dense block
 func (b *DenseBlock) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
 	r := b.Attention.Forward(b.InputLayerNorm.Forward(x, cfg.RMSNormEps), c, B, L, cfg)
 	h := mlx.Add(x, r)
 	r = b.MLP.Forward(b.PostAttentionLayerNorm.Forward(h, cfg.RMSNormEps))
 	return mlx.Add(h, r)
 }
 // MoEBlock represents a MoE transformer block
 type MoEBlock struct {
 	Attention              *MLAAttention
 	MoE                    *MoE
 	InputLayerNorm         *nn.RMSNorm
 	PostAttentionLayerNorm *nn.RMSNorm
 }
 // Forward applies the MoE block
 func (b *MoEBlock) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
 	r := b.Attention.Forward(b.InputLayerNorm.Forward(x, cfg.RMSNormEps), c, B, L, cfg)
 	h := mlx.Add(x, r)
 	r = b.MoE.Forward(b.PostAttentionLayerNorm.Forward(h, cfg.RMSNormEps), cfg)
 	return mlx.Add(h, r)
 }
 // Block interface for both dense and MoE blocks
 type Block interface {
 	Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array
 }
 // Model represents the complete GLM4-MoE-Lite model
 type Model struct {
 	EmbedTokens *nn.Embedding
 	Layers      []Block
 	Norm        *nn.RMSNorm
 	LMHead      nn.LinearLayer
 	tok *tokenizer.Tokenizer
 	*Config
 }
 // computeScale computes the attention scale.
 func computeScale(cfg *Config) float32 {
 	keyLength := cfg.QKNopeHeadDim + cfg.QKRopeHeadDim
 	scale := float32(1.0 / math.Sqrt(float64(keyLength)))
 	if cfg.RopeScaling != nil && cfg.RopeScaling.MscaleAllDim > 0 && cfg.RopeScaling.Factor > 1 {
 		s := 0.1*cfg.RopeScaling.MscaleAllDim*float32(math.Log(float64(cfg.RopeScaling.Factor))) + 1.0
 		scale *= s * s
 	}
 	return scale
 }
 // supportsGatherQMM returns true if the quantization mode has GatherQMM kernel support.
 func supportsGatherQMM(mode string, bits int) bool {
 	return mode == "affine" && (bits == 4 || bits == 8)
 }
 // quantizationParams returns groupSize, bits, mode for a quantization type string.
 func quantizationParams(quantization string) (groupSize, bits int, mode string) {
 	switch strings.ToUpper(quantization) {
 	case "NVFP4":
 		return 16, 4, "nvfp4"
 	case "FP4", "Q4", "INT4":
 		return 32, 4, "affine"
 	case "MXFP8":
 		return 32, 8, "mxfp8"
 	case "FP8", "Q8", "INT8", "":
 		return 64, 8, "affine"
 	default:
 		return 32, 8, "affine"
 	}
 }
 // readBlobMetadata reads the __metadata__ from a safetensors blob header.
 func readBlobMetadata(path string) (map[string]string, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	var headerSize uint64
 	if err := binary.Read(f, binary.LittleEndian, &headerSize); err != nil {
 		return nil, err
 	}
 	if headerSize > 1024*1024 {
 		return nil, fmt.Errorf("header too large: %d", headerSize)
 	}
 	data := make([]byte, headerSize)
 	if _, err := io.ReadFull(f, data); err != nil {
 		return nil, err
 	}
 	var header map[string]json.RawMessage
 	if err := json.Unmarshal(data, &header); err != nil {
 		return nil, err
 	}
 	metaRaw, ok := header["__metadata__"]
 	if !ok {
 		return nil, nil
 	}
 	var meta map[string]string
 	if err := json.Unmarshal(metaRaw, &meta); err != nil {
 		return nil, err
 	}
 	return meta, nil
 }
 // ExpertWeight holds a single expert's weight with optional quantization components.
 type ExpertWeight struct {
 	Weight    *mlx.Array
 	Scales    *mlx.Array
 	Biases    *mlx.Array
 	Bits      int
 	GroupSize int
 }
 // loadExpertWeight loads an expert weight from the tensor map.
 func loadExpertWeight(tensors map[string]*mlx.Array, path string, useQuantized bool, cfg *Config) *ExpertWeight {
 	w := tensors[path+".weight"]
 	if w == nil {
 		return nil
 	}
 	scales := tensors[path+".weight_scale"]
 	if scales != nil {
 		qbiases := tensors[path+".weight_qbias"]
 		groupSize, bits, mode := cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode
 		if useQuantized && supportsGatherQMM(mode, bits) {
 			return &ExpertWeight{Weight: w, Scales: scales, Biases: qbiases, Bits: bits, GroupSize: groupSize}
 		}
 		return &ExpertWeight{Weight: mlx.Dequantize(w, scales, qbiases, groupSize, bits, mode)}
 	}
 	return &ExpertWeight{Weight: w}
 }
 // StackedExpertWeights holds stacked weights for all experts.
 type StackedExpertWeights struct {
 	Weight    *mlx.Array
 	Scales    *mlx.Array
 	Biases    *mlx.Array
 	Bits      int
 	GroupSize int
 }
 // collectAndStackExpertWeights loads and stacks expert weights for one projection type.
 func collectAndStackExpertWeights(
 	tensors map[string]*mlx.Array,
 	prefix string,
 	projName string,
 	numExperts int32,
 	useQuantized bool,
 	cfg *Config,
 ) *StackedExpertWeights {
 	var w, s, b []*mlx.Array
 	var bits, groupSize int
 	for e := int32(0); e < numExperts; e++ {
 		path := fmt.Sprintf("%s.mlp.experts.%d.%s", prefix, e, projName)
 		ew := loadExpertWeight(tensors, path, useQuantized, cfg)
 		if ew == nil {
 			continue
 		}
 		w = append(w, ew.Weight)
 		if ew.Scales != nil {
 			s = append(s, ew.Scales)
 		}
 		if ew.Biases != nil {
 			b = append(b, ew.Biases)
 		}
 		if e == 0 {
 			bits = ew.Bits
 			groupSize = ew.GroupSize
 		}
 	}
 	result := &StackedExpertWeights{Bits: bits, GroupSize: groupSize}
 	if len(w) > 0 {
 		result.Weight = mlx.Stack(w, 0)
 		if len(s) > 0 {
 			result.Scales = mlx.Stack(s, 0)
 		}
 		if len(b) > 0 {
 			result.Biases = mlx.Stack(b, 0)
 		}
 	}
 	return result
 }
 // sanitizeExpertWeights stacks individual expert weights into tensors.
 func sanitizeExpertWeights(tensors map[string]*mlx.Array, prefix string, numExperts int32, useQuantized bool, cfg *Config) (gate, up, down *StackedExpertWeights) {
 	gate = collectAndStackExpertWeights(tensors, prefix, "gate_proj", numExperts, useQuantized, cfg)
 	up = collectAndStackExpertWeights(tensors, prefix, "up_proj", numExperts, useQuantized, cfg)
 	down = collectAndStackExpertWeights(tensors, prefix, "down_proj", numExperts, useQuantized, cfg)
 	return gate, up, down
 }
 // sanitizeMLAWeights transforms kv_b_proj weights into absorbed MLA format.
 func sanitizeMLAWeights(tensors map[string]*mlx.Array, prefix string, cfg *Config) (*mlx.Array, *mlx.Array) {
 	path := prefix + ".self_attn.kv_b_proj"
 	w := tensors[path+".weight"]
 	if w == nil {
 		return nil, nil
 	}
 	// Check if quantized and dequantize
 	if scales := tensors[path+".weight_scale"]; scales != nil {
 		qbiases := tensors[path+".weight_qbias"]
 		w = mlx.Dequantize(w, scales, qbiases, cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode)
 	}
 	headDim := cfg.QKNopeHeadDim + cfg.VHeadDim
 	w = mlx.Reshape(w, cfg.NumAttentionHeads, headDim, cfg.KVLoraRank)
 	wk := mlx.SliceStartStop(w, []int32{0, 0, 0}, []int32{cfg.NumAttentionHeads, cfg.QKNopeHeadDim, cfg.KVLoraRank})
 	wv := mlx.SliceStartStop(w, []int32{0, cfg.QKNopeHeadDim, 0}, []int32{cfg.NumAttentionHeads, headDim, cfg.KVLoraRank})
 	embedQ := mlx.Transpose(wk, 0, 2, 1)
 	unembedOut := wv
 	return embedQ, unembedOut
 }
 // makeLinear creates a Linear or QuantizedLinear layer from the tensor map.
 func makeLinear(tensors map[string]*mlx.Array, path string, cfg *Config) nn.LinearLayer {
 	w := tensors[path+".weight"]
 	if w == nil {
 		return nil
 	}
 	scales := tensors[path+".weight_scale"]
 	if scales != nil {
 		qbiases := tensors[path+".weight_qbias"]
 		bias := tensors[path+".bias"]
 		return &nn.QuantizedLinear{
 			Weight:    w,
 			Scales:    scales,
 			QBiases:   qbiases,
 			Bias:      bias,
 			GroupSize: cfg.QuantGroupSize,
 			Bits:      cfg.QuantBits,
 			Mode:      cfg.QuantMode,
 		}
 	}
 	bias := tensors[path+".bias"]
 	return nn.NewLinear(w, bias)
 }
 // LoadFromManifest loads a GLM4-MoE-Lite model from a manifest (Ollama blob storage).
 func LoadFromManifest(modelManifest *manifest.ModelManifest) (*Model, error) {
 	configData, err := modelManifest.ReadConfig("config.json")
 	if err != nil {
 		return nil, fmt.Errorf("load config: %w", err)
 	}
 	var cfg Config
 	if err := json.Unmarshal(configData, &cfg); err != nil {
 		return nil, fmt.Errorf("parse config: %w", err)
 	}
 	cfg.QHeadDim = cfg.QKNopeHeadDim + cfg.QKRopeHeadDim
 	cfg.Scale = computeScale(&cfg)
 	// Load all tensors from manifest blobs into a flat map
 	allTensors := make(map[string]*mlx.Array)
 	seen := make(map[string]bool) // dedupe by digest
 	var quantType string
 	var quantGroupSize int
 	for _, layer := range modelManifest.GetTensorLayers("") {
 		if seen[layer.Digest] {
 			continue
 		}
 		seen[layer.Digest] = true
 		blobPath := modelManifest.BlobPath(layer.Digest)
 		// Read quantization metadata from first blob
 		if quantType == "" {
 			if meta, err := readBlobMetadata(blobPath); err == nil && meta != nil {
 				if qt := meta["quant_type"]; qt != "" {
 					quantType = strings.ToUpper(qt)
 				}
 				if gs := meta["group_size"]; gs != "" {
 					fmt.Sscanf(gs, "%d", &quantGroupSize)
 				}
 			}
 		}
 		for name, arr := range mlx.Load(blobPath) {
 			// Map safetensors key naming to our naming convention
 			// Combined blobs use ".scale" and ".bias" suffixes
 			if strings.HasSuffix(name, ".scale") {
 				baseName := strings.TrimSuffix(name, ".scale")
 				allTensors[baseName+"_scale"] = arr
 			} else if strings.HasSuffix(name, ".bias") && !strings.HasSuffix(name, ".weight_qbias") {
 				// Check if this is a quantization bias or a regular bias
 				// by checking if there's a corresponding weight
 				baseName := strings.TrimSuffix(name, ".bias")
 				if _, hasScale := allTensors[baseName+"_scale"]; hasScale {
 					allTensors[baseName+"_qbias"] = arr
 				} else {
 					allTensors[name] = arr
 				}
 			} else {
 				allTensors[name] = arr
 			}
 		}
 	}
 	// Set up quantization parameters
 	useQuantized := false
 	if quantType != "" {
 		_, cfg.QuantBits, cfg.QuantMode = quantizationParams(quantType)
 		if quantGroupSize > 0 {
 			cfg.QuantGroupSize = quantGroupSize
 		} else {
 			cfg.QuantGroupSize, _, _ = quantizationParams(quantType)
 		}
 		useQuantized = supportsGatherQMM(cfg.QuantMode, cfg.QuantBits)
 	}
 	// Load tokenizer
 	tokData, err := modelManifest.ReadConfig("tokenizer.json")
 	if err != nil {
 		return nil, fmt.Errorf("load tokenizer config: %w", err)
 	}
 	tokConfig := &tokenizer.TokenizerConfig{
 		ConfigJSON: configData,
 	}
 	if genConfigData, err := modelManifest.ReadConfig("generation_config.json"); err == nil {
 		tokConfig.GenerationConfigJSON = genConfigData
 	}
 	if tokConfigData, err := modelManifest.ReadConfig("tokenizer_config.json"); err == nil {
 		tokConfig.TokenizerConfigJSON = tokConfigData
 	}
 	tok, err := tokenizer.LoadFromBytesWithConfig(tokData, tokConfig)
 	if err != nil {
 		return nil, fmt.Errorf("parse tokenizer: %w", err)
 	}
 	m := &Model{
 		Layers: make([]Block, cfg.NumHiddenLayers),
 		Config: &cfg,
 		tok:    tok,
 	}
 	// Load embedding
 	if w := allTensors["model.embed_tokens.weight"]; w != nil {
 		m.EmbedTokens = nn.NewEmbedding(w)
 	}
 	// Load final norm
 	if w := allTensors["model.norm.weight"]; w != nil {
 		m.Norm = nn.NewRMSNorm(w, cfg.RMSNormEps)
 	}
 	// Load LM head
 	m.LMHead = makeLinear(allTensors, "lm_head", &cfg)
 	// Load layers
 	for i := int32(0); i < cfg.NumHiddenLayers; i++ {
 		prefix := fmt.Sprintf("model.layers.%d", i)
 		// Load attention (same for both block types)
 		attn := &MLAAttention{}
 		attn.QAProj = makeLinear(allTensors, prefix+".self_attn.q_a_proj", &cfg)
 		if w := allTensors[prefix+".self_attn.q_a_layernorm.weight"]; w != nil {
 			attn.QALayerNorm = nn.NewRMSNorm(w, cfg.RMSNormEps)
 		}
 		attn.QBProj = makeLinear(allTensors, prefix+".self_attn.q_b_proj", &cfg)
 		attn.KVAProjWithMQA = makeLinear(allTensors, prefix+".self_attn.kv_a_proj_with_mqa", &cfg)
 		if w := allTensors[prefix+".self_attn.kv_a_layernorm.weight"]; w != nil {
 			attn.KVALayerNorm = nn.NewRMSNorm(w, cfg.RMSNormEps)
 		}
 		attn.OProj = makeLinear(allTensors, prefix+".self_attn.o_proj", &cfg)
 		// Sanitize MLA weights for absorbed attention
 		embedQ, unembedOut := sanitizeMLAWeights(allTensors, prefix, &cfg)
 		attn.EmbedQ = nn.NewMultiLinear(embedQ)
 		attn.UnembedOut = nn.NewMultiLinear(unembedOut)
 		inputLN := allTensors[prefix+".input_layernorm.weight"]
 		postAttnLN := allTensors[prefix+".post_attention_layernorm.weight"]
 		if i < cfg.FirstKDenseReplace {
 			// Dense block
 			block := &DenseBlock{Attention: attn}
 			if inputLN != nil {
 				block.InputLayerNorm = nn.NewRMSNorm(inputLN, cfg.RMSNormEps)
 			}
 			if postAttnLN != nil {
 				block.PostAttentionLayerNorm = nn.NewRMSNorm(postAttnLN, cfg.RMSNormEps)
 			}
 			block.MLP = &DenseMLP{
 				GateProj: makeLinear(allTensors, prefix+".mlp.gate_proj", &cfg),
 				UpProj:   makeLinear(allTensors, prefix+".mlp.up_proj", &cfg),
 				DownProj: makeLinear(allTensors, prefix+".mlp.down_proj", &cfg),
 			}
 			m.Layers[i] = block
 		} else {
 			// MoE block
 			block := &MoEBlock{Attention: attn}
 			if inputLN != nil {
 				block.InputLayerNorm = nn.NewRMSNorm(inputLN, cfg.RMSNormEps)
 			}
 			if postAttnLN != nil {
 				block.PostAttentionLayerNorm = nn.NewRMSNorm(postAttnLN, cfg.RMSNormEps)
 			}
 			// Stack expert weights
 			gate, up, down := sanitizeExpertWeights(allTensors, prefix, cfg.NRoutedExperts, useQuantized, &cfg)
 			switchMLP := &SwitchMLP{UseQuantized: useQuantized}
 			if useQuantized {
 				switchMLP.GateWeightQ = gate.Weight
 				switchMLP.GateScales = gate.Scales
 				switchMLP.GateBiases = gate.Biases
 				switchMLP.GateBits = gate.Bits
 				switchMLP.GateGroupSize = gate.GroupSize
 				switchMLP.UpWeightQ = up.Weight
 				switchMLP.UpScales = up.Scales
 				switchMLP.UpBiases = up.Biases
 				switchMLP.UpBits = up.Bits
 				switchMLP.UpGroupSize = up.GroupSize
 				switchMLP.DownWeightQ = down.Weight
 				switchMLP.DownScales = down.Scales
 				switchMLP.DownBiases = down.Biases
 				switchMLP.DownBits = down.Bits
 				switchMLP.DownGroupSize = down.GroupSize
 			} else {
 				switchMLP.GateWeight = gate.Weight
 				switchMLP.UpWeight = up.Weight
 				switchMLP.DownWeight = down.Weight
 			}
 			moeGate := &MoEGate{}
 			moeGate.Gate = makeLinear(allTensors, prefix+".mlp.gate", &cfg)
 			if bias := allTensors[prefix+".mlp.gate.e_score_correction_bias"]; bias != nil {
 				moeGate.EScoreCorrectionBias = bias
 			}
 			block.MoE = &MoE{
 				Gate:      moeGate,
 				SwitchMLP: switchMLP,
 			}
 			// Load shared experts if present
 			if cfg.NSharedExperts > 0 {
 				block.MoE.SharedExperts = &SharedExperts{
 					GateProj: makeLinear(allTensors, prefix+".mlp.shared_experts.gate_proj", &cfg),
 					UpProj:   makeLinear(allTensors, prefix+".mlp.shared_experts.up_proj", &cfg),
 					DownProj: makeLinear(allTensors, prefix+".mlp.shared_experts.down_proj", &cfg),
 				}
 			}
 			m.Layers[i] = block
 		}
 	}
 	mlx.Eval(mlx.Collect(m)...)
 	return m, nil
 }
 // Forward computes the forward pass of the model
 func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
 	dims := tokens.Dims()
 	B, L := int32(dims[0]), int32(dims[1])
 	h := m.EmbedTokens.Forward(tokens)
 	for i, layer := range m.Layers {
 		var c cache.Cache
 		if caches != nil {
 			c = caches[i]
 		}
 		h = layer.Forward(h, c, B, L, m.Config)
 	}
 	h = m.Norm.Forward(h, m.RMSNormEps)
 	return h
 }
 // Unembed applies the LM head to get logits.
 func (m *Model) Unembed(x *mlx.Array) *mlx.Array {
 	return m.LMHead.Forward(x)
 }
 // NumLayers returns the number of transformer layers
 func (m *Model) NumLayers() int { return len(m.Layers) }
 // MaxContextLength returns the maximum context length
 func (m *Model) MaxContextLength() int32 { return m.MaxPositionEmbeddings }
 // VocabSize returns the vocabulary size
 func (m *Model) VocabSize() int32 { return m.Config.VocabSize }
 // Tokenizer returns the model's tokenizer
 func (m *Model) Tokenizer() *tokenizer.Tokenizer { return m.tok }
 // NewCache creates a new KV cache for the model
 func (m *Model) NewCache(maxSeqLen int32) []cache.Cache {
 	caches := make([]cache.Cache, len(m.Layers))
 	for i := range caches {
 		caches[i] = cache.NewKVCache()
 	}
 	return caches
 }
 // FormatPrompt applies the GLM-4 chat template with thinking enabled by default.
 func (m *Model) FormatPrompt(prompt string) string {
 	return "[gMASK]<sop><|user|>" + prompt + "<|assistant|><think>"
 }
 // FormatPromptWithThinking applies the GLM-4 chat template with explicit thinking control.
 func (m *Model) FormatPromptWithThinking(prompt string, think bool) string {
 	if think {
 		return "[gMASK]<sop><|user|>" + prompt + "<|assistant|><think>"
 	}
 	return "[gMASK]<sop><|user|>" + prompt + "<|assistant|></think>"
 }
 // NewRenderer returns a new Renderer for formatting multi-turn conversations.
 func (m *Model) NewRenderer() *Renderer {
 	return &Renderer{}
 }
 // NewParser returns a new Parser for extracting thinking and tool calls from output.
 func (m *Model) NewParser() *Parser {
 	return &Parser{}
 }
--- a/x/models/glm4_moe_lite/parser.go
+++ b/x/models/glm4_moe_lite/parser.go
@@ -0,0 +1,479 @@
 //go:build mlx
 package glm4_moe_lite
 import (
 	"context"
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
 	"log/slog"
 	"strings"
 	"unicode"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/logutil"
 )
 type parserState int
 const (
 	parserState_LookingForThinkingOpen parserState = iota
 	parserState_ThinkingStartedEatingWhitespace
 	parserState_CollectingThinking
 	parserState_ThinkingDoneEatingWhitespace
 	parserState_CollectingContent
 	parserState_ToolStartedEatingWhitespace
 	parserState_CollectingToolContent
 )
 const (
 	thinkingOpenTag  = "<think>"
 	thinkingCloseTag = "</think>"
 	toolOpenTag      = "<tool_call>"
 	toolCloseTag     = "</tool_call>"
 )
 // Parser parses GLM4-MoE-Lite model output to extract thinking and tool calls.
 // GLM-4's prompt ends with <think> when thinking is enabled, so the parser
 // must start in CollectingThinking state (the model outputs thinking content directly).
 type Parser struct {
 	state  parserState
 	buffer strings.Builder
 	tools  []api.Tool
 }
 // HasToolSupport returns true as GLM4 supports tool calling.
 func (p *Parser) HasToolSupport() bool {
 	return true
 }
 // HasThinkingSupport returns true as GLM4 supports thinking mode.
 func (p *Parser) HasThinkingSupport() bool {
 	return true
 }
 // Init initializes the parser with tools and thinking configuration.
 func (p *Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
 	p.tools = tools
 	// When thinking is enabled (nil or true), the prompt ends with <think>,
 	// so model output starts directly with thinking content (no opening tag).
 	if thinkValue == nil || thinkValue.Bool() {
 		p.state = parserState_CollectingThinking
 	}
 	return tools
 }
 type parserEvent interface {
 	isParserEvent()
 }
 type eventContent struct {
 	content string
 }
 func (eventContent) isParserEvent() {}
 type eventRawToolCall struct {
 	raw string
 }
 func (eventRawToolCall) isParserEvent() {}
 type eventThinkingContent struct {
 	content string
 }
 func (eventThinkingContent) isParserEvent() {}
 // Add processes new output text and returns parsed content, thinking, and tool calls.
 func (p *Parser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
 	p.buffer.WriteString(s)
 	events := p.parseEvents()
 	var toolCalls []api.ToolCall
 	var contentSb strings.Builder
 	var thinkingSb strings.Builder
 	for _, event := range events {
 		switch event := event.(type) {
 		case eventRawToolCall:
 			toolCall, err := parseToolCall(event, p.tools)
 			if err != nil {
 				slog.Warn("glm-4 tool call parsing failed", "error", err)
 				return "", "", nil, err
 			}
 			toolCalls = append(toolCalls, toolCall)
 		case eventThinkingContent:
 			thinkingSb.WriteString(event.content)
 		case eventContent:
 			contentSb.WriteString(event.content)
 		}
 	}
 	return contentSb.String(), thinkingSb.String(), toolCalls, nil
 }
 func (p *Parser) parseEvents() []parserEvent {
 	var all []parserEvent
 	keepLooping := true
 	for keepLooping {
 		var events []parserEvent
 		events, keepLooping = p.eat()
 		if len(events) > 0 {
 			all = append(all, events...)
 		}
 	}
 	if len(all) > 0 {
 		slog.Log(context.TODO(), logutil.LevelTrace, "glm-4 events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
 	}
 	return all
 }
 // eatLeadingWhitespaceAndTransitionTo consumes leading whitespace from the buffer
 // and transitions to the next state. Returns (nil, false) if only whitespace remains
 // in the buffer (needs more input), or (nil, true) if we successfully transitioned.
 func (p *Parser) eatLeadingWhitespaceAndTransitionTo(nextState parserState) ([]parserEvent, bool) {
 	trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
 	p.buffer.Reset()
 	if trimmed == "" {
 		return nil, false // Still only whitespace, keep waiting for more input
 	}
 	p.state = nextState
 	p.buffer.WriteString(trimmed)
 	return nil, true // Successfully transitioned
 }
 // splitAtTag splits the buffer at the given tag, returns the content before (trimmed of trailing whitespace),
 // the content after (optionally trimmed of leading whitespace), and updates the buffer
 func (p *Parser) splitAtTag(tag string, trimAfter bool) (string, string) {
 	split := strings.SplitN(p.buffer.String(), tag, 2)
 	before := split[0]
 	before = strings.TrimRightFunc(before, unicode.IsSpace)
 	after := split[1]
 	if trimAfter {
 		after = strings.TrimLeftFunc(after, unicode.IsSpace)
 	}
 	p.buffer.Reset()
 	p.buffer.WriteString(after)
 	return before, after
 }
 func (p *Parser) eat() ([]parserEvent, bool) {
 	var events []parserEvent
 	switch p.state {
 	case parserState_LookingForThinkingOpen:
 		trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
 		if strings.HasPrefix(trimmed, thinkingOpenTag) {
 			// Found <think> opening tag
 			after := strings.TrimPrefix(trimmed, thinkingOpenTag)
 			after = strings.TrimLeftFunc(after, unicode.IsSpace)
 			p.buffer.Reset()
 			p.buffer.WriteString(after)
 			if after == "" {
 				p.state = parserState_ThinkingStartedEatingWhitespace
 			} else {
 				p.state = parserState_CollectingThinking
 			}
 			return events, true
 		} else if strings.HasPrefix(thinkingOpenTag, trimmed) {
 			// Partial opening tag seen, keep accumulating
 			return events, false
 		} else if trimmed == "" {
 			// Only whitespace, keep accumulating
 			return events, false
 		} else {
 			// No thinking tag found, skip to content collection
 			p.state = parserState_CollectingContent
 			// Don't trim - we want to keep the original content
 			return events, true
 		}
 	case parserState_ThinkingStartedEatingWhitespace:
 		return p.eatLeadingWhitespaceAndTransitionTo(parserState_CollectingThinking)
 	case parserState_CollectingThinking:
 		acc := p.buffer.String()
 		if strings.Contains(acc, thinkingCloseTag) {
 			thinking, remaining := p.splitAtTag(thinkingCloseTag, true)
 			if len(thinking) > 0 {
 				events = append(events, eventThinkingContent{content: thinking})
 			}
 			if remaining == "" {
 				p.state = parserState_ThinkingDoneEatingWhitespace
 			} else {
 				p.state = parserState_CollectingContent
 			}
 			return events, true
 		} else if overlapLen := overlap(acc, thinkingCloseTag); overlapLen > 0 {
 			// Partial closing tag - withhold it along with any trailing whitespace before it
 			beforePartialTag := acc[:len(acc)-overlapLen]
 			trailingWsLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWsLen
 			unambiguous := acc[:ambiguousStart]
 			ambiguous := acc[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
 				events = append(events, eventThinkingContent{content: unambiguous})
 			}
 			return events, false
 		} else {
 			// Pure thinking content - withhold trailing whitespace (might precede closing tag)
 			whitespaceLen := trailingWhitespaceLen(acc)
 			ambiguousStart := len(acc) - whitespaceLen
 			unambiguous := acc[:ambiguousStart]
 			ambiguous := acc[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
 				events = append(events, eventThinkingContent{content: unambiguous})
 			}
 			return events, false
 		}
 	case parserState_ThinkingDoneEatingWhitespace:
 		return p.eatLeadingWhitespaceAndTransitionTo(parserState_CollectingContent)
 	case parserState_CollectingContent:
 		if strings.Contains(p.buffer.String(), toolOpenTag) {
 			before, after := p.splitAtTag(toolOpenTag, true)
 			if len(before) > 0 {
 				events = append(events, eventContent{content: before})
 			}
 			if after == "" {
 				p.state = parserState_ToolStartedEatingWhitespace
 			} else {
 				p.state = parserState_CollectingToolContent
 			}
 			return events, true
 		} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 {
 			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
 			trailingWsLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWsLen
 			unambiguous := p.buffer.String()[:ambiguousStart]
 			ambiguous := p.buffer.String()[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
 				events = append(events, eventContent{content: unambiguous})
 			}
 			return events, false
 		} else {
 			whitespaceLen := trailingWhitespaceLen(p.buffer.String())
 			ambiguousStart := len(p.buffer.String()) - whitespaceLen
 			unambiguous := p.buffer.String()[:ambiguousStart]
 			ambiguous := p.buffer.String()[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
 				events = append(events, eventContent{content: unambiguous})
 			}
 			return events, false
 		}
 	case parserState_ToolStartedEatingWhitespace:
 		return p.eatLeadingWhitespaceAndTransitionTo(parserState_CollectingToolContent)
 	case parserState_CollectingToolContent:
 		acc := p.buffer.String()
 		if strings.Contains(acc, toolCloseTag) {
 			toolContent, _ := p.splitAtTag(toolCloseTag, true)
 			if len(toolContent) == 0 {
 				slog.Warn("glm4 tool call closing tag found but no content before it")
 			}
 			events = append(events, eventRawToolCall{raw: toolContent})
 			p.state = parserState_CollectingContent
 			return events, true
 		} else {
 			// Keep accumulating - tool calls are not streamed
 			// We just wait for the closing tag
 			return events, false
 		}
 	default:
 		panic("unreachable")
 	}
 }
 // overlap returns the length of the overlap between the end of s and the start of tag.
 func overlap(s, tag string) int {
 	for i := 1; i <= len(tag) && i <= len(s); i++ {
 		if strings.HasSuffix(s, tag[:i]) {
 			return i
 		}
 	}
 	return 0
 }
 // trailingWhitespaceLen returns the length of trailing whitespace in s.
 func trailingWhitespaceLen(s string) int {
 	trimmed := strings.TrimRightFunc(s, unicode.IsSpace)
 	return len(s) - len(trimmed)
 }
 // ToolCallXML represents the structure of a GLM-4 tool call for XML parsing
 type ToolCallXML struct {
 	XMLName xml.Name `xml:"tool_call"`
 	Content string   `xml:",chardata"` // Function name (text nodes between tags)
 	Keys    []string `xml:"arg_key"`   // All arg_key elements in document order
 	Values  []string `xml:"arg_value"` // All arg_value elements in document order
 }
 // escapeContent escapes XML entities in text content while preserving arg_key/arg_value tags
 func escapeContent(s string) string {
 	var result strings.Builder
 	inTag := false
 	for i := range len(s) {
 		ch := s[i]
 		if ch == '<' {
 			// Check if this is a known tag
 			if strings.HasPrefix(s[i:], "<arg_key>") ||
 				strings.HasPrefix(s[i:], "</arg_key>") ||
 				strings.HasPrefix(s[i:], "<arg_value>") ||
 				strings.HasPrefix(s[i:], "</arg_value>") {
 				inTag = true
 			}
 		}
 		if inTag {
 			result.WriteByte(ch)
 			if ch == '>' {
 				inTag = false
 			}
 		} else {
 			// Escape special characters in text content
 			switch ch {
 			case '&':
 				result.WriteString("&amp;")
 			case '<':
 				result.WriteString("&lt;")
 			case '>':
 				result.WriteString("&gt;")
 			default:
 				result.WriteByte(ch)
 			}
 		}
 	}
 	return result.String()
 }
 func parseToolCall(raw eventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
 	// Escape any unescaped entities in text content
 	escaped := escapeContent(raw.raw)
 	// Wrap the content in a root element to make it valid XML
 	xmlString := "<tool_call>" + escaped + "</tool_call>"
 	// Parse XML into struct
 	var parsed ToolCallXML
 	if err := xml.Unmarshal([]byte(xmlString), &parsed); err != nil {
 		return api.ToolCall{}, fmt.Errorf("failed to parse XML: %w", err)
 	}
 	// Extract and trim function name
 	functionName := strings.TrimSpace(parsed.Content)
 	if functionName == "" {
 		return api.ToolCall{}, fmt.Errorf("empty function name")
 	}
 	// Verify keys and values are paired correctly
 	if len(parsed.Keys) != len(parsed.Values) {
 		return api.ToolCall{}, fmt.Errorf("mismatched arg_key and arg_value counts: %d keys, %d values", len(parsed.Keys), len(parsed.Values))
 	}
 	// Find the matching tool to get parameter types
 	var matchedTool *api.Tool
 	for i := range tools {
 		if tools[i].Function.Name == functionName {
 			matchedTool = &tools[i]
 			break
 		}
 	}
 	// Build arguments map by pairing keys and values
 	toolCall := api.ToolCall{
 		Function: api.ToolCallFunction{
 			Name:      functionName,
 			Arguments: api.NewToolCallFunctionArguments(),
 		},
 	}
 	for i := range parsed.Keys {
 		key := strings.TrimSpace(parsed.Keys[i])
 		value := parsed.Values[i] // Don't trim here - parseValue handles it
 		// Look up parameter type
 		var paramType api.PropertyType
 		if matchedTool != nil && matchedTool.Function.Parameters.Properties != nil {
 			if prop, ok := matchedTool.Function.Parameters.Properties.Get(key); ok {
 				// Handle anyOf by collecting all types from the union
 				if len(prop.AnyOf) > 0 {
 					for _, anyOfProp := range prop.AnyOf {
 						paramType = append(paramType, anyOfProp.Type...)
 					}
 				} else {
 					paramType = prop.Type
 				}
 			}
 		}
 		// Parse value with type coercion
 		toolCall.Function.Arguments.Set(key, parseValue(value, paramType))
 	}
 	return toolCall, nil
 }
 // parseValue parses a string value and coerces it to the appropriate type based on paramType.
 func parseValue(value string, paramType api.PropertyType) any {
 	value = strings.TrimSpace(value)
 	// If no type specified, return as string
 	if len(paramType) == 0 {
 		return value
 	}
 	// Try to parse based on specified types
 	for _, t := range paramType {
 		switch t {
 		case "boolean":
 			if value == "true" {
 				return true
 			}
 			if value == "false" {
 				return false
 			}
 		case "integer":
 			var i int64
 			if _, err := fmt.Sscanf(value, "%d", &i); err == nil {
 				return i
 			}
 		case "number":
 			var f float64
 			if _, err := fmt.Sscanf(value, "%f", &f); err == nil {
 				return f
 			}
 		case "array", "object":
 			// Try to parse as JSON
 			var result any
 			if err := json.Unmarshal([]byte(value), &result); err == nil {
 				return result
 			}
 		}
 	}
 	// Default to string
 	return value
 }
--- a/x/models/glm4_moe_lite/parser_test.go
+++ b/x/models/glm4_moe_lite/parser_test.go
@@ -0,0 +1,192 @@
 //go:build mlx
 package glm4_moe_lite
 import (
 	"testing"
 	"github.com/ollama/ollama/api"
 )
 func TestParserThinking(t *testing.T) {
 	tests := []struct {
 		name          string
 		input         string
 		thinkEnabled  bool
 		wantContent   string
 		wantThinking  string
 		wantToolCalls int
 	}{
 		{
 			name:         "thinking enabled - simple thinking then content",
 			input:        "Let me think about this...</think>Here is my answer.",
 			thinkEnabled: true,
 			wantThinking: "Let me think about this...",
 			wantContent:  "Here is my answer.",
 		},
 		{
 			name:         "thinking enabled - only thinking",
 			input:        "I need to consider multiple factors...",
 			thinkEnabled: true,
 			wantThinking: "I need to consider multiple factors...",
 			wantContent:  "",
 		},
 		{
 			name:         "thinking disabled - direct content",
 			input:        "Here is my direct answer.",
 			thinkEnabled: false,
 			wantThinking: "",
 			wantContent:  "Here is my direct answer.",
 		},
 		{
 			name:          "thinking with tool call",
 			input:         "Let me search for that...</think>I'll use a tool.<tool_call>search<arg_key>query</arg_key><arg_value>test</arg_value></tool_call>",
 			thinkEnabled:  true,
 			wantThinking:  "Let me search for that...",
 			wantContent:   "I'll use a tool.",
 			wantToolCalls: 1,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			p := &Parser{}
 			var thinkValue *api.ThinkValue
 			if tt.thinkEnabled {
 				thinkValue = &api.ThinkValue{Value: true}
 			} else {
 				thinkValue = &api.ThinkValue{Value: false}
 			}
 			// Define tools for tool call tests
 			props := api.NewToolPropertiesMap()
 			props.Set("query", api.ToolProperty{Type: api.PropertyType{"string"}})
 			tools := []api.Tool{
 				{
 					Function: api.ToolFunction{
 						Name: "search",
 						Parameters: api.ToolFunctionParameters{
 							Properties: props,
 						},
 					},
 				},
 			}
 			p.Init(tools, nil, thinkValue)
 			content, thinking, calls, err := p.Add(tt.input, true)
 			if err != nil {
 				t.Fatalf("unexpected error: %v", err)
 			}
 			if thinking != tt.wantThinking {
 				t.Errorf("thinking = %q, want %q", thinking, tt.wantThinking)
 			}
 			if content != tt.wantContent {
 				t.Errorf("content = %q, want %q", content, tt.wantContent)
 			}
 			if len(calls) != tt.wantToolCalls {
 				t.Errorf("len(calls) = %d, want %d", len(calls), tt.wantToolCalls)
 			}
 		})
 	}
 }
 func TestParserToolCall(t *testing.T) {
 	p := &Parser{}
 	props := api.NewToolPropertiesMap()
 	props.Set("location", api.ToolProperty{Type: api.PropertyType{"string"}})
 	props.Set("unit", api.ToolProperty{Type: api.PropertyType{"string"}})
 	tools := []api.Tool{
 		{
 			Function: api.ToolFunction{
 				Name: "get_weather",
 				Parameters: api.ToolFunctionParameters{
 					Properties: props,
 				},
 			},
 		},
 	}
 	// Initialize with thinking disabled
 	tv := &api.ThinkValue{Value: false}
 	p.Init(tools, nil, tv)
 	input := "<tool_call>get_weather<arg_key>location</arg_key><arg_value>San Francisco</arg_value><arg_key>unit</arg_key><arg_value>celsius</arg_value></tool_call>"
 	_, _, calls, err := p.Add(input, true)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if len(calls) != 1 {
 		t.Fatalf("expected 1 tool call, got %d", len(calls))
 	}
 	call := calls[0]
 	if call.Function.Name != "get_weather" {
 		t.Errorf("function name = %q, want %q", call.Function.Name, "get_weather")
 	}
 	location, ok := call.Function.Arguments.Get("location")
 	if !ok || location != "San Francisco" {
 		t.Errorf("location = %v, want %q", location, "San Francisco")
 	}
 	unit, ok := call.Function.Arguments.Get("unit")
 	if !ok || unit != "celsius" {
 		t.Errorf("unit = %v, want %q", unit, "celsius")
 	}
 }
 func TestOverlap(t *testing.T) {
 	tests := []struct {
 		s    string
 		tag  string
 		want int
 	}{
 		{"hello<", "</think>", 1},
 		{"hello</", "</think>", 2},
 		{"hello</t", "</think>", 3},
 		{"hello</th", "</think>", 4},
 		{"hello</thi", "</think>", 5},
 		{"hello</thin", "</think>", 6},
 		{"hello</think", "</think>", 7},
 		{"hello</think>", "</think>", 8}, // Complete tag at end returns full length
 		{"hello", "</think>", 0},
 		{"", "</think>", 0},
 	}
 	for _, tt := range tests {
 		t.Run(tt.s+"_"+tt.tag, func(t *testing.T) {
 			got := overlap(tt.s, tt.tag)
 			if got != tt.want {
 				t.Errorf("overlap(%q, %q) = %d, want %d", tt.s, tt.tag, got, tt.want)
 			}
 		})
 	}
 }
 func TestTrailingWhitespaceLen(t *testing.T) {
 	tests := []struct {
 		s    string
 		want int
 	}{
 		{"hello   ", 3},
 		{"hello\n\t ", 3},
 		{"hello", 0},
 		{"", 0},
 		{"   ", 3},
 	}
 	for _, tt := range tests {
 		t.Run(tt.s, func(t *testing.T) {
 			got := trailingWhitespaceLen(tt.s)
 			if got != tt.want {
 				t.Errorf("trailingWhitespaceLen(%q) = %d, want %d", tt.s, got, tt.want)
 			}
 		})
 	}
 }
--- a/x/models/glm4_moe_lite/render.go
+++ b/x/models/glm4_moe_lite/render.go
@@ -0,0 +1,175 @@
 //go:build mlx
 package glm4_moe_lite
 import (
 	"encoding/json"
 	"fmt"
 	"strings"
 	"github.com/ollama/ollama/api"
 )
 // Renderer renders messages for GLM4-MoE-Lite models.
 //
 // GLM-4 Thinking Modes (ref: https://docs.z.ai/guides/capabilities/thinking-mode):
 //
 //  1. INTERLEAVED THINKING
 //     The model thinks between tool calls and after receiving tool results.
 //     This enables complex step-by-step reasoning: interpreting each tool output
 //     before deciding what to do next. Thinking blocks are preserved and returned
 //     with tool results to maintain reasoning continuity.
 //
 //  2. PRESERVED THINKING
 //     The model retains reasoning content from previous assistant turns in context.
 //     This preserves reasoning continuity across multi-turn conversations. The
 //     upstream API has a "clear_thinking" parameter to control this:
 //     - clear_thinking=true:  clears reasoning from previous turns (outputs </think>)
 //     - clear_thinking=false: preserves <think>...</think> blocks from previous turns
 //
 //  3. TURN-LEVEL THINKING
 //     Controls whether the model should reason on each turn. The upstream API
 //     uses "enable_thinking" parameter:
 //     - enable_thinking=true:  outputs <think> to start reasoning
 //     - enable_thinking=false: outputs </think> to skip reasoning
 //
 // OLLAMA DEFAULTS:
 //   - Thinking is ENABLED by default (thinkValue=nil or true outputs <think>)
 //   - Thinking is PRESERVED by default (reasoning content from previous turns is always
 //     included in <think>...</think> blocks, equivalent to clear_thinking=false)
 //   - Users can disable thinking per-turn via thinkValue=false
 type Renderer struct{}
 // Render renders messages into the GLM4 chat format.
 func (r *Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
 	var sb strings.Builder
 	sb.WriteString("[gMASK]<sop>")
 	if len(tools) > 0 {
 		sb.WriteString("<|system|>\n")
 		sb.WriteString("# Tools\n\n")
 		sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
 		sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
 		sb.WriteString("<tools>\n")
 		for _, tool := range tools {
 			d, _ := json.Marshal(tool)
 			sb.WriteString(formatToolJSON(d))
 			sb.WriteString("\n")
 		}
 		sb.WriteString("</tools>\n\n")
 		sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
 		sb.WriteString("<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>")
 	}
 	think := true
 	if thinkValue != nil && !thinkValue.Bool() {
 		think = false
 	}
 	for i, message := range messages {
 		switch message.Role {
 		case "user":
 			sb.WriteString("<|user|>")
 			sb.WriteString(message.Content)
 		case "assistant":
 			sb.WriteString("<|assistant|>")
 			if message.Thinking != "" {
 				sb.WriteString("<think>" + message.Thinking + "</think>")
 			} else {
 				sb.WriteString("</think>")
 			}
 			if message.Content != "" {
 				sb.WriteString(message.Content)
 			}
 			if len(message.ToolCalls) > 0 {
 				for _, toolCall := range message.ToolCalls {
 					sb.WriteString("<tool_call>" + toolCall.Function.Name)
 					sb.WriteString(renderToolArguments(toolCall.Function.Arguments))
 					sb.WriteString("</tool_call>")
 				}
 			}
 		case "tool":
 			if i == 0 || messages[i-1].Role != "tool" {
 				sb.WriteString("<|observation|>")
 			}
 			sb.WriteString("<tool_response>")
 			sb.WriteString(message.Content)
 			sb.WriteString("</tool_response>")
 		case "system":
 			sb.WriteString("<|system|>")
 			sb.WriteString(message.Content)
 		}
 	}
 	sb.WriteString("<|assistant|>")
 	if think {
 		sb.WriteString("<think>")
 	} else {
 		sb.WriteString("</think>")
 	}
 	return sb.String(), nil
 }
 // renderToolArguments converts tool call arguments to GLM4 XML format.
 func renderToolArguments(args api.ToolCallFunctionArguments) string {
 	var sb strings.Builder
 	for key, value := range args.All() {
 		sb.WriteString("<arg_key>" + key + "</arg_key>")
 		var valueStr string
 		if str, ok := value.(string); ok {
 			valueStr = str
 		} else {
 			jsonBytes, err := json.Marshal(value)
 			if err != nil {
 				valueStr = fmt.Sprintf("%v", value)
 			} else {
 				valueStr = string(jsonBytes)
 			}
 		}
 		sb.WriteString("<arg_value>" + valueStr + "</arg_value>")
 	}
 	return sb.String()
 }
 // formatToolJSON formats JSON for GLM4 tool definitions by adding spaces after : and ,
 func formatToolJSON(raw []byte) string {
 	var sb strings.Builder
 	sb.Grow(len(raw) + len(raw)/10)
 	inString := false
 	escaped := false
 	for i := range raw {
 		ch := raw[i]
 		sb.WriteByte(ch)
 		if inString {
 			if escaped {
 				escaped = false
 				continue
 			}
 			if ch == '\\' {
 				escaped = true
 				continue
 			}
 			if ch == '"' {
 				inString = false
 			}
 			continue
 		}
 		if ch == '"' {
 			inString = true
 			continue
 		}
 		if ch == ':' || ch == ',' {
 			sb.WriteByte(' ')
 		}
 	}
 	return sb.String()
 }
--- a/x/models/glm4_moe_lite/render_test.go
+++ b/x/models/glm4_moe_lite/render_test.go
@@ -0,0 +1,205 @@
 //go:build mlx
 package glm4_moe_lite
 import (
 	"strings"
 	"testing"
 	"github.com/ollama/ollama/api"
 )
 func TestRendererSimple(t *testing.T) {
 	r := &Renderer{}
 	messages := []api.Message{
 		{Role: "user", Content: "Hello"},
 	}
 	// Thinking enabled (default)
 	result, err := r.Render(messages, nil, nil)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	expected := "[gMASK]<sop><|user|>Hello<|assistant|><think>"
 	if result != expected {
 		t.Errorf("result = %q, want %q", result, expected)
 	}
 }
 func TestRendererThinkingDisabled(t *testing.T) {
 	r := &Renderer{}
 	messages := []api.Message{
 		{Role: "user", Content: "Hello"},
 	}
 	tv := &api.ThinkValue{Value: false}
 	result, err := r.Render(messages, nil, tv)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	expected := "[gMASK]<sop><|user|>Hello<|assistant|></think>"
 	if result != expected {
 		t.Errorf("result = %q, want %q", result, expected)
 	}
 }
 func TestRendererMultiTurn(t *testing.T) {
 	r := &Renderer{}
 	messages := []api.Message{
 		{Role: "user", Content: "What is 2+2?"},
 		{Role: "assistant", Content: "4", Thinking: "Let me calculate: 2+2=4"},
 		{Role: "user", Content: "And 3+3?"},
 	}
 	result, err := r.Render(messages, nil, nil)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	// Check key parts
 	if !strings.Contains(result, "[gMASK]<sop>") {
 		t.Error("missing [gMASK]<sop> prefix")
 	}
 	if !strings.Contains(result, "<|user|>What is 2+2?") {
 		t.Error("missing first user message")
 	}
 	if !strings.Contains(result, "<|assistant|><think>Let me calculate: 2+2=4</think>4") {
 		t.Error("missing assistant message with thinking")
 	}
 	if !strings.Contains(result, "<|user|>And 3+3?") {
 		t.Error("missing second user message")
 	}
 	if !strings.HasSuffix(result, "<|assistant|><think>") {
 		t.Errorf("should end with <|assistant|><think>, got suffix: %q", result[len(result)-30:])
 	}
 }
 func TestRendererWithSystem(t *testing.T) {
 	r := &Renderer{}
 	messages := []api.Message{
 		{Role: "system", Content: "You are a helpful assistant."},
 		{Role: "user", Content: "Hello"},
 	}
 	result, err := r.Render(messages, nil, nil)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if !strings.Contains(result, "<|system|>You are a helpful assistant.") {
 		t.Error("missing system message")
 	}
 }
 func TestRendererWithTools(t *testing.T) {
 	r := &Renderer{}
 	messages := []api.Message{
 		{Role: "user", Content: "What's the weather?"},
 	}
 	props := api.NewToolPropertiesMap()
 	props.Set("location", api.ToolProperty{Type: api.PropertyType{"string"}, Description: "The city"})
 	tools := []api.Tool{
 		{
 			Function: api.ToolFunction{
 				Name:        "get_weather",
 				Description: "Get the weather for a location",
 				Parameters: api.ToolFunctionParameters{
 					Type:       "object",
 					Properties: props,
 					Required:   []string{"location"},
 				},
 			},
 		},
 	}
 	result, err := r.Render(messages, tools, nil)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	// Check for tool system prompt
 	if !strings.Contains(result, "<|system|>") {
 		t.Error("missing system tag for tools")
 	}
 	if !strings.Contains(result, "# Tools") {
 		t.Error("missing tools header")
 	}
 	if !strings.Contains(result, "<tools>") {
 		t.Error("missing tools tag")
 	}
 	if !strings.Contains(result, "get_weather") {
 		t.Error("missing tool name")
 	}
 	if !strings.Contains(result, "</tools>") {
 		t.Error("missing closing tools tag")
 	}
 }
 func TestRendererWithToolCalls(t *testing.T) {
 	r := &Renderer{}
 	args := api.NewToolCallFunctionArguments()
 	args.Set("location", "San Francisco")
 	messages := []api.Message{
 		{Role: "user", Content: "What's the weather in SF?"},
 		{
 			Role: "assistant",
 			ToolCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name:      "get_weather",
 						Arguments: args,
 					},
 				},
 			},
 		},
 		{Role: "tool", Content: "Sunny, 72F"},
 	}
 	result, err := r.Render(messages, nil, nil)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if !strings.Contains(result, "<tool_call>get_weather") {
 		t.Error("missing tool call")
 	}
 	if !strings.Contains(result, "<arg_key>location</arg_key>") {
 		t.Error("missing arg_key")
 	}
 	if !strings.Contains(result, "<arg_value>San Francisco</arg_value>") {
 		t.Error("missing arg_value")
 	}
 	if !strings.Contains(result, "</tool_call>") {
 		t.Error("missing tool call closing tag")
 	}
 	if !strings.Contains(result, "<|observation|>") {
 		t.Error("missing observation tag")
 	}
 	if !strings.Contains(result, "<tool_response>Sunny, 72F</tool_response>") {
 		t.Error("missing tool response")
 	}
 }
 func TestFormatToolJSON(t *testing.T) {
 	input := []byte(`{"name":"test","value":123}`)
 	result := formatToolJSON(input)
 	// Should add spaces after : and ,
 	if !strings.Contains(result, ": ") {
 		t.Error("should add space after colon")
 	}
 	if !strings.Contains(result, ", ") {
 		t.Error("should add space after comma")
 	}
 }
--- a/x/models/nn/nn.go
+++ b/x/models/nn/nn.go
@@ -0,0 +1,188 @@
 //go:build mlx
 package nn
 import "github.com/ollama/ollama/x/mlxrunner/mlx"
 // Layer is the interface for neural network layers with a Forward method.
 type Layer interface {
 	Forward(x *mlx.Array) *mlx.Array
 }
 // LinearLayer is an interface for linear layers (both regular and quantized).
 type LinearLayer interface {
 	Forward(x *mlx.Array) *mlx.Array
 	OutputDim() int32
 }
 // Linear applies an affine transformation: y = x @ W.T + b
 type Linear struct {
 	Weight *mlx.Array
 	Bias   *mlx.Array
 }
 func NewLinear(weight *mlx.Array, bias *mlx.Array) *Linear {
 	return &Linear{Weight: weight, Bias: bias}
 }
 func (l *Linear) Forward(x *mlx.Array) *mlx.Array {
 	w := l.Weight.Transpose(1, 0)
 	if l.Bias != nil && l.Bias.Valid() {
 		return l.Bias.Addmm(x, w, 1.0, 1.0)
 	}
 	return x.Matmul(w)
 }
 func (l *Linear) OutputDim() int32 {
 	return int32(l.Weight.Dim(0))
 }
 // QuantizedLinear applies an affine transformation using quantized weights.
 type QuantizedLinear struct {
 	Weight    *mlx.Array // Quantized weight data
 	Scales    *mlx.Array // Scale factors for dequantization
 	QBiases   *mlx.Array // Quantization biases (nil for nvfp4)
 	Bias      *mlx.Array // Layer bias [output_dims] or nil
 	GroupSize int
 	Bits      int
 	Mode      string
 }
 func NewQuantizedLinear(weight *mlx.Array, bias *mlx.Array, groupSize, bits int, mode string) *QuantizedLinear {
 	qw, scales, qbiases := mlx.Quantize(weight, groupSize, bits, mode)
 	if qbiases != nil {
 		mlx.Eval(qw, scales, qbiases)
 	} else {
 		mlx.Eval(qw, scales)
 	}
 	return &QuantizedLinear{
 		Weight:    qw,
 		Scales:    scales,
 		QBiases:   qbiases,
 		Bias:      bias,
 		GroupSize: groupSize,
 		Bits:      bits,
 		Mode:      mode,
 	}
 }
 func (ql *QuantizedLinear) Forward(x *mlx.Array) *mlx.Array {
 	out := mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode)
 	if ql.Bias != nil && ql.Bias.Valid() {
 		out = out.Add(ql.Bias)
 	}
 	return out
 }
 func (ql *QuantizedLinear) OutputDim() int32 {
 	return int32(ql.Weight.Dim(0))
 }
 // RMSNorm represents an RMS normalization layer.
 type RMSNorm struct {
 	Weight *mlx.Array
 	Eps    float32
 }
 func NewRMSNorm(weight *mlx.Array, eps float32) *RMSNorm {
 	return &RMSNorm{Weight: weight, Eps: eps}
 }
 func (rn *RMSNorm) Forward(x *mlx.Array, eps float32) *mlx.Array {
 	if eps == 0 {
 		eps = rn.Eps
 	}
 	return mlx.RMSNormFn(x, rn.Weight, eps)
 }
 // Embedding represents an embedding layer.
 type Embedding struct {
 	Weight *mlx.Array
 }
 func NewEmbedding(weight *mlx.Array) *Embedding {
 	return &Embedding{Weight: weight}
 }
 func (e *Embedding) Forward(indices *mlx.Array) *mlx.Array {
 	return e.Weight.TakeAxis(indices, 0)
 }
 // LayerNorm represents a standard layer normalization layer (with bias).
 type LayerNorm struct {
 	Weight *mlx.Array
 	Bias   *mlx.Array
 	Eps    float32
 }
 func (ln *LayerNorm) Forward(x *mlx.Array) *mlx.Array {
 	eps := ln.Eps
 	if eps == 0 {
 		eps = 1e-5
 	}
 	mean := mlx.Mean(x, -1, true)
 	centered := x.Subtract(mean)
 	variance := mlx.Mean(centered.Multiply(centered), -1, true)
 	normalized := centered.Multiply(mlx.RSqrt(mlx.AddScalar(variance, eps)))
 	out := normalized.Multiply(ln.Weight)
 	if ln.Bias != nil && ln.Bias.Valid() {
 		out = out.Add(ln.Bias)
 	}
 	return out
 }
 // MultiLinearLayer is an interface for per-head linear layers.
 type MultiLinearLayer interface {
 	Forward(x *mlx.Array) *mlx.Array
 }
 // MultiLinear performs per-head linear projections.
 // Weight shape: [num_heads, output_dims, input_dims]
 type MultiLinear struct {
 	Weight *mlx.Array
 }
 func NewMultiLinear(weight *mlx.Array) *MultiLinear {
 	return &MultiLinear{Weight: weight}
 }
 func (ml *MultiLinear) Forward(x *mlx.Array) *mlx.Array {
 	wT := ml.Weight.Transpose(0, 2, 1)
 	return x.Matmul(wT)
 }
 // RepeatKV repeats K/V tensors for grouped query attention.
 func RepeatKV(x *mlx.Array, repeatFactor int32) *mlx.Array {
 	if repeatFactor == 1 {
 		return x
 	}
 	shape := x.Dims()
 	x = x.ExpandDims(2)
 	reps := []int32{1, 1, repeatFactor, 1, 1}
 	x = mlx.Tile(x, reps)
 	return mlx.Reshape(x, int32(shape[0]), int32(shape[1])*repeatFactor, int32(shape[2]), int32(shape[3]))
 }
 // ApplyCausalMask applies causal (lower triangular) mask to attention scores.
 func ApplyCausalMask(scores *mlx.Array) *mlx.Array {
 	shape := scores.Dims()
 	seqLen := int32(shape[2])
 	mask := mlx.Tri(seqLen, seqLen, 0)
 	negInf := mlx.NewScalarArray(float32(-1e9))
 	mask = mask.ExpandDims(0).ExpandDims(0)
 	return mlx.Where(mask, scores, negInf)
 }
 // ApplyCausalMaskWithOffset applies causal mask for cached attention.
 func ApplyCausalMaskWithOffset(scores *mlx.Array, offset int32) *mlx.Array {
 	if offset == 0 {
 		return ApplyCausalMask(scores)
 	}
 	shape := scores.Dims()
 	queryLen := int32(shape[2])
 	keyLen := int32(shape[3])
 	mask := mlx.Tri(queryLen, keyLen, int(offset))
 	negInf := mlx.NewScalarArray(float32(-1e9))
 	mask = mask.ExpandDims(0).ExpandDims(0)
 	return mlx.Where(mask, scores, negInf)
 }