mirror of
https://github.com/ollama/ollama.git
synced 2026-04-24 09:46:01 +02:00
Enable multiple conversations to reuse cached computations when they share token prefixes (e.g. the same system prompt). A prefix trie tracks shared regions so switching between conversations only recomputes tokens that diverge. Inactive conversation state is paged from active GPU memory to other memory and restored on demand, with LRU eviction to keep memory usage bounded.
48 lines
991 B
Go
48 lines
991 B
Go
package mlx
|
|
|
|
//go:generate go run generator/main.go -output=. ./include/mlx/c/*.h
|
|
|
|
// #cgo CXXFLAGS: -std=c++17
|
|
// #cgo CPPFLAGS: -I${SRCDIR}/include
|
|
// #cgo LDFLAGS: -lstdc++
|
|
// #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework Accelerate
|
|
// #include "generated.h"
|
|
import "C"
|
|
|
|
// Version returns the MLX core library version string.
|
|
func Version() string {
|
|
str := C.mlx_string_new()
|
|
defer C.mlx_string_free(str)
|
|
C.mlx_version(&str)
|
|
return C.GoString(C.mlx_string_data(str))
|
|
}
|
|
|
|
func doEval(outputs []*Array, async bool) {
|
|
if len(outputs) == 0 {
|
|
return
|
|
}
|
|
|
|
vector := C.mlx_vector_array_new()
|
|
defer C.mlx_vector_array_free(vector)
|
|
|
|
for _, output := range outputs {
|
|
if output != nil && output.Valid() {
|
|
C.mlx_vector_array_append_value(vector, output.ctx)
|
|
}
|
|
}
|
|
|
|
if async {
|
|
C.mlx_async_eval(vector)
|
|
} else {
|
|
C.mlx_eval(vector)
|
|
}
|
|
}
|
|
|
|
func AsyncEval(outputs ...*Array) {
|
|
doEval(outputs, true)
|
|
}
|
|
|
|
func Eval(outputs ...*Array) {
|
|
doEval(outputs, false)
|
|
}
|