mirror of
https://github.com/ollama/ollama.git
synced 2026-04-26 02:36:09 +02:00
This change adds support for qwen3.5-next-moe models (qwen3-next/qwen3.5-next/qwen3-coder) to the MLX runner. It also: * introduces recurrent cache support and related MLX ops * updates pipeline/runner integration and adds tests * properly quantizes stacked expert tensors * a Gated Delta Metal kernel for fast SSM inference * adds new MLX calls for Conv1d, DepthwideConv1d, Contiguous, Exp, Log, SoftmaxAxis
41 lines
975 B
Go
41 lines
975 B
Go
//go:build mlx
|
|
|
|
package mlx
|
|
|
|
//go:generate cmake -S . -B build -DCMAKE_INSTALL_PREFIX=dist -DCMAKE_BUILD_TYPE=Release
|
|
//go:generate cmake --build build --parallel
|
|
//go:generate cmake --install build
|
|
//go:generate sh -c "go run generator/main.go -output=. ./dist/include/mlx/c/*.h"
|
|
|
|
// #cgo CXXFLAGS: -std=c++17
|
|
// #cgo CPPFLAGS: -I${SRCDIR}/dist/include
|
|
// #cgo LDFLAGS: -L${SRCDIR}/dist/lib -lstdc++
|
|
// #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework Accelerate
|
|
// #include "generated.h"
|
|
import "C"
|
|
|
|
func doEval(outputs []*Array, async bool) {
|
|
vector := C.mlx_vector_array_new()
|
|
defer C.mlx_vector_array_free(vector)
|
|
|
|
for _, output := range outputs {
|
|
if output != nil && output.Valid() {
|
|
C.mlx_vector_array_append_value(vector, output.ctx)
|
|
}
|
|
}
|
|
|
|
if async {
|
|
C.mlx_async_eval(vector)
|
|
} else {
|
|
C.mlx_eval(vector)
|
|
}
|
|
}
|
|
|
|
func AsyncEval(outputs ...*Array) {
|
|
doEval(outputs, true)
|
|
}
|
|
|
|
func Eval(outputs ...*Array) {
|
|
doEval(outputs, false)
|
|
}
|