launch: add kimi cli integration with installer flow

mlx: apply repeat penalties in sampler (#15631 )
docs: update hermes (#15655 )
2026-04-21 00:05:40 +02:00 · 2026-04-20 13:30:44 -07:00 · 2026-04-18 07:49:38 -07:00 · 2026-04-17 14:20:59 -07:00 · 2026-04-16 17:18:04 -07:00 · 2026-04-16 16:20:42 -07:00
57 changed files with 5287 additions and 431 deletions
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 ollama
 ```

-You'll be prompted to run a model or connect Ollama to your existing agents or applications such as `claude`, `codex`, `openclaw` and more.
+You'll be prompted to run a model or connect Ollama to your existing agents or applications such as `Claude Code`, `OpenClaw`, `OpenCode` , `Codex`, `Copilot`,  and more.

 ### Coding

@@ -65,7 +65,7 @@ To launch a specific integration:
 ollama launch claude
 ```

-Supported integrations include [Claude Code](https://docs.ollama.com/integrations/claude-code), [Codex](https://docs.ollama.com/integrations/codex), [Droid](https://docs.ollama.com/integrations/droid), and [OpenCode](https://docs.ollama.com/integrations/opencode).
+Supported integrations include [Claude Code](https://docs.ollama.com/integrations/claude-code), [Codex](https://docs.ollama.com/integrations/codex), [Copilot CLI](https://docs.ollama.com/integrations/copilot-cli), [Droid](https://docs.ollama.com/integrations/droid), and [OpenCode](https://docs.ollama.com/integrations/opencode).

 ### AI assistant

--- a/cmd/launch/command_test.go
+++ b/cmd/launch/command_test.go
@@ -58,6 +58,12 @@ func TestLaunchCmd(t *testing.T) {
 		if cmd.Long == "" {
 			t.Error("Long description should not be empty")
 		}
+		if !strings.Contains(cmd.Long, "hermes") {
+			t.Error("Long description should mention hermes")
+		}
+		if !strings.Contains(cmd.Long, "kimi") {
+			t.Error("Long description should mention kimi")
+		}
 	})

 	t.Run("flags exist", func(t *testing.T) {
--- a/cmd/launch/copilot.go
+++ b/cmd/launch/copilot.go
@@ -0,0 +1,76 @@
+package launch
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+
+	"github.com/ollama/ollama/envconfig"
+)
+
+// Copilot implements Runner for GitHub Copilot CLI integration.
+type Copilot struct{}
+
+func (c *Copilot) String() string { return "Copilot CLI" }
+
+func (c *Copilot) args(model string, extra []string) []string {
+	var args []string
+	if model != "" {
+		args = append(args, "--model", model)
+	}
+	args = append(args, extra...)
+	return args
+}
+
+func (c *Copilot) findPath() (string, error) {
+	if p, err := exec.LookPath("copilot"); err == nil {
+		return p, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+	name := "copilot"
+	if runtime.GOOS == "windows" {
+		name = "copilot.exe"
+	}
+	fallback := filepath.Join(home, ".local", "bin", name)
+	if _, err := os.Stat(fallback); err != nil {
+		return "", err
+	}
+	return fallback, nil
+}
+
+func (c *Copilot) Run(model string, args []string) error {
+	copilotPath, err := c.findPath()
+	if err != nil {
+		return fmt.Errorf("copilot is not installed, install from https://docs.github.com/en/copilot/how-tos/set-up/install-copilot-cli")
+	}
+
+	cmd := exec.Command(copilotPath, c.args(model, args)...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	cmd.Env = append(os.Environ(), c.envVars(model)...)
+
+	return cmd.Run()
+}
+
+// envVars returns the environment variables that configure Copilot CLI
+// to use Ollama as its model provider.
+func (c *Copilot) envVars(model string) []string {
+	env := []string{
+		"COPILOT_PROVIDER_BASE_URL=" + envconfig.Host().String() + "/v1",
+		"COPILOT_PROVIDER_API_KEY=",
+		"COPILOT_PROVIDER_WIRE_API=responses",
+	}
+
+	if model != "" {
+		env = append(env, "COPILOT_MODEL="+model)
+	}
+
+	return env
+}
--- a/cmd/launch/copilot_test.go
+++ b/cmd/launch/copilot_test.go
@@ -0,0 +1,161 @@
+package launch
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestCopilotIntegration(t *testing.T) {
+	c := &Copilot{}
+
+	t.Run("String", func(t *testing.T) {
+		if got := c.String(); got != "Copilot CLI" {
+			t.Errorf("String() = %q, want %q", got, "Copilot CLI")
+		}
+	})
+
+	t.Run("implements Runner", func(t *testing.T) {
+		var _ Runner = c
+	})
+}
+
+func TestCopilotFindPath(t *testing.T) {
+	c := &Copilot{}
+
+	t.Run("finds copilot in PATH", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		name := "copilot"
+		if runtime.GOOS == "windows" {
+			name = "copilot.exe"
+		}
+		fakeBin := filepath.Join(tmpDir, name)
+		os.WriteFile(fakeBin, []byte("#!/bin/sh\n"), 0o755)
+		t.Setenv("PATH", tmpDir)
+
+		got, err := c.findPath()
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != fakeBin {
+			t.Errorf("findPath() = %q, want %q", got, fakeBin)
+		}
+	})
+
+	t.Run("returns error when not in PATH", func(t *testing.T) {
+		t.Setenv("PATH", t.TempDir()) // empty dir, no copilot binary
+
+		_, err := c.findPath()
+		if err == nil {
+			t.Fatal("expected error, got nil")
+		}
+	})
+
+	t.Run("falls back to ~/.local/bin/copilot", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		t.Setenv("PATH", t.TempDir()) // empty dir, no copilot binary
+
+		name := "copilot"
+		if runtime.GOOS == "windows" {
+			name = "copilot.exe"
+		}
+		fallback := filepath.Join(tmpDir, ".local", "bin", name)
+		os.MkdirAll(filepath.Dir(fallback), 0o755)
+		os.WriteFile(fallback, []byte("#!/bin/sh\n"), 0o755)
+
+		got, err := c.findPath()
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != fallback {
+			t.Errorf("findPath() = %q, want %q", got, fallback)
+		}
+	})
+
+	t.Run("returns error when neither PATH nor fallback exists", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		t.Setenv("PATH", t.TempDir()) // empty dir, no copilot binary
+
+		_, err := c.findPath()
+		if err == nil {
+			t.Fatal("expected error, got nil")
+		}
+	})
+}
+
+func TestCopilotArgs(t *testing.T) {
+	c := &Copilot{}
+
+	tests := []struct {
+		name  string
+		model string
+		args  []string
+		want  []string
+	}{
+		{"with model", "llama3.2", nil, []string{"--model", "llama3.2"}},
+		{"empty model", "", nil, nil},
+		{"with model and extra", "llama3.2", []string{"--verbose"}, []string{"--model", "llama3.2", "--verbose"}},
+		{"empty model with help", "", []string{"--help"}, []string{"--help"}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := c.args(tt.model, tt.args)
+			if !slices.Equal(got, tt.want) {
+				t.Errorf("args(%q, %v) = %v, want %v", tt.model, tt.args, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestCopilotEnvVars(t *testing.T) {
+	c := &Copilot{}
+
+	envMap := func(envs []string) map[string]string {
+		m := make(map[string]string)
+		for _, e := range envs {
+			k, v, _ := strings.Cut(e, "=")
+			m[k] = v
+		}
+		return m
+	}
+
+	t.Run("sets required provider env vars with model", func(t *testing.T) {
+		got := envMap(c.envVars("llama3.2"))
+		if got["COPILOT_PROVIDER_BASE_URL"] == "" {
+			t.Error("COPILOT_PROVIDER_BASE_URL should be set")
+		}
+		if !strings.HasSuffix(got["COPILOT_PROVIDER_BASE_URL"], "/v1") {
+			t.Errorf("COPILOT_PROVIDER_BASE_URL = %q, want /v1 suffix", got["COPILOT_PROVIDER_BASE_URL"])
+		}
+		if _, ok := got["COPILOT_PROVIDER_API_KEY"]; !ok {
+			t.Error("COPILOT_PROVIDER_API_KEY should be set (empty)")
+		}
+		if got["COPILOT_PROVIDER_WIRE_API"] != "responses" {
+			t.Errorf("COPILOT_PROVIDER_WIRE_API = %q, want %q", got["COPILOT_PROVIDER_WIRE_API"], "responses")
+		}
+		if got["COPILOT_MODEL"] != "llama3.2" {
+			t.Errorf("COPILOT_MODEL = %q, want %q", got["COPILOT_MODEL"], "llama3.2")
+		}
+	})
+
+	t.Run("omits COPILOT_MODEL when model is empty", func(t *testing.T) {
+		got := envMap(c.envVars(""))
+		if _, ok := got["COPILOT_MODEL"]; ok {
+			t.Errorf("COPILOT_MODEL should not be set for empty model, got %q", got["COPILOT_MODEL"])
+		}
+	})
+
+	t.Run("uses custom OLLAMA_HOST", func(t *testing.T) {
+		t.Setenv("OLLAMA_HOST", "http://myhost:9999")
+		got := envMap(c.envVars("test"))
+		if !strings.Contains(got["COPILOT_PROVIDER_BASE_URL"], "myhost:9999") {
+			t.Errorf("COPILOT_PROVIDER_BASE_URL = %q, want custom host", got["COPILOT_PROVIDER_BASE_URL"])
+		}
+	})
+}
--- a/cmd/launch/hermes.go
+++ b/cmd/launch/hermes.go
@@ -0,0 +1,679 @@
+package launch
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strconv"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/cmd/config"
+	"github.com/ollama/ollama/cmd/internal/fileutil"
+	"github.com/ollama/ollama/envconfig"
+)
+
+const (
+	hermesInstallScript     = "curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash -s -- --skip-setup"
+	hermesProviderName      = "Ollama"
+	hermesProviderKey       = "ollama-launch"
+	hermesLegacyKey         = "ollama"
+	hermesPlaceholderKey    = "ollama"
+	hermesGatewaySetupHint  = "hermes gateway setup"
+	hermesGatewaySetupTitle = "Connect a messaging app now?"
+)
+
+var (
+	hermesGOOS      = runtime.GOOS
+	hermesLookPath  = exec.LookPath
+	hermesCommand   = exec.Command
+	hermesUserHome  = os.UserHomeDir
+	hermesOllamaURL = envconfig.ConnectableHost
+)
+
+var hermesMessagingEnvGroups = [][]string{
+	{"TELEGRAM_BOT_TOKEN"},
+	{"DISCORD_BOT_TOKEN"},
+	{"SLACK_BOT_TOKEN"},
+	{"SIGNAL_ACCOUNT"},
+	{"EMAIL_ADDRESS"},
+	{"TWILIO_ACCOUNT_SID"},
+	{"MATRIX_ACCESS_TOKEN", "MATRIX_PASSWORD"},
+	{"MATTERMOST_TOKEN"},
+	{"WHATSAPP_PHONE_NUMBER_ID"},
+	{"DINGTALK_CLIENT_ID"},
+	{"FEISHU_APP_ID"},
+	{"WECOM_BOT_ID"},
+	{"WEIXIN_ACCOUNT_ID"},
+	{"BLUEBUBBLES_SERVER_URL"},
+	{"WEBHOOK_ENABLED"},
+}
+
+// Hermes is intentionally not an Editor integration: launch owns one primary
+// model and the local Ollama endpoint, while Hermes keeps its own discovery and
+// switching UX after startup.
+type Hermes struct{}
+
+func (h *Hermes) String() string { return "Hermes Agent" }
+
+func (h *Hermes) Run(_ string, args []string) error {
+	// Hermes reads its primary model from config.yaml. launch configures that
+	// default model ahead of time so we can keep runtime invocation simple and
+	// still let Hermes discover additional models later via its own UX.
+	bin, err := h.binary()
+	if err != nil {
+		return err
+	}
+	if err := h.runGatewaySetupPreflight(args, func() error {
+		return hermesAttachedCommand(bin, "gateway", "setup").Run()
+	}); err != nil {
+		return err
+	}
+	return hermesAttachedCommand(bin, args...).Run()
+}
+
+func (h *Hermes) Paths() []string {
+	configPath, err := hermesConfigPath()
+	if err != nil {
+		return nil
+	}
+	return []string{configPath}
+}
+
+func (h *Hermes) Configure(model string) error {
+	configPath, err := hermesConfigPath()
+	if err != nil {
+		return err
+	}
+
+	cfg := map[string]any{}
+	if data, err := os.ReadFile(configPath); err == nil {
+		if err := yaml.Unmarshal(data, &cfg); err != nil {
+			return fmt.Errorf("parse hermes config: %w", err)
+		}
+	} else if !os.IsNotExist(err) {
+		return err
+	}
+
+	modelSection, _ := cfg["model"].(map[string]any)
+	if modelSection == nil {
+		modelSection = make(map[string]any)
+	}
+	models := h.listModels(model)
+	applyHermesManagedProviders(cfg, hermesBaseURL(), model, models)
+
+	// launch writes the minimum provider/default-model settings needed to
+	// bootstrap Hermes against Ollama. The active provider stays on a
+	// launch-owned key so /model stays aligned with the launcher-managed entry,
+	// and the Ollama endpoint lives in providers: so the picker shows one row.
+	modelSection["provider"] = hermesProviderKey
+	modelSection["default"] = model
+	modelSection["base_url"] = hermesBaseURL()
+	modelSection["api_key"] = hermesPlaceholderKey
+	cfg["model"] = modelSection
+
+	// use Hermes' built-in web toolset for now.
+	// TODO(parthsareen): move this to using Ollama web search
+	cfg["toolsets"] = mergeHermesToolsets(cfg["toolsets"])
+
+	data, err := yaml.Marshal(cfg)
+	if err != nil {
+		return err
+	}
+	if err := os.MkdirAll(filepath.Dir(configPath), 0o755); err != nil {
+		return err
+	}
+	return fileutil.WriteWithBackup(configPath, data)
+}
+
+func (h *Hermes) CurrentModel() string {
+	configPath, err := hermesConfigPath()
+	if err != nil {
+		return ""
+	}
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return ""
+	}
+
+	cfg := map[string]any{}
+	if yaml.Unmarshal(data, &cfg) != nil {
+		return ""
+	}
+	return hermesManagedCurrentModel(cfg, hermesBaseURL())
+}
+
+func (h *Hermes) Onboard() error {
+	return config.MarkIntegrationOnboarded("hermes")
+}
+
+func (h *Hermes) RequiresInteractiveOnboarding() bool {
+	return false
+}
+
+func (h *Hermes) RefreshRuntimeAfterConfigure() error {
+	running, err := h.gatewayRunning()
+	if err != nil {
+		return fmt.Errorf("check Hermes gateway status: %w", err)
+	}
+	if !running {
+		return nil
+	}
+
+	fmt.Fprintf(os.Stderr, "%sRefreshing Hermes messaging gateway...%s\n", ansiGray, ansiReset)
+	if err := h.restartGateway(); err != nil {
+		return fmt.Errorf("restart Hermes gateway: %w", err)
+	}
+	fmt.Fprintln(os.Stderr)
+	return nil
+}
+
+func (h *Hermes) installed() bool {
+	_, err := h.binary()
+	return err == nil
+}
+
+func (h *Hermes) ensureInstalled() error {
+	if h.installed() {
+		return nil
+	}
+
+	if hermesGOOS == "windows" {
+		return hermesWindowsHint()
+	}
+
+	var missing []string
+	for _, dep := range []string{"bash", "curl", "git"} {
+		if _, err := hermesLookPath(dep); err != nil {
+			missing = append(missing, dep)
+		}
+	}
+	if len(missing) > 0 {
+		return fmt.Errorf("Hermes is not installed and required dependencies are missing\n\nInstall the following first:\n  %s\n\nThen re-run:\n  ollama launch hermes", strings.Join(missing, "\n  "))
+	}
+
+	ok, err := ConfirmPrompt("Hermes is not installed. Install now?")
+	if err != nil {
+		return err
+	}
+	if !ok {
+		return fmt.Errorf("hermes installation cancelled")
+	}
+
+	fmt.Fprintf(os.Stderr, "\nInstalling Hermes...\n")
+	if err := hermesAttachedCommand("bash", "-lc", hermesInstallScript).Run(); err != nil {
+		return fmt.Errorf("failed to install hermes: %w", err)
+	}
+
+	if !h.installed() {
+		return fmt.Errorf("hermes was installed but the binary was not found on PATH\n\nYou may need to restart your shell")
+	}
+
+	fmt.Fprintf(os.Stderr, "%sHermes installed successfully%s\n\n", ansiGreen, ansiReset)
+	return nil
+}
+
+func (h *Hermes) listModels(defaultModel string) []string {
+	client := hermesOllamaClient()
+	resp, err := client.List(context.Background())
+	if err != nil {
+		return []string{defaultModel}
+	}
+
+	models := make([]string, 0, len(resp.Models)+1)
+	seen := make(map[string]struct{}, len(resp.Models)+1)
+	add := func(name string) {
+		name = strings.TrimSpace(name)
+		if name == "" {
+			return
+		}
+		if _, ok := seen[name]; ok {
+			return
+		}
+		seen[name] = struct{}{}
+		models = append(models, name)
+	}
+
+	add(defaultModel)
+	for _, entry := range resp.Models {
+		add(entry.Name)
+	}
+	if len(models) == 0 {
+		return []string{defaultModel}
+	}
+	return models
+}
+
+func (h *Hermes) binary() (string, error) {
+	if path, err := hermesLookPath("hermes"); err == nil {
+		return path, nil
+	}
+
+	if hermesGOOS == "windows" {
+		return "", hermesWindowsHint()
+	}
+
+	home, err := hermesUserHome()
+	if err != nil {
+		return "", err
+	}
+	fallback := filepath.Join(home, ".local", "bin", "hermes")
+	if _, err := os.Stat(fallback); err == nil {
+		return fallback, nil
+	}
+
+	return "", fmt.Errorf("hermes is not installed")
+}
+
+func hermesConfigPath() (string, error) {
+	home, err := hermesUserHome()
+	if err != nil {
+		return "", err
+	}
+	return filepath.Join(home, ".hermes", "config.yaml"), nil
+}
+
+func hermesBaseURL() string {
+	return strings.TrimRight(hermesOllamaURL().String(), "/") + "/v1"
+}
+
+func hermesEnvPath() (string, error) {
+	home, err := hermesUserHome()
+	if err != nil {
+		return "", err
+	}
+	return filepath.Join(home, ".hermes", ".env"), nil
+}
+
+func (h *Hermes) runGatewaySetupPreflight(args []string, runSetup func() error) error {
+	if len(args) > 0 || !isInteractiveSession() || currentLaunchConfirmPolicy.yes || currentLaunchConfirmPolicy.requireYesMessage {
+		return nil
+	}
+	if h.messagingConfigured() {
+		return nil
+	}
+
+	fmt.Fprintf(os.Stderr, "\nHermes can message you on Telegram, Discord, Slack, and more.\n\n")
+	ok, err := ConfirmPromptWithOptions(hermesGatewaySetupTitle, ConfirmOptions{
+		YesLabel: "Yes",
+		NoLabel:  "Set up later",
+	})
+	if err != nil {
+		return err
+	}
+	if !ok {
+		return nil
+	}
+	if err := runSetup(); err != nil {
+		return fmt.Errorf("hermes messaging setup failed: %w\n\nTry running: %s", err, hermesGatewaySetupHint)
+	}
+	return nil
+}
+
+func (h *Hermes) messagingConfigured() bool {
+	envVars, err := h.gatewayEnvVars()
+	if err != nil {
+		return false
+	}
+	for _, group := range hermesMessagingEnvGroups {
+		for _, key := range group {
+			if strings.TrimSpace(envVars[key]) != "" {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func (h *Hermes) gatewayEnvVars() (map[string]string, error) {
+	envVars := make(map[string]string)
+
+	envFilePath, err := hermesEnvPath()
+	if err != nil {
+		return nil, err
+	}
+	switch data, err := os.ReadFile(envFilePath); {
+	case err == nil:
+		for key, value := range hermesParseEnvFile(data) {
+			envVars[key] = value
+		}
+	case os.IsNotExist(err):
+		// nothing persisted yet
+	default:
+		return nil, err
+	}
+
+	for _, group := range hermesMessagingEnvGroups {
+		for _, key := range group {
+			if value, ok := os.LookupEnv(key); ok {
+				envVars[key] = value
+			}
+		}
+	}
+
+	return envVars, nil
+}
+
+func (h *Hermes) gatewayRunning() (bool, error) {
+	status, err := h.gatewayStatusOutput()
+	if err != nil {
+		return false, err
+	}
+	return hermesGatewayStatusRunning(status), nil
+}
+
+func (h *Hermes) gatewayStatusOutput() (string, error) {
+	bin, err := h.binary()
+	if err != nil {
+		return "", err
+	}
+	out, err := hermesCommand(bin, "gateway", "status").CombinedOutput()
+	return string(out), err
+}
+
+func (h *Hermes) restartGateway() error {
+	bin, err := h.binary()
+	if err != nil {
+		return err
+	}
+	return hermesAttachedCommand(bin, "gateway", "restart").Run()
+}
+
+func hermesGatewayStatusRunning(output string) bool {
+	status := strings.ToLower(output)
+	switch {
+	case strings.Contains(status, "gateway is not running"):
+		return false
+	case strings.Contains(status, "gateway service is stopped"):
+		return false
+	case strings.Contains(status, "gateway service is not loaded"):
+		return false
+	case strings.Contains(status, "gateway is running"):
+		return true
+	case strings.Contains(status, "gateway service is running"):
+		return true
+	case strings.Contains(status, "gateway service is loaded"):
+		return true
+	default:
+		return false
+	}
+}
+
+func hermesParseEnvFile(data []byte) map[string]string {
+	out := make(map[string]string)
+	scanner := bufio.NewScanner(bytes.NewReader(data))
+	for scanner.Scan() {
+		line := strings.TrimSpace(strings.TrimPrefix(scanner.Text(), "\ufeff"))
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		if strings.HasPrefix(line, "export ") {
+			line = strings.TrimSpace(strings.TrimPrefix(line, "export "))
+		}
+
+		key, value, ok := strings.Cut(line, "=")
+		if !ok {
+			continue
+		}
+
+		key = strings.TrimSpace(key)
+		if key == "" {
+			continue
+		}
+
+		value = strings.TrimSpace(value)
+		if len(value) >= 2 {
+			switch {
+			case value[0] == '"' && value[len(value)-1] == '"':
+				if unquoted, err := strconv.Unquote(value); err == nil {
+					value = unquoted
+				}
+			case value[0] == '\'' && value[len(value)-1] == '\'':
+				value = value[1 : len(value)-1]
+			}
+		}
+
+		out[key] = value
+	}
+	return out
+}
+
+func hermesOllamaClient() *api.Client {
+	// Hermes queries the same launch-resolved Ollama host that launch writes
+	// into config, so model discovery follows the configured endpoint.
+	return api.NewClient(hermesOllamaURL(), http.DefaultClient)
+}
+
+func applyHermesManagedProviders(cfg map[string]any, baseURL string, model string, models []string) {
+	providers := hermesUserProviders(cfg["providers"])
+	entry := hermesManagedProviderEntry(providers)
+	if entry == nil {
+		entry = make(map[string]any)
+	}
+	entry["name"] = hermesProviderName
+	entry["api"] = baseURL
+	entry["default_model"] = model
+	entry["models"] = hermesStringListAny(models)
+	providers[hermesProviderKey] = entry
+	delete(providers, hermesLegacyKey)
+	cfg["providers"] = providers
+
+	customProviders := hermesWithoutManagedCustomProviders(cfg["custom_providers"])
+	if len(customProviders) == 0 {
+		delete(cfg, "custom_providers")
+		return
+	}
+	cfg["custom_providers"] = customProviders
+}
+
+func hermesManagedCurrentModel(cfg map[string]any, baseURL string) string {
+	modelCfg, _ := cfg["model"].(map[string]any)
+	if modelCfg == nil {
+		return ""
+	}
+
+	provider, _ := modelCfg["provider"].(string)
+	if strings.TrimSpace(strings.ToLower(provider)) != hermesProviderKey {
+		return ""
+	}
+
+	configBaseURL, _ := modelCfg["base_url"].(string)
+	if hermesNormalizeURL(configBaseURL) != hermesNormalizeURL(baseURL) {
+		return ""
+	}
+
+	current, _ := modelCfg["default"].(string)
+	current = strings.TrimSpace(current)
+	if current == "" {
+		return ""
+	}
+
+	providers := hermesUserProviders(cfg["providers"])
+	entry, _ := providers[hermesProviderKey].(map[string]any)
+	if entry == nil {
+		return ""
+	}
+	if hermesHasManagedCustomProvider(cfg["custom_providers"]) {
+		return ""
+	}
+
+	apiURL, _ := entry["api"].(string)
+	if hermesNormalizeURL(apiURL) != hermesNormalizeURL(baseURL) {
+		return ""
+	}
+
+	defaultModel, _ := entry["default_model"].(string)
+	if strings.TrimSpace(defaultModel) != current {
+		return ""
+	}
+
+	return current
+}
+
+func hermesUserProviders(current any) map[string]any {
+	switch existing := current.(type) {
+	case map[string]any:
+		out := make(map[string]any, len(existing))
+		for key, value := range existing {
+			out[key] = value
+		}
+		return out
+	case map[any]any:
+		out := make(map[string]any, len(existing))
+		for key, value := range existing {
+			if s, ok := key.(string); ok {
+				out[s] = value
+			}
+		}
+		return out
+	default:
+		return make(map[string]any)
+	}
+}
+
+func hermesCustomProviders(current any) []any {
+	switch existing := current.(type) {
+	case []any:
+		return append([]any(nil), existing...)
+	case []map[string]any:
+		out := make([]any, 0, len(existing))
+		for _, entry := range existing {
+			out = append(out, entry)
+		}
+		return out
+	default:
+		return nil
+	}
+}
+
+func hermesManagedProviderEntry(providers map[string]any) map[string]any {
+	for _, key := range []string{hermesProviderKey, hermesLegacyKey} {
+		if entry, _ := providers[key].(map[string]any); entry != nil {
+			return entry
+		}
+	}
+	return nil
+}
+
+func hermesWithoutManagedCustomProviders(current any) []any {
+	customProviders := hermesCustomProviders(current)
+	preserved := make([]any, 0, len(customProviders))
+
+	for _, item := range customProviders {
+		entry, _ := item.(map[string]any)
+		if entry == nil {
+			preserved = append(preserved, item)
+			continue
+		}
+		if hermesManagedCustomProvider(entry) {
+			continue
+		}
+		preserved = append(preserved, entry)
+	}
+
+	return preserved
+}
+
+func hermesHasManagedCustomProvider(current any) bool {
+	for _, item := range hermesCustomProviders(current) {
+		entry, _ := item.(map[string]any)
+		if entry != nil && hermesManagedCustomProvider(entry) {
+			return true
+		}
+	}
+	return false
+}
+
+func hermesManagedCustomProvider(entry map[string]any) bool {
+	name, _ := entry["name"].(string)
+	return strings.EqualFold(strings.TrimSpace(name), hermesProviderName)
+}
+
+func hermesNormalizeURL(raw string) string {
+	return strings.TrimRight(strings.TrimSpace(raw), "/")
+}
+
+func hermesStringListAny(models []string) []any {
+	out := make([]any, 0, len(models))
+	for _, model := range dedupeModelList(models) {
+		model = strings.TrimSpace(model)
+		if model == "" {
+			continue
+		}
+		out = append(out, model)
+	}
+	return out
+}
+
+func mergeHermesToolsets(current any) any {
+	added := false
+	switch existing := current.(type) {
+	case []any:
+		out := make([]any, 0, len(existing)+1)
+		for _, item := range existing {
+			out = append(out, item)
+			if s, _ := item.(string); s == "web" {
+				added = true
+			}
+		}
+		if !added {
+			out = append(out, "web")
+		}
+		return out
+	case []string:
+		out := append([]string(nil), existing...)
+		if !slices.Contains(out, "web") {
+			out = append(out, "web")
+		}
+		asAny := make([]any, 0, len(out))
+		for _, item := range out {
+			asAny = append(asAny, item)
+		}
+		return asAny
+	case string:
+		if strings.TrimSpace(existing) == "" {
+			return []any{"hermes-cli", "web"}
+		}
+		parts := strings.Split(existing, ",")
+		out := make([]any, 0, len(parts)+1)
+		for _, part := range parts {
+			part = strings.TrimSpace(part)
+			if part == "" {
+				continue
+			}
+			if part == "web" {
+				added = true
+			}
+			out = append(out, part)
+		}
+		if !added {
+			out = append(out, "web")
+		}
+		return out
+	default:
+		return []any{"hermes-cli", "web"}
+	}
+}
+
+func hermesAttachedCommand(name string, args ...string) *exec.Cmd {
+	cmd := hermesCommand(name, args...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	return cmd
+}
+
+func hermesWindowsHint() error {
+	return fmt.Errorf("Hermes on Windows requires WSL2. Install WSL with: wsl --install\n" +
+		"Then run 'ollama launch hermes' from inside your WSL shell.\n" +
+		"Docs: https://hermes-agent.nousresearch.com/docs/getting-started/installation/")
+}
--- a/cmd/launch/hermes_test.go
+++ b/cmd/launch/hermes_test.go
--- a/cmd/launch/integrations_test.go
+++ b/cmd/launch/integrations_test.go
@@ -54,6 +54,7 @@ func TestIntegrationLookup(t *testing.T) {
 		{"claude uppercase", "CLAUDE", true, "Claude Code"},
 		{"claude mixed case", "Claude", true, "Claude Code"},
 		{"codex", "codex", true, "Codex"},
+		{"kimi", "kimi", true, "Kimi Code CLI"},
 		{"droid", "droid", true, "Droid"},
 		{"opencode", "opencode", true, "OpenCode"},
 		{"unknown integration", "unknown", false, ""},
@@ -74,7 +75,7 @@ func TestIntegrationLookup(t *testing.T) {
 }

 func TestIntegrationRegistry(t *testing.T) {
-	expectedIntegrations := []string{"claude", "codex", "droid", "opencode"}
+	expectedIntegrations := []string{"claude", "codex", "kimi", "droid", "opencode", "hermes"}

 	for _, name := range expectedIntegrations {
 		t.Run(name, func(t *testing.T) {
@@ -89,6 +90,15 @@ func TestIntegrationRegistry(t *testing.T) {
 	}
 }

+func TestHiddenIntegrationsExcludedFromVisibleLists(t *testing.T) {
+	for _, info := range ListIntegrationInfos() {
+		switch info.Name {
+		case "cline", "vscode", "kimi":
+			t.Fatalf("hidden integration %q should not appear in ListIntegrationInfos", info.Name)
+		}
+	}
+}
+
 func TestHasLocalModel(t *testing.T) {
 	tests := []struct {
 		name   string
@@ -329,7 +339,7 @@ func TestBuildModelList_NoExistingModels(t *testing.T) {
 	}
 }

-func TestBuildModelList_OnlyLocalModels_CloudRecsAtBottom(t *testing.T) {
+func TestBuildModelList_OnlyLocalModels_CloudRecsStillFirst(t *testing.T) {
 	existing := []modelInfo{
 		{Name: "llama3.2:latest", Remote: false},
 		{Name: "qwen2.5:latest", Remote: false},
@@ -338,10 +348,11 @@ func TestBuildModelList_OnlyLocalModels_CloudRecsAtBottom(t *testing.T) {
 	items, _, _, _ := buildModelList(existing, nil, "")
 	got := names(items)

-	// Recommended pinned at top (local recs first, then cloud recs when only-local), then installed non-recs
-	want := []string{"gemma4", "qwen3.5", "kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "llama3.2", "qwen2.5"}
+	// Cloud recs always come first among recommended, regardless of installed inventory.
+	// Cloud disablement is handled upstream in loadSelectableModels via filterCloudItems.
+	want := []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "gemma4", "qwen3.5", "llama3.2", "qwen2.5"}
 	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("recs pinned at top, local recs before cloud recs (-want +got):\n%s", diff)
+		t.Errorf("cloud recs pinned first even when no cloud models installed (-want +got):\n%s", diff)
 	}
 }

@@ -588,7 +599,7 @@ func TestBuildModelList_MixedCase_CloudRecsFirst(t *testing.T) {
 	}
 }

-func TestBuildModelList_OnlyLocal_LocalRecsFirst(t *testing.T) {
+func TestBuildModelList_OnlyLocal_CloudRecsStillFirst(t *testing.T) {
 	existing := []modelInfo{
 		{Name: "llama3.2:latest", Remote: false},
 	}
@@ -596,11 +607,11 @@ func TestBuildModelList_OnlyLocal_LocalRecsFirst(t *testing.T) {
 	items, _, _, _ := buildModelList(existing, nil, "")
 	got := names(items)

-	// Local recs should sort before cloud recs in only-local case
+	// Cloud recs sort before local recs regardless of installed inventory.
 	localIdx := slices.Index(got, "gemma4")
 	cloudIdx := slices.Index(got, "glm-5.1:cloud")
-	if localIdx > cloudIdx {
-		t.Errorf("local recs should be before cloud recs in only-local case, got %v", got)
+	if cloudIdx > localIdx {
+		t.Errorf("cloud recs should be before local recs even when only local models installed, got %v", got)
 	}
 }

@@ -1509,27 +1520,13 @@ func TestListIntegrationInfos(t *testing.T) {
 		}
 	})

-	t.Run("sorted with custom order at end", func(t *testing.T) {
-		// integrationOrder entries (cline, opencode) should appear last, in that order.
-		// All other entries should be sorted alphabetically before them.
-		orderRank := make(map[string]int)
-		for i, name := range integrationOrder {
-			orderRank[name] = i + 1
+	t.Run("follows launcher order", func(t *testing.T) {
+		got := make([]string, 0, len(infos))
+		for _, info := range infos {
+			got = append(got, info.Name)
 		}
-		for i := 1; i < len(infos); i++ {
-			aRank, bRank := orderRank[infos[i-1].Name], orderRank[infos[i].Name]
-			switch {
-			case aRank == 0 && bRank == 0:
-				if infos[i-1].Name >= infos[i].Name {
-					t.Errorf("non-ordered items not sorted: %q >= %q", infos[i-1].Name, infos[i].Name)
-				}
-			case aRank > 0 && bRank == 0:
-				t.Errorf("ordered item %q should come after non-ordered %q", infos[i-1].Name, infos[i].Name)
-			case aRank > 0 && bRank > 0:
-				if aRank >= bRank {
-					t.Errorf("ordered items wrong: %q (rank %d) before %q (rank %d)", infos[i-1].Name, aRank, infos[i].Name, bRank)
-				}
-			}
+		if diff := compareStrings(got, integrationOrder); diff != "" {
+			t.Fatalf("launcher integration order mismatch: %s", diff)
 		}
 	})

@@ -1557,6 +1554,28 @@ func TestListIntegrationInfos(t *testing.T) {
 			}
 		}
 	})
+
+	t.Run("includes hermes", func(t *testing.T) {
+		for _, info := range infos {
+			if info.Name == "hermes" {
+				return
+			}
+		}
+		t.Fatal("expected hermes to be included in ListIntegrationInfos")
+	})
+
+	t.Run("hermes still resolves explicitly", func(t *testing.T) {
+		name, runner, err := LookupIntegration("hermes")
+		if err != nil {
+			t.Fatalf("expected explicit hermes integration lookup to work, got %v", err)
+		}
+		if name != "hermes" {
+			t.Fatalf("expected canonical name hermes, got %q", name)
+		}
+		if runner.String() == "" {
+			t.Fatal("expected hermes integration runner to be present")
+		}
+	})
 }

 func TestBuildModelList_Descriptions(t *testing.T) {
@@ -1645,6 +1664,7 @@ func TestIntegration_AutoInstallable(t *testing.T) {
 	}{
 		{"openclaw", true},
 		{"pi", true},
+		{"hermes", true},
 		{"claude", false},
 		{"codex", false},
 		{"opencode", false},
--- a/cmd/launch/kimi.go
+++ b/cmd/launch/kimi.go
@@ -0,0 +1,230 @@
+package launch
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"runtime"
+	"strings"
+	"time"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
+)
+
+// Kimi implements Runner for Kimi Code CLI integration.
+type Kimi struct{}
+
+const (
+	kimiDefaultModelAlias     = "ollama"
+	kimiDefaultMaxContextSize = 32768
+)
+
+var (
+	kimiGOOS             = runtime.GOOS
+	kimiModelShowTimeout = 5 * time.Second
+)
+
+func (k *Kimi) String() string { return "Kimi Code CLI" }
+
+func (k *Kimi) args(config string, extra []string) []string {
+	args := []string{"--config", config}
+	args = append(args, extra...)
+	return args
+}
+
+func (k *Kimi) Run(model string, args []string) error {
+	if strings.TrimSpace(model) == "" {
+		return fmt.Errorf("model is required")
+	}
+	if err := validateKimiPassthroughArgs(args); err != nil {
+		return err
+	}
+
+	config, err := buildKimiInlineConfig(model, resolveKimiMaxContextSize(model))
+	if err != nil {
+		return fmt.Errorf("failed to build kimi config: %w", err)
+	}
+
+	bin, err := ensureKimiInstalled()
+	if err != nil {
+		return err
+	}
+
+	cmd := exec.Command(bin, k.args(config, args)...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	return cmd.Run()
+}
+
+func validateKimiPassthroughArgs(args []string) error {
+	for _, arg := range args {
+		switch {
+		case arg == "--config", strings.HasPrefix(arg, "--config="):
+			return fmt.Errorf("conflicting extra argument %q: ollama launch kimi manages --config", arg)
+		case arg == "--config-file", strings.HasPrefix(arg, "--config-file="):
+			return fmt.Errorf("conflicting extra argument %q: ollama launch kimi manages --config-file", arg)
+		case arg == "--model", strings.HasPrefix(arg, "--model="):
+			return fmt.Errorf("conflicting extra argument %q: ollama launch kimi manages --model", arg)
+		case arg == "-m", strings.HasPrefix(arg, "-m="):
+			return fmt.Errorf("conflicting extra argument %q: ollama launch kimi manages -m/--model", arg)
+		}
+	}
+	return nil
+}
+
+func buildKimiInlineConfig(model string, maxContextSize int) (string, error) {
+	cfg := map[string]any{
+		"default_model": kimiDefaultModelAlias,
+		"providers": map[string]any{
+			kimiDefaultModelAlias: map[string]any{
+				"type":     "openai_legacy",
+				"base_url": envconfig.Host().String() + "/v1",
+				"api_key":  "ollama",
+			},
+		},
+		"models": map[string]any{
+			kimiDefaultModelAlias: map[string]any{
+				"provider":         kimiDefaultModelAlias,
+				"model":            model,
+				"max_context_size": maxContextSize,
+			},
+		},
+	}
+
+	data, err := json.Marshal(cfg)
+	if err != nil {
+		return "", err
+	}
+	return string(data), nil
+}
+
+func resolveKimiMaxContextSize(model string) int {
+	if l, ok := lookupCloudModelLimit(model); ok {
+		return l.Context
+	}
+
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return kimiDefaultMaxContextSize
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), kimiModelShowTimeout)
+	defer cancel()
+	resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
+	if err != nil {
+		return kimiDefaultMaxContextSize
+	}
+
+	if n, ok := modelInfoContextLength(resp.ModelInfo); ok {
+		return n
+	}
+
+	return kimiDefaultMaxContextSize
+}
+
+func modelInfoContextLength(modelInfo map[string]any) (int, bool) {
+	for key, val := range modelInfo {
+		if !strings.HasSuffix(key, ".context_length") {
+			continue
+		}
+		switch v := val.(type) {
+		case float64:
+			if v > 0 {
+				return int(v), true
+			}
+		case int:
+			if v > 0 {
+				return v, true
+			}
+		case int64:
+			if v > 0 {
+				return int(v), true
+			}
+		}
+	}
+	return 0, false
+}
+
+func ensureKimiInstalled() (string, error) {
+	if _, err := exec.LookPath("kimi"); err == nil {
+		return "kimi", nil
+	}
+
+	if err := checkKimiInstallerDependencies(); err != nil {
+		return "", err
+	}
+
+	ok, err := ConfirmPrompt("Kimi is not installed. Install now?")
+	if err != nil {
+		return "", err
+	}
+	if !ok {
+		return "", fmt.Errorf("kimi installation cancelled")
+	}
+
+	bin, args, err := kimiInstallerCommand(kimiGOOS)
+	if err != nil {
+		return "", err
+	}
+
+	fmt.Fprintf(os.Stderr, "\nInstalling Kimi...\n")
+	cmd := exec.Command(bin, args...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("failed to install kimi: %w", err)
+	}
+
+	if _, err := exec.LookPath("kimi"); err != nil {
+		return "", fmt.Errorf("kimi was installed but the binary was not found on PATH\n\nYou may need to restart your shell")
+	}
+
+	fmt.Fprintf(os.Stderr, "%sKimi installed successfully%s\n\n", ansiGreen, ansiReset)
+	return "kimi", nil
+}
+
+func checkKimiInstallerDependencies() error {
+	switch kimiGOOS {
+	case "windows":
+		if _, err := exec.LookPath("powershell"); err != nil {
+			return fmt.Errorf("kimi is not installed and required dependencies are missing\n\nInstall the following first:\n  PowerShell: https://learn.microsoft.com/powershell/\n\nThen re-run:\n  ollama launch kimi")
+		}
+	default:
+		var missing []string
+		if _, err := exec.LookPath("curl"); err != nil {
+			missing = append(missing, "curl: https://curl.se/")
+		}
+		if _, err := exec.LookPath("bash"); err != nil {
+			missing = append(missing, "bash: https://www.gnu.org/software/bash/")
+		}
+		if len(missing) > 0 {
+			return fmt.Errorf("kimi is not installed and required dependencies are missing\n\nInstall the following first:\n  %s\n\nThen re-run:\n  ollama launch kimi", strings.Join(missing, "\n  "))
+		}
+	}
+	return nil
+}
+
+func kimiInstallerCommand(goos string) (string, []string, error) {
+	switch goos {
+	case "windows":
+		return "powershell", []string{
+			"-NoProfile",
+			"-ExecutionPolicy",
+			"Bypass",
+			"-Command",
+			"Invoke-RestMethod https://code.kimi.com/install.ps1 | Invoke-Expression",
+		}, nil
+	case "darwin", "linux":
+		return "bash", []string{
+			"-c",
+			"curl -LsSf https://code.kimi.com/install.sh | bash",
+		}, nil
+	default:
+		return "", nil, fmt.Errorf("unsupported platform for kimi install: %s", goos)
+	}
+}
--- a/cmd/launch/kimi_test.go
+++ b/cmd/launch/kimi_test.go
@@ -0,0 +1,456 @@
+package launch
+
+import (
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestKimiIntegration(t *testing.T) {
+	k := &Kimi{}
+
+	t.Run("String", func(t *testing.T) {
+		if got := k.String(); got != "Kimi Code CLI" {
+			t.Errorf("String() = %q, want %q", got, "Kimi Code CLI")
+		}
+	})
+
+	t.Run("implements Runner", func(t *testing.T) {
+		var _ Runner = k
+	})
+}
+
+func TestKimiArgs(t *testing.T) {
+	k := &Kimi{}
+
+	got := k.args(`{"foo":"bar"}`, []string{"--quiet", "--print"})
+	want := []string{"--config", `{"foo":"bar"}`, "--quiet", "--print"}
+	if !slices.Equal(got, want) {
+		t.Fatalf("args() = %v, want %v", got, want)
+	}
+}
+
+func TestValidateKimiPassthroughArgs_RejectsConflicts(t *testing.T) {
+	tests := []struct {
+		name string
+		args []string
+		want string
+	}{
+		{name: "--config", args: []string{"--config", "{}"}, want: "--config"},
+		{name: "--config=", args: []string{"--config={}"}, want: "--config={"},
+		{name: "--config-file", args: []string{"--config-file", "x.toml"}, want: "--config-file"},
+		{name: "--config-file=", args: []string{"--config-file=x.toml"}, want: "--config-file=x.toml"},
+		{name: "--model", args: []string{"--model", "foo"}, want: "--model"},
+		{name: "--model=", args: []string{"--model=foo"}, want: "--model=foo"},
+		{name: "-m", args: []string{"-m", "foo"}, want: "-m"},
+		{name: "-m=", args: []string{"-m=foo"}, want: "-m=foo"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateKimiPassthroughArgs(tt.args)
+			if err == nil {
+				t.Fatalf("expected error for args %v", tt.args)
+			}
+			if !strings.Contains(err.Error(), tt.want) {
+				t.Fatalf("error %q does not contain %q", err.Error(), tt.want)
+			}
+		})
+	}
+}
+
+func TestBuildKimiInlineConfig(t *testing.T) {
+	t.Setenv("OLLAMA_HOST", "http://127.0.0.1:11434")
+
+	cfg, err := buildKimiInlineConfig("llama3.2", 65536)
+	if err != nil {
+		t.Fatalf("buildKimiInlineConfig() error = %v", err)
+	}
+
+	var parsed map[string]any
+	if err := json.Unmarshal([]byte(cfg), &parsed); err != nil {
+		t.Fatalf("config is not valid JSON: %v", err)
+	}
+
+	if parsed["default_model"] != "ollama" {
+		t.Fatalf("default_model = %v, want ollama", parsed["default_model"])
+	}
+
+	providers, ok := parsed["providers"].(map[string]any)
+	if !ok {
+		t.Fatalf("providers missing or wrong type: %T", parsed["providers"])
+	}
+	ollamaProvider, ok := providers["ollama"].(map[string]any)
+	if !ok {
+		t.Fatalf("providers.ollama missing or wrong type: %T", providers["ollama"])
+	}
+	if ollamaProvider["type"] != "openai_legacy" {
+		t.Fatalf("provider type = %v, want openai_legacy", ollamaProvider["type"])
+	}
+	if ollamaProvider["base_url"] != "http://127.0.0.1:11434/v1" {
+		t.Fatalf("provider base_url = %v, want http://127.0.0.1:11434/v1", ollamaProvider["base_url"])
+	}
+	if ollamaProvider["api_key"] != "ollama" {
+		t.Fatalf("provider api_key = %v, want ollama", ollamaProvider["api_key"])
+	}
+
+	models, ok := parsed["models"].(map[string]any)
+	if !ok {
+		t.Fatalf("models missing or wrong type: %T", parsed["models"])
+	}
+	ollamaModel, ok := models["ollama"].(map[string]any)
+	if !ok {
+		t.Fatalf("models.ollama missing or wrong type: %T", models["ollama"])
+	}
+	if ollamaModel["provider"] != "ollama" {
+		t.Fatalf("model provider = %v, want ollama", ollamaModel["provider"])
+	}
+	if ollamaModel["model"] != "llama3.2" {
+		t.Fatalf("model model = %v, want llama3.2", ollamaModel["model"])
+	}
+	if ollamaModel["max_context_size"] != float64(65536) {
+		t.Fatalf("model max_context_size = %v, want 65536", ollamaModel["max_context_size"])
+	}
+}
+
+func TestResolveKimiMaxContextSize(t *testing.T) {
+	t.Run("uses cloud limit when known", func(t *testing.T) {
+		got := resolveKimiMaxContextSize("kimi-k2.5:cloud")
+		if got != 262_144 {
+			t.Fatalf("resolveKimiMaxContextSize() = %d, want 262144", got)
+		}
+	})
+
+	t.Run("uses model show context length for local models", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path != "/api/show" {
+				http.NotFound(w, r)
+				return
+			}
+			fmt.Fprint(w, `{"model_info":{"llama.context_length":131072}}`)
+		}))
+		defer srv.Close()
+		t.Setenv("OLLAMA_HOST", srv.URL)
+
+		got := resolveKimiMaxContextSize("llama3.2")
+		if got != 131_072 {
+			t.Fatalf("resolveKimiMaxContextSize() = %d, want 131072", got)
+		}
+	})
+
+	t.Run("falls back to default when show fails", func(t *testing.T) {
+		srv := httptest.NewServer(http.NotFoundHandler())
+		defer srv.Close()
+		t.Setenv("OLLAMA_HOST", srv.URL)
+
+		oldTimeout := kimiModelShowTimeout
+		kimiModelShowTimeout = 100 * 1000 * 1000 // 100ms
+		t.Cleanup(func() { kimiModelShowTimeout = oldTimeout })
+
+		got := resolveKimiMaxContextSize("llama3.2")
+		if got != kimiDefaultMaxContextSize {
+			t.Fatalf("resolveKimiMaxContextSize() = %d, want %d", got, kimiDefaultMaxContextSize)
+		}
+	})
+}
+
+func TestKimiRun_RejectsConflictingArgsBeforeInstall(t *testing.T) {
+	k := &Kimi{}
+
+	oldConfirm := DefaultConfirmPrompt
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		t.Fatalf("did not expect install prompt, got %q", prompt)
+		return false, nil
+	}
+	t.Cleanup(func() { DefaultConfirmPrompt = oldConfirm })
+
+	err := k.Run("llama3.2", []string{"--model", "other"})
+	if err == nil || !strings.Contains(err.Error(), "--model") {
+		t.Fatalf("expected conflict error mentioning --model, got %v", err)
+	}
+}
+
+func TestKimiRun_PassesInlineConfigAndExtraArgs(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses POSIX shell fake binary")
+	}
+
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	logPath := filepath.Join(tmpDir, "kimi-args.log")
+	script := fmt.Sprintf(`#!/bin/sh
+for arg in "$@"; do
+  printf "%%s\n" "$arg" >> %q
+done
+exit 0
+`, logPath)
+	if err := os.WriteFile(filepath.Join(tmpDir, "kimi"), []byte(script), 0o755); err != nil {
+		t.Fatalf("failed to write fake kimi: %v", err)
+	}
+	t.Setenv("PATH", tmpDir)
+
+	srv := httptest.NewServer(http.NotFoundHandler())
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	k := &Kimi{}
+	if err := k.Run("llama3.2", []string{"--quiet", "--print"}); err != nil {
+		t.Fatalf("Run() error = %v", err)
+	}
+
+	data, err := os.ReadFile(logPath)
+	if err != nil {
+		t.Fatalf("failed to read args log: %v", err)
+	}
+	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
+	if len(lines) < 4 {
+		t.Fatalf("expected at least 4 args, got %v", lines)
+	}
+	if lines[0] != "--config" {
+		t.Fatalf("first arg = %q, want --config", lines[0])
+	}
+
+	var cfg map[string]any
+	if err := json.Unmarshal([]byte(lines[1]), &cfg); err != nil {
+		t.Fatalf("config arg is not valid JSON: %v", err)
+	}
+	providers := cfg["providers"].(map[string]any)
+	ollamaProvider := providers["ollama"].(map[string]any)
+	if ollamaProvider["type"] != "openai_legacy" {
+		t.Fatalf("provider type = %v, want openai_legacy", ollamaProvider["type"])
+	}
+
+	if lines[2] != "--quiet" || lines[3] != "--print" {
+		t.Fatalf("extra args = %v, want [--quiet --print]", lines[2:])
+	}
+}
+
+func TestEnsureKimiInstalled(t *testing.T) {
+	oldGOOS := kimiGOOS
+	t.Cleanup(func() { kimiGOOS = oldGOOS })
+
+	withConfirm := func(t *testing.T, fn func(prompt string) (bool, error)) {
+		t.Helper()
+		oldConfirm := DefaultConfirmPrompt
+		DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+			return fn(prompt)
+		}
+		t.Cleanup(func() { DefaultConfirmPrompt = oldConfirm })
+	}
+
+	t.Run("already installed", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		t.Setenv("PATH", tmpDir)
+		writeFakeBinary(t, tmpDir, "kimi")
+		kimiGOOS = runtime.GOOS
+
+		withConfirm(t, func(prompt string) (bool, error) {
+			t.Fatalf("did not expect prompt, got %q", prompt)
+			return false, nil
+		})
+
+		bin, err := ensureKimiInstalled()
+		if err != nil {
+			t.Fatalf("ensureKimiInstalled() error = %v", err)
+		}
+		if bin != "kimi" {
+			t.Fatalf("bin = %q, want kimi", bin)
+		}
+	})
+
+	t.Run("missing dependencies", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		t.Setenv("PATH", tmpDir)
+		kimiGOOS = "linux"
+
+		withConfirm(t, func(prompt string) (bool, error) {
+			t.Fatalf("did not expect prompt, got %q", prompt)
+			return false, nil
+		})
+
+		_, err := ensureKimiInstalled()
+		if err == nil || !strings.Contains(err.Error(), "required dependencies are missing") {
+			t.Fatalf("expected missing dependency error, got %v", err)
+		}
+	})
+
+	t.Run("missing and user declines install", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		t.Setenv("PATH", tmpDir)
+		writeFakeBinary(t, tmpDir, "curl")
+		writeFakeBinary(t, tmpDir, "bash")
+		kimiGOOS = "linux"
+
+		withConfirm(t, func(prompt string) (bool, error) {
+			if !strings.Contains(prompt, "Kimi is not installed.") {
+				t.Fatalf("unexpected prompt: %q", prompt)
+			}
+			return false, nil
+		})
+
+		_, err := ensureKimiInstalled()
+		if err == nil || !strings.Contains(err.Error(), "installation cancelled") {
+			t.Fatalf("expected cancellation error, got %v", err)
+		}
+	})
+
+	t.Run("missing and user confirms install succeeds", func(t *testing.T) {
+		if runtime.GOOS == "windows" {
+			t.Skip("uses POSIX shell fake binaries")
+		}
+
+		tmpDir := t.TempDir()
+		t.Setenv("PATH", tmpDir)
+		kimiGOOS = "linux"
+
+		writeFakeBinary(t, tmpDir, "curl")
+
+		installLog := filepath.Join(tmpDir, "bash.log")
+		kimiPath := filepath.Join(tmpDir, "kimi")
+		bashScript := fmt.Sprintf(`#!/bin/sh
+echo "$@" >> %q
+if [ "$1" = "-c" ]; then
+  /bin/cat > %q <<'EOS'
+#!/bin/sh
+exit 0
+EOS
+  /bin/chmod +x %q
+fi
+exit 0
+`, installLog, kimiPath, kimiPath)
+		if err := os.WriteFile(filepath.Join(tmpDir, "bash"), []byte(bashScript), 0o755); err != nil {
+			t.Fatalf("failed to write fake bash: %v", err)
+		}
+
+		withConfirm(t, func(prompt string) (bool, error) {
+			return true, nil
+		})
+
+		bin, err := ensureKimiInstalled()
+		if err != nil {
+			t.Fatalf("ensureKimiInstalled() error = %v", err)
+		}
+		if bin != "kimi" {
+			t.Fatalf("bin = %q, want kimi", bin)
+		}
+
+		logData, err := os.ReadFile(installLog)
+		if err != nil {
+			t.Fatalf("failed to read install log: %v", err)
+		}
+		if !strings.Contains(string(logData), "https://code.kimi.com/install.sh") {
+			t.Fatalf("expected install.sh command in log, got:\n%s", string(logData))
+		}
+	})
+
+	t.Run("install command fails", func(t *testing.T) {
+		if runtime.GOOS == "windows" {
+			t.Skip("uses POSIX shell fake binaries")
+		}
+
+		tmpDir := t.TempDir()
+		t.Setenv("PATH", tmpDir)
+		kimiGOOS = "linux"
+		writeFakeBinary(t, tmpDir, "curl")
+		if err := os.WriteFile(filepath.Join(tmpDir, "bash"), []byte("#!/bin/sh\nexit 1\n"), 0o755); err != nil {
+			t.Fatalf("failed to write fake bash: %v", err)
+		}
+
+		withConfirm(t, func(prompt string) (bool, error) {
+			return true, nil
+		})
+
+		_, err := ensureKimiInstalled()
+		if err == nil || !strings.Contains(err.Error(), "failed to install kimi") {
+			t.Fatalf("expected install failure error, got %v", err)
+		}
+	})
+
+	t.Run("install succeeds but binary missing on PATH", func(t *testing.T) {
+		if runtime.GOOS == "windows" {
+			t.Skip("uses POSIX shell fake binaries")
+		}
+
+		tmpDir := t.TempDir()
+		t.Setenv("PATH", tmpDir)
+		kimiGOOS = "linux"
+		writeFakeBinary(t, tmpDir, "curl")
+		if err := os.WriteFile(filepath.Join(tmpDir, "bash"), []byte("#!/bin/sh\nexit 0\n"), 0o755); err != nil {
+			t.Fatalf("failed to write fake bash: %v", err)
+		}
+
+		withConfirm(t, func(prompt string) (bool, error) {
+			return true, nil
+		})
+
+		_, err := ensureKimiInstalled()
+		if err == nil || !strings.Contains(err.Error(), "binary was not found on PATH") {
+			t.Fatalf("expected PATH guidance error, got %v", err)
+		}
+	})
+}
+
+func TestKimiInstallerCommand(t *testing.T) {
+	tests := []struct {
+		name      string
+		goos      string
+		wantBin   string
+		wantParts []string
+		wantErr   bool
+	}{
+		{
+			name:      "linux",
+			goos:      "linux",
+			wantBin:   "bash",
+			wantParts: []string{"-c", "install.sh"},
+		},
+		{
+			name:      "darwin",
+			goos:      "darwin",
+			wantBin:   "bash",
+			wantParts: []string{"-c", "install.sh"},
+		},
+		{
+			name:      "windows",
+			goos:      "windows",
+			wantBin:   "powershell",
+			wantParts: []string{"-Command", "install.ps1"},
+		},
+		{
+			name:    "unsupported",
+			goos:    "freebsd",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			bin, args, err := kimiInstallerCommand(tt.goos)
+			if tt.wantErr {
+				if err == nil {
+					t.Fatal("expected error")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("kimiInstallerCommand() error = %v", err)
+			}
+			if bin != tt.wantBin {
+				t.Fatalf("bin = %q, want %q", bin, tt.wantBin)
+			}
+			joined := strings.Join(args, " ")
+			for _, part := range tt.wantParts {
+				if !strings.Contains(joined, part) {
+					t.Fatalf("args %q missing %q", joined, part)
+				}
+			}
+		})
+	}
+}
--- a/cmd/launch/launch.go
+++ b/cmd/launch/launch.go
@@ -141,6 +141,36 @@ type Editor interface {
 	Models() []string
 }

+// ManagedSingleModel is the narrow launch-owned config path for integrations
+// like Hermes that have one primary model selected by launcher, need launcher
+// to persist minimal config, and still keep their own model discovery and
+// onboarding UX. This stays separate from Runner-only integrations and the
+// multi-model Editor flow so Hermes-specific behavior stays scoped to one path.
+type ManagedSingleModel interface {
+	Paths() []string
+	Configure(model string) error
+	CurrentModel() string
+	Onboard() error
+}
+
+// ManagedRuntimeRefresher lets managed integrations refresh any long-lived
+// background runtime after launch rewrites their config.
+type ManagedRuntimeRefresher interface {
+	RefreshRuntimeAfterConfigure() error
+}
+
+// ManagedOnboardingValidator lets managed integrations re-check saved
+// onboarding state when launcher needs a stronger live readiness signal.
+type ManagedOnboardingValidator interface {
+	OnboardingComplete() bool
+}
+
+// ManagedInteractiveOnboarding lets a managed integration declare whether its
+// onboarding step really requires an interactive terminal. Hermes does not.
+type ManagedInteractiveOnboarding interface {
+	RequiresInteractiveOnboarding() bool
+}
+
 type modelInfo struct {
 	Name        string
 	Remote      bool
@@ -176,7 +206,10 @@ Supported integrations:
  claude    Claude Code
  cline     Cline
  codex     Codex
+  copilot   Copilot CLI (aliases: copilot-cli)
  droid     Droid
+  hermes    Hermes Agent
+  kimi      Kimi Code CLI
  opencode  OpenCode
  openclaw  OpenClaw (aliases: clawdbot, moltbot)
  pi        Pi
@@ -186,6 +219,7 @@ Examples:
  ollama launch
  ollama launch claude
  ollama launch claude --model <model>
+  ollama launch hermes
  ollama launch droid --config (does not auto-launch)
  ollama launch codex -- -p myprofile (pass extra args to integration)
  ollama launch codex -- --sandbox workspace-write`,
@@ -308,36 +342,54 @@ func LaunchIntegration(ctx context.Context, req IntegrationLaunchRequest) error
 	if err != nil {
 		return err
 	}
+
+	policy := launchIntegrationPolicy(req)
+	if policy.Confirm == LaunchConfirmAutoApprove && !isInteractiveSession() && req.ModelOverride == "" {
+		return fmt.Errorf("headless --yes launch for %s requires --model <model>", name)
+	}
+
+	launchClient, saved, err := prepareIntegrationLaunch(name, policy)
+	if err != nil {
+		return err
+	}
+
+	if managed, ok := runner.(ManagedSingleModel); ok {
+		if err := EnsureIntegrationInstalled(name, runner); err != nil {
+			return err
+		}
+		return launchClient.launchManagedSingleIntegration(ctx, name, runner, managed, saved, req)
+	}
+
 	if !req.ConfigureOnly {
 		if err := EnsureIntegrationInstalled(name, runner); err != nil {
 			return err
 		}
 	}

-	var policy LaunchPolicy
-	// TUI does not set a policy, whereas ollama launch <app> does as it can have flags which change the behavior
-	if req.Policy == nil {
-		policy = defaultLaunchPolicy(isInteractiveSession(), false)
-	} else {
-		policy = *req.Policy
-	}
-
-	launchClient, err := newLauncherClient(policy)
-	if err != nil {
-		return err
-	}
-	saved, _ := loadStoredIntegrationConfig(name)
-	// In headless --yes mode we cannot prompt, so require an explicit --model.
-	if policy.Confirm == LaunchConfirmAutoApprove && !isInteractiveSession() && req.ModelOverride == "" {
-		return fmt.Errorf("headless --yes launch for %s requires --model <model>", name)
-	}
-
 	if editor, ok := runner.(Editor); ok {
 		return launchClient.launchEditorIntegration(ctx, name, runner, editor, saved, req)
 	}
 	return launchClient.launchSingleIntegration(ctx, name, runner, saved, req)
 }

+func launchIntegrationPolicy(req IntegrationLaunchRequest) LaunchPolicy {
+	// TUI does not set a policy, whereas ollama launch <app> does as it can
+	// have flags which change the behavior.
+	if req.Policy != nil {
+		return *req.Policy
+	}
+	return defaultLaunchPolicy(isInteractiveSession(), false)
+}
+
+func prepareIntegrationLaunch(name string, policy LaunchPolicy) (*launcherClient, *config.IntegrationConfig, error) {
+	launchClient, err := newLauncherClient(policy)
+	if err != nil {
+		return nil, nil, err
+	}
+	saved, _ := loadStoredIntegrationConfig(name)
+	return launchClient, saved, nil
+}
+
 func (c *launcherClient) buildLauncherState(ctx context.Context) (*LauncherState, error) {
 	_ = c.loadModelInventoryOnce(ctx)

@@ -368,9 +420,18 @@ func (c *launcherClient) buildLauncherIntegrationState(ctx context.Context, info
 	if err != nil {
 		return LauncherIntegrationState{}, err
 	}
-	currentModel, usable, err := c.launcherModelState(ctx, info.Name, integration.editor)
-	if err != nil {
-		return LauncherIntegrationState{}, err
+	var currentModel string
+	var usable bool
+	if managed, ok := integration.spec.Runner.(ManagedSingleModel); ok {
+		currentModel, usable, err = c.launcherManagedModelState(ctx, info.Name, managed)
+		if err != nil {
+			return LauncherIntegrationState{}, err
+		}
+	} else {
+		currentModel, usable, err = c.launcherModelState(ctx, info.Name, integration.editor)
+		if err != nil {
+			return LauncherIntegrationState{}, err
+		}
 	}

 	return LauncherIntegrationState{
@@ -408,6 +469,28 @@ func (c *launcherClient) launcherModelState(ctx context.Context, name string, is
 	return model, usableErr == nil && usable, nil
 }

+func (c *launcherClient) launcherManagedModelState(ctx context.Context, name string, managed ManagedSingleModel) (string, bool, error) {
+	current := managed.CurrentModel()
+	if current == "" {
+		cfg, loadErr := loadStoredIntegrationConfig(name)
+		if loadErr == nil {
+			current = primaryModelFromConfig(cfg)
+		}
+		if current != "" {
+			return current, false, nil
+		}
+	}
+	if current == "" {
+		return "", false, nil
+	}
+
+	usable, err := c.savedModelUsable(ctx, current)
+	if err != nil {
+		return current, false, err
+	}
+	return current, usable, nil
+}
+
 func (c *launcherClient) resolveRunModel(ctx context.Context, req RunModelRequest) (string, error) {
 	current := config.LastModel()
 	if !req.ForcePicker && current != "" && c.policy.Confirm == LaunchConfirmAutoApprove && !isInteractiveSession() {
@@ -444,35 +527,15 @@ func (c *launcherClient) resolveRunModel(ctx context.Context, req RunModelReques
 }

 func (c *launcherClient) launchSingleIntegration(ctx context.Context, name string, runner Runner, saved *config.IntegrationConfig, req IntegrationLaunchRequest) error {
-	current := primaryModelFromConfig(saved)
-	target := req.ModelOverride
-	needsConfigure := req.ForceConfigure
-
-	if target == "" {
-		target = current
-		usable, err := c.savedModelUsable(ctx, target)
-		if err != nil {
-			return err
-		}
-		if !usable {
-			needsConfigure = true
-		}
-	}
-
-	if needsConfigure {
-		selected, err := c.selectSingleModelWithSelector(ctx, fmt.Sprintf("Select model for %s:", runner), target, DefaultSingleSelector)
-		if err != nil {
-			return err
-		}
-		target = selected
-	} else if err := c.ensureModelsReady(ctx, []string{target}); err != nil {
+	target, _, err := c.resolveSingleIntegrationTarget(ctx, runner, primaryModelFromConfig(saved), req)
+	if err != nil {
 		return err
 	}
-
 	if target == "" {
 		return nil
 	}

+	current := primaryModelFromConfig(saved)
 	if target != current {
 		if err := config.SaveIntegration(name, []string{target}); err != nil {
 			return fmt.Errorf("failed to save: %w", err)
@@ -510,6 +573,102 @@ func (c *launcherClient) launchEditorIntegration(ctx context.Context, name strin
 	return launchAfterConfiguration(name, runner, models[0], req)
 }

+func (c *launcherClient) launchManagedSingleIntegration(ctx context.Context, name string, runner Runner, managed ManagedSingleModel, saved *config.IntegrationConfig, req IntegrationLaunchRequest) error {
+	current := managed.CurrentModel()
+	selectionCurrent := current
+	if selectionCurrent == "" {
+		selectionCurrent = primaryModelFromConfig(saved)
+	}
+
+	target, needsConfigure, err := c.resolveSingleIntegrationTarget(ctx, runner, selectionCurrent, req)
+	if err != nil {
+		return err
+	}
+	if target == "" {
+		return nil
+	}
+
+	if (current == "" || needsConfigure || req.ModelOverride != "" || target != current) && !savedMatchesModels(saved, []string{target}) {
+		if err := prepareManagedSingleIntegration(name, runner, managed, target); err != nil {
+			return err
+		}
+		if refresher, ok := managed.(ManagedRuntimeRefresher); ok {
+			if err := refresher.RefreshRuntimeAfterConfigure(); err != nil {
+				return err
+			}
+		}
+	}
+
+	if !managedIntegrationOnboarded(saved, managed) {
+		if !isInteractiveSession() && managedRequiresInteractiveOnboarding(managed) {
+			return fmt.Errorf("%s still needs interactive gateway setup; run 'ollama launch %s' in a terminal to finish onboarding", runner, name)
+		}
+		if err := managed.Onboard(); err != nil {
+			return err
+		}
+	}
+
+	if req.ConfigureOnly {
+		return nil
+	}
+
+	return runIntegration(runner, target, req.ExtraArgs)
+}
+
+func (c *launcherClient) resolveSingleIntegrationTarget(ctx context.Context, runner Runner, current string, req IntegrationLaunchRequest) (string, bool, error) {
+	target := req.ModelOverride
+	needsConfigure := req.ForceConfigure
+
+	if target == "" {
+		target = current
+		usable, err := c.savedModelUsable(ctx, target)
+		if err != nil {
+			return "", false, err
+		}
+		if !usable {
+			needsConfigure = true
+		}
+	}
+
+	if needsConfigure {
+		selected, err := c.selectSingleModelWithSelector(ctx, fmt.Sprintf("Select model for %s:", runner), target, DefaultSingleSelector)
+		if err != nil {
+			return "", false, err
+		}
+		target = selected
+	} else if err := c.ensureModelsReady(ctx, []string{target}); err != nil {
+		return "", false, err
+	}
+
+	return target, needsConfigure, nil
+}
+
+func savedIntegrationOnboarded(saved *config.IntegrationConfig) bool {
+	return saved != nil && saved.Onboarded
+}
+
+func managedIntegrationOnboarded(saved *config.IntegrationConfig, managed ManagedSingleModel) bool {
+	if !savedIntegrationOnboarded(saved) {
+		return false
+	}
+	validator, ok := managed.(ManagedOnboardingValidator)
+	if !ok {
+		return true
+	}
+	return validator.OnboardingComplete()
+}
+
+// Most managed integrations treat onboarding as an interactive terminal step.
+// Hermes opts out because its launch-owned onboarding is just bookkeeping, so
+// headless launches should not be blocked once config is already prepared.
+func managedRequiresInteractiveOnboarding(managed ManagedSingleModel) bool {
+	onboarding, ok := managed.(ManagedInteractiveOnboarding)
+	if !ok {
+		return true
+	}
+	return onboarding.RequiresInteractiveOnboarding()
+}
+
 func (c *launcherClient) selectSingleModelWithSelector(ctx context.Context, title, current string, selector SingleSelector) (string, error) {
 	if selector == nil {
 		return "", fmt.Errorf("no selector configured")
--- a/cmd/launch/launch_test.go
+++ b/cmd/launch/launch_test.go
@@ -49,6 +49,55 @@ func (r *launcherSingleRunner) Run(model string, args []string) error {

 func (r *launcherSingleRunner) String() string { return "StubSingle" }

+type launcherManagedRunner struct {
+	paths              []string
+	currentModel       string
+	configured         []string
+	ranModel           string
+	onboarded          bool
+	onboardCalls       int
+	onboardingComplete bool
+	refreshCalls       int
+	refreshErr         error
+}
+
+func (r *launcherManagedRunner) Run(model string, args []string) error {
+	r.ranModel = model
+	return nil
+}
+
+func (r *launcherManagedRunner) String() string { return "StubManaged" }
+
+func (r *launcherManagedRunner) Paths() []string { return r.paths }
+
+func (r *launcherManagedRunner) Configure(model string) error {
+	r.configured = append(r.configured, model)
+	r.currentModel = model
+	return nil
+}
+
+func (r *launcherManagedRunner) CurrentModel() string { return r.currentModel }
+
+func (r *launcherManagedRunner) Onboard() error {
+	r.onboardCalls++
+	r.onboarded = true
+	r.onboardingComplete = true
+	return nil
+}
+
+func (r *launcherManagedRunner) OnboardingComplete() bool { return r.onboardingComplete }
+
+func (r *launcherManagedRunner) RefreshRuntimeAfterConfigure() error {
+	r.refreshCalls++
+	return r.refreshErr
+}
+
+type launcherHeadlessManagedRunner struct {
+	launcherManagedRunner
+}
+
+func (r *launcherHeadlessManagedRunner) RequiresInteractiveOnboarding() bool { return false }
+
 func setLaunchTestHome(t *testing.T, dir string) {
 	t.Helper()
 	t.Setenv("HOME", dir)
@@ -141,6 +190,451 @@ func TestDefaultLaunchPolicy(t *testing.T) {
 	}
 }

+func TestBuildLauncherState_ManagedSingleIntegrationUsesCurrentModel(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherManagedRunner{currentModel: "gemma4"}
+	withIntegrationOverride(t, "pi", runner)
+
+	state, err := BuildLauncherState(context.Background())
+	if err != nil {
+		t.Fatalf("BuildLauncherState returned error: %v", err)
+	}
+
+	if state.Integrations["pi"].CurrentModel != "gemma4" {
+		t.Fatalf("expected managed current model from integration config, got %q", state.Integrations["pi"].CurrentModel)
+	}
+	if !state.Integrations["pi"].ModelUsable {
+		t.Fatal("expected managed current model to be usable")
+	}
+}
+
+func TestBuildLauncherState_ManagedSingleIntegrationShowsSavedModelWhenLiveConfigMissing(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	if err := config.SaveIntegration("pi", []string{"gemma4"}); err != nil {
+		t.Fatalf("failed to save managed integration config: %v", err)
+	}
+
+	runner := &launcherManagedRunner{}
+	withIntegrationOverride(t, "pi", runner)
+
+	state, err := BuildLauncherState(context.Background())
+	if err != nil {
+		t.Fatalf("BuildLauncherState returned error: %v", err)
+	}
+
+	if state.Integrations["pi"].CurrentModel != "gemma4" {
+		t.Fatalf("expected saved model to remain visible, got %q", state.Integrations["pi"].CurrentModel)
+	}
+	if state.Integrations["pi"].ModelUsable {
+		t.Fatal("expected missing live config to mark managed model unusable")
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationConfiguresOnboardsAndRuns(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherManagedRunner{
+		paths: nil,
+	}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		return "gemma4", nil
+	}
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		return true, nil
+	}
+
+	if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{Name: "stubmanaged"}); err != nil {
+		t.Fatalf("LaunchIntegration returned error: %v", err)
+	}
+
+	if diff := compareStrings(runner.configured, []string{"gemma4"}); diff != "" {
+		t.Fatalf("configured models mismatch: %s", diff)
+	}
+	if runner.refreshCalls != 1 {
+		t.Fatalf("expected runtime refresh once after configure, got %d", runner.refreshCalls)
+	}
+	if runner.onboardCalls != 1 {
+		t.Fatalf("expected onboarding to run once, got %d", runner.onboardCalls)
+	}
+	if runner.ranModel != "gemma4" {
+		t.Fatalf("expected launch to run configured model, got %q", runner.ranModel)
+	}
+
+	saved, err := config.LoadIntegration("stubmanaged")
+	if err != nil {
+		t.Fatalf("failed to reload managed integration config: %v", err)
+	}
+	if diff := compareStrings(saved.Models, []string{"gemma4"}); diff != "" {
+		t.Fatalf("saved models mismatch: %s", diff)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationReOnboardsWhenSavedFlagIsStale(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherManagedRunner{
+		currentModel:       "gemma4",
+		onboardingComplete: false,
+	}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	if err := config.SaveIntegration("stubmanaged", []string{"gemma4"}); err != nil {
+		t.Fatalf("failed to save managed integration config: %v", err)
+	}
+	if err := config.MarkIntegrationOnboarded("stubmanaged"); err != nil {
+		t.Fatalf("failed to mark managed integration onboarded: %v", err)
+	}
+
+	if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{Name: "stubmanaged"}); err != nil {
+		t.Fatalf("LaunchIntegration returned error: %v", err)
+	}
+
+	if runner.onboardCalls != 1 {
+		t.Fatalf("expected stale onboarded flag to trigger onboarding, got %d calls", runner.onboardCalls)
+	}
+	if runner.refreshCalls != 0 {
+		t.Fatalf("expected no runtime refresh when config is unchanged, got %d", runner.refreshCalls)
+	}
+	if runner.ranModel != "gemma4" {
+		t.Fatalf("expected launch to run saved model after onboarding repair, got %q", runner.ranModel)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationConfigOnlySkipsFinalRun(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherManagedRunner{
+		paths: nil,
+	}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		return true, nil
+	}
+
+	if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{
+		Name:          "stubmanaged",
+		ModelOverride: "gemma4",
+		ConfigureOnly: true,
+	}); err != nil {
+		t.Fatalf("LaunchIntegration returned error: %v", err)
+	}
+
+	if runner.ranModel != "" {
+		t.Fatalf("expected configure-only flow to skip final launch, got %q", runner.ranModel)
+	}
+	if runner.refreshCalls != 1 {
+		t.Fatalf("expected configure-only flow to refresh runtime once, got %d", runner.refreshCalls)
+	}
+	if runner.onboardCalls != 1 {
+		t.Fatalf("expected configure-only flow to onboard once, got %d", runner.onboardCalls)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationSkipsRewriteWhenSavedMatches(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	if err := config.SaveIntegration("stubmanaged", []string{"gemma4"}); err != nil {
+		t.Fatalf("failed to save managed integration config: %v", err)
+	}
+
+	runner := &launcherManagedRunner{}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		t.Fatal("selector should not be called when saved model matches target")
+		return "", nil
+	}
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		t.Fatal("confirm prompt should not run when saved model matches target")
+		return false, nil
+	}
+
+	if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{Name: "stubmanaged"}); err != nil {
+		t.Fatalf("LaunchIntegration returned error: %v", err)
+	}
+
+	if len(runner.configured) != 0 {
+		t.Fatalf("expected Configure to be skipped when saved matches, got %v", runner.configured)
+	}
+	if runner.refreshCalls != 0 {
+		t.Fatalf("expected no runtime refresh when config is unchanged, got %d", runner.refreshCalls)
+	}
+	if runner.ranModel != "gemma4" {
+		t.Fatalf("expected launch to run saved model, got %q", runner.ranModel)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationRewritesWhenSavedDiffers(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	if err := config.SaveIntegration("stubmanaged", []string{"old-model"}); err != nil {
+		t.Fatalf("failed to save managed integration config: %v", err)
+	}
+
+	runner := &launcherManagedRunner{}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		t.Fatal("selector should not be called when model override is provided")
+		return "", nil
+	}
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		return true, nil
+	}
+
+	if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{
+		Name:          "stubmanaged",
+		ModelOverride: "gemma4",
+	}); err != nil {
+		t.Fatalf("LaunchIntegration returned error: %v", err)
+	}
+
+	if diff := compareStrings(runner.configured, []string{"gemma4"}); diff != "" {
+		t.Fatalf("expected Configure to run when saved differs from target: %s", diff)
+	}
+	if runner.refreshCalls != 1 {
+		t.Fatalf("expected runtime refresh once after configure, got %d", runner.refreshCalls)
+	}
+	if runner.ranModel != "gemma4" {
+		t.Fatalf("expected launch to run configured model, got %q", runner.ranModel)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationStopsWhenRuntimeRefreshFails(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherManagedRunner{
+		refreshErr: fmt.Errorf("boom"),
+	}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		return true, nil
+	}
+
+	err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{
+		Name:          "stubmanaged",
+		ModelOverride: "gemma4",
+	})
+	if err == nil || !strings.Contains(err.Error(), "boom") {
+		t.Fatalf("expected runtime refresh error, got %v", err)
+	}
+	if runner.ranModel != "" {
+		t.Fatalf("expected final launch to stop on runtime refresh failure, got %q", runner.ranModel)
+	}
+	if runner.refreshCalls != 1 {
+		t.Fatalf("expected one runtime refresh attempt, got %d", runner.refreshCalls)
+	}
+	if runner.onboardCalls != 0 {
+		t.Fatalf("expected onboarding to stop after refresh failure, got %d", runner.onboardCalls)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationHeadlessNeedsInteractiveOnboarding(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, false)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherManagedRunner{
+		paths: nil,
+	}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{
+		Name:          "stubmanaged",
+		ModelOverride: "gemma4",
+		Policy:        &LaunchPolicy{Confirm: LaunchConfirmAutoApprove, MissingModel: LaunchMissingModelAutoPull},
+	})
+	if err == nil {
+		t.Fatal("expected headless onboarding requirement to fail")
+	}
+	if !strings.Contains(err.Error(), "interactive gateway setup") {
+		t.Fatalf("expected interactive onboarding guidance, got %v", err)
+	}
+	if runner.ranModel != "" {
+		t.Fatalf("expected no final launch when onboarding is still required, got %q", runner.ranModel)
+	}
+	if runner.onboardCalls != 0 {
+		t.Fatalf("expected no onboarding attempts in headless mode, got %d", runner.onboardCalls)
+	}
+}
+
+func TestLaunchIntegration_ManagedSingleIntegrationHeadlessAllowsNonInteractiveOnboarding(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, false)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	runner := &launcherHeadlessManagedRunner{}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{
+		Name:          "stubmanaged",
+		ModelOverride: "gemma4",
+		Policy:        &LaunchPolicy{Confirm: LaunchConfirmAutoApprove, MissingModel: LaunchMissingModelAutoPull},
+	})
+	if err != nil {
+		t.Fatalf("expected non-interactive onboarding to succeed headlessly, got %v", err)
+	}
+	if diff := compareStrings(runner.configured, []string{"gemma4"}); diff != "" {
+		t.Fatalf("configured models mismatch: %s", diff)
+	}
+	if runner.onboardCalls != 1 {
+		t.Fatalf("expected onboarding to run once, got %d", runner.onboardCalls)
+	}
+	if runner.ranModel != "gemma4" {
+		t.Fatalf("expected launch to run configured model, got %q", runner.ranModel)
+	}
+}
+
 func TestBuildLauncherState_InstalledAndCloudDisabled(t *testing.T) {
 	tmpDir := t.TempDir()
 	setLaunchTestHome(t, tmpDir)
--- a/cmd/launch/models.go
+++ b/cmd/launch/models.go
@@ -230,7 +230,7 @@ func pullMissingModel(ctx context.Context, client *api.Client, model string) err

 // prepareEditorIntegration persists models and applies editor-managed config files.
 func prepareEditorIntegration(name string, runner Runner, editor Editor, models []string) error {
-	if ok, err := confirmEditorEdit(runner, editor); err != nil {
+	if ok, err := confirmConfigEdit(runner, editor.Paths()); err != nil {
 		return err
 	} else if !ok {
 		return errCancelled
@@ -244,8 +244,22 @@ func prepareEditorIntegration(name string, runner Runner, editor Editor, models
 	return nil
 }

-func confirmEditorEdit(runner Runner, editor Editor) (bool, error) {
-	paths := editor.Paths()
+func prepareManagedSingleIntegration(name string, runner Runner, managed ManagedSingleModel, model string) error {
+	if ok, err := confirmConfigEdit(runner, managed.Paths()); err != nil {
+		return err
+	} else if !ok {
+		return errCancelled
+	}
+	if err := managed.Configure(model); err != nil {
+		return fmt.Errorf("setup failed: %w", err)
+	}
+	if err := config.SaveIntegration(name, []string{model}); err != nil {
+		return fmt.Errorf("failed to save: %w", err)
+	}
+	return nil
+}
+
+func confirmConfigEdit(runner Runner, paths []string) (bool, error) {
 	if len(paths) == 0 {
 		return true, nil
 	}
@@ -345,8 +359,6 @@ func buildModelList(existing []modelInfo, preChecked []string, current string) (
 		recRank[rec.Name] = i + 1
 	}

-	onlyLocal := hasLocalModel && !hasCloudModel
-
 	if hasLocalModel || hasCloudModel {
 		slices.SortStableFunc(items, func(a, b ModelItem) int {
 			ac, bc := checked[a.Name], checked[b.Name]
@@ -368,12 +380,6 @@ func buildModelList(existing []modelInfo, preChecked []string, current string) (
 			}
 			if aRec && bRec {
 				if aCloud != bCloud {
-					if onlyLocal {
-						if aCloud {
-							return 1
-						}
-						return -1
-					}
 					if aCloud {
 						return -1
 					}
--- a/cmd/launch/registry.go
+++ b/cmd/launch/registry.go
@@ -33,7 +33,7 @@ type IntegrationInfo struct {
 	Description string
 }

-var launcherIntegrationOrder = []string{"opencode", "droid", "pi"}
+var launcherIntegrationOrder = []string{"openclaw", "claude", "opencode", "hermes", "codex", "copilot", "droid", "pi"}

 var integrationSpecs = []*IntegrationSpec{
 	{
@@ -74,6 +74,36 @@ var integrationSpecs = []*IntegrationSpec{
 			Command: []string{"npm", "install", "-g", "@openai/codex"},
 		},
 	},
+	{
+		Name:        "kimi",
+		Runner:      &Kimi{},
+		Description: "Moonshot's coding agent for terminal and IDEs",
+		Hidden:      true,
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := exec.LookPath("kimi")
+				return err == nil
+			},
+			EnsureInstalled: func() error {
+				_, err := ensureKimiInstalled()
+				return err
+			},
+			URL: "https://moonshotai.github.io/kimi-cli/en/guides/getting-started.html",
+		},
+	},
+	{
+		Name:        "copilot",
+		Runner:      &Copilot{},
+		Aliases:     []string{"copilot-cli"},
+		Description: "GitHub's AI coding agent for the terminal",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				_, err := (&Copilot{}).findPath()
+				return err == nil
+			},
+			URL: "https://github.com/features/copilot/cli/",
+		},
+	},
 	{
 		Name:        "droid",
 		Runner:      &Droid{},
@@ -136,6 +166,20 @@ var integrationSpecs = []*IntegrationSpec{
 			Command: []string{"npm", "install", "-g", "@mariozechner/pi-coding-agent@latest"},
 		},
 	},
+	{
+		Name:        "hermes",
+		Runner:      &Hermes{},
+		Description: "Self-improving AI agent built by Nous Research",
+		Install: IntegrationInstallSpec{
+			CheckInstalled: func() bool {
+				return (&Hermes{}).installed()
+			},
+			EnsureInstalled: func() error {
+				return (&Hermes{}).ensureInstalled()
+			},
+			URL: "https://hermes-agent.nousresearch.com/docs/getting-started/installation/",
+		},
+	},
 	{
 		Name:        "vscode",
 		Runner:      &VSCode{},
@@ -255,10 +299,10 @@ func ListVisibleIntegrationSpecs() []IntegrationSpec {
 			return aRank - bRank
 		}
 		if aRank > 0 {
-			return 1
+			return -1
 		}
 		if bRank > 0 {
-			return -1
+			return 1
 		}
 		return strings.Compare(a.Name, b.Name)
 	})
--- a/cmd/launch/runner_exec_only_test.go
+++ b/cmd/launch/runner_exec_only_test.go
@@ -45,6 +45,14 @@ func TestEditorRunsDoNotRewriteConfig(t *testing.T) {
 				return filepath.Join(home, ".pi", "agent", "models.json")
 			},
 		},
+		{
+			name:   "kimi",
+			binary: "kimi",
+			runner: &Kimi{},
+			checkPath: func(home string) string {
+				return filepath.Join(home, ".kimi", "config.toml")
+			},
+		},
 	}

 	for _, tt := range tests {
@@ -57,6 +65,10 @@ func TestEditorRunsDoNotRewriteConfig(t *testing.T) {
 			if tt.name == "pi" {
 				writeFakeBinary(t, binDir, "npm")
 			}
+			if tt.name == "kimi" {
+				writeFakeBinary(t, binDir, "curl")
+				writeFakeBinary(t, binDir, "bash")
+			}
 			t.Setenv("PATH", binDir)

 			configPath := tt.checkPath(home)
--- a/cmd/tui/tui.go
+++ b/cmd/tui/tui.go
@@ -45,21 +45,12 @@ type menuItem struct {
 	isOthers    bool
 }

-var mainMenuItems = []menuItem{
-	{
-		title:       "Chat with a model",
-		description: "Start an interactive chat with a model",
-		isRunModel:  true,
-	},
-	{
-		integration: "openclaw",
-	},
-	{
-		integration: "claude",
-	},
-	{
-		integration: "opencode",
-	},
+const pinnedIntegrationCount = 3
+
+var runModelMenuItem = menuItem{
+	title:       "Chat with a model",
+	description: "Start an interactive chat with a model",
+	isRunModel:  true,
 }

 var othersMenuItem = menuItem{
@@ -102,20 +93,14 @@ func shouldExpandOthers(state *launch.LauncherState) bool {
 }

 func buildMenuItems(state *launch.LauncherState, showOthers bool) []menuItem {
-	items := make([]menuItem, 0, len(mainMenuItems)+1)
-	for _, item := range mainMenuItems {
-		if item.integration == "" {
-			items = append(items, item)
-			continue
-		}
-		if integrationState, ok := state.Integrations[item.integration]; ok {
-			items = append(items, integrationMenuItem(integrationState))
-		}
-	}
+	items := []menuItem{runModelMenuItem}
+	items = append(items, pinnedIntegrationItems(state)...)

-	if showOthers {
-		items = append(items, otherIntegrationItems(state)...)
-	} else {
+	otherItems := otherIntegrationItems(state)
+	switch {
+	case showOthers:
+		items = append(items, otherItems...)
+	case len(otherItems) > 0:
 		items = append(items, othersMenuItem)
 	}

@@ -135,17 +120,28 @@ func integrationMenuItem(state launch.LauncherIntegrationState) menuItem {
 }

 func otherIntegrationItems(state *launch.LauncherState) []menuItem {
-	pinned := map[string]bool{
-		"openclaw": true,
-		"claude":   true,
-		"opencode": true,
+	ordered := orderedIntegrationItems(state)
+	if len(ordered) <= pinnedIntegrationCount {
+		return nil
+	}
+	return ordered[pinnedIntegrationCount:]
+}
+
+func pinnedIntegrationItems(state *launch.LauncherState) []menuItem {
+	ordered := orderedIntegrationItems(state)
+	if len(ordered) <= pinnedIntegrationCount {
+		return ordered
+	}
+	return ordered[:pinnedIntegrationCount]
+}
+
+func orderedIntegrationItems(state *launch.LauncherState) []menuItem {
+	if state == nil {
+		return nil
 	}

-	var items []menuItem
+	items := make([]menuItem, 0, len(state.Integrations))
 	for _, info := range launch.ListIntegrationInfos() {
-		if pinned[info.Name] {
-			continue
-		}
 		integrationState, ok := state.Integrations[info.Name]
 		if !ok {
 			continue
@@ -155,6 +151,10 @@ func otherIntegrationItems(state *launch.LauncherState) []menuItem {
 	return items
 }

+func primaryMenuItemCount(state *launch.LauncherState) int {
+	return 1 + len(pinnedIntegrationItems(state))
+}
+
 func initialCursor(state *launch.LauncherState, items []menuItem) int {
 	if state == nil || state.LastSelection == "" {
 		return 0
@@ -190,7 +190,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			if m.cursor > 0 {
 				m.cursor--
 			}
-			if m.showOthers && m.cursor < len(mainMenuItems) {
+			if m.showOthers && m.cursor < primaryMenuItemCount(m.state) {
 				m.showOthers = false
 				m.items = buildMenuItems(m.state, false)
 				m.cursor = min(m.cursor, len(m.items)-1)
--- a/cmd/tui/tui_test.go
+++ b/cmd/tui/tui_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"

 	tea "github.com/charmbracelet/bubbletea"
+	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/cmd/launch"
 )

@@ -43,6 +44,13 @@ func launcherTestState() *launch.LauncherState {
 				Selectable:  true,
 				Changeable:  true,
 			},
+			"hermes": {
+				Name:        "hermes",
+				DisplayName: "Hermes Agent",
+				Description: "Self-improving AI agent built by Nous Research",
+				Selectable:  true,
+				Changeable:  true,
+			},
 			"droid": {
 				Name:        "droid",
 				DisplayName: "Droid",
@@ -70,8 +78,28 @@ func findMenuCursorByIntegration(items []menuItem, name string) int {
 	return -1
 }

+func integrationSequence(items []menuItem) []string {
+	sequence := make([]string, 0, len(items))
+	for _, item := range items {
+		switch {
+		case item.isRunModel:
+			sequence = append(sequence, "run")
+		case item.isOthers:
+			sequence = append(sequence, "more")
+		case item.integration != "":
+			sequence = append(sequence, item.integration)
+		}
+	}
+	return sequence
+}
+
+func compareStrings(got, want []string) string {
+	return cmp.Diff(want, got)
+}
+
 func TestMenuRendersPinnedItemsAndMore(t *testing.T) {
-	view := newModel(launcherTestState()).View()
+	menu := newModel(launcherTestState())
+	view := menu.View()
 	for _, want := range []string{"Chat with a model", "Launch OpenClaw", "Launch Claude Code", "Launch OpenCode", "More..."} {
 		if !strings.Contains(view, want) {
 			t.Fatalf("expected menu view to contain %q\n%s", want, view)
@@ -80,23 +108,31 @@ func TestMenuRendersPinnedItemsAndMore(t *testing.T) {
 	if strings.Contains(view, "Launch Codex") {
 		t.Fatalf("expected Codex to be under More, not pinned\n%s", view)
 	}
+	wantOrder := []string{"run", "openclaw", "claude", "opencode", "more"}
+	if diff := compareStrings(integrationSequence(menu.items), wantOrder); diff != "" {
+		t.Fatalf("unexpected pinned order: %s", diff)
+	}
 }

 func TestMenuExpandsOthersFromLastSelection(t *testing.T) {
 	state := launcherTestState()
-	state.LastSelection = "pi"
+	state.LastSelection = "codex"

 	menu := newModel(state)
 	if !menu.showOthers {
 		t.Fatal("expected others section to expand when last selection is in the overflow list")
 	}
 	view := menu.View()
-	if !strings.Contains(view, "Launch Pi") {
+	if !strings.Contains(view, "Launch Codex") {
 		t.Fatalf("expected expanded view to contain overflow integration\n%s", view)
 	}
 	if strings.Contains(view, "More...") {
 		t.Fatalf("expected expanded view to replace More... item\n%s", view)
 	}
+	wantOrder := []string{"run", "openclaw", "claude", "opencode", "hermes", "codex", "droid", "pi"}
+	if diff := compareStrings(integrationSequence(menu.items), wantOrder); diff != "" {
+		t.Fatalf("unexpected expanded order: %s", diff)
+	}
 }

 func TestMenuEnterOnRunSelectsRun(t *testing.T) {
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -120,6 +120,7 @@
                "pages": [
                  "/integrations/claude-code",
                  "/integrations/codex",
+                  "/integrations/copilot-cli",
                  "/integrations/opencode",
                  "/integrations/droid",
                  "/integrations/goose",
--- a/docs/images/hermes.png
+++ b/docs/images/hermes.png
--- a/docs/integrations/copilot-cli.mdx
+++ b/docs/integrations/copilot-cli.mdx
@@ -0,0 +1,93 @@
+---
+title: Copilot CLI
+---
+
+GitHub Copilot CLI is GitHub's AI coding agent for the terminal. It can understand your codebase, make edits, run commands, and help you build software faster.
+
+Open models can be used with Copilot CLI through Ollama, enabling you to use models such as `qwen3.5`, `glm-5.1:cloud`, `kimi-k2.5:cloud`.
+
+## Install
+
+Install [Copilot CLI](https://github.com/features/copilot/cli/):
+
+<CodeGroup>
+
+```shell macOS / Linux (Homebrew)
+brew install copilot-cli
+```
+
+```shell npm (all platforms)
+npm install -g @github/copilot
+```
+
+```shell macOS / Linux (script)
+curl -fsSL https://gh.io/copilot-install | bash
+```
+
+```powershell Windows (WinGet)
+winget install GitHub.Copilot
+```
+
+</CodeGroup>
+
+## Usage with Ollama
+
+### Quick setup
+
+```shell
+ollama launch copilot
+```
+
+### Run directly with a model
+
+```shell
+ollama launch copilot --model kimi-k2.5:cloud
+```
+
+## Recommended Models
+
+- `kimi-k2.5:cloud`
+- `glm-5:cloud`
+- `minimax-m2.7:cloud`
+- `qwen3.5:cloud`
+- `glm-4.7-flash`
+- `qwen3.5`
+
+Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
+
+## Non-interactive (headless) mode
+
+Run Copilot CLI without interaction for use in Docker, CI/CD, or scripts:
+
+```shell
+ollama launch copilot --model kimi-k2.5:cloud --yes -- -p "how does this repository work?"
+```
+
+The `--yes` flag auto-pulls the model, skips selectors, and requires `--model` to be specified. Arguments after `--` are passed directly to Copilot CLI.
+
+## Manual setup
+
+Copilot CLI connects to Ollama using the OpenAI-compatible API via environment variables.
+
+1. Set the environment variables:
+
+```shell
+export COPILOT_PROVIDER_BASE_URL=http://localhost:11434/v1
+export COPILOT_PROVIDER_API_KEY=
+export COPILOT_PROVIDER_WIRE_API=responses
+export COPILOT_MODEL=qwen3.5
+```
+
+1. Run Copilot CLI:
+
+```shell
+copilot
+```
+
+Or run with environment variables inline:
+
+```shell
+COPILOT_PROVIDER_BASE_URL=http://localhost:11434/v1 COPILOT_PROVIDER_API_KEY= COPILOT_PROVIDER_WIRE_API=responses COPILOT_MODEL=glm-5:cloud copilot
+```
+
+**Note:** Copilot requires a large context window. We recommend at least 64k tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.
--- a/docs/integrations/hermes.mdx
+++ b/docs/integrations/hermes.mdx
@@ -2,29 +2,66 @@
 title: Hermes Agent
 ---

-Hermes Agent is a self-improving AI agent built by Nous Research. It features automatic skill creation, cross-session memory, and connects messaging platforms (Telegram, Discord, Slack, WhatsApp, Signal, Email) to models through a unified gateway.
+Hermes Agent is a self-improving AI agent built by Nous Research. It features automatic skill creation, cross-session memory, and 70+ skills that it ships with by default. 
+
+![Hermes Agent with Ollama](/images/hermes.png)

 ## Quick start

-### Pull a model
-
-Before running the setup wizard, make sure you have a model available. Hermes will auto-detect models downloaded through Ollama.
-
 ```bash
-ollama pull kimi-k2.5:cloud
+ollama launch hermes
 ```

-See [Recommended models](#recommended-models) for more options.
+Ollama handles everything automatically:

-### Install
+1. **Install** — If Hermes isn't installed, Ollama prompts to install it via the Nous Research install script
+2. **Model** — Pick a model from the selector (local or cloud)
+3. **Onboarding** — Ollama configures the Ollama provider, points Hermes at `http://127.0.0.1:11434/v1`, and sets your model as the primary
+4. **Gateway** — Optionally connects a messaging platform (Telegram, Discord, Slack, WhatsApp, Signal, Email) and launches the Hermes chat
+
+<Note>Hermes on Windows requires WSL2. Install it with `wsl --install` and re-run from inside the WSL shell.</Note>
+
+## Recommended models
+
+**Cloud models**:
+
+- `kimi-k2.5:cloud` — Multimodal reasoning with subagents
+- `glm-5.1:cloud` — Reasoning and code generation
+- `qwen3.5:cloud` — Reasoning, coding, and agentic tool use with vision
+- `minimax-m2.7:cloud` — Fast, efficient coding and real-world productivity
+
+**Local models:**
+
+- `gemma4` — Reasoning and code generation locally (~16 GB VRAM)
+- `qwen3.6` — Reasoning, coding, and visual understanding locally (~24 GB VRAM)
+
+More models at [ollama.com/search](https://ollama.com/search?c=cloud).
+
+## Connect messaging apps
+
+Link Telegram, Discord, Slack, WhatsApp, Signal, or Email to chat with your models from anywhere:
+
+```bash
+hermes gateway setup
+```
+
+## Reconfigure
+
+Re-run the full setup wizard at any time:
+
+```bash
+hermes setup
+```
+
+## Manual setup
+
+If you'd rather drive Hermes's own wizard instead of `ollama launch hermes`, install it directly:

 ```bash
 curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
 ```

-### Set up
-
-After installation, Hermes launches the setup wizard automatically. Choose **Quick setup**:
+Hermes launches the setup wizard automatically. Choose **Quick setup**:

 ```
 How would you like to set up Hermes?
@@ -80,32 +117,3 @@ Connect a messaging platform? (Telegram, Discord, etc.)
 Launch hermes chat now? [Y/n]: Y
 ```

-## Recommended models
-
-**Cloud models**:
-
- `kimi-k2.5:cloud` — Multimodal reasoning with subagents
- `qwen3.5:cloud` — Reasoning, coding, and agentic tool use with vision
- `glm-5.1:cloud` — Reasoning and code generation
- `minimax-m2.7:cloud` — Fast, efficient coding and real-world productivity
-
-**Local models:**
-
- `gemma4` — Reasoning and code generation locally (~16 GB VRAM)
- `qwen3.5` — Reasoning, coding, and visual understanding locally (~11 GB VRAM)
-
-More models at [ollama.com/search](https://ollama.com/models).
-
-## Configure later
-
-Re-run the setup wizard at any time:
-
-```bash
-hermes setup
-```
-
-To configure just messaging:
-
-```bash
-hermes setup gateway
-```
--- a/docs/integrations/index.mdx
+++ b/docs/integrations/index.mdx
@@ -10,6 +10,7 @@ Coding assistants that can read, modify, and execute code in your projects.

 - [Claude Code](/integrations/claude-code)
 - [Codex](/integrations/codex)
+- [Copilot CLI](/integrations/copilot-cli)
 - [OpenCode](/integrations/opencode)
 - [Droid](/integrations/droid)
 - [Goose](/integrations/goose)
--- a/go.mod
+++ b/go.mod
@@ -106,5 +106,5 @@ require (
 	golang.org/x/term v0.36.0
 	golang.org/x/text v0.30.0
 	google.golang.org/protobuf v1.34.1
-	gopkg.in/yaml.v3 v3.0.1 // indirect
+	gopkg.in/yaml.v3 v3.0.1
 )
--- a/manifest/layer.go
+++ b/manifest/layer.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"time"
 )

 type Layer struct {
@@ -60,6 +61,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 			return Layer{}, err
 		}
 	}
+	if err := touchLayer(blob); err != nil {
+		return Layer{}, err
+	}

 	return Layer{
 		MediaType: mediatype,
@@ -83,6 +87,9 @@ func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) {
 	if err != nil {
 		return Layer{}, err
 	}
+	if err := touchLayer(blob); err != nil {
+		return Layer{}, err
+	}

 	return Layer{
 		MediaType: mediatype,
@@ -93,6 +100,11 @@ func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) {
 	}, nil
 }

+func touchLayer(path string) error {
+	now := time.Now()
+	return os.Chtimes(path, now, now)
+}
+
 func (l *Layer) Open() (io.ReadSeekCloser, error) {
 	if l.Digest == "" {
 		return nil, errors.New("opening layer with empty digest")
--- a/model/renderers/gemma4.go
+++ b/model/renderers/gemma4.go
@@ -12,7 +12,8 @@ import (
 // <|turn>/<turn|> markers, <|"|> string delimiters, and <|tool>/
 // <|tool_call>/<|tool_response> tags for function calling.
 type Gemma4Renderer struct {
-	useImgTags bool
+	useImgTags          bool
+	emptyBlockOnNothink bool
 }

 const (
@@ -124,6 +125,9 @@ func (r *Gemma4Renderer) Render(messages []api.Message, tools []api.Tool, thinkV
 	// Generation prompt.
 	if prevMessageType != "tool_response" && prevMessageType != "tool_call" {
 		sb.WriteString("<|turn>model\n")
+		if r.emptyBlockOnNothink && !hasThink {
+			sb.WriteString("<|channel>thought\n<channel|>")
+		}
 	}

 	return sb.String(), nil
--- a/model/renderers/gemma4_reference_test.go
+++ b/model/renderers/gemma4_reference_test.go
@@ -3,9 +3,9 @@ package renderers
 // TestGemma4RendererMatchesReference verifies our renderer matches the checked-in
 // Gemma 4 reference template.
 //
-// Current upstream Gemma 4 chat templates differ by model size, so the checked-in
-// reference intentionally uses the shared baseline without an empty generation-time
-// thought channel until renderer selection is split by size.
+// Current upstream Gemma 4 chat templates differ by model size. The checked-in
+// reference cases below use the small (e2b/e4b-style) baseline, with large
+// (26b/31b-style) checks covered separately in this file.
 //
 // To regenerate expected values, save the E2B template to
 // gemma4_e2b_chat_template.jinja2 and run:
@@ -1474,6 +1474,47 @@ Hi<turn|>
 	}
 }

+func TestGemma4RendererVariantsMatchExpectedGenerationPrompt(t *testing.T) {
+	messages := []api.Message{{Role: "user", Content: "Hello"}}
+
+	tests := []struct {
+		name         string
+		rendererName string
+		expected     string
+	}{
+		{
+			name:         "legacy_alias",
+			rendererName: "gemma4",
+			expected:     "<bos><|turn>user\nHello<turn|>\n<|turn>model\n",
+		},
+		{
+			name:         "small",
+			rendererName: "gemma4-small",
+			expected:     "<bos><|turn>user\nHello<turn|>\n<|turn>model\n",
+		},
+		{
+			name:         "large",
+			rendererName: "gemma4-large",
+			expected:     "<bos><|turn>user\nHello<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := RenderWithRenderer(tt.rendererName, messages, nil, nil)
+			assert.NoError(t, err)
+			assert.Equal(t, tt.expected, got)
+		})
+	}
+}
+
+func TestGemma4LargeRendererOmitsEmptyThoughtBlockWhenThinkingEnabled(t *testing.T) {
+	got, err := RenderWithRenderer("gemma4-large", []api.Message{{Role: "user", Content: "Hello"}}, nil, thinkTrue())
+	assert.NoError(t, err)
+	assert.Equal(t, "<bos><|turn>system\n<|think|>\n<turn|>\n<|turn>user\nHello<turn|>\n<|turn>model\n", got)
+	assert.NotContains(t, got, "<|channel>thought\n<channel|>")
+}
+
 func TestGemma4RendererMatchesJinja2ExpandedParity(t *testing.T) {
 	if os.Getenv("VERIFY_JINJA2") == "" {
 		t.Skip("set VERIFY_JINJA2=1 to run expanded Jinja2 parity checks")
@@ -1616,15 +1657,35 @@ func TestGemma4RendererMatchesJinja2ExpandedParity(t *testing.T) {
 		},
 	}

-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			renderer := &Gemma4Renderer{useImgTags: RenderImgTags}
-			got, err := renderer.Render(tt.messages, tt.tools, tt.think)
-			assert.NoError(t, err)
+	variants := []struct {
+		name        string
+		renderer    *Gemma4Renderer
+		templateRel string
+	}{
+		{
+			name:        "small",
+			renderer:    &Gemma4Renderer{useImgTags: RenderImgTags},
+			templateRel: gemma4E2BTemplate,
+		},
+		{
+			name:        "large",
+			renderer:    &Gemma4Renderer{useImgTags: RenderImgTags, emptyBlockOnNothink: true},
+			templateRel: gemma431BTemplate,
+		},
+	}

-			jinja2Output := renderWithJinja2(t, tt.messages, tt.tools, tt.think)
-			assert.Equal(t, jinja2Output, got,
-				"renderer output doesn't match Jinja2 template output")
+	for _, variant := range variants {
+		t.Run(variant.name, func(t *testing.T) {
+			for _, tt := range tests {
+				t.Run(tt.name, func(t *testing.T) {
+					got, err := variant.renderer.Render(tt.messages, tt.tools, tt.think)
+					assert.NoError(t, err)
+
+					jinja2Output := renderWithJinja2Template(t, variant.templateRel, tt.messages, tt.tools, tt.think)
+					assert.Equal(t, jinja2Output, got,
+						"renderer output doesn't match Jinja2 template output")
+				})
+			}
 		})
 	}
 }
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -81,8 +81,10 @@ func rendererForName(name string) Renderer {
 		return renderer
 	case "nemotron-3-nano":
 		return &Nemotron3NanoRenderer{}
-	case "gemma4":
+	case "gemma4", "gemma4-small":
 		return &Gemma4Renderer{useImgTags: RenderImgTags}
+	case "gemma4-large":
+		return &Gemma4Renderer{useImgTags: RenderImgTags, emptyBlockOnNothink: true}
 	case "functiongemma":
 		return &FunctionGemmaRenderer{}
 	case "glm-4.7":
--- a/server/create.go
+++ b/server/create.go
@@ -523,7 +523,7 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
 				arch := layer.GGML.KV().Architecture()
 				switch arch {
 				case "gemma4":
-					config.Renderer = cmp.Or(config.Renderer, "gemma4")
+					config.Renderer = cmp.Or(config.Renderer, gemma4RendererLegacy)
 					config.Parser = cmp.Or(config.Parser, "gemma4")
 					if _, ok := r.Parameters["stop"]; !ok {
 						if r.Parameters == nil {
--- a/server/gemma4_test.go
+++ b/server/gemma4_test.go
@@ -0,0 +1,78 @@
+package server
+
+import "testing"
+
+func TestResolveGemma4Renderer(t *testing.T) {
+	tests := []struct {
+		name  string
+		model *Model
+		want  string
+	}{
+		{
+			name:  "nil model falls back to legacy alias",
+			model: nil,
+			want:  gemma4RendererLegacy,
+		},
+		{
+			name: "explicit small passes through",
+			model: &Model{
+				Config: testConfigWithRenderer(gemma4RendererSmall),
+			},
+			want: gemma4RendererSmall,
+		},
+		{
+			name: "explicit large passes through",
+			model: &Model{
+				Config: testConfigWithRenderer(gemma4RendererLarge),
+			},
+			want: gemma4RendererLarge,
+		},
+		{
+			name: "legacy e4b tag resolves small",
+			model: &Model{
+				Name:      "gemma4:e4b",
+				ShortName: "gemma4:e4b",
+				Config:    testConfigWithRenderer(gemma4RendererLegacy),
+			},
+			want: gemma4RendererSmall,
+		},
+		{
+			name: "legacy 31b tag resolves large",
+			model: &Model{
+				Name:      "gemma4:31b-cloud",
+				ShortName: "gemma4:31b-cloud",
+				Config:    testConfigWithRenderer(gemma4RendererLegacy),
+			},
+			want: gemma4RendererLarge,
+		},
+		{
+			name: "legacy model type resolves small",
+			model: &Model{
+				Config: testConfigWithRendererAndType(gemma4RendererLegacy, "4.3B"),
+			},
+			want: gemma4RendererSmall,
+		},
+		{
+			name: "legacy model type resolves large",
+			model: &Model{
+				Config: testConfigWithRendererAndType(gemma4RendererLegacy, "25.2B"),
+			},
+			want: gemma4RendererLarge,
+		},
+		{
+			name: "legacy unknown defaults small",
+			model: &Model{
+				Config: testConfigWithRenderer(gemma4RendererLegacy),
+			},
+			want: gemma4RendererSmall,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := resolveGemma4Renderer(tt.model); got != tt.want {
+				t.Fatalf("resolveGemma4Renderer() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
--- a/server/images.go
+++ b/server/images.go
@@ -19,6 +19,7 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"time"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
@@ -33,6 +34,10 @@ import (
 	"github.com/ollama/ollama/x/imagegen/transfer"
 )

+// Blobs newer than this may belong to another process that has not written its
+// manifest yet. They become eligible for the normal mark-and-sweep pass later.
+const layerPruneGracePeriod = time.Hour
+
 var (
 	errCapabilities         = errors.New("does not support")
 	errCapabilityCompletion = errors.New("completion")
@@ -156,7 +161,7 @@ func (m *Model) Capabilities() []model.Capability {

 	// Temporary workaround — suppress vision/audio for gemma4 MLX models
 	// until multimodal runtime pipeline lands. Remove when imageproc.go is wired up.
-	if m.Config.ModelFormat == "safetensors" && m.Config.Renderer == "gemma4" {
+	if m.Config.ModelFormat == "safetensors" && isGemma4Renderer(m.Config.Renderer) {
 		capabilities = slices.DeleteFunc(capabilities, func(c model.Capability) bool {
 			return c == model.CapabilityVision || c == "audio"
 		})
@@ -478,10 +483,23 @@ func PruneLayers() error {
 	}

 	for _, blob := range blobs {
+		if blob.IsDir() {
+			continue
+		}
+
+		info, err := blob.Info()
+		if err != nil {
+			slog.Error("couldn't stat blob", "blob", blob.Name(), "error", err)
+			continue
+		}
+		if time.Since(info.ModTime()) < layerPruneGracePeriod {
+			continue
+		}
+
 		name := blob.Name()
 		name = strings.ReplaceAll(name, "-", ":")

-		_, err := manifest.BlobsPath(name)
+		_, err = manifest.BlobsPath(name)
 		if err != nil {
 			if errors.Is(err, manifest.ErrInvalidDigestFormat) {
 				// remove invalid blobs (e.g. partial downloads)
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -5,14 +5,58 @@ import (
 	"fmt"
 	"net/http"
 	"net/http/httptest"
+	"os"
 	"strings"
 	"testing"
+	"time"

 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/manifest"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )

+func TestPruneLayersSkipsRecentOrphans(t *testing.T) {
+	t.Setenv("OLLAMA_MODELS", t.TempDir())
+
+	recentDigest := "sha256:0000000000000000000000000000000000000000000000000000000000000001"
+	oldDigest := "sha256:0000000000000000000000000000000000000000000000000000000000000002"
+
+	for _, digest := range []string{recentDigest, oldDigest} {
+		p, err := manifest.BlobsPath(digest)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(p, nil, 0o644); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	oldPath, err := manifest.BlobsPath(oldDigest)
+	if err != nil {
+		t.Fatal(err)
+	}
+	oldTime := time.Now().Add(-layerPruneGracePeriod - time.Hour)
+	if err := os.Chtimes(oldPath, oldTime, oldTime); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := PruneLayers(); err != nil {
+		t.Fatal(err)
+	}
+
+	recentPath, err := manifest.BlobsPath(recentDigest)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := os.Stat(recentPath); err != nil {
+		t.Fatalf("recent orphan was pruned: %v", err)
+	}
+	if _, err := os.Stat(oldPath); !os.IsNotExist(err) {
+		t.Fatalf("old orphan still exists: %v", err)
+	}
+}
+
 func TestModelCapabilities(t *testing.T) {
 	// Create completion model (llama architecture without vision)
 	completionModelPath, _ := createBinFile(t, ggml.KV{
@@ -118,6 +162,39 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityEmbedding},
 		},
+		{
+			name: "gemma4 small safetensors suppresses vision and audio",
+			model: Model{
+				Config: model.ConfigV2{
+					ModelFormat:  "safetensors",
+					Renderer:     gemma4RendererSmall,
+					Capabilities: []string{"vision", "audio"},
+				},
+				Template: chatTemplate,
+			},
+		},
+		{
+			name: "gemma4 large safetensors suppresses vision and audio",
+			model: Model{
+				Config: model.ConfigV2{
+					ModelFormat:  "safetensors",
+					Renderer:     gemma4RendererLarge,
+					Capabilities: []string{"vision", "audio"},
+				},
+				Template: chatTemplate,
+			},
+		},
+		{
+			name: "legacy gemma4 safetensors suppresses vision and audio",
+			model: Model{
+				Config: model.ConfigV2{
+					ModelFormat:  "safetensors",
+					Renderer:     gemma4RendererLegacy,
+					Capabilities: []string{"vision", "audio"},
+				},
+				Template: chatTemplate,
+			},
+		},
 	}

 	// compare two slices of model.Capability regardless of order
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -115,7 +115,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.

 func renderPrompt(m *Model, msgs []api.Message, tools []api.Tool, think *api.ThinkValue) (string, error) {
 	if m.Config.Renderer != "" {
-		rendered, err := renderers.RenderWithRenderer(m.Config.Renderer, msgs, tools, think)
+		rendererName := resolveRendererName(m)
+		rendered, err := renderers.RenderWithRenderer(rendererName, msgs, tools, think)
 		if err != nil {
 			return "", err
 		}
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -13,6 +13,14 @@ import (
 	"github.com/ollama/ollama/types/model"
 )

+func testConfigWithRenderer(renderer string) model.ConfigV2 {
+	return model.ConfigV2{Renderer: renderer}
+}
+
+func testConfigWithRendererAndType(renderer, modelType string) model.ConfigV2 {
+	return model.ConfigV2{Renderer: renderer, ModelType: modelType}
+}
+
 func TestChatPrompt(t *testing.T) {
 	type expect struct {
 		prompt string
@@ -397,3 +405,43 @@ func TestChatPromptGLMOcrRendererAddsImageTags(t *testing.T) {
 		t.Fatalf("prompt missing glm-ocr image tags, got: %q", prompt)
 	}
 }
+
+func TestRenderPromptResolvesDynamicGemma4Renderer(t *testing.T) {
+	msgs := []api.Message{{Role: "user", Content: "Hello"}}
+
+	tests := []struct {
+		name  string
+		model Model
+		want  string
+	}{
+		{
+			name: "small from name",
+			model: Model{
+				Name:      "gemma4:e4b",
+				ShortName: "gemma4:e4b",
+				Config:    testConfigWithRenderer(gemma4RendererLegacy),
+			},
+			want: "<bos><|turn>user\nHello<turn|>\n<|turn>model\n",
+		},
+		{
+			name: "large from model type",
+			model: Model{
+				Config: testConfigWithRendererAndType(gemma4RendererLegacy, "25.2B"),
+			},
+			want: "<bos><|turn>user\nHello<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := renderPrompt(&tt.model, msgs, nil, nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if diff := cmp.Diff(got, tt.want); diff != "" {
+				t.Fatalf("rendered prompt mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
--- a/server/renderer_resolution.go
+++ b/server/renderer_resolution.go
@@ -0,0 +1,110 @@
+package server
+
+import (
+	"strconv"
+	"strings"
+
+	"github.com/ollama/ollama/format"
+)
+
+const (
+	gemma4RendererLegacy = "gemma4"
+	gemma4RendererSmall  = "gemma4-small"
+	gemma4RendererLarge  = "gemma4-large"
+
+	// Gemma 4 small templates cover the e2b/e4b family, while 26b/31b use the
+	// large template. Default to the small prompt unless the model is clearly in
+	// the large range.
+	gemma4LargeMinParameterCount = 16_000_000_000
+)
+
+func resolveRendererName(m *Model) string {
+	if m == nil || m.Config.Renderer == "" {
+		return ""
+	}
+
+	switch m.Config.Renderer {
+	case gemma4RendererLegacy:
+		return resolveGemma4Renderer(m)
+	default:
+		return m.Config.Renderer
+	}
+}
+
+func resolveGemma4Renderer(m *Model) string {
+	if m == nil || m.Config.Renderer != gemma4RendererLegacy {
+		if m == nil {
+			return gemma4RendererLegacy
+		}
+		return m.Config.Renderer
+	}
+
+	if renderer, ok := gemma4RendererFromName(m.ShortName); ok {
+		return renderer
+	}
+
+	if renderer, ok := gemma4RendererFromName(m.Name); ok {
+		return renderer
+	}
+
+	if parameterCount, ok := parseHumanParameterCount(m.Config.ModelType); ok {
+		return gemma4RendererForParameterCount(parameterCount)
+	}
+
+	return gemma4RendererSmall
+}
+
+func gemma4RendererForParameterCount(parameterCount uint64) string {
+	if parameterCount >= gemma4LargeMinParameterCount {
+		return gemma4RendererLarge
+	}
+
+	return gemma4RendererSmall
+}
+
+func gemma4RendererFromName(name string) (string, bool) {
+	lower := strings.ToLower(name)
+	switch {
+	case strings.Contains(lower, "e2b"), strings.Contains(lower, "e4b"):
+		return gemma4RendererSmall, true
+	case strings.Contains(lower, "26b"), strings.Contains(lower, "31b"):
+		return gemma4RendererLarge, true
+	default:
+		return "", false
+	}
+}
+
+func parseHumanParameterCount(s string) (uint64, bool) {
+	if s == "" {
+		return 0, false
+	}
+
+	unit := strings.ToUpper(s[len(s)-1:])
+	var multiplier float64
+	switch unit {
+	case "B":
+		multiplier = float64(format.Billion)
+	case "M":
+		multiplier = float64(format.Million)
+	case "K":
+		multiplier = float64(format.Thousand)
+	default:
+		return 0, false
+	}
+
+	value, err := strconv.ParseFloat(s[:len(s)-1], 64)
+	if err != nil {
+		return 0, false
+	}
+
+	return uint64(value * multiplier), true
+}
+
+func isGemma4Renderer(renderer string) bool {
+	switch renderer {
+	case gemma4RendererLegacy, gemma4RendererSmall, gemma4RendererLarge:
+		return true
+	default:
+		return false
+	}
+}
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -928,6 +928,59 @@ func TestCreateDetectTemplate(t *testing.T) {
 	})
 }

+func TestCreateGemma4KeepsDynamicRendererAlias(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	p := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", p)
+	var s Server
+
+	_, digest := createBinFile(t, ggml.KV{
+		"general.architecture":    "gemma4",
+		"general.parameter_count": uint64(25_200_000_000),
+	}, nil)
+
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		Name:   "test",
+		Files:  map[string]string{"test.gguf": digest},
+		Stream: &stream,
+	})
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status code 200, actual %d", w.Code)
+	}
+
+	mf, err := manifest.ParseNamedManifest(model.ParseName("test"))
+	if err != nil {
+		t.Fatalf("parse manifest: %v", err)
+	}
+	if mf.Config.Digest == "" {
+		t.Fatalf("unexpected empty config digest for manifest")
+	}
+
+	configPath, err := manifest.BlobsPath(mf.Config.Digest)
+	if err != nil {
+		t.Fatalf("config blob path: %v", err)
+	}
+
+	cfgFile, err := os.Open(configPath)
+	if err != nil {
+		t.Fatalf("open config blob: %v", err)
+	}
+	defer cfgFile.Close()
+
+	var cfg model.ConfigV2
+	if err := json.NewDecoder(cfgFile).Decode(&cfg); err != nil {
+		t.Fatalf("decode config: %v", err)
+	}
+
+	if cfg.Renderer != gemma4RendererLegacy {
+		t.Fatalf("expected renderer %q, got %q", gemma4RendererLegacy, cfg.Renderer)
+	}
+	if cfg.Parser != "gemma4" {
+		t.Fatalf("expected parser %q, got %q", "gemma4", cfg.Parser)
+	}
+}
+
 func TestDetectModelTypeFromFiles(t *testing.T) {
 	t.Run("gguf file", func(t *testing.T) {
 		_, digest := createBinFile(t, nil, nil)
--- a/x/create/gemma4.go
+++ b/x/create/gemma4.go
@@ -93,6 +93,13 @@ func (t gemma4ImportTransform) quantizationType(name string, shape []int32, quan
 		return ""
 	}

+	// MoE router logits choose the top-k expert set. Quantization noise here
+	// can flip expert selection, after which downstream activations diverge
+	// sharply. The tensor is small, so leave it in source precision.
+	if isGemma4RouterProjection(name) {
+		return ""
+	}
+
 	// Mixed-precision quantization: sensitive tensors get higher precision.
 	//
 	// Value projections (v_proj) directly determine attention output quality.
@@ -170,6 +177,12 @@ func isEmbedTokensWeight(name string) bool {
 		!strings.Contains(name, "per_layer")
 }

+func isGemma4RouterProjection(name string) bool {
+	return strings.HasSuffix(name, ".router.proj.weight") &&
+		!strings.Contains(name, "audio_tower") &&
+		!strings.Contains(name, "vision_tower")
+}
+
 func (t gemma4ImportTransform) transformTensor(td *safetensors.TensorData) ([]*safetensors.TensorData, error) {
 	if td == nil {
 		return nil, nil
--- a/x/create/gemma4_test.go
+++ b/x/create/gemma4_test.go
@@ -68,6 +68,11 @@ func TestGemma4QuantizationType(t *testing.T) {
 		{"expert gate_up nvfp4", transform26B, "model.layers.0.moe.experts.42.gate_up_proj.weight", aligned, "nvfp4", "nvfp4"},
 		{"expert gate_up mxfp4", transform26B, "model.layers.0.moe.experts.42.gate_up_proj.weight", aligned, "mxfp4", "mxfp4"},

+		// === Router projection: expert selection is sensitive; keep source precision ===
+		{"router proj int4", transform26B, "model.layers.0.router.proj.weight", aligned, "int4", ""},
+		{"router proj nvfp4", transform26B, "model.layers.0.router.proj.weight", aligned, "nvfp4", ""},
+		{"router proj mxfp4", transform26B, "model.layers.0.router.proj.weight", aligned, "mxfp4", ""},
+
 		// === k_proj: promoted only for 8-expert models ===
 		{"k_proj 128 experts int4", transform26B, "model.layers.0.self_attn.k_proj.weight", aligned, "int4", "int4"},
 		{"k_proj 8 experts int4", transform8E, "model.layers.0.self_attn.k_proj.weight", aligned, "int4", "int8"},
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -115,36 +115,7 @@ func (s *Server) Load(ctx context.Context, _ ml.SystemInfo, gpus []ml.DeviceInfo
 	// Spawn subprocess: ollama runner --imagegen-engine --model <path> --port <port>
 	cmd := exec.Command(exe, "runner", "--imagegen-engine", "--model", s.modelName, "--port", strconv.Itoa(port))
 	cmd.Env = os.Environ()
-
-	// On Linux, set LD_LIBRARY_PATH to include MLX library directories
-	if runtime.GOOS == "linux" {
-		// Build library paths: start with LibOllamaPath, then add any mlx_* subdirectories
-		libraryPaths := []string{ml.LibOllamaPath}
-		if mlxDirs, err := filepath.Glob(filepath.Join(ml.LibOllamaPath, "mlx_*")); err == nil {
-			libraryPaths = append(libraryPaths, mlxDirs...)
-		}
-
-		// Append existing LD_LIBRARY_PATH if set
-		if existingPath, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
-			libraryPaths = append(libraryPaths, filepath.SplitList(existingPath)...)
-		}
-
-		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
-
-		// Update or add LD_LIBRARY_PATH in cmd.Env
-		found := false
-		for i := range cmd.Env {
-			if strings.HasPrefix(cmd.Env[i], "LD_LIBRARY_PATH=") {
-				cmd.Env[i] = "LD_LIBRARY_PATH=" + pathEnvVal
-				found = true
-				break
-			}
-		}
-		if !found {
-			cmd.Env = append(cmd.Env, "LD_LIBRARY_PATH="+pathEnvVal)
-		}
-		slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
-	}
+	configureMLXSubprocessEnv(cmd, ml.LibraryPaths(gpus))

 	s.cmd = cmd

@@ -200,6 +171,53 @@ func (s *Server) Ping(ctx context.Context) error {
 	return nil
 }

+func mlxLibraryPathEnv() string {
+	switch runtime.GOOS {
+	case "windows":
+		return "PATH"
+	case "darwin":
+		return "DYLD_LIBRARY_PATH"
+	default:
+		return "LD_LIBRARY_PATH"
+	}
+}
+
+func configureMLXSubprocessEnv(cmd *exec.Cmd, libraryPaths []string) {
+	if len(libraryPaths) == 0 {
+		return
+	}
+
+	// Search order for the imagegen runner is:
+	//   1. bundled lib/ollama root
+	//   2. backend-specific library dirs selected during GPU discovery
+	//   3. any existing caller-provided library path values
+	pathEnv := mlxLibraryPathEnv()
+	pathEnvPaths := append([]string{}, libraryPaths...)
+	if existingPath, ok := os.LookupEnv(pathEnv); ok {
+		pathEnvPaths = append(pathEnvPaths, filepath.SplitList(existingPath)...)
+	}
+	setSubprocessEnv(cmd, pathEnv, strings.Join(pathEnvPaths, string(filepath.ListSeparator)))
+	slog.Debug("mlx subprocess library path", pathEnv, strings.Join(pathEnvPaths, string(filepath.ListSeparator)))
+
+	ollamaLibraryPaths := append([]string{}, libraryPaths...)
+	if existingPath, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH"); ok {
+		ollamaLibraryPaths = append(ollamaLibraryPaths, filepath.SplitList(existingPath)...)
+	}
+	setSubprocessEnv(cmd, "OLLAMA_LIBRARY_PATH", strings.Join(ollamaLibraryPaths, string(filepath.ListSeparator)))
+	slog.Debug("mlx subprocess library path", "OLLAMA_LIBRARY_PATH", strings.Join(ollamaLibraryPaths, string(filepath.ListSeparator)))
+}
+
+func setSubprocessEnv(cmd *exec.Cmd, key, value string) {
+	for i := range cmd.Env {
+		name, _, ok := strings.Cut(cmd.Env[i], "=")
+		if ok && strings.EqualFold(name, key) {
+			cmd.Env[i] = key + "=" + value
+			return
+		}
+	}
+	cmd.Env = append(cmd.Env, key+"="+value)
+}
+
 // getLastErr returns the last stderr line.
 func (s *Server) getLastErr() string {
 	s.lastErrLock.Lock()
--- a/x/mlxrunner/cache/cache.go
+++ b/x/mlxrunner/cache/cache.go
@@ -254,8 +254,23 @@ func (c *RotatingKVCache) concat(keys, values *mlx.Array) (newK *mlx.Array, newV
 		mlx.Pin(c.keys, c.values)
 	} else {
 		if c.idx < c.keys.Dim(2) {
-			c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
-			c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
+			if c.offset <= c.maxSize {
+				// Not yet wrapped: slots [c.idx, Dim) are grow padding
+				// or stale post-rewind data, not live window content.
+				c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
+				c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
+			} else {
+				// Wrapped: logical order is slots[idx..Dim) then slots[0..idx).
+				// Linearize so the trim + concat below operate on contiguous
+				// positions and preserve the last (maxSize - 1) old tokens.
+				tailK := c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(c.idx, c.keys.Dim(2)), mlx.Slice())
+				tailV := c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(c.idx, c.values.Dim(2)), mlx.Slice())
+				headK := c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice())
+				headV := c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice())
+				c.keys.Set(tailK.Concatenate(2, headK))
+				c.values.Set(tailV.Concatenate(2, headV))
+				c.idx = c.keys.Dim(2)
+			}
 		}

 		// Trim to max_size to maintain sliding window
@@ -322,9 +337,10 @@ func (c *RotatingKVCache) State() []*mlx.Array {
 	if c.keys == nil || c.values == nil {
 		return nil
 	}
+	liveLen := min(c.offset, c.keys.Dim(2))
 	return []*mlx.Array{
-		c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
-		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
+		c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, liveLen), mlx.Slice()),
+		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, liveLen), mlx.Slice()),
 	}
 }

--- a/x/mlxrunner/cache/rotating_multiturn_test.go
+++ b/x/mlxrunner/cache/rotating_multiturn_test.go
@@ -0,0 +1,338 @@
+package cache
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/x/mlxrunner/mlx"
+)
+
+// singleTokenKV and multiTokenKV fabricate [B=1, H=1, L, D=2] key/value
+// tensors whose channel value is the token id, so stateIDs can recover
+// which ids survived in the cache.
+func singleTokenKV(id float32) (*mlx.Array, *mlx.Array) {
+	k := mlx.FromValues([]float32{id, id}, 1, 1, 1, 2)
+	v := mlx.FromValues([]float32{id, id}, 1, 1, 1, 2)
+	return k, v
+}
+
+func multiTokenKV(ids []float32) (*mlx.Array, *mlx.Array) {
+	data := make([]float32, 0, 2*len(ids))
+	for _, id := range ids {
+		data = append(data, id, id)
+	}
+	k := mlx.FromValues(data, 1, 1, len(ids), 2)
+	v := mlx.FromValues(data, 1, 1, len(ids), 2)
+	return k, v
+}
+
+// stateIDs returns the ids currently in the cache in slot order (logical
+// after a concat, physical/rotated after a single-token update).
+func stateIDs(t *testing.T, c *RotatingKVCache) []float32 {
+	t.Helper()
+	state := c.State()
+	if state == nil {
+		return nil
+	}
+	mlx.Eval(state[0])
+	flat := state[0].Floats()
+	n := state[0].Dim(2)
+	out := make([]float32, n)
+	for i := range n {
+		out[i] = flat[i*2]
+	}
+	return out
+}
+
+func equalSlice(a, b []float32) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func feedMulti(c *RotatingKVCache, startID float32, n int) float32 {
+	ids := make([]float32, n)
+	for i := range ids {
+		ids[i] = startID + float32(i)
+	}
+	k, v := multiTokenKV(ids)
+	c.Update(k, v)
+	return startID + float32(n)
+}
+
+func feedSingle(c *RotatingKVCache, id float32) {
+	k, v := singleTokenKV(id)
+	c.Update(k, v)
+}
+
+// TestRotatingKVCacheConcatMidRotationPreservesContext: after the buffer
+// has wrapped, a multi-token concat must keep the (maxSize-1) most recent
+// pre-existing tokens in logical order so the first Q of the new batch
+// has a full sliding window.
+func TestRotatingKVCacheConcatMidRotationPreservesContext(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 4
+	c := NewRotatingKVCache(window)
+
+	nextID := feedMulti(c, 1, 3)
+	for range 6 {
+		feedSingle(c, nextID)
+		nextID++
+	}
+	if c.Offset() != 9 {
+		t.Fatalf("setup: offset=%d want 9", c.Offset())
+	}
+	if c.idx >= c.maxSize {
+		t.Fatalf("setup: expected mid-rotation idx (<%d), got %d", c.maxSize, c.idx)
+	}
+
+	feedMulti(c, 10, 2)
+	got := stateIDs(t, c)
+	want := []float32{7, 8, 9, 10, 11}
+	if !equalSlice(got, want) {
+		t.Fatalf("post-concat window=%v want %v", got, want)
+	}
+	if c.Offset() != 11 {
+		t.Fatalf("offset=%d want 11", c.Offset())
+	}
+}
+
+// TestRotatingKVCacheConcatAlignedInvariant: with an aligned buffer
+// (c.idx == Dim), an L>1 concat keeps the last (maxSize-1) pre-existing
+// tokens plus the full new batch. This is the chunked-prefill contract
+// x/mlxrunner/pipeline.go relies on.
+func TestRotatingKVCacheConcatAlignedInvariant(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 4
+	c := NewRotatingKVCache(window)
+
+	// Chunk 1 fills past maxSize, leaving Dim == maxSize aligned.
+	feedMulti(c, 1, 6)
+	// Chunk 2: the buffer is intentionally oversized to (maxSize-1) + L
+	// so the first new Q has its full window in scope for this forward.
+	feedMulti(c, 7, 3)
+	got := stateIDs(t, c)
+	want := []float32{4, 5, 6, 7, 8, 9}
+	if !equalSlice(got, want) {
+		t.Fatalf("post-chunk-2 buffer=%v want %v", got, want)
+	}
+
+	// The next decode trims oversize back to maxSize; order may be
+	// physical (rotated), so check as a set.
+	feedSingle(c, 10)
+	got = stateIDs(t, c)
+	if len(got) != window {
+		t.Fatalf("post-decode Dim=%d want %d", len(got), window)
+	}
+	seen := map[float32]bool{}
+	for _, v := range got {
+		seen[v] = true
+	}
+	for _, w := range []float32{7, 8, 9, 10} {
+		if !seen[w] {
+			t.Fatalf("post-decode window missing %v (got %v)", w, got)
+		}
+	}
+}
+
+// TestRotatingKVCacheConcatAfterDecodeGrowsBuffer: update() grows the
+// underlying buffer by `step` slots via mlx.Zeros before writing, so
+// after one decode on a short prefill c.idx < Dim even though the cache
+// has not wrapped. Those trailing slots are zero padding and must not
+// be pulled back into the live window on the next concat.
+func TestRotatingKVCacheConcatAfterDecodeGrowsBuffer(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 512
+	c := NewRotatingKVCache(window)
+
+	feedMulti(c, 1, 3)
+	feedSingle(c, 4)
+	feedMulti(c, 5, 3)
+
+	got := stateIDs(t, c)
+	want := []float32{1, 2, 3, 4, 5, 6, 7}
+	if !equalSlice(got, want) {
+		t.Fatalf("growing-buffer concat=%v want %v", got, want)
+	}
+}
+
+// TestRotatingKVCacheConcatAfterLiveRewind: x/mlxrunner/cache.go calls
+// Restore(nil, target) between conversation turns to rewind the cache to
+// the matched prefix. Restore moves c.offset/c.idx without trimming the
+// underlying buffer, so slots [c.idx, Dim) still hold stale pre-rewind
+// tokens. A subsequent concat must drop those, not treat them as wrapped
+// window content.
+func TestRotatingKVCacheConcatAfterLiveRewind(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 8
+	c := NewRotatingKVCache(window)
+
+	// Grow the buffer to exactly maxSize without wrapping.
+	feedMulti(c, 1, 2)
+	for id := float32(3); id <= 8; id++ {
+		feedSingle(c, id)
+	}
+	if c.Offset() != window {
+		t.Fatalf("setup: offset=%d want %d", c.Offset(), window)
+	}
+
+	if !c.Restore(nil, 2) {
+		t.Fatalf("live rewind to 2 failed")
+	}
+	if c.Offset() != 2 {
+		t.Fatalf("post-rewind offset=%d want 2", c.Offset())
+	}
+
+	feedMulti(c, 9, 3)
+	got := stateIDs(t, c)
+	want := []float32{1, 2, 9, 10, 11}
+	if !equalSlice(got, want) {
+		t.Fatalf("post-rewind concat=%v want %v", got, want)
+	}
+	if c.Offset() != 5 {
+		t.Fatalf("offset=%d want 5", c.Offset())
+	}
+}
+
+// TestRotatingKVCacheConcatGrowingBuffer: when oldLen < maxSize the trim
+// formula drops to non-positive and all pre-existing tokens are kept.
+func TestRotatingKVCacheConcatGrowingBuffer(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 4
+	c := NewRotatingKVCache(window)
+
+	feedMulti(c, 1, 2)
+	feedMulti(c, 3, 2)
+	got := stateIDs(t, c)
+	want := []float32{1, 2, 3, 4}
+	if !equalSlice(got, want) {
+		t.Fatalf("growing buffer=%v want %v", got, want)
+	}
+}
+
+// TestRotatingKVCacheRunnerChunkedPrefill mirrors the
+// x/mlxrunner/pipeline.go prefill loop: a long prompt fed through
+// repeated L>1 Update() calls on a single cache. Scaled-down proxy for
+// the Gemma 4 26B case (sliding_window=1024, prefillChunkSize=2048).
+func TestRotatingKVCacheRunnerChunkedPrefill(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 4
+	c := NewRotatingKVCache(window)
+
+	feedMulti(c, 1, 8)
+	if c.Offset() != 8 {
+		t.Fatalf("chunk 1: offset=%d want 8", c.Offset())
+	}
+
+	feedMulti(c, 9, 8)
+	got := stateIDs(t, c)
+	want := []float32{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if !equalSlice(got, want) {
+		t.Fatalf("chunk 2: buffer=%v want %v", got, want)
+	}
+
+	feedMulti(c, 17, 4)
+	got = stateIDs(t, c)
+	want = []float32{14, 15, 16, 17, 18, 19, 20}
+	if !equalSlice(got, want) {
+		t.Fatalf("chunk 3: buffer=%v want %v", got, want)
+	}
+
+	// Decode trims oversize back to maxSize; order may be physical.
+	feedSingle(c, 21)
+	got = stateIDs(t, c)
+	if len(got) != window {
+		t.Fatalf("post-decode Dim=%d want %d", len(got), window)
+	}
+	seen := map[float32]bool{}
+	for _, v := range got {
+		seen[v] = true
+	}
+	for _, w := range []float32{18, 19, 20, 21} {
+		if !seen[w] {
+			t.Fatalf("post-decode window missing %v (got %v)", w, got)
+		}
+	}
+}
+
+// TestRotatingKVCacheMultiTurnChatSimulation walks a prefill → decode →
+// prefill sequence and checks that each new prefill retains the last
+// (maxSize-1) pre-existing tokens in logical order.
+func TestRotatingKVCacheMultiTurnChatSimulation(t *testing.T) {
+	skipIfNoMLX(t)
+
+	const window = 4
+	c := NewRotatingKVCache(window)
+
+	nextID := feedMulti(c, 1, 2)
+	for range 5 {
+		feedSingle(c, nextID)
+		nextID++
+	}
+	if c.Offset() != 7 {
+		t.Fatalf("turn 1: offset=%d want 7", c.Offset())
+	}
+
+	feedMulti(c, nextID, 3)
+	nextID += 3
+	got := stateIDs(t, c)
+	want := []float32{5, 6, 7, 8, 9, 10}
+	if !equalSlice(got, want) {
+		t.Fatalf("turn 2 prefill buffer=%v want %v", got, want)
+	}
+
+	for range 4 {
+		feedSingle(c, nextID)
+		nextID++
+	}
+	if c.Offset() != 14 {
+		t.Fatalf("turn 2 decode: offset=%d want 14", c.Offset())
+	}
+
+	feedMulti(c, nextID, 2)
+	got = stateIDs(t, c)
+	want = []float32{12, 13, 14, 15, 16}
+	if !equalSlice(got, want) {
+		t.Fatalf("turn 3 prefill buffer=%v want %v", got, want)
+	}
+}
+
+// TestRotatingKVCacheOffsetTracking: Offset() is the monotonic logical
+// token count through any mix of Update() calls — Gemma 4 uses
+// donorEntry.Offset - L for the consumer's RoPE offset.
+func TestRotatingKVCacheOffsetTracking(t *testing.T) {
+	skipIfNoMLX(t)
+
+	c := NewRotatingKVCache(4)
+	nextID := feedMulti(c, 1, 3)
+	if c.Offset() != 3 {
+		t.Fatalf("after prefill 3: offset=%d want 3", c.Offset())
+	}
+	for i := range 5 {
+		feedSingle(c, nextID)
+		nextID++
+		if c.Offset() != 3+i+1 {
+			t.Fatalf("after decode %d: offset=%d want %d", i, c.Offset(), 3+i+1)
+		}
+	}
+	nextID = feedMulti(c, nextID, 2)
+	if c.Offset() != 10 {
+		t.Fatalf("after turn-2 prefill: offset=%d want 10", c.Offset())
+	}
+	// L > maxSize concat.
+	feedMulti(c, nextID, 7)
+	if c.Offset() != 17 {
+		t.Fatalf("after large prefill: offset=%d want 17", c.Offset())
+	}
+}
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -158,13 +158,15 @@ type completionRequest struct {
 }

 type completionOpts struct {
-	Temperature     float32 `json:"temperature,omitempty"`
-	TopP            float32 `json:"top_p,omitempty"`
-	MinP            float32 `json:"min_p,omitempty"`
-	TopK            int     `json:"top_k,omitempty"`
-	RepeatLastN     int     `json:"repeat_last_n,omitempty"`
-	PresencePenalty float32 `json:"presence_penalty,omitempty"`
-	NumPredict      int     `json:"num_predict,omitempty"`
+	Temperature      float32 `json:"temperature,omitempty"`
+	TopP             float32 `json:"top_p,omitempty"`
+	MinP             float32 `json:"min_p,omitempty"`
+	TopK             int     `json:"top_k,omitempty"`
+	RepeatLastN      int     `json:"repeat_last_n,omitempty"`
+	RepeatPenalty    float32 `json:"repeat_penalty,omitempty"`
+	PresencePenalty  float32 `json:"presence_penalty,omitempty"`
+	FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
+	NumPredict       int     `json:"num_predict,omitempty"`
 }

 type CompletionResponse struct {
@@ -206,13 +208,15 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 	}
 	if req.Options != nil {
 		creq.Options = &completionOpts{
-			Temperature:     req.Options.Temperature,
-			TopP:            req.Options.TopP,
-			MinP:            req.Options.MinP,
-			TopK:            req.Options.TopK,
-			RepeatLastN:     req.Options.RepeatLastN,
-			PresencePenalty: req.Options.PresencePenalty,
-			NumPredict:      req.Options.NumPredict,
+			Temperature:      req.Options.Temperature,
+			TopP:             req.Options.TopP,
+			MinP:             req.Options.MinP,
+			TopK:             req.Options.TopK,
+			RepeatLastN:      req.Options.RepeatLastN,
+			RepeatPenalty:    req.Options.RepeatPenalty,
+			PresencePenalty:  req.Options.PresencePenalty,
+			FrequencyPenalty: req.Options.FrequencyPenalty,
+			NumPredict:       req.Options.NumPredict,
 		}
 	}

--- a/x/mlxrunner/mlx/act.go
+++ b/x/mlxrunner/mlx/act.go
@@ -1,62 +1,64 @@
 package mlx

-// #include "generated.h"
-import "C"
 import "math"

 var geluCoeff = float32(math.Sqrt(2 / math.Pi))

-// GELUApprox matches mlx.nn.gelu_approx:
-//
-//	0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
-func GELUApprox(x *Array) *Array {
-	// Use dtype-matched scalars to avoid implicit upcasts on bf16 inputs.
-	half := scalarWithDtype(0.5, x)
-	defer C.mlx_array_free(half)
-	coeff := scalarWithDtype(geluCoeff, x)
-	defer C.mlx_array_free(coeff)
-	c := scalarWithDtype(0.044715, x)
-	defer C.mlx_array_free(c)
+// GELUApprox returns 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+// as a fused kernel.
+var GELUApprox = Compile1(
+	"GELUApprox",
+	func(x *Array) *Array {
+		// Dtype-matched scalars avoid implicit upcasts on bf16 inputs.
+		dt := x.DType()
+		half := FromValue[float32](0.5).AsType(dt)
+		coeff := FromValue(geluCoeff).AsType(dt)
+		c := FromValue[float32](0.044715).AsType(dt)
+		one := FromValue[float32](1.0).AsType(dt)

-	// x^3 via x*x*x (avoids general Power which is slower)
-	x3 := New("GELU_X3")
-	C.mlx_multiply(&x3.ctx, x.ctx, x.ctx, DefaultStream().ctx)
-	tmp := New("GELU_X3b")
-	C.mlx_multiply(&tmp.ctx, x3.ctx, x.ctx, DefaultStream().ctx)
-	x3 = tmp
+		// x^3 via x*x*x (avoids general Power which is slower).
+		x3 := x.Multiply(x).Multiply(x)
+		inner := x.Add(c.Multiply(x3))
+		tanh := coeff.Multiply(inner).Tanh()
+		return half.Multiply(x).Multiply(one.Add(tanh))
+	},
+	Shapeless(),
+)

-	// 0.044715 * x^3
-	cx3 := New("GELU_CX3")
-	C.mlx_multiply(&cx3.ctx, c, x3.ctx, DefaultStream().ctx)
+// SiLU returns a * sigmoid(a) as a fused kernel.
+var SiLU = Compile1(
+	"SiLU",
+	func(a *Array) *Array {
+		return a.Multiply(a.Sigmoid())
+	},
+	Shapeless(),
+)

-	// x + 0.044715 * x^3
-	inner := New("GELU_INNER")
-	C.mlx_add(&inner.ctx, x.ctx, cx3.ctx, DefaultStream().ctx)
+// SwiGLU returns silu(gate) * up as a fused kernel.
+var SwiGLU = Compile2(
+	"SwiGLU",
+	func(gate, up *Array) *Array {
+		return SiLU(gate).Multiply(up)
+	},
+	Shapeless(),
+)

-	// sqrt(2/pi) * (x + 0.044715 * x^3)
-	scaled := New("GELU_SCALED")
-	C.mlx_multiply(&scaled.ctx, coeff, inner.ctx, DefaultStream().ctx)
+// GeGLU returns gelu_approx(gate) * up as a fused kernel. Matches mlx_lm's
+// geglu, used by Gemma-family MLP and MoE paths.
+var GeGLU = Compile2(
+	"GeGLU",
+	func(gate, up *Array) *Array {
+		return GELUApprox(gate).Multiply(up)
+	},
+	Shapeless(),
+)

-	// tanh(...)
-	th := New("GELU_TANH")
-	C.mlx_tanh(&th.ctx, scaled.ctx, DefaultStream().ctx)
-
-	// 1 + tanh(...)
-	one := scalarWithDtype(1.0, x)
-	defer C.mlx_array_free(one)
-	onePlusTanh := New("GELU_1PT")
-	C.mlx_add(&onePlusTanh.ctx, one, th.ctx, DefaultStream().ctx)
-
-	// 0.5 * x
-	halfX := New("GELU_HALFX")
-	C.mlx_multiply(&halfX.ctx, half, x.ctx, DefaultStream().ctx)
-
-	// 0.5 * x * (1 + tanh(...))
-	out := New("GELU_APPROX")
-	C.mlx_multiply(&out.ctx, halfX.ctx, onePlusTanh.ctx, DefaultStream().ctx)
-	return out
-}
-
-func SILU(t *Array) *Array {
-	return t.Multiply(t.Sigmoid()).AsType(t.DType())
-}
+// LogitSoftcap returns tanh(x / cap) * cap as a fused kernel. Matches
+// mlx_lm's logit_softcap. cap must have the same dtype as x.
+var LogitSoftcap = Compile2(
+	"LogitSoftcap",
+	func(x, cap *Array) *Array {
+		return x.Divide(cap).Tanh().Multiply(cap)
+	},
+	Shapeless(),
+)
--- a/x/mlxrunner/mlx/array.go
+++ b/x/mlxrunner/mlx/array.go
@@ -27,7 +27,11 @@ var arrays []*Array

 func New(name string) *Array {
 	t := &Array{name: name}
-	arrays = append(arrays, t)
+	if tracing {
+		traceScratch = append(traceScratch, t)
+	} else {
+		arrays = append(arrays, t)
+	}
 	return t
 }

--- a/x/mlxrunner/mlx/compile.go
+++ b/x/mlxrunner/mlx/compile.go
@@ -0,0 +1,192 @@
+package mlx
+
+// #include <stdlib.h>
+// #include "generated.h"
+//
+// extern int closureCallback(mlx_vector_array* res, mlx_vector_array input, void* payload);
+// extern void closureDestructor(void* payload);
+import "C"
+
+import (
+	"log/slog"
+	"runtime/cgo"
+	"sync"
+	"unsafe"
+)
+
+// CompileFunc is the signature of a function that can be compiled.
+type CompileFunc func(inputs ...*Array) []*Array
+
+// CompileOption configures Compile behavior.
+type CompileOption func(*compileConfig)
+
+type compileConfig struct {
+	shapeless bool
+}
+
+// Shapeless traces the function once against symbolic shapes so the compiled
+// graph accepts any input shape afterwards. Without this option, MLX re-traces
+// on each new (shape, dtype) combination and caches each specialization.
+func Shapeless() CompileOption {
+	return func(c *compileConfig) { c.shapeless = true }
+}
+
+// Compile returns a compiled version of fn. When called during another
+// compile's trace, fn is inlined directly so outer compiles can fuse through
+// inner ones.
+//
+// Compiled functions must not have side effects outside of the function. Do
+// not access data other than the arguments passed in (either Go data or MLX
+// arrays) unless it is a constant.
+func Compile(name string, fn CompileFunc, opts ...CompileOption) CompileFunc {
+	var cfg compileConfig
+	for _, o := range opts {
+		o(&cfg)
+	}
+
+	var closure C.mlx_closure
+	var once sync.Once
+
+	return func(inputs ...*Array) []*Array {
+		if tracing {
+			return fn(inputs...)
+		}
+
+		once.Do(func() {
+			payload := (*cgo.Handle)(C.malloc(C.size_t(unsafe.Sizeof(cgo.Handle(0)))))
+			*payload = cgo.NewHandle(fn)
+			src := C.mlx_closure_new_func_payload(
+				(*[0]byte)(C.closureCallback),
+				unsafe.Pointer(payload),
+				(*[0]byte)(C.closureDestructor),
+			)
+			defer C.mlx_closure_free(src)
+
+			closure = C.mlx_closure_new()
+			mlxCheck(name+": compile failed", func() C.int {
+				return C.mlx_compile(&closure, src, C.bool(cfg.shapeless))
+			})
+		})
+
+		inVec := C.mlx_vector_array_new()
+		defer C.mlx_vector_array_free(inVec)
+		for _, in := range inputs {
+			C.mlx_vector_array_append_value(inVec, in.ctx)
+		}
+
+		outVec := C.mlx_vector_array_new()
+		defer C.mlx_vector_array_free(outVec)
+		mlxCheck(name+": closure apply failed", func() C.int {
+			return C.mlx_closure_apply(&outVec, closure, inVec)
+		})
+
+		n := int(C.mlx_vector_array_size(outVec))
+		outputs := make([]*Array, n)
+		for i := range n {
+			outputs[i] = New(name)
+			C.mlx_vector_array_get(&outputs[i].ctx, outVec, C.size_t(i))
+		}
+		return outputs
+	}
+}
+
+// Compile1 compiles a unary function. See Compile.
+func Compile1(name string, fn func(*Array) *Array, opts ...CompileOption) func(*Array) *Array {
+	cf := Compile(name, func(in ...*Array) []*Array {
+		return []*Array{fn(in[0])}
+	}, opts...)
+	return func(a *Array) *Array {
+		return cf(a)[0]
+	}
+}
+
+// Compile2 compiles a binary function. See Compile.
+func Compile2(name string, fn func(*Array, *Array) *Array, opts ...CompileOption) func(*Array, *Array) *Array {
+	cf := Compile(name, func(in ...*Array) []*Array {
+		return []*Array{fn(in[0], in[1])}
+	}, opts...)
+	return func(a, b *Array) *Array {
+		return cf(a, b)[0]
+	}
+}
+
+// Compile3 compiles a ternary function. See Compile.
+func Compile3(name string, fn func(*Array, *Array, *Array) *Array, opts ...CompileOption) func(*Array, *Array, *Array) *Array {
+	cf := Compile(name, func(in ...*Array) []*Array {
+		return []*Array{fn(in[0], in[1], in[2])}
+	}, opts...)
+	return func(a, b, c *Array) *Array {
+		return cf(a, b, c)[0]
+	}
+}
+
+// tracing is true while a compile callback is running. Since MLX is
+// single-threaded at this level a plain Go bool suffices.
+var tracing bool
+
+// traceScratch collects arrays created during a compile trace so they can be
+// freed as a group when the callback returns.
+var traceScratch []*Array
+
+//export closureCallback
+func closureCallback(res *C.mlx_vector_array, input C.mlx_vector_array, payload unsafe.Pointer) (rc C.int) {
+	defer func() {
+		if r := recover(); r != nil {
+			slog.Error("mlx closure callback panicked", "panic", r)
+			rc = 1
+		}
+	}()
+
+	handle := *(*cgo.Handle)(payload)
+	fn := handle.Value().(CompileFunc)
+
+	// When tracing, we track all of the intermediates that are created and free them separately at the end of
+	// the process. This will give the effect of a single op - inputs are owned by the original caller (via
+	// the MLX layer) and outputs are transferred back to MLX to create a new Go side tensor.
+	if tracing {
+		panic("mlx: nested compile trace")
+	}
+	tracing = true
+	traceScratch = nil
+	defer func() {
+		for _, a := range traceScratch {
+			if a.pinned > 0 {
+				panic("mlx: traced array was pinned during compilation")
+			}
+			if a.Valid() {
+				C.mlx_array_free(a.ctx)
+				a.ctx.ctx = nil
+			}
+		}
+		tracing = false
+		traceScratch = nil
+	}()
+
+	n := int(C.mlx_vector_array_size(input))
+	inputs := make([]*Array, n)
+	for i := range n {
+		a := New("")
+		C.mlx_vector_array_get(&a.ctx, input, C.size_t(i))
+		inputs[i] = a
+	}
+
+	outputs := fn(inputs...)
+
+	var arrPtr *C.mlx_array
+	if len(outputs) > 0 {
+		handles := make([]C.mlx_array, len(outputs))
+		for i, out := range outputs {
+			handles[i] = out.ctx
+		}
+		arrPtr = &handles[0]
+	}
+	C.mlx_vector_array_set_data(res, arrPtr, C.size_t(len(outputs)))
+	return 0
+}
+
+//export closureDestructor
+func closureDestructor(payload unsafe.Pointer) {
+	handle := *(*cgo.Handle)(payload)
+	handle.Delete()
+	C.free(payload)
+}
--- a/x/mlxrunner/mlx/compile_test.go
+++ b/x/mlxrunner/mlx/compile_test.go
@@ -0,0 +1,147 @@
+package mlx
+
+import (
+	"testing"
+)
+
+func TestCompileFusion(t *testing.T) {
+	skipIfNoMLX(t)
+
+	// Compile fuses the ops inside a function body into a single kernel,
+	// eliminating intermediate buffers. Use a diamond-shaped graph where
+	// two branches must be materialized simultaneously without fusion,
+	// then compare peak memory against the compiled version which fuses
+	// everything into one kernel with no intermediates.
+	const n = 1024 * 1024 // 4MB per float32 array
+	data := make([]float32, n)
+	for i := range data {
+		data[i] = float32(i + 1)
+	}
+
+	// Diamond: both a*b and a+b must be live for the final multiply.
+	// Without fusion: peak includes both intermediates (~8MB extra).
+	// With fusion: single kernel, no intermediates.
+	body := func(a, b *Array) *Array {
+		return a.Multiply(b).Multiply(a.Add(b))
+	}
+
+	a := FromValues(data, n)
+	b := FromValues(data, n)
+	Pin(a, b)
+	defer Unpin(a, b)
+
+	// Compiled: ops fused into a single kernel.
+	EnableCompile()
+	fn := Compile2("diamond", body, Shapeless())
+	warm := fn(a, b)
+	Eval(warm)
+	Sweep()
+	ClearCache()
+	ResetPeakMemory()
+	y := fn(a, b)
+	Eval(y)
+	compiledPeak := PeakMemory()
+	Sweep()
+
+	// Uncompiled: ops evaluated individually, intermediates materialized.
+	ClearCache()
+	ResetPeakMemory()
+	z := body(a, b)
+	Eval(z)
+	uncompiledPeak := PeakMemory()
+	Sweep()
+
+	if compiledPeak == 0 && uncompiledPeak == 0 {
+		t.Skip("peak memory tracking not available")
+	}
+
+	t.Logf("peak memory: compiled=%d uncompiled=%d", compiledPeak, uncompiledPeak)
+
+	if compiledPeak >= uncompiledPeak {
+		t.Fatalf("compilation did not reduce peak memory: compiled=%d uncompiled=%d", compiledPeak, uncompiledPeak)
+	}
+}
+
+func TestCompileNested(t *testing.T) {
+	skipIfNoMLX(t)
+
+	// A compiled function that calls another compiled function should
+	// produce correct results. The inner function inlines via isTracing()
+	// during the outer's trace.
+	inner := Compile1("silu", func(a *Array) *Array {
+		return a.Multiply(a.Sigmoid())
+	}, Shapeless())
+
+	outer := Compile2("swiglu", func(gate, up *Array) *Array {
+		return inner(gate).Multiply(up)
+	}, Shapeless())
+
+	gate := FromValues([]float32{0, 1, 2}, 3)
+	up := FromValues([]float32{1, 1, 1}, 3)
+	Pin(gate, up)
+	defer Unpin(gate, up)
+
+	y := outer(gate, up)
+	Eval(y)
+
+	// silu(x) = x * sigmoid(x); for x=0 → 0, x=1 → ~0.7311, x=2 → ~1.7616
+	got := y.Floats()
+	want := []float32{0, 0.7310586, 1.7615942}
+	for i, v := range got {
+		if v-want[i] > 1e-4 || want[i]-v > 1e-4 {
+			t.Fatalf("got[%d]=%v want %v", i, v, want[i])
+		}
+	}
+}
+
+func TestCompileCallbackPanicRecovers(t *testing.T) {
+	skipIfNoMLX(t)
+
+	boom := Compile1("boom", func(a *Array) *Array {
+		panic("intentional test panic")
+	})
+
+	x := FromValues([]float32{1}, 1)
+	Pin(x)
+	defer Unpin(x)
+
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatal("expected panic from Call, got none")
+		}
+		if _, ok := r.(string); !ok {
+			t.Fatalf("expected string panic, got %T: %v", r, r)
+		}
+	}()
+	boom(x)
+}
+
+func TestCompileNoTrackingGrowth(t *testing.T) {
+	skipIfNoMLX(t)
+
+	// Repeated invocations of a compiled kernel should not grow the
+	// tracked-arrays list — the callback's traceScratch collects
+	// intermediates during tracing and frees them when the callback returns.
+	fn := Compile2("mul_add", func(a, b *Array) *Array {
+		return a.Multiply(b).Add(b)
+	})
+
+	a := FromValues([]float32{1, 2}, 2)
+	b := FromValues([]float32{3, 4}, 2)
+	Pin(a, b)
+	defer Unpin(a, b)
+
+	Sweep()
+	before := len(arrays)
+
+	for range 100 {
+		_ = fn(a, b)
+		Sweep()
+	}
+
+	after := len(arrays)
+	if after > before+2 {
+		t.Fatalf("tracked arrays grew from %d to %d across 100 calls (includes initial trace)", before, after)
+	}
+}
--- a/x/mlxrunner/mlx/mlx.go
+++ b/x/mlxrunner/mlx/mlx.go
@@ -9,8 +9,8 @@ package mlx
 // #include "generated.h"
 // #include <string.h>
 //
-// static char _mlx_last_error_msg[1024] = {0};
-// static int  _mlx_last_error_flag = 0;
+// static __thread char _mlx_last_error_msg[1024] = {0};
+// static __thread int  _mlx_last_error_flag = 0;
 //
 // static void _mlx_capture_error_handler(const char* msg, void* data) {
 //     (void)data;
@@ -30,15 +30,13 @@ package mlx
 //     _mlx_last_error_msg[0] = '\0';
 // }
 //
-// static int mlx_had_last_error(void) {
-//     return _mlx_last_error_flag;
-// }
-//
 // static const char* mlx_get_last_error(void) {
-//     return _mlx_last_error_flag ? _mlx_last_error_msg : NULL;
+//     return _mlx_last_error_flag ? _mlx_last_error_msg : "";
 // }
 import "C"

+import "runtime"
+
 func init() {
 	// Replace the default exit(-1) error handler with one that captures
 	// the error message so we can surface it in Go.
@@ -53,6 +51,24 @@ func Version() string {
 	return C.GoString(C.mlx_string_data(str))
 }

+// mlxCheck locks the goroutine to its OS thread, clears the captured error
+// state, calls fn, and panics with the captured message if fn returns non-zero.
+// The thread lock ensures the thread-local error state is read from the same
+// thread that executed the call.
+func mlxCheck(fallback string, fn func() C.int) {
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	C.mlx_clear_last_error()
+	if fn() != 0 {
+		msg := C.GoString(C.mlx_get_last_error())
+		if msg == "" {
+			msg = fallback
+		}
+		panic("mlx: " + msg)
+	}
+}
+
 func doEval(outputs []*Array, async bool) {
 	if len(outputs) == 0 {
 		return
@@ -67,20 +83,12 @@ func doEval(outputs []*Array, async bool) {
 		}
 	}

-	C.mlx_clear_last_error()
-	var rc C.int
-	if async {
-		rc = C.mlx_async_eval(vector)
-	} else {
-		rc = C.mlx_eval(vector)
-	}
-	if rc != 0 {
-		msg := "mlx eval failed"
-		if C.mlx_had_last_error() != 0 {
-			msg = C.GoString(C.mlx_get_last_error())
+	mlxCheck("eval failed", func() C.int {
+		if async {
+			return C.mlx_async_eval(vector)
 		}
-		panic("mlx: " + msg)
-	}
+		return C.mlx_eval(vector)
+	})
 }

 func AsyncEval(outputs ...*Array) {
--- a/x/mlxrunner/mlx/ops.go
+++ b/x/mlxrunner/mlx/ops.go
@@ -169,6 +169,12 @@ func (t *Array) PutAlongAxis(indices, values *Array, axis int) *Array {
 	return out
 }

+func (t *Array) ScatterAddAxis(indices, values *Array, axis int) *Array {
+	out := New("SCATTER_ADD_AXIS")
+	C.mlx_scatter_add_axis(&out.ctx, t.ctx, indices.ctx, values.ctx, C.int(axis), DefaultStream().ctx)
+	return out
+}
+
 func (t *Array) Reshape(axes ...int) *Array {
 	cAxes := make([]C.int, len(axes))
 	for i := range axes {
--- a/x/mlxrunner/mlx/ops_extra.go
+++ b/x/mlxrunner/mlx/ops_extra.go
@@ -404,11 +404,6 @@ func GatherMM(a, b *Array, lhsIndices, rhsIndices *Array, sortedIndices bool) *A
 	return a.GatherMM(b, lhsIndices, rhsIndices, sortedIndices)
 }

-func SiLU(a *Array) *Array {
-	sig := a.Sigmoid()
-	return a.Multiply(sig)
-}
-
 func RoPEWithBase(x *Array, dims int, traditional bool, base, scale float32, offset int) *Array {
 	return RoPEWithFreqs(x, dims, traditional, base, scale, offset, nil)
 }
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -23,28 +23,19 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		return errors.New("model not loaded")
 	}

-	enableCompile := true
-	if modelCompile, ok := r.Model.(interface{ EnableCompile() bool }); ok {
-		enableCompile = modelCompile.EnableCompile()
-	}
-	if enableCompile {
-		mlx.EnableCompile()
-	} else {
-		mlx.DisableCompile()
-	}
 	mlx.ResetPeakMemory()
 	ctx := request.Ctx
 	var (
-		sample, logprobs         *mlx.Array
-		nextSample, nextLogprobs *mlx.Array
+		sample     *mlx.Array
+		nextSample *mlx.Array
 	)

 	defer func() {
 		if request.Sampler != nil {
 			request.Sampler.Free()
 		}
-		mlx.Unpin(sample, logprobs)
-		mlx.Unpin(nextSample, nextLogprobs)
+		mlx.Unpin(sample)
+		mlx.Unpin(nextSample)
 		mlx.Sweep()
 		mlx.ClearCache()

@@ -144,22 +135,21 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		mlx.ClearCache()
 	}

-	step := func(token *mlx.Array) (*mlx.Array, *mlx.Array) {
+	step := func(token *mlx.Array) *mlx.Array {
 		fwd := r.Model.Forward(token.ExpandDims(0), caches)
 		logits := r.Model.Unembed(fwd)
 		logits = logits.Slice(mlx.Slice(), mlx.Slice(logits.Dim(1)-1), mlx.Slice()).Squeeze(1)

-		logprobs := logits.Subtract(logits.Logsumexp(true))
-		sample := request.Sampler.Sample(logprobs)
+		sample := request.Sampler.Sample(logits)

-		mlx.Pin(sample, logprobs)
+		mlx.Pin(sample)
 		mlx.Sweep()
-		mlx.AsyncEval(sample, logprobs)
+		mlx.AsyncEval(sample)

-		return sample, logprobs
+		return sample
 	}

-	sample, logprobs = step(mlx.FromValues(tokens[processed:], total-processed))
+	sample = step(mlx.FromValues(tokens[processed:], total-processed))

 	var b bytes.Buffer

@@ -170,7 +160,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		}

 		request.Sampler.AppendToken(sample)
-		nextSample, nextLogprobs = step(sample)
+		nextSample = step(sample)

 		if i == 0 {
 			mlx.Eval(sample)
@@ -195,9 +185,9 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		}:
 		}

-		mlx.Unpin(sample, logprobs)
-		sample, logprobs = nextSample, nextLogprobs
-		nextSample, nextLogprobs = nil, nil
+		mlx.Unpin(sample)
+		sample = nextSample
+		nextSample = nil

 		if i%256 == 0 {
 			mlx.ClearCache()
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -31,13 +31,15 @@ type Request struct {
 type TextCompletionsRequest struct {
 	Prompt  string `json:"prompt"`
 	Options struct {
-		Temperature     float32 `json:"temperature"`
-		TopP            float32 `json:"top_p"`
-		MinP            float32 `json:"min_p"`
-		TopK            int     `json:"top_k"`
-		RepeatLastN     int     `json:"repeat_last_n"`
-		PresencePenalty float32 `json:"presence_penalty"`
-		MaxTokens       int     `json:"max_tokens"`
+		Temperature      float32 `json:"temperature"`
+		TopP             float32 `json:"top_p"`
+		MinP             float32 `json:"min_p"`
+		TopK             int     `json:"top_k"`
+		RepeatLastN      int     `json:"repeat_last_n"`
+		RepeatPenalty    float32 `json:"repeat_penalty"`
+		PresencePenalty  float32 `json:"presence_penalty"`
+		FrequencyPenalty float32 `json:"frequency_penalty"`
+		MaxTokens        int     `json:"max_tokens"`

 		// Deprecated: use MaxTokens instead
 		NumPredict int `json:"num_predict"`
@@ -79,6 +81,8 @@ func (r *Runner) Load(modelName string) error {
 	r.Model = m
 	r.Tokenizer = m.Tokenizer()
 	r.contextLength = m.MaxContextLength()
+
+	mlx.EnableCompile()
 	return nil
 }

--- a/x/mlxrunner/sample/sample.go
+++ b/x/mlxrunner/sample/sample.go
@@ -9,30 +9,38 @@ import (
 type Transform func(*Sampler, *mlx.Array) *mlx.Array

 type Sampler struct {
-	Temperature     float32
-	TopP            float32
-	MinP            float32
-	TopK            int
-	RepeatLastN     int
-	PresencePenalty float32
+	Temperature      float32
+	TopP             float32
+	MinP             float32
+	TopK             int
+	RepeatLastN      int
+	RepeatPenalty    float32
+	PresencePenalty  float32
+	FrequencyPenalty float32

 	history    *mlx.Array
 	historyLen int
 	transforms []Transform
 }

-func New(temp, top_p, min_p float32, top_k, repeatLastN int, presencePenalty float32) *Sampler {
+func New(temp, top_p, min_p float32, top_k, repeatLastN int, repeatPenalty, presencePenalty, frequencyPenalty float32) *Sampler {
+	if repeatPenalty <= 0 {
+		repeatPenalty = 1
+	}
+
 	s := &Sampler{
-		Temperature:     temp,
-		TopP:            top_p,
-		MinP:            min_p,
-		TopK:            top_k,
-		RepeatLastN:     repeatLastN,
-		PresencePenalty: presencePenalty,
+		Temperature:      temp,
+		TopP:             top_p,
+		MinP:             min_p,
+		TopK:             top_k,
+		RepeatLastN:      repeatLastN,
+		RepeatPenalty:    repeatPenalty,
+		PresencePenalty:  presencePenalty,
+		FrequencyPenalty: frequencyPenalty,
 	}

 	var transforms []Transform
-	if presencePenalty != 0 {
+	if s.usesHistory() {
 		transforms = append(transforms, penalty)
 	}

@@ -59,7 +67,7 @@ func New(temp, top_p, min_p float32, top_k, repeatLastN int, presencePenalty flo
 }

 func (s *Sampler) usesHistory() bool {
-	return s.PresencePenalty != 0
+	return s.RepeatPenalty != 1 || s.PresencePenalty != 0 || s.FrequencyPenalty != 0
 }

 func (s *Sampler) setHistory(history *mlx.Array, historyLen int) {
@@ -130,60 +138,78 @@ func temperature(s *Sampler, logits *mlx.Array) *mlx.Array {
 	return mlx.DivScalar(logits, s.Temperature).Categorical(-1)
 }

-func topP(s *Sampler, logprobs *mlx.Array) *mlx.Array {
+func topP(s *Sampler, logits *mlx.Array) *mlx.Array {
 	if s.TopP <= 0 || s.TopP >= 1 {
-		return logprobs
+		return logits
 	}

-	order := logprobs.Negative().ArgsortAxis(-1)
-	sortedLogprobs := logprobs.TakeAlongAxis(order, -1)
-	sortedProbs := mlx.SoftmaxAxis(sortedLogprobs, -1, true)
+	order := logits.Negative().ArgsortAxis(-1)
+	sortedLogits := logits.TakeAlongAxis(order, -1)
+	sortedProbs := mlx.SoftmaxAxis(sortedLogits, -1, true)
 	prevCumProbs := sortedProbs.Cumsum(-1, false, true).Subtract(sortedProbs)
 	keep := prevCumProbs.Less(mlx.FromValue(s.TopP))
-	filtered := mlx.Where(keep, sortedLogprobs, mlx.FromValue(float32(math.Inf(-1))))
-	return logprobs.PutAlongAxis(order, filtered, -1)
+	filtered := mlx.Where(keep, sortedLogits, mlx.FromValue(float32(math.Inf(-1))))
+	return logits.PutAlongAxis(order, filtered, -1)
 }

-func minP(s *Sampler, logprobs *mlx.Array) *mlx.Array {
+func minP(s *Sampler, logits *mlx.Array) *mlx.Array {
 	if s.MinP <= 0 || s.MinP > 1 {
-		return logprobs
+		return logits
 	}

-	maxLogprobs := logprobs.TakeAlongAxis(logprobs.Argmax(-1, true), -1)
-	minLogprobs := mlx.AddScalar(maxLogprobs, float32(math.Log(float64(s.MinP))))
+	maxLogits := logits.TakeAlongAxis(logits.Argmax(-1, true), -1)
+	minLogits := mlx.AddScalar(maxLogits, float32(math.Log(float64(s.MinP))))

 	return mlx.Where(
-		logprobs.Less(minLogprobs),
+		logits.Less(minLogits),
 		mlx.FromValue(float32(math.Inf(-1))),
-		logprobs,
+		logits,
 	)
 }

-func topK(s *Sampler, logprobs *mlx.Array) *mlx.Array {
+func topK(s *Sampler, logits *mlx.Array) *mlx.Array {
 	if s.TopK <= 0 {
-		return logprobs
+		return logits
 	}

-	vocab := logprobs.Dim(logprobs.NumDims() - 1)
+	vocab := logits.Dim(logits.NumDims() - 1)
 	if s.TopK >= vocab {
-		return logprobs
+		return logits
 	}

-	mask := logprobs.Negative().ArgpartitionAxis(s.TopK-1, -1).Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
-	return logprobs.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
+	mask := logits.Negative().ArgpartitionAxis(s.TopK-1, -1).Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
+	return logits.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
 }

-func penalty(s *Sampler, logprobs *mlx.Array) *mlx.Array {
-	if s.history == nil || s.historyLen == 0 || s.PresencePenalty == 0 {
-		return logprobs
+func penalty(s *Sampler, logits *mlx.Array) *mlx.Array {
+	if s.historyLen == 0 {
+		return logits
 	}

 	tokenIndices := s.history
-	if logprobs.NumDims() > 1 {
+	if logits.NumDims() > 1 {
 		tokenIndices = tokenIndices.ExpandDims(0)
 	}

-	selected := logprobs.TakeAlongAxis(tokenIndices, -1)
-	adjusted := mlx.AddScalar(selected, -s.PresencePenalty)
-	return logprobs.PutAlongAxis(tokenIndices, adjusted, -1)
+	if s.RepeatPenalty != 1 || s.PresencePenalty != 0 {
+		adjusted := logits.TakeAlongAxis(tokenIndices, -1)
+		if s.RepeatPenalty != 1 {
+			factor := mlx.Where(
+				adjusted.Less(mlx.FromValue(float32(0))),
+				mlx.FromValue(s.RepeatPenalty),
+				mlx.FromValue(1/s.RepeatPenalty),
+			)
+			adjusted = adjusted.Multiply(factor)
+		}
+		if s.PresencePenalty != 0 {
+			adjusted = mlx.AddScalar(adjusted, -s.PresencePenalty)
+		}
+		logits = logits.PutAlongAxis(tokenIndices, adjusted, -1)
+	}
+
+	if s.FrequencyPenalty != 0 {
+		logits = logits.ScatterAddAxis(tokenIndices, mlx.FromValue(-s.FrequencyPenalty), -1)
+	}
+
+	return logits
 }
--- a/x/mlxrunner/sample/sample_test.go
+++ b/x/mlxrunner/sample/sample_test.go
@@ -11,7 +11,7 @@ import (

 func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
 	// RepeatLastN = 1, PresencePenalty = 6
-	s := New(0, 0, 0, 0, 1, 6)
+	s := New(0, 0, 0, 0, 1, 1, 6, 0)
 	defer func() {
 		s.Free()
 		mlx.Sweep()
@@ -20,11 +20,11 @@ func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
 	s.ResetHistory([]int32{0})
 	s.AppendToken(mlx.NewArrayInt32([]int32{1}, []int32{1}))

-	logprobs := mlx.FromValues([]float32{0, 5, 4}, 3)
-	got := s.Sample(logprobs)
+	logits := mlx.FromValues([]float32{0, 5, 4}, 3)
+	got := s.Sample(logits)
 	mlx.Eval(got)

-	// logprobs will be [0, -1, 4] after the penalty
+	// logits will be [0, -1, 4] after the penalty
 	// and then (index) 2 after the greedy sampler
 	gotInt := got.Int()
 	if gotInt != 2 {
@@ -32,19 +32,59 @@ func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
 	}
 }

-func TestMinPMasksTokensBelowThreshold(t *testing.T) {
-	s := New(0, 0, 0.5, 0, 0, 0)
+func TestRepeatPenaltyUsesHistoryWithoutPresencePenalty(t *testing.T) {
+	s := New(0, 0, 0, 0, 1, 2, 0, 0)
 	defer func() {
 		s.Free()
 		mlx.Sweep()
 	}()

-	logprobs := mlx.FromValues([]float32{
+	s.ResetHistory([]int32{1})
+
+	logits := mlx.FromValues([]float32{0, 5, 4}, 3)
+	got := s.Sample(logits)
+	mlx.Eval(got)
+
+	// token 1 is repeated and positive, so 5 / 2 falls below token 2.
+	gotInt := got.Int()
+	if gotInt != 2 {
+		t.Fatalf("got %d, want 2", gotInt)
+	}
+}
+
+func TestFrequencyPenaltyUsesTokenCounts(t *testing.T) {
+	s := New(0, 0, 0, 0, 4, 1, 0, 2)
+	defer func() {
+		s.Free()
+		mlx.Sweep()
+	}()
+
+	s.ResetHistory([]int32{1, 1})
+
+	logits := mlx.FromValues([]float32{0, 5, 4}, 3)
+	got := s.Sample(logits)
+	mlx.Eval(got)
+
+	// token 1 appears twice, so 5 - (2 * 2) falls below token 2.
+	gotInt := got.Int()
+	if gotInt != 2 {
+		t.Fatalf("got %d, want 2", gotInt)
+	}
+}
+
+func TestMinPMasksTokensBelowThreshold(t *testing.T) {
+	s := New(0, 0, 0.5, 0, 0, 1, 0, 0)
+	defer func() {
+		s.Free()
+		mlx.Sweep()
+	}()
+
+	logits := mlx.FromValues([]float32{
 		float32(math.Log(0.5)),
 		float32(math.Log(0.3)),
 		float32(math.Log(0.2)),
 	}, 3)
-	got := minP(s, logprobs)
+	got := minP(s, logits)
 	mlx.Eval(got)

 	gotFloats := got.Floats()
--- a/x/mlxrunner/server.go
+++ b/x/mlxrunner/server.go
@@ -102,7 +102,9 @@ func Execute(args []string) error {
 			request.Options.MinP,
 			request.Options.TopK,
 			request.Options.RepeatLastN,
+			request.Options.RepeatPenalty,
 			request.Options.PresencePenalty,
+			request.Options.FrequencyPenalty,
 		)

 		var cancel context.CancelFunc
--- a/x/models/gemma4/gemma4.go
+++ b/x/models/gemma4/gemma4.go
@@ -80,7 +80,6 @@ type TextConfig struct {
 	PLEProjScale    float32 `json:"-"` // 1/sqrt(hidden_size)
 	PLECombineScale float32 `json:"-"` // 2^(-0.5) = 0.7071...
 	RouterScale     float32 `json:"-"` // 1/sqrt(hidden_size)
-	SoftcapInv      float32 `json:"-"` // 1/final_logit_softcapping

 	// KV sharing: maps shared layer index -> donor layer index.
 	KVShareMap map[int32]int32 `json:"-"`
@@ -455,9 +454,6 @@ func parseTextConfig(configData []byte) (TextConfig, error) {
 		cfg.PLECombineScale = float32(math.Pow(2.0, -0.5))
 	}
 	cfg.RouterScale = float32(1.0 / math.Sqrt(float64(cfg.HiddenSize)))
-	if cfg.FinalLogitSoftcapping > 0 {
-		cfg.SoftcapInv = 1.0 / cfg.FinalLogitSoftcapping
-	}

 	// Compute KV sharing map.
 	cfg.KVShareMap = make(map[int32]int32)
@@ -1065,14 +1061,12 @@ func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
 			}
 		}

-		h = layer.Forward(h, c, B, L, m.TextConfig, pleInput, donorEntry, smc)
+		var donorKV *sharedKVEntry
+		h, donorKV = layer.Forward(h, c, B, L, m.TextConfig, pleInput, donorEntry, smc)

 		// If this layer is a donor, store its cached KV for later shared layers.
-		if layer.IsDonor && c != nil {
-			state := c.State()
-			if len(state) >= 2 && state[0] != nil && state[1] != nil {
-				sharedKV[layer.LayerIdx] = sharedKVEntry{K: state[0], V: state[1], Offset: c.Offset()}
-			}
+		if layer.IsDonor && donorKV != nil {
+			sharedKV[layer.LayerIdx] = *donorKV
 		}
 	}

@@ -1114,9 +1108,8 @@ func (m *Model) Unembed(x *mlx.Array) *mlx.Array {
 	logits := m.LMHead.Forward(x)

 	if m.FinalLogitSoftcapping > 0 {
-		logits = mlx.MulScalar(logits, m.SoftcapInv)
-		logits = logits.Tanh()
-		logits = mlx.MulScalar(logits, m.FinalLogitSoftcapping)
+		cap := mlx.FromValue(m.FinalLogitSoftcapping).AsType(logits.DType())
+		logits = mlx.LogitSoftcap(logits, cap)
 	}

 	return logits
@@ -1195,9 +1188,9 @@ func sliceLayerDim(combined *mlx.Array, layerIdx, B, L, pleDim int32) *mlx.Array
 	return mlx.Squeeze(sliced, 2)
 }

-func (l *DecoderLayer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *TextConfig, pleInput *mlx.Array, donorEntry *sharedKVEntry, slidingMaskCache *slidingMaskCache) *mlx.Array {
+func (l *DecoderLayer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *TextConfig, pleInput *mlx.Array, donorEntry *sharedKVEntry, slidingMaskCache *slidingMaskCache) (*mlx.Array, *sharedKVEntry) {
 	normed := mlx.RMSNormFn(x, l.InputNormScaled, cfg.RMSNormEps)
-	attnOut := l.Attention.Forward(normed, c, B, L, l.IsSliding, cfg, donorEntry, slidingMaskCache)
+	attnOut, donorKV := l.Attention.Forward(normed, c, B, L, l.IsSliding, cfg, donorEntry, slidingMaskCache)
 	attnOut = mlx.RMSNormFn(attnOut, l.PostAttnNormScaled, cfg.RMSNormEps)
 	h := mlx.Add(x, attnOut)

@@ -1231,8 +1224,7 @@ func (l *DecoderLayer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Tex
 	// PLE injection (after MLP residual).
 	if l.PLE != nil && pleInput != nil {
 		residual := h
-		gate := mlx.GELUApprox(l.PLE.InputGate.Forward(h))
-		gated := mlx.Mul(gate, pleInput)
+		gated := mlx.GeGLU(l.PLE.InputGate.Forward(h), pleInput)
 		projected := l.PLE.Projection.Forward(gated)
 		projected = mlx.RMSNormFn(projected, l.PLE.PostNormScaled, cfg.RMSNormEps)
 		h = mlx.Add(residual, projected)
@@ -1243,10 +1235,10 @@ func (l *DecoderLayer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Tex
 		h = mlx.Mul(h, l.LayerScalar)
 	}

-	return h
+	return h, donorKV
 }

-func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding bool, cfg *TextConfig, donorEntry *sharedKVEntry, slidingMaskCache *slidingMaskCache) *mlx.Array {
+func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding bool, cfg *TextConfig, donorEntry *sharedKVEntry, slidingMaskCache *slidingMaskCache) (*mlx.Array, *sharedKVEntry) {
 	// Determine head dim and scale based on layer type.
 	headDim := cfg.HeadDim
 	scale := cfg.SlidingScale
@@ -1280,6 +1272,7 @@ func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding b
 	q = mlx.RoPEWithFreqs(q, ropeDims, false, ropeBase, 1.0, offset, ropeFreqs)

 	var k, v *mlx.Array
+	var donorKV *sharedKVEntry

 	if donorEntry != nil {
 		// Shared layer: use donor's cached K/V.
@@ -1318,6 +1311,7 @@ func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding b
 		// Update cache.
 		if c != nil {
 			k, v = c.Update(k, v)
+			donorKV = &sharedKVEntry{K: k, V: v, Offset: c.Offset()}
 		}
 	}

@@ -1371,13 +1365,13 @@ func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding b
 		// strided views differently. Metal handles them natively.
 		out = mlx.Contiguous(out, false)
 	}
-	return a.OProj.Forward(out)
+	return a.OProj.Forward(out), donorKV
 }

 func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
-	gate := mlx.GELUApprox(m.GateProj.Forward(x))
+	gate := m.GateProj.Forward(x)
 	up := m.UpProj.Forward(x)
-	return m.DownProj.Forward(mlx.Mul(gate, up))
+	return m.DownProj.Forward(mlx.GeGLU(gate, up))
 }

 // Forward runs the router to select top-k experts per token.
@@ -1457,13 +1451,13 @@ func (m *MoEBlock) Forward(x *mlx.Array, scores, inds *mlx.Array, cfg *TextConfi
 			up := mlx.SliceStartStop(gateUp,
 				[]int32{0, 0, 0, mid},
 				[]int32{int32(guDims[0]), int32(guDims[1]), int32(guDims[2]), int32(guDims[len(guDims)-1])})
-			hidden = mlx.Mul(mlx.GELUApprox(gate), up)
+			hidden = mlx.GeGLU(gate, up)
 		} else {
 			gate := mlx.GatherQMM(xFlat, m.GateWeightQ, m.GateScales, m.GateBiases,
 				nil, idxFlat, true, m.GateGroupSize, m.GateBits, m.QuantMode, doSort)
 			up := mlx.GatherQMM(xFlat, m.UpWeightQ, m.UpScales, m.UpBiases,
 				nil, idxFlat, true, m.UpGroupSize, m.UpBits, m.QuantMode, doSort)
-			hidden = mlx.Mul(mlx.GELUApprox(gate), up)
+			hidden = mlx.GeGLU(gate, up)
 		}
 		downMode := m.DownQuantMode
 		if downMode == "" {
@@ -1482,11 +1476,11 @@ func (m *MoEBlock) Forward(x *mlx.Array, scores, inds *mlx.Array, cfg *TextConfi
 			up := mlx.SliceStartStop(gateUp,
 				[]int32{0, 0, 0, mid},
 				[]int32{int32(guDims[0]), int32(guDims[1]), int32(guDims[2]), int32(guDims[len(guDims)-1])})
-			hidden = mlx.Mul(mlx.GELUApprox(gate), up)
+			hidden = mlx.GeGLU(gate, up)
 		} else {
 			gate := mlx.GatherMM(xFlat, m.GateWeight, nil, idxFlat, doSort)
 			up := mlx.GatherMM(xFlat, m.UpWeight, nil, idxFlat, doSort)
-			hidden = mlx.Mul(mlx.GELUApprox(gate), up)
+			hidden = mlx.GeGLU(gate, up)
 		}
 		down = mlx.GatherMM(hidden, m.DownWeight, nil, idxFlat, doSort)
 	}
--- a/x/models/glm4_moe_lite/glm4_moe_lite.go
+++ b/x/models/glm4_moe_lite/glm4_moe_lite.go
@@ -148,9 +148,7 @@ type DenseMLP struct {

 // Forward applies the SwiGLU MLP
 func (m *DenseMLP) Forward(x *mlx.Array) *mlx.Array {
-	gate := mlx.SiLU(m.GateProj.Forward(x))
-	up := m.UpProj.Forward(x)
-	return m.DownProj.Forward(mlx.Mul(gate, up))
+	return m.DownProj.Forward(mlx.SwiGLU(m.GateProj.Forward(x), m.UpProj.Forward(x)))
 }

 // MoEGate implements the expert gating mechanism
@@ -242,7 +240,7 @@ func (s *SwitchMLP) Forward(x *mlx.Array, indices *mlx.Array, cfg *Config) *mlx.
 		up = mlx.GatherQMM(xFlat, s.UpWeightQ, s.UpScales, s.UpBiases,
 			nil, idxFlat, true, s.UpGroupSize, s.UpBits, cfg.QuantMode, doSort)

-		hidden = mlx.Mul(mlx.SiLU(gate), up)
+		hidden = mlx.SwiGLU(gate, up)

 		down = mlx.GatherQMM(hidden, s.DownWeightQ, s.DownScales, s.DownBiases,
 			nil, idxFlat, true, s.DownGroupSize, s.DownBits, cfg.QuantMode, doSort)
@@ -250,7 +248,7 @@ func (s *SwitchMLP) Forward(x *mlx.Array, indices *mlx.Array, cfg *Config) *mlx.
 		gate = mlx.GatherMM(xFlat, mlx.Transpose(s.GateWeight, 0, 2, 1), nil, idxFlat, doSort)
 		up = mlx.GatherMM(xFlat, mlx.Transpose(s.UpWeight, 0, 2, 1), nil, idxFlat, doSort)

-		hidden = mlx.Mul(mlx.SiLU(gate), up)
+		hidden = mlx.SwiGLU(gate, up)

 		down = mlx.GatherMM(hidden, mlx.Transpose(s.DownWeight, 0, 2, 1), nil, idxFlat, doSort)
 	}
@@ -273,9 +271,7 @@ type SharedExperts struct {

 // Forward applies the shared expert MLP
 func (s *SharedExperts) Forward(x *mlx.Array) *mlx.Array {
-	gate := mlx.SiLU(s.GateProj.Forward(x))
-	up := s.UpProj.Forward(x)
-	return s.DownProj.Forward(mlx.Mul(gate, up))
+	return s.DownProj.Forward(mlx.SwiGLU(s.GateProj.Forward(x), s.UpProj.Forward(x)))
 }

 // MoE implements the full Mixture of Experts layer
--- a/x/models/llama/llama.go
+++ b/x/models/llama/llama.go
@@ -314,5 +314,5 @@ func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config
 }

 func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
-	return m.DownProj.Forward(mlx.Mul(mlx.SiLU(m.GateProj.Forward(x)), m.UpProj.Forward(x)))
+	return m.DownProj.Forward(mlx.SwiGLU(m.GateProj.Forward(x), m.UpProj.Forward(x)))
 }
--- a/x/models/qwen3/qwen3.go
+++ b/x/models/qwen3/qwen3.go
@@ -333,5 +333,5 @@ func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config
 }

 func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
-	return m.DownProj.Forward(mlx.Mul(mlx.SiLU(m.GateProj.Forward(x)), m.UpProj.Forward(x)))
+	return m.DownProj.Forward(mlx.SwiGLU(m.GateProj.Forward(x), m.UpProj.Forward(x)))
 }
--- a/x/models/qwen3_5/qwen3_5.go
+++ b/x/models/qwen3_5/qwen3_5.go
@@ -1253,7 +1253,7 @@ func (g *GatedDeltaNet) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Co
 }

 func (m *DenseMLP) Forward(x *mlx.Array, _ *Config) *mlx.Array {
-	return m.DownProj.Forward(mlx.Mul(mlx.SiLU(m.GateProj.Forward(x)), m.UpProj.Forward(x)))
+	return m.DownProj.Forward(mlx.SwiGLU(m.GateProj.Forward(x), m.UpProj.Forward(x)))
 }

 func (s *SwitchMLP) Forward(x *mlx.Array, indices *mlx.Array, cfg *Config) *mlx.Array {
@@ -1283,13 +1283,13 @@ func (s *SwitchMLP) Forward(x *mlx.Array, indices *mlx.Array, cfg *Config) *mlx.
 			nil, idxFlat, true, s.GateGroupSize, s.GateBits, cfg.QuantMode, doSort)
 		up = mlx.GatherQMM(xFlat, s.UpWeightQ, s.UpScales, s.UpBiases,
 			nil, idxFlat, true, s.UpGroupSize, s.UpBits, cfg.QuantMode, doSort)
-		hidden = mlx.Mul(mlx.SiLU(gate), up)
+		hidden = mlx.SwiGLU(gate, up)
 		down = mlx.GatherQMM(hidden, s.DownWeightQ, s.DownScales, s.DownBiases,
 			nil, idxFlat, true, s.DownGroupSize, s.DownBits, cfg.QuantMode, doSort)
 	} else {
 		gate = mlx.GatherMM(xFlat, s.GateWeight, nil, idxFlat, doSort)
 		up = mlx.GatherMM(xFlat, s.UpWeight, nil, idxFlat, doSort)
-		hidden = mlx.Mul(mlx.SiLU(gate), up)
+		hidden = mlx.SwiGLU(gate, up)
 		down = mlx.GatherMM(hidden, s.DownWeight, nil, idxFlat, doSort)
 	}
Author	SHA1	Message	Date
ParthSareen	1222a4effd	launch: add kimi cli integration with installer flow	2026-04-20 13:30:44 -07:00
Daniel Hiltgen	ff23dd343f	mlx: apply repeat penalties in sampler (#15631 )	2026-04-18 07:49:38 -07:00
Parth Sareen	123b300af6	docs: update hermes (#15655 )	2026-04-17 14:20:59 -07:00
Parth Sareen	57653b8e42	cmd/launch: show WSL guidance on Windows instead of handing off (#15637 )	2026-04-16 17:18:04 -07:00
Parth Sareen	a50ce61c54	launch: skip unchanged managed-single rewrite (#15633 )	2026-04-16 16:20:42 -07:00
Daniel Hiltgen	2bb7ea00d2	create: avoid gc race with create (#15628 ) If you have a long running create, and start another ollama server with the same model dir, the GC algorithm deletes the pending blobs and breaks the create. This adds a 1h grace period to avoid deleting in-flight creation operations.	2026-04-16 13:29:16 -07:00
Daniel Hiltgen	55fa80d07a	mlx: additional gemma4 cache fixes (#15607 ) Harden additional corner cases	2026-04-16 13:07:19 -07:00
Daniel Hiltgen	b9cb535407	mlx: fix gemma4 cache to use logical view (#15617 )	2026-04-16 11:54:30 -07:00
Daniel Hiltgen	031baef094	mlx: fix imagegen lookup (#15588 ) * mlx: fix imagegen lookup Fixes #15533 - imagegen had fallen out of sync with the new layout for multiple mlx libraries on Metal. * review comments	2026-04-16 10:39:00 -07:00
Mike Wallio	7d271e6dc9	cmd/launch: add Copilot CLI integration (#15583 ) --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: ParthSareen <parth.sareen@ollama.com>	2026-04-15 17:22:53 -07:00
Devon Rifkin	c88dae2d6b	Merge pull request #15612 from ollama/drifkin/gemma4-split-templates gemma4: render differently based on model size	2026-04-15 17:15:35 -07:00
Devon Rifkin	9e3618d663	make empty block conditional	2026-04-15 15:35:25 -07:00
Daniel Hiltgen	5d920cc6bc	Keep Gemma4 router projection in source precision (#15613 )	2026-04-15 15:04:23 -07:00
Devon Rifkin	e585ecd11f	gemma4: render differently based on model size Following up on #15560, this change now has e2b/e4b render differently from 26b/31b. For backwards compatibility, we take the existing renderer name `gemma4` and make it do dynamic resolution based on the model name/size, but the intended use is for the models to be republished with the renderer variant specified explicitly: `gemma4-small` or `gemma4-large`.	2026-04-15 14:37:16 -07:00
Eva H	cdddea0592	launch: always list cloud recommendations first (#15593 )	2026-04-15 13:17:35 -07:00
Parth Sareen	43f90def04	launch: add hermes (#15569 )	2026-04-15 12:00:23 -07:00
Daniel Hiltgen	06ae6367bd	mlx: fix RotatingKVCache.concat() dropping context on mid-rotation (#15591 ) After the rotating buffer has wrapped (c.offset > c.maxSize) a subsequent L>1 Update() went through a slice-to-[0, c.idx) path that discarded all slots in [c.idx, Dim), losing the older-but-still-in-window tokens the first Q of the new batch needs for its sliding-window attention. Linearize the circular buffer to logical order in that wrapped case so the existing trim + concat preserves the last (maxSize - 1) old tokens. When the buffer has not yet wrapped (c.offset <= c.maxSize), slots [c.idx, Dim) are grow padding or stale post-rewind data, so keep dropping them.	2026-04-14 18:29:06 -07:00
Daniel Hiltgen	48ad7085c4	mlx: Improve gemma4 performance with fused operations (#15587 ) * mlx: Improve gemma4 performance with fused operations * review comments	2026-04-14 18:04:04 -07:00
Jesse Gross	e1e3cec8d0	models: fuse MLP activation functions via mlx_compile Converts SiLU/GELUApprox to compiled kernels and adds SwiGLU, matching upstream mlx/mlx_lm's activations pattern. Routes llama, qwen3, qwen3_5 (dense + MoE), and glm4_moe_lite MLP paths through mlx.SwiGLU so each MLP invocation runs as one fused Metal/CUDA kernel rather than a chain of per-op launches.	2026-04-14 16:38:32 -07:00
Jesse Gross	d3e67e305c	mlx: add compiled closure support Wraps MLX's mlx_compile API so Go functions can be traced into fused kernels. Contiguous elementwise chains collapse into a single Metal/CUDA kernel instead of launching one per op. Exposes Compile plus arity helpers (Compile1/2/3) that mirror Python's @mx.compile decorator shape, lazily building the closure on first call so package-level declarations work before the MLX dylib loads.	2026-04-14 16:38:32 -07:00