openai: map responses reasoning effort to think (#15789 )

api: accept "max" as a think value (#15787 )
launch: harden OpenClaw onboarding flow (#15777 )
2026-04-26 02:36:09 +02:00 · 2026-04-24 02:49:36 -07:00 · 2026-04-24 01:49:39 -07:00 · 2026-04-23 16:47:20 -07:00 · 2026-04-23 19:32:36 -04:00 · 2026-04-23 16:33:00 -04:00
32 changed files with 1603 additions and 621 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -1080,7 +1080,7 @@ func DefaultOptions() Options {
 	}
 }

-// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low")
+// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low", "max")
 type ThinkValue struct {
 	// Value can be a bool or string
 	Value interface{}
@@ -1096,7 +1096,7 @@ func (t *ThinkValue) IsValid() bool {
 	case bool:
 		return true
 	case string:
-		return v == "high" || v == "medium" || v == "low"
+		return v == "high" || v == "medium" || v == "low" || v == "max"
 	default:
 		return false
 	}
@@ -1130,8 +1130,8 @@ func (t *ThinkValue) Bool() bool {
 	case bool:
 		return v
 	case string:
-		// Any string value ("high", "medium", "low") means thinking is enabled
-		return v == "high" || v == "medium" || v == "low"
+		// Any string value ("high", "medium", "low", "max") means thinking is enabled
+		return v == "high" || v == "medium" || v == "low" || v == "max"
 	default:
 		return false
 	}
@@ -1169,14 +1169,14 @@ func (t *ThinkValue) UnmarshalJSON(data []byte) error {
 	var s string
 	if err := json.Unmarshal(data, &s); err == nil {
 		// Validate string values
-		if s != "high" && s != "medium" && s != "low" {
-			return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", true, or false)", s)
+		if s != "high" && s != "medium" && s != "low" && s != "max" {
+			return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", \"max\", true, or false)", s)
 		}
 		t.Value = s
 		return nil
 	}

-	return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\", true, or false)")
+	return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\", \"max\", true, or false)")
 }

 // MarshalJSON implements json.Marshaler
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -495,6 +495,11 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 			input:            `{ "think": "low" }`,
 			expectedThinking: &ThinkValue{Value: "low"},
 		},
+		{
+			name:             "string_max",
+			input:            `{ "think": "max" }`,
+			expectedThinking: &ThinkValue{Value: "max"},
+		},
 		{
 			name:             "invalid_string",
 			input:            `{ "think": "invalid" }`,
--- a/app/ui/app/src/hooks/useChats.ts
+++ b/app/ui/app/src/hooks/useChats.ts
@@ -381,7 +381,7 @@ export const useSendMessage = (chatId: string) => {
                    role: "assistant",
                    content: "",
                    thinking: "",
-                    model: effectiveModel,
+                    model: effectiveModel.model,
                  }),
                );
                lastMessage = newMessages[newMessages.length - 1];
@@ -433,7 +433,7 @@ export const useSendMessage = (chatId: string) => {
                    role: "assistant",
                    content: "",
                    thinking: "",
-                    model: effectiveModel,
+                    model: effectiveModel.model,
                  }),
                );
                lastMessage = newMessages[newMessages.length - 1];
@@ -520,7 +520,7 @@ export const useSendMessage = (chatId: string) => {
                    thinkingTimeStart:
                      lastMessage.thinkingTimeStart || event.thinkingTimeStart,
                    thinkingTimeEnd: event.thinkingTimeEnd,
-                    model: selectedModel,
+                    model: selectedModel.model,
                  });
                  newMessages[newMessages.length - 1] = updatedMessage;
                } else {
@@ -533,7 +533,7 @@ export const useSendMessage = (chatId: string) => {
                      tool_calls: event.toolCalls,
                      thinkingTimeStart: event.thinkingTimeStart,
                      thinkingTimeEnd: event.thinkingTimeEnd,
-                      model: selectedModel,
+                      model: selectedModel.model,
                    }),
                  );
                }
@@ -699,7 +699,7 @@ export const useSendMessage = (chatId: string) => {
            queryClient.setQueryData(["chat", newId], {
              chat: new Chat({
                id: newId,
-                model: effectiveModel,
+                model: effectiveModel.model,
                messages: [
                  new Message({
                    role: "user",
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -582,10 +582,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 			opts.Think = &api.ThinkValue{Value: true}
 		case "false":
 			opts.Think = &api.ThinkValue{Value: false}
-		case "high", "medium", "low":
+		case "high", "medium", "low", "max":
 			opts.Think = &api.ThinkValue{Value: thinkStr}
 		default:
-			return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, or low)", thinkStr)
+			return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, low, or max)", thinkStr)
 		}
 	} else {
 		opts.Think = nil
@@ -1975,8 +1975,61 @@ func launchInteractiveModel(cmd *cobra.Command, modelName string) error {
 		Options:     map[string]any{},
 		ShowConnect: true,
 	}
-	// loadOrUnloadModel is cloud-safe here: remote/cloud models skip local preload
-	// and only validate auth/connectivity before interactive chat starts.
+
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	requestedCloud := modelref.HasExplicitCloudSource(modelName)
+
+	info, err := func() (*api.ShowResponse, error) {
+		showReq := &api.ShowRequest{Name: modelName}
+		info, err := client.Show(cmd.Context(), showReq)
+		var se api.StatusError
+		if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
+			if requestedCloud {
+				return nil, err
+			}
+			if err := PullHandler(cmd, []string{modelName}); err != nil {
+				return nil, err
+			}
+			return client.Show(cmd.Context(), &api.ShowRequest{Name: modelName})
+		}
+		return info, err
+	}()
+	if err != nil {
+		if handleCloudAuthorizationError(err) {
+			return nil
+		}
+		return err
+	}
+
+	ensureCloudStub(cmd.Context(), client, modelName)
+
+	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, false)
+	if err != nil {
+		return err
+	}
+
+	audioCapable := slices.Contains(info.Capabilities, model.CapabilityAudio)
+	opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision) || audioCapable
+
+	// TODO: remove the projector info and vision info checks below,
+	// these are left in for backwards compatibility with older servers
+	// that don't have the capabilities field in the model info
+	if len(info.ProjectorInfo) != 0 {
+		opts.MultiModal = true
+	}
+	for k := range info.ModelInfo {
+		if strings.Contains(k, ".vision.") {
+			opts.MultiModal = true
+			break
+		}
+	}
+
+	applyShowResponseToRunOptions(&opts, info)
+
 	if err := loadOrUnloadModel(cmd, &opts); err != nil {
 		return fmt.Errorf("error loading model: %w", err)
 	}
--- a/cmd/launch/integrations_test.go
+++ b/cmd/launch/integrations_test.go
@@ -301,7 +301,7 @@ func TestParseArgs(t *testing.T) {
 func TestIsCloudModel(t *testing.T) {
 	// isCloudModel now only uses Show API, so nil client always returns false
 	t.Run("nil client returns false", func(t *testing.T) {
-		models := []string{"glm-5.1:cloud", "kimi-k2.5:cloud", "local-model"}
+		models := []string{"glm-5.1:cloud", "kimi-k2.6:cloud", "local-model"}
 		for _, model := range models {
 			if isCloudModel(context.Background(), nil, model) {
 				t.Errorf("isCloudModel(%q) with nil client should return false", model)
@@ -318,10 +318,18 @@ func names(items []ModelItem) []string {
 	return out
 }

+func recommendedNames(extra ...string) []string {
+	out := make([]string, 0, len(recommendedModels)+len(extra))
+	for _, item := range recommendedModels {
+		out = append(out, item.Name)
+	}
+	return append(out, extra...)
+}
+
 func TestBuildModelList_NoExistingModels(t *testing.T) {
 	items, _, _, _ := buildModelList(nil, nil, "")

-	want := []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "gemma4", "qwen3.5"}
+	want := recommendedNames()
 	if diff := cmp.Diff(want, names(items)); diff != "" {
 		t.Errorf("with no existing models, items should be recommended in order (-want +got):\n%s", diff)
 	}
@@ -350,7 +358,7 @@ func TestBuildModelList_OnlyLocalModels_CloudRecsStillFirst(t *testing.T) {

 	// Cloud recs always come first among recommended, regardless of installed inventory.
 	// Cloud disablement is handled upstream in loadSelectableModels via filterCloudItems.
-	want := []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "gemma4", "qwen3.5", "llama3.2", "qwen2.5"}
+	want := recommendedNames("llama3.2", "qwen2.5")
 	if diff := cmp.Diff(want, got); diff != "" {
 		t.Errorf("cloud recs pinned first even when no cloud models installed (-want +got):\n%s", diff)
 	}
@@ -366,13 +374,13 @@ func TestBuildModelList_BothCloudAndLocal_RegularSort(t *testing.T) {
 	got := names(items)

 	// All recs pinned at top (cloud before local in mixed case), then non-recs
-	want := []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "gemma4", "qwen3.5", "llama3.2"}
+	want := recommendedNames("llama3.2")
 	if diff := cmp.Diff(want, got); diff != "" {
 		t.Errorf("recs pinned at top, cloud recs first in mixed case (-want +got):\n%s", diff)
 	}
 }

-func TestBuildModelList_PreCheckedFirst(t *testing.T) {
+func TestBuildModelList_PreCheckedNonRecommendedFirstInMore(t *testing.T) {
 	existing := []modelInfo{
 		{Name: "llama3.2:latest", Remote: false},
 		{Name: "glm-5.1:cloud", Remote: true},
@@ -381,8 +389,9 @@ func TestBuildModelList_PreCheckedFirst(t *testing.T) {
 	items, _, _, _ := buildModelList(existing, []string{"llama3.2"}, "")
 	got := names(items)

-	if got[0] != "llama3.2" {
-		t.Errorf("pre-checked model should be first, got %v", got)
+	want := recommendedNames("llama3.2")
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("recommended block should stay fixed while checked non-recommended models lead More (-want +got):\n%s", diff)
 	}
 }

@@ -437,7 +446,7 @@ func TestBuildModelList_ExistingRecommendedMarked(t *testing.T) {
 			if !strings.HasSuffix(item.Description, "(not downloaded)") {
 				t.Errorf("non-installed recommended %q should have '(not downloaded)' suffix, got %q", item.Name, item.Description)
 			}
-		case "minimax-m2.7:cloud", "kimi-k2.5:cloud", "qwen3.5:cloud":
+		case "minimax-m2.7:cloud", "kimi-k2.6:cloud", "qwen3.5:cloud":
 			if strings.HasSuffix(item.Description, "(not downloaded)") {
 				t.Errorf("cloud model %q should not have '(not downloaded)' suffix, got %q", item.Name, item.Description)
 			}
@@ -455,9 +464,9 @@ func TestBuildModelList_ExistingCloudModelsNotPushedToBottom(t *testing.T) {
 	got := names(items)

 	// gemma4 and glm-5.1:cloud are installed so they sort normally;
-	// kimi-k2.5:cloud, qwen3.5:cloud, and qwen3.5 are not installed so they go to the bottom
+	// qwen3.5:cloud and qwen3.5 are not installed so they go to the bottom
 	// All recs: cloud first in mixed case, then local, in rec order within each
-	want := []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "gemma4", "qwen3.5"}
+	want := recommendedNames()
 	if diff := cmp.Diff(want, got); diff != "" {
 		t.Errorf("all recs, cloud first in mixed case (-want +got):\n%s", diff)
 	}
@@ -466,23 +475,23 @@ func TestBuildModelList_ExistingCloudModelsNotPushedToBottom(t *testing.T) {
 func TestBuildModelList_HasRecommendedCloudModel_OnlyNonInstalledAtBottom(t *testing.T) {
 	existing := []modelInfo{
 		{Name: "llama3.2:latest", Remote: false},
-		{Name: "kimi-k2.5:cloud", Remote: true},
+		{Name: "kimi-k2.6:cloud", Remote: true},
 	}

 	items, _, _, _ := buildModelList(existing, nil, "")
 	got := names(items)

-	// kimi-k2.5:cloud is installed so it sorts normally;
+	// kimi-k2.6:cloud is installed so it sorts normally;
 	// the rest of the recommendations are not installed so they go to the bottom
 	// All recs pinned at top (cloud first in mixed case), then non-recs
-	want := []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud", "gemma4", "qwen3.5", "llama3.2"}
+	want := recommendedNames("llama3.2")
 	if diff := cmp.Diff(want, got); diff != "" {
 		t.Errorf("recs pinned at top, cloud first in mixed case (-want +got):\n%s", diff)
 	}

 	for _, item := range items {
 		isCloud := strings.HasSuffix(item.Name, ":cloud")
-		isInstalled := slices.Contains([]string{"kimi-k2.5:cloud", "llama3.2"}, item.Name)
+		isInstalled := slices.Contains([]string{"kimi-k2.6:cloud", "llama3.2"}, item.Name)
 		if isInstalled || isCloud {
 			if strings.HasSuffix(item.Description, "(not downloaded)") {
 				t.Errorf("installed or cloud model %q should not have '(not downloaded)' suffix, got %q", item.Name, item.Description)
@@ -549,8 +558,8 @@ func TestBuildModelList_ReturnsExistingAndCloudMaps(t *testing.T) {
 	if !cloudModels["glm-5.1:cloud"] {
 		t.Error("glm-5.1:cloud should be in cloudModels")
 	}
-	if !cloudModels["kimi-k2.5:cloud"] {
-		t.Error("kimi-k2.5:cloud should be in cloudModels (recommended cloud)")
+	if !cloudModels["kimi-k2.6:cloud"] {
+		t.Error("kimi-k2.6:cloud should be in cloudModels (recommended cloud)")
 	}
 	if !cloudModels["qwen3.5:cloud"] {
 		t.Error("qwen3.5:cloud should be in cloudModels (recommended cloud)")
@@ -570,7 +579,7 @@ func TestBuildModelList_RecommendedFieldSet(t *testing.T) {

 	for _, item := range items {
 		switch item.Name {
-		case "gemma4", "qwen3.5", "glm-5.1:cloud", "kimi-k2.5:cloud", "qwen3.5:cloud":
+		case "gemma4", "qwen3.5", "glm-5.1:cloud", "kimi-k2.6:cloud", "qwen3.5:cloud":
 			if !item.Recommended {
 				t.Errorf("%q should have Recommended=true", item.Name)
 			}
@@ -628,7 +637,7 @@ func TestBuildModelList_RecsAboveNonRecs(t *testing.T) {
 	lastRecIdx := -1
 	firstNonRecIdx := len(got)
 	for i, name := range got {
-		isRec := name == "gemma4" || name == "qwen3.5" || name == "minimax-m2.7:cloud" || name == "glm-5.1:cloud" || name == "kimi-k2.5:cloud" || name == "qwen3.5:cloud"
+		isRec := name == "gemma4" || name == "qwen3.5" || name == "minimax-m2.7:cloud" || name == "glm-5.1:cloud" || name == "kimi-k2.6:cloud" || name == "qwen3.5:cloud"
 		if isRec && i > lastRecIdx {
 			lastRecIdx = i
 		}
@@ -641,17 +650,32 @@ func TestBuildModelList_RecsAboveNonRecs(t *testing.T) {
 	}
 }

-func TestBuildModelList_CheckedBeforeRecs(t *testing.T) {
+func TestBuildModelList_CheckedRecommendedDoesNotReshuffleRecommendedOrder(t *testing.T) {
 	existing := []modelInfo{
 		{Name: "llama3.2:latest", Remote: false},
 		{Name: "glm-5.1:cloud", Remote: true},
 	}

-	items, _, _, _ := buildModelList(existing, []string{"llama3.2"}, "")
+	items, _, _, _ := buildModelList(existing, []string{"qwen3.5:cloud", "glm-5.1:cloud"}, "")
 	got := names(items)

-	if got[0] != "llama3.2" {
-		t.Errorf("checked model should be first even before recs, got %v", got)
+	want := recommendedNames("llama3.2")
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("checked recommended models should not reshuffle the fixed recommended order (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_StaleSavedKimiK25DoesNotReshuffleRecommendedOrder(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "kimi-k2.5:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, []string{"kimi-k2.5:cloud", "qwen3.5:cloud", "glm-5.1:cloud", "minimax-m2.7:cloud"}, "kimi-k2.5:cloud")
+	got := names(items)
+
+	want := recommendedNames("kimi-k2.5:cloud")
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("stale saved kimi-k2.5 should stay in More without reshuffling the fixed recommended order (-want +got):\n%s", diff)
 	}
 }

--- a/cmd/launch/launch.go
+++ b/cmd/launch/launch.go
@@ -588,7 +588,7 @@ func (c *launcherClient) launchManagedSingleIntegration(ctx context.Context, nam
 		return nil
 	}

-	if (current == "" || needsConfigure || req.ModelOverride != "" || target != current) && !savedMatchesModels(saved, []string{target}) {
+	if needsConfigure || req.ModelOverride != "" || (current != "" && target != current) || !savedMatchesModels(saved, []string{target}) {
 		if err := prepareManagedSingleIntegration(name, runner, managed, target); err != nil {
 			return err
 		}
--- a/cmd/launch/launch_test.go
+++ b/cmd/launch/launch_test.go
@@ -13,6 +13,7 @@ import (
 	"strings"
 	"testing"

+	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/cmd/config"
 )

@@ -511,6 +512,65 @@ func TestLaunchIntegration_ManagedSingleIntegrationRewritesWhenSavedDiffers(t *t
 	}
 }

+func TestLaunchIntegration_ManagedSingleIntegrationRewritesWhenLiveConfigDrifts(t *testing.T) {
+	tmpDir := t.TempDir()
+	setLaunchTestHome(t, tmpDir)
+	withInteractiveSession(t, true)
+	withLauncherHooks(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/tags":
+			fmt.Fprint(w, `{"models":[{"name":"gemma4"},{"name":"qwen3:8b"}]}`)
+		case "/api/show":
+			fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+	t.Setenv("OLLAMA_HOST", srv.URL)
+
+	if err := config.SaveIntegration("stubmanaged", []string{"gemma4"}); err != nil {
+		t.Fatalf("failed to save managed integration config: %v", err)
+	}
+
+	runner := &launcherManagedRunner{
+		currentModel: "qwen3:8b",
+	}
+	withIntegrationOverride(t, "stubmanaged", runner)
+
+	DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
+		t.Fatal("selector should not be called when live config already provides the target")
+		return "", nil
+	}
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		return true, nil
+	}
+
+	if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{Name: "stubmanaged"}); err != nil {
+		t.Fatalf("LaunchIntegration returned error: %v", err)
+	}
+
+	if diff := compareStrings(runner.configured, []string{"qwen3:8b"}); diff != "" {
+		t.Fatalf("expected Configure to reconcile stale saved config to live target: %s", diff)
+	}
+	if runner.refreshCalls != 1 {
+		t.Fatalf("expected runtime refresh once after drift reconciliation, got %d", runner.refreshCalls)
+	}
+	if runner.ranModel != "qwen3:8b" {
+		t.Fatalf("expected launch to run live configured model, got %q", runner.ranModel)
+	}
+
+	saved, err := config.LoadIntegration("stubmanaged")
+	if err != nil {
+		t.Fatalf("failed to reload managed integration config: %v", err)
+	}
+	if diff := compareStrings(saved.Models, []string{"qwen3:8b"}); diff != "" {
+		t.Fatalf("saved models mismatch after drift reconciliation: %s", diff)
+	}
+}
+
 func TestLaunchIntegration_ManagedSingleIntegrationStopsWhenRuntimeRefreshFails(t *testing.T) {
 	tmpDir := t.TempDir()
 	setLaunchTestHome(t, tmpDir)
@@ -1219,8 +1279,9 @@ func TestLaunchIntegration_EditorForceConfigure_FloatsCheckedModelsInPicker(t *t
 	if len(gotItems) == 0 {
 		t.Fatal("expected multi selector to receive items")
 	}
-	if gotItems[0] != "qwen3.5:cloud" {
-		t.Fatalf("expected checked models floated to top with qwen3.5:cloud first, got %v", gotItems)
+	wantItems := recommendedNames()
+	if diff := cmp.Diff(wantItems, gotItems); diff != "" {
+		t.Fatalf("expected fixed recommended order in selector items (-want +got):\n%s", diff)
 	}
 	if len(gotPreChecked) < 2 {
 		t.Fatalf("expected prechecked models to be preserved, got %v", gotPreChecked)
--- a/cmd/launch/models.go
+++ b/cmd/launch/models.go
@@ -21,7 +21,7 @@ import (
 )

 var recommendedModels = []ModelItem{
-	{Name: "kimi-k2.5:cloud", Description: "Multimodal reasoning with subagents", Recommended: true},
+	{Name: "kimi-k2.6:cloud", Description: "State-of-the-art coding, long-horizon execution, and multimodal agent swarm capability", Recommended: true},
 	{Name: "qwen3.5:cloud", Description: "Reasoning, coding, and agentic tool use with vision", Recommended: true},
 	{Name: "glm-5.1:cloud", Description: "Reasoning and code generation", Recommended: true},
 	{Name: "minimax-m2.7:cloud", Description: "Fast, efficient coding and real-world productivity", Recommended: true},
@@ -56,6 +56,7 @@ var cloudModelLimits = map[string]cloudModelLimit{
 	"gpt-oss:20b":         {Context: 131_072, Output: 131_072},
 	"kimi-k2:1t":          {Context: 262_144, Output: 262_144},
 	"kimi-k2.5":           {Context: 262_144, Output: 262_144},
+	"kimi-k2.6":           {Context: 262_144, Output: 262_144},
 	"kimi-k2-thinking":    {Context: 262_144, Output: 262_144},
 	"nemotron-3-nano:30b": {Context: 1_048_576, Output: 131_072},
 	"qwen3-coder:480b":    {Context: 262_144, Output: 65_536},
@@ -360,18 +361,12 @@ func buildModelList(existing []modelInfo, preChecked []string, current string) (
 	}

 	if hasLocalModel || hasCloudModel {
+		// Keep the Recommended section pinned to recommendedModels order. Checked
+		// and default-model priority only apply within the More section.
 		slices.SortStableFunc(items, func(a, b ModelItem) int {
 			ac, bc := checked[a.Name], checked[b.Name]
 			aNew, bNew := notInstalled[a.Name], notInstalled[b.Name]
 			aRec, bRec := recRank[a.Name] > 0, recRank[b.Name] > 0
-			aCloud, bCloud := cloudModels[a.Name], cloudModels[b.Name]
-
-			if ac != bc {
-				if ac {
-					return -1
-				}
-				return 1
-			}
 			if aRec != bRec {
 				if aRec {
 					return -1
@@ -379,14 +374,14 @@ func buildModelList(existing []modelInfo, preChecked []string, current string) (
 				return 1
 			}
 			if aRec && bRec {
-				if aCloud != bCloud {
-					if aCloud {
-						return -1
-					}
-					return 1
-				}
 				return recRank[a.Name] - recRank[b.Name]
 			}
+			if ac != bc {
+				if ac {
+					return -1
+				}
+				return 1
+			}
 			// Among checked non-recommended items - put the default first
 			if ac && !aRec && current != "" {
 				aCurrent := a.Name == current
--- a/cmd/launch/openclaw.go
+++ b/cmd/launch/openclaw.go
@@ -14,8 +14,6 @@ import (
 	"strings"
 	"time"

-	"golang.org/x/mod/semver"
-
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/cmd/internal/fileutil"
 	"github.com/ollama/ollama/envconfig"
@@ -30,6 +28,8 @@ var openclawModelShowTimeout = 5 * time.Second
 // openclawFreshInstall is set to true when ensureOpenclawInstalled performs an install
 var openclawFreshInstall bool

+var openclawCanInstallDaemon = canInstallDaemon
+
 type Openclaw struct{}

 func (c *Openclaw) String() string { return "OpenClaw" }
@@ -60,6 +60,7 @@ func (c *Openclaw) Run(model string, args []string) error {
 		// the newest wizard flags (e.g. --auth-choice ollama).
 		if !openclawFreshInstall {
 			update := exec.Command(bin, "update")
+			update.Env = openclawInstallEnv()
 			update.Stdout = os.Stdout
 			update.Stderr = os.Stderr
 			_ = update.Run() // best-effort; continue even if update fails
@@ -75,19 +76,18 @@ func (c *Openclaw) Run(model string, args []string) error {
 			"--auth-choice", "ollama",
 			"--custom-base-url", envconfig.Host().String(),
 			"--custom-model-id", model,
+			// Launch owns the first real gateway startup immediately after onboarding,
+			// so don't let OpenClaw fail the whole first-run flow on a transient
+			// daemon health probe.
+			"--skip-health",
 			"--skip-channels",
 			"--skip-skills",
 		}
-		if canInstallDaemon() {
+		if openclawCanInstallDaemon() {
 			onboardArgs = append(onboardArgs, "--install-daemon")
-		} else {
-			// When we can't install a daemon (e.g. no systemd, sudo dropped
-			// XDG_RUNTIME_DIR, or container environment), skip the gateway
-			// health check so non-interactive onboarding completes. The
-			// gateway is started as a foreground child process after onboarding.
-			onboardArgs = append(onboardArgs, "--skip-health")
 		}
 		cmd := exec.Command(bin, onboardArgs...)
+		cmd.Env = openclawInstallEnv()
 		cmd.Stdin = os.Stdin
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
@@ -98,13 +98,23 @@ func (c *Openclaw) Run(model string, args []string) error {
 		patchDeviceScopes()
 	}

-	if ensureWebSearchPlugin() {
-		registerWebSearchPlugin()
-	}
+	configureOllamaWebSearch()

 	// When extra args are passed through, run exactly what the user asked for
 	// after setup and skip the built-in gateway+TUI convenience flow.
 	if len(args) > 0 {
+		cleanup := func() {}
+		if shouldEnsureGatewayForArgs(args) {
+			cleanupFn, _, _, err := c.ensureGatewayReady(bin)
+			if err != nil {
+				return windowsHint(err)
+			}
+			if cleanupFn != nil {
+				cleanup = cleanupFn
+			}
+		}
+		defer cleanup()
+
 		cmd := exec.Command(bin, args...)
 		cmd.Env = openclawEnv()
 		cmd.Stdin = os.Stdin
@@ -125,41 +135,11 @@ func (c *Openclaw) Run(model string, args []string) error {

 	fmt.Fprintf(os.Stderr, "\n%sStarting your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)

-	token, port := c.gatewayInfo()
-	addr := fmt.Sprintf("localhost:%d", port)
-
-	// If the gateway is already running (e.g. via the daemon), restart it
-	// so it picks up any config changes (model, provider, etc.).
-	if portOpen(addr) {
-		restart := exec.Command(bin, "daemon", "restart")
-		restart.Env = openclawEnv()
-		if err := restart.Run(); err != nil {
-			fmt.Fprintf(os.Stderr, "%s  Warning: daemon restart failed: %v%s\n", ansiYellow, err, ansiReset)
-		}
-		if !waitForPort(addr, 10*time.Second) {
-			fmt.Fprintf(os.Stderr, "%s  Warning: gateway did not come back after restart%s\n", ansiYellow, ansiReset)
-		}
-	}
-
-	// If the gateway isn't running, start it as a background child process.
-	if !portOpen(addr) {
-		gw := exec.Command(bin, "gateway", "run", "--force")
-		gw.Env = openclawEnv()
-		if err := gw.Start(); err != nil {
-			return windowsHint(fmt.Errorf("failed to start gateway: %w", err))
-		}
-		defer func() {
-			if gw.Process != nil {
-				_ = gw.Process.Kill()
-				_ = gw.Wait()
-			}
-		}()
-	}
-
-	fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
-	if !waitForPort(addr, 30*time.Second) {
-		return windowsHint(fmt.Errorf("gateway did not start on %s", addr))
+	cleanup, token, port, err := c.ensureGatewayReady(bin)
+	if err != nil {
+		return windowsHint(err)
 	}
+	defer cleanup()

 	printOpenclawReady(bin, token, port, firstLaunch)

@@ -179,6 +159,66 @@ func (c *Openclaw) Run(model string, args []string) error {
 	return nil
 }

+func shouldEnsureGatewayForArgs(args []string) bool {
+	return len(args) > 0 && args[0] == "tui"
+}
+
+func (c *Openclaw) ensureGatewayReady(bin string) (func(), string, int, error) {
+	token, port := c.gatewayInfo()
+	addr := fmt.Sprintf("localhost:%d", port)
+
+	// If the gateway is already running (e.g. via the daemon), restart it
+	// so it picks up any config changes (model, provider, etc.).
+	if portOpen(addr) {
+		restart := exec.Command(bin, "daemon", "restart")
+		restart.Env = openclawEnv()
+		if err := restart.Run(); err != nil {
+			fmt.Fprintf(os.Stderr, "%s  Warning: daemon restart failed: %v%s\n", ansiYellow, err, ansiReset)
+		}
+		if !waitForPort(addr, 10*time.Second) {
+			fmt.Fprintf(os.Stderr, "%s  Warning: gateway did not come back after restart%s\n", ansiYellow, ansiReset)
+		}
+	}
+
+	// If the daemon is installed but not currently listening, try to bring it
+	// up before falling back to a foreground child process.
+	if openclawCanInstallDaemon() && !portOpen(addr) {
+		start := exec.Command(bin, "daemon", "start")
+		start.Env = openclawEnv()
+		if err := start.Run(); err != nil {
+			fmt.Fprintf(os.Stderr, "%s  Warning: daemon start failed: %v%s\n", ansiYellow, err, ansiReset)
+		} else if waitForPort(addr, 10*time.Second) {
+			fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
+			return func() {}, token, port, nil
+		}
+	}
+
+	cleanup := func() {}
+
+	// If the gateway still isn't running, start it as a background child process.
+	if !portOpen(addr) {
+		gw := exec.Command(bin, "gateway", "run", "--force")
+		gw.Env = openclawEnv()
+		if err := gw.Start(); err != nil {
+			return nil, "", 0, fmt.Errorf("failed to start gateway: %w", err)
+		}
+		cleanup = func() {
+			if gw.Process != nil {
+				_ = gw.Process.Kill()
+				_ = gw.Wait()
+			}
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
+	if !waitForPort(addr, 30*time.Second) {
+		cleanup()
+		return nil, "", 0, fmt.Errorf("gateway did not start on %s", addr)
+	}
+
+	return cleanup, token, port, nil
+}
+
 // runChannelSetupPreflight prompts users to connect a messaging channel before
 // starting the built-in gateway+TUI flow. In interactive sessions, it loops
 // until a channel is configured, unless the user chooses "Set up later".
@@ -339,9 +379,30 @@ func openclawEnv() []string {
 			env = append(env, e)
 		}
 	}
+	if _, ok := os.LookupEnv("OPENCLAW_PLUGIN_STAGE_DIR"); !ok {
+		if dir := openclawPluginStageDir(); dir != "" {
+			env = append(env, "OPENCLAW_PLUGIN_STAGE_DIR="+dir)
+		}
+	}
 	return env
 }

+func openclawInstallEnv() []string {
+	env := openclawEnv()
+	if _, ok := os.LookupEnv("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"); !ok {
+		env = append(env, "OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS=1")
+	}
+	return env
+}
+
+func openclawPluginStageDir() string {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return ""
+	}
+	return filepath.Join(home, ".openclaw", "plugin-runtime-deps")
+}
+
 // portOpen checks if a TCP port is currently accepting connections.
 func portOpen(addr string) bool {
 	conn, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
@@ -565,6 +626,7 @@ func ensureOpenclawInstalled() (string, error) {

 	fmt.Fprintf(os.Stderr, "\nInstalling OpenClaw...\n")
 	cmd := exec.Command("npm", "install", "-g", "openclaw@latest")
+	cmd.Env = openclawInstallEnv()
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -738,89 +800,13 @@ func clearSessionModelOverride(primary string) {
 	_ = os.WriteFile(path, out, 0o600)
 }

-const (
-	webSearchNpmPackage = "@ollama/openclaw-web-search"
-	webSearchMinVersion = "0.2.1"
-)
-
-// ensureWebSearchPlugin installs the openclaw-web-search extension into the
-// user-level extensions directory (~/.openclaw/extensions/) if it isn't already
-// present, or re-installs if the installed version is older than webSearchMinVersion.
-// Returns true if the extension is available.
-func ensureWebSearchPlugin() bool {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return false
-	}
-
-	pluginDir := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
-	if webSearchPluginUpToDate(pluginDir) {
-		return true
-	}
-
-	npmBin, err := exec.LookPath("npm")
-	if err != nil {
-		return false
-	}
-
-	if err := os.MkdirAll(pluginDir, 0o755); err != nil {
-		return false
-	}
-
-	// Download the tarball via `npm pack`, extract it flat into the plugin dir.
-	pack := exec.Command(npmBin, "pack", webSearchNpmPackage, "--pack-destination", pluginDir)
-	out, err := pack.Output()
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "%s  Warning: could not download web search plugin: %v%s\n", ansiYellow, err, ansiReset)
-		return false
-	}
-
-	tgzName := strings.TrimSpace(string(out))
-	tgzPath := filepath.Join(pluginDir, tgzName)
-	defer os.Remove(tgzPath)
-
-	tar := exec.Command("tar", "xzf", tgzPath, "--strip-components=1", "-C", pluginDir)
-	if err := tar.Run(); err != nil {
-		fmt.Fprintf(os.Stderr, "%s  Warning: could not extract web search plugin: %v%s\n", ansiYellow, err, ansiReset)
-		return false
-	}
-
-	fmt.Fprintf(os.Stderr, "%s  ✓ Installed Ollama web search %s\n", ansiGreen, ansiReset)
-	return true
-}
-
-// webSearchPluginUpToDate returns true if the plugin is installed and its
-// package.json version is >= webSearchMinVersion.
-func webSearchPluginUpToDate(pluginDir string) bool {
-	data, err := os.ReadFile(filepath.Join(pluginDir, "package.json"))
-	if err != nil {
-		return false
-	}
-	var pkg struct {
-		Version string `json:"version"`
-	}
-	if json.Unmarshal(data, &pkg) != nil || pkg.Version == "" {
-		return false
-	}
-	return !versionLessThan(pkg.Version, webSearchMinVersion)
-}
-
-// versionLessThan compares two semver version strings (major.minor.patch).
-// Inputs may omit the "v" prefix; it is added automatically for semver.Compare.
-func versionLessThan(a, b string) bool {
-	if !strings.HasPrefix(a, "v") {
-		a = "v" + a
-	}
-	if !strings.HasPrefix(b, "v") {
-		b = "v" + b
-	}
-	return semver.Compare(a, b) < 0
-}
-
-// registerWebSearchPlugin adds plugins.entries.openclaw-web-search to the OpenClaw
-// config so the gateway activates it on next start. Best-effort; silently returns
-// on any error.
-func registerWebSearchPlugin() {
+// configureOllamaWebSearch keeps launch-managed OpenClaw installs on the
+// bundled Ollama web_search provider. Older launch builds installed an
+// external openclaw-web-search plugin that added custom ollama_web_search and
+// ollama_web_fetch tools. Current OpenClaw versions ship Ollama web_search as
+// the bundled "ollama" plugin instead, so we migrate stale config and ensure
+// fresh installs select the bundled provider.
+func configureOllamaWebSearch() {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return
@@ -835,6 +821,8 @@ func registerWebSearchPlugin() {
 		return
 	}

+	stalePluginConfigured := false
+
 	plugins, _ := config["plugins"].(map[string]any)
 	if plugins == nil {
 		plugins = make(map[string]any)
@@ -843,68 +831,100 @@ func registerWebSearchPlugin() {
 	if entries == nil {
 		entries = make(map[string]any)
 	}
-	entries["openclaw-web-search"] = map[string]any{"enabled": true}
-	plugins["entries"] = entries
-
-	// Pin trust so the gateway doesn't warn about untracked plugins.
-	allow, _ := plugins["allow"].([]any)
-	hasAllow := false
-	for _, v := range allow {
-		if s, ok := v.(string); ok && s == "openclaw-web-search" {
-			hasAllow = true
-			break
-		}
-	}
-	if !hasAllow {
-		allow = append(allow, "openclaw-web-search")
-	}
-	plugins["allow"] = allow
-
-	// Record install provenance so the loader can verify the plugin origin.
-	installs, _ := plugins["installs"].(map[string]any)
-	if installs == nil {
-		installs = make(map[string]any)
-	}
-	pluginDir := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
-	installs["openclaw-web-search"] = map[string]any{
-		"source":      "npm",
-		"spec":        webSearchNpmPackage,
-		"installPath": pluginDir,
-	}
-	plugins["installs"] = installs
-
-	config["plugins"] = plugins
-
-	// Add plugin tools to tools.alsoAllow so they survive the coding profile's
-	// policy pipeline (which has an explicit allow list of core tools only).
 	tools, _ := config["tools"].(map[string]any)
 	if tools == nil {
 		tools = make(map[string]any)
 	}
-
-	alsoAllow, _ := tools["alsoAllow"].([]any)
-	needed := []string{"ollama_web_search", "ollama_web_fetch"}
-	have := make(map[string]bool, len(alsoAllow))
-	for _, v := range alsoAllow {
-		if s, ok := v.(string); ok {
-			have[s] = true
-		}
-	}
-	for _, name := range needed {
-		if !have[name] {
-			alsoAllow = append(alsoAllow, name)
-		}
-	}
-	tools["alsoAllow"] = alsoAllow
-
-	// Disable built-in web search/fetch since our plugin replaces them.
 	web, _ := tools["web"].(map[string]any)
 	if web == nil {
 		web = make(map[string]any)
 	}
-	web["search"] = map[string]any{"enabled": false}
-	web["fetch"] = map[string]any{"enabled": false}
+	search, _ := web["search"].(map[string]any)
+	if search == nil {
+		search = make(map[string]any)
+	}
+	fetch, _ := web["fetch"].(map[string]any)
+	if fetch == nil {
+		fetch = make(map[string]any)
+	}
+
+	alsoAllow, _ := tools["alsoAllow"].([]any)
+	var filteredAlsoAllow []any
+	for _, v := range alsoAllow {
+		s, ok := v.(string)
+		if !ok {
+			filteredAlsoAllow = append(filteredAlsoAllow, v)
+			continue
+		}
+		if s == "ollama_web_search" || s == "ollama_web_fetch" {
+			stalePluginConfigured = true
+			continue
+		}
+		filteredAlsoAllow = append(filteredAlsoAllow, v)
+	}
+	if len(filteredAlsoAllow) > 0 {
+		tools["alsoAllow"] = filteredAlsoAllow
+	} else {
+		delete(tools, "alsoAllow")
+	}
+
+	if _, ok := entries["openclaw-web-search"]; ok {
+		delete(entries, "openclaw-web-search")
+		stalePluginConfigured = true
+	}
+	ollamaEntry, _ := entries["ollama"].(map[string]any)
+	if ollamaEntry == nil {
+		ollamaEntry = make(map[string]any)
+	}
+	ollamaEntry["enabled"] = true
+	entries["ollama"] = ollamaEntry
+	plugins["entries"] = entries
+
+	if allow, ok := plugins["allow"].([]any); ok {
+		var nextAllow []any
+		hasOllama := false
+		for _, v := range allow {
+			s, ok := v.(string)
+			if ok && s == "openclaw-web-search" {
+				stalePluginConfigured = true
+				continue
+			}
+			if ok && s == "ollama" {
+				hasOllama = true
+			}
+			nextAllow = append(nextAllow, v)
+		}
+		if !hasOllama {
+			nextAllow = append(nextAllow, "ollama")
+		}
+		plugins["allow"] = nextAllow
+	}
+
+	if installs, ok := plugins["installs"].(map[string]any); ok {
+		if _, exists := installs["openclaw-web-search"]; exists {
+			delete(installs, "openclaw-web-search")
+			stalePluginConfigured = true
+		}
+		if len(installs) > 0 {
+			plugins["installs"] = installs
+		} else {
+			delete(plugins, "installs")
+		}
+	}
+
+	if stalePluginConfigured || search["provider"] == nil {
+		search["provider"] = "ollama"
+	}
+	if stalePluginConfigured {
+		fetch["enabled"] = true
+	}
+	search["enabled"] = true
+	web["search"] = search
+	if len(fetch) > 0 {
+		web["fetch"] = fetch
+	}
 	tools["web"] = web
+	config["plugins"] = plugins
 	config["tools"] = tools

 	out, err := json.MarshalIndent(config, "", "  ")
--- a/cmd/launch/openclaw_test.go
+++ b/cmd/launch/openclaw_test.go
@@ -251,6 +251,359 @@ func TestOpenclawRun_SetupLaterContinuesToGatewayAndTUI(t *testing.T) {
 	}
 }

+func TestOpenclawRun_FirstLaunchOnboardUsesLaunchManagedHealthFlow(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses a POSIX shell test binary")
+	}
+
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	t.Setenv("PATH", tmpDir)
+
+	bin := filepath.Join(tmpDir, "openclaw")
+	script := fmt.Sprintf(`#!/bin/sh
+printf '%%s\n' "$*" >> "$HOME/invocations.log"
+if [ "$1" = "onboard" ]; then
+  /usr/bin/env | /usr/bin/sort > "$HOME/onboard-env.log"
+  /bin/mkdir -p "$HOME/.openclaw"
+  /bin/cat > "$HOME/.openclaw/openclaw.json" <<'EOF'
+{"wizard":{"lastRunAt":"2026-01-01T00:00:00Z"},"gateway":{"port":18789,"mode":"local"}}
+EOF
+fi
+exit 0
+`)
+	if err := os.WriteFile(bin, []byte(script), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	oldConfirmPrompt := DefaultConfirmPrompt
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		if prompt != "I understand the risks. Continue?" {
+			t.Fatalf("unexpected prompt: %q", prompt)
+		}
+		return true, nil
+	}
+	defer func() { DefaultConfirmPrompt = oldConfirmPrompt }()
+
+	c := &Openclaw{}
+	if err := c.Run("llama3.2", []string{"status"}); err != nil {
+		t.Fatalf("Run() error = %v", err)
+	}
+
+	data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
+	if len(lines) < 2 {
+		t.Fatalf("expected onboard + passthrough invocations, got %v", lines)
+	}
+	onboardInvocation := ""
+	for _, line := range lines {
+		if strings.HasPrefix(line, "onboard ") {
+			onboardInvocation = line
+			break
+		}
+	}
+	if onboardInvocation == "" {
+		t.Fatalf("expected onboard invocation, got %v", lines)
+	}
+	if !strings.Contains(onboardInvocation, "--skip-health") {
+		t.Fatalf("expected onboard invocation to include --skip-health, got %q", onboardInvocation)
+	}
+
+	envData, err := os.ReadFile(filepath.Join(tmpDir, "onboard-env.log"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	env := envSliceToMap(strings.Split(strings.TrimSpace(string(envData)), "\n"))
+	if env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"] != "1" {
+		t.Fatalf("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS = %q, want %q", env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"], "1")
+	}
+	if env["OPENCLAW_PLUGIN_STAGE_DIR"] != filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps") {
+		t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps"))
+	}
+}
+
+func TestOpenclawRun_FirstLaunchTUIArgsEnsureGatewayBeforePassthrough(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses a POSIX shell test binary")
+	}
+
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	t.Setenv("PATH", tmpDir)
+
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer ln.Close()
+	port := ln.Addr().(*net.TCPAddr).Port
+
+	bin := filepath.Join(tmpDir, "openclaw")
+	script := fmt.Sprintf(`#!/bin/sh
+printf '%%s\n' "$*" >> "$HOME/invocations.log"
+if [ "$1" = "onboard" ]; then
+  /bin/mkdir -p "$HOME/.openclaw"
+  /bin/cat > "$HOME/.openclaw/openclaw.json" <<'EOF'
+{"wizard":{"lastRunAt":"2026-01-01T00:00:00Z"},"gateway":{"port":%d,"mode":"local"}}
+EOF
+fi
+exit 0
+`, port)
+	if err := os.WriteFile(bin, []byte(script), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	oldConfirmPrompt := DefaultConfirmPrompt
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		if prompt != "I understand the risks. Continue?" {
+			t.Fatalf("unexpected prompt: %q", prompt)
+		}
+		return true, nil
+	}
+	defer func() { DefaultConfirmPrompt = oldConfirmPrompt }()
+
+	c := &Openclaw{}
+	if err := c.Run("llama3.2", []string{"tui"}); err != nil {
+		t.Fatalf("Run() error = %v", err)
+	}
+
+	data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
+	if len(lines) < 3 {
+		t.Fatalf("expected at least 3 invocations (update, onboard, daemon restart, tui), got %v", lines)
+	}
+	onboardIdx, daemonRestartIdx, tuiIdx := -1, -1, -1
+	for i, line := range lines {
+		if onboardIdx == -1 && strings.HasPrefix(line, "onboard ") {
+			onboardIdx = i
+		}
+		if daemonRestartIdx == -1 && line == "daemon restart" {
+			daemonRestartIdx = i
+		}
+		if tuiIdx == -1 && line == "tui" {
+			tuiIdx = i
+		}
+	}
+	if onboardIdx == -1 {
+		t.Fatalf("expected an onboarding invocation, got %v", lines)
+	}
+	if daemonRestartIdx == -1 {
+		t.Fatalf("expected a daemon restart before tui, got %v", lines)
+	}
+	if tuiIdx == -1 {
+		t.Fatalf("expected a tui invocation, got %v", lines)
+	}
+	if !(onboardIdx < daemonRestartIdx && daemonRestartIdx < tuiIdx) {
+		t.Fatalf("expected onboarding, then daemon restart, then tui; got %v", lines)
+	}
+}
+
+func TestOpenclawEnsureGatewayReady_UsesDaemonStartFallback(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses a POSIX shell test binary")
+	}
+
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	t.Setenv("PATH", tmpDir)
+
+	portProbe, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatal(err)
+	}
+	port := portProbe.Addr().(*net.TCPAddr).Port
+	_ = portProbe.Close()
+
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	if err := os.MkdirAll(configDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(fmt.Sprintf(`{
+		"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"},
+		"gateway": {"port": %d, "mode": "local"}
+	}`, port)), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	bin := filepath.Join(tmpDir, "openclaw")
+	if err := os.WriteFile(bin, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> \"$HOME/invocations.log\"\n"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	oldCanInstallDaemon := openclawCanInstallDaemon
+	openclawCanInstallDaemon = func() bool { return true }
+	defer func() { openclawCanInstallDaemon = oldCanInstallDaemon }()
+
+	triggeredBy := make(chan string, 1)
+	listenerReady := make(chan net.Listener, 1)
+	go func() {
+		invocationsPath := filepath.Join(tmpDir, "invocations.log")
+		deadline := time.Now().Add(5 * time.Second)
+		for time.Now().Before(deadline) {
+			data, err := os.ReadFile(invocationsPath)
+			if err == nil {
+				lines := strings.Split(strings.TrimSpace(string(data)), "\n")
+				for _, line := range lines {
+					if line != "daemon start" && line != "gateway run --force" {
+						continue
+					}
+					ln, err := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", port))
+					if err != nil {
+						return
+					}
+					go func() {
+						for {
+							conn, err := ln.Accept()
+							if err != nil {
+								return
+							}
+							_ = conn.Close()
+						}
+					}()
+					triggeredBy <- line
+					listenerReady <- ln
+					return
+				}
+			}
+			time.Sleep(10 * time.Millisecond)
+		}
+	}()
+
+	c := &Openclaw{}
+	cleanup, _, gotPort, err := c.ensureGatewayReady(bin)
+	if err != nil {
+		t.Fatalf("ensureGatewayReady() error = %v", err)
+	}
+	defer cleanup()
+	if gotPort != port {
+		t.Fatalf("ensureGatewayReady() port = %d, want %d", gotPort, port)
+	}
+
+	var ln net.Listener
+	select {
+	case which := <-triggeredBy:
+		if which != "daemon start" {
+			t.Fatalf("expected daemon start fallback, got %q", which)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for gateway startup trigger")
+	}
+	select {
+	case ln = <-listenerReady:
+		defer ln.Close()
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for test listener")
+	}
+
+	data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
+	if len(lines) == 0 || lines[0] != "daemon start" {
+		t.Fatalf("expected daemon start invocation, got %v", lines)
+	}
+	for _, line := range lines {
+		if line == "gateway run --force" {
+			t.Fatalf("did not expect gateway run fallback when daemon start succeeds, got %v", lines)
+		}
+	}
+}
+
+func TestOpenclawEnv_StagesBundledPluginRuntimeDeps(t *testing.T) {
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	t.Setenv("OPENAI_API_KEY", "should-be-cleared")
+
+	env := envSliceToMap(openclawEnv())
+
+	if env["OPENCLAW_PLUGIN_STAGE_DIR"] != filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps") {
+		t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps"))
+	}
+	if _, ok := env["OPENAI_API_KEY"]; ok {
+		t.Fatal("expected OPENAI_API_KEY to be cleared from openclaw environment")
+	}
+}
+
+func TestOpenclawInstallEnv_PreservesExplicitStageDirAndAddsEagerDeps(t *testing.T) {
+	t.Setenv("OPENCLAW_PLUGIN_STAGE_DIR", "/tmp/custom-stage")
+
+	env := envSliceToMap(openclawInstallEnv())
+
+	if env["OPENCLAW_PLUGIN_STAGE_DIR"] != "/tmp/custom-stage" {
+		t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], "/tmp/custom-stage")
+	}
+	if env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"] != "1" {
+		t.Fatalf("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS = %q, want %q", env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"], "1")
+	}
+}
+
+func TestEnsureOpenclawInstalled_UsesBundledPluginInstallEnv(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses a POSIX shell test binary")
+	}
+
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	t.Setenv("PATH", tmpDir)
+
+	writeScript := func(path, content string) {
+		t.Helper()
+		if err := os.WriteFile(path, []byte(content), 0o755); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	openclawPath := filepath.Join(tmpDir, "openclaw")
+	npmScript := fmt.Sprintf(`#!/bin/sh
+/usr/bin/env | /usr/bin/sort > "$HOME/npm-env.log"
+/bin/cat > %q <<'EOF'
+#!/bin/sh
+exit 0
+EOF
+/bin/chmod +x %q
+exit 0
+`, openclawPath, openclawPath)
+	writeScript(filepath.Join(tmpDir, "npm"), npmScript)
+	writeScript(filepath.Join(tmpDir, "git"), "#!/bin/sh\nexit 0\n")
+
+	oldConfirmPrompt := DefaultConfirmPrompt
+	DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
+		if prompt != "OpenClaw is not installed. Install with npm?" {
+			t.Fatalf("unexpected prompt: %q", prompt)
+		}
+		return true, nil
+	}
+	defer func() { DefaultConfirmPrompt = oldConfirmPrompt }()
+
+	openclawFreshInstall = false
+	bin, err := ensureOpenclawInstalled()
+	if err != nil {
+		t.Fatalf("ensureOpenclawInstalled() error = %v", err)
+	}
+	if bin != "openclaw" {
+		t.Fatalf("ensureOpenclawInstalled() bin = %q, want %q", bin, "openclaw")
+	}
+
+	envData, err := os.ReadFile(filepath.Join(tmpDir, "npm-env.log"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	env := envSliceToMap(strings.Split(strings.TrimSpace(string(envData)), "\n"))
+	if env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"] != "1" {
+		t.Fatalf("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS = %q, want %q", env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"], "1")
+	}
+	if env["OPENCLAW_PLUGIN_STAGE_DIR"] != filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps") {
+		t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps"))
+	}
+}
+
 func TestOpenclawEdit(t *testing.T) {
 	c := &Openclaw{}
 	tmpDir := t.TempDir()
@@ -1227,6 +1580,18 @@ func TestOpenclawChannelsConfigured(t *testing.T) {
 	})
 }

+func envSliceToMap(entries []string) map[string]string {
+	env := make(map[string]string, len(entries))
+	for _, entry := range entries {
+		key, value, ok := strings.Cut(entry, "=")
+		if !ok {
+			continue
+		}
+		env[key] = value
+	}
+	return env
+}
+
 func TestOpenclawChannelSetupPreflight(t *testing.T) {
 	if runtime.GOOS == "windows" {
 		t.Skip("uses a POSIX shell test binary")
@@ -2242,95 +2607,7 @@ func TestIntegrationOnboarded(t *testing.T) {
 	})
 }

-func TestVersionLessThan(t *testing.T) {
-	tests := []struct {
-		a, b string
-		want bool
-	}{
-		{"0.1.7", "0.2.1", true},
-		{"0.2.0", "0.2.1", true},
-		{"0.2.1", "0.2.1", false},
-		{"0.2.2", "0.2.1", false},
-		{"1.0.0", "0.2.1", false},
-		{"0.2.1", "1.0.0", true},
-		{"v0.1.7", "0.2.1", true},
-		{"0.2.1", "v0.2.1", false},
-	}
-	for _, tt := range tests {
-		t.Run(tt.a+"_vs_"+tt.b, func(t *testing.T) {
-			if got := versionLessThan(tt.a, tt.b); got != tt.want {
-				t.Errorf("versionLessThan(%q, %q) = %v, want %v", tt.a, tt.b, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestWebSearchPluginUpToDate(t *testing.T) {
-	t.Run("missing directory", func(t *testing.T) {
-		if webSearchPluginUpToDate(filepath.Join(t.TempDir(), "nonexistent")) {
-			t.Error("expected false for missing directory")
-		}
-	})
-
-	t.Run("missing package.json", func(t *testing.T) {
-		dir := t.TempDir()
-		if webSearchPluginUpToDate(dir) {
-			t.Error("expected false for missing package.json")
-		}
-	})
-
-	t.Run("old version", func(t *testing.T) {
-		dir := t.TempDir()
-		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":"0.1.7"}`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if webSearchPluginUpToDate(dir) {
-			t.Error("expected false for old version 0.1.7")
-		}
-	})
-
-	t.Run("exact minimum version", func(t *testing.T) {
-		dir := t.TempDir()
-		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":"0.2.1"}`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if !webSearchPluginUpToDate(dir) {
-			t.Error("expected true for exact minimum version 0.2.1")
-		}
-	})
-
-	t.Run("newer version", func(t *testing.T) {
-		dir := t.TempDir()
-		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":"1.0.0"}`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if !webSearchPluginUpToDate(dir) {
-			t.Error("expected true for newer version 1.0.0")
-		}
-	})
-
-	t.Run("invalid json", func(t *testing.T) {
-		dir := t.TempDir()
-		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`not json`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if webSearchPluginUpToDate(dir) {
-			t.Error("expected false for invalid json")
-		}
-	})
-
-	t.Run("empty version", func(t *testing.T) {
-		dir := t.TempDir()
-		if err := os.WriteFile(filepath.Join(dir, "package.json"), []byte(`{"version":""}`), 0o644); err != nil {
-			t.Fatal(err)
-		}
-		if webSearchPluginUpToDate(dir) {
-			t.Error("expected false for empty version")
-		}
-	})
-}
-
-func TestRegisterWebSearchPlugin(t *testing.T) {
+func TestConfigureOllamaWebSearch(t *testing.T) {
 	home := t.TempDir()
 	setTestHome(t, home)

@@ -2345,7 +2622,7 @@ func TestRegisterWebSearchPlugin(t *testing.T) {
 			t.Fatal(err)
 		}

-		registerWebSearchPlugin()
+		configureOllamaWebSearch()

 		data, err := os.ReadFile(configPath)
 		if err != nil {
@@ -2361,40 +2638,30 @@ func TestRegisterWebSearchPlugin(t *testing.T) {
 			t.Fatal("plugins section missing")
 		}

-		// Check entries
 		entries, _ := plugins["entries"].(map[string]any)
-		entry, _ := entries["openclaw-web-search"].(map[string]any)
+		entry, _ := entries["ollama"].(map[string]any)
 		if enabled, _ := entry["enabled"].(bool); !enabled {
-			t.Error("expected entries.openclaw-web-search.enabled = true")
+			t.Error("expected entries.ollama.enabled = true")
+		}
+		if _, ok := entries["openclaw-web-search"]; ok {
+			t.Error("expected stale openclaw-web-search entry to be absent")
 		}

-		// Check allow list
-		allow, _ := plugins["allow"].([]any)
-		found := false
-		for _, v := range allow {
-			if s, ok := v.(string); ok && s == "openclaw-web-search" {
-				found = true
-			}
+		if _, ok := plugins["allow"]; ok {
+			t.Error("did not expect plugins.allow to be created when no allowlist exists")
 		}
-		if !found {
-			t.Error("expected plugins.allow to contain openclaw-web-search")
+		if _, ok := plugins["installs"]; ok {
+			t.Error("did not expect plugins.installs to be created")
 		}

-		// Check install provenance
-		installs, _ := plugins["installs"].(map[string]any)
-		record, _ := installs["openclaw-web-search"].(map[string]any)
-		if record == nil {
-			t.Fatal("expected plugins.installs.openclaw-web-search")
+		tools, _ := config["tools"].(map[string]any)
+		web, _ := tools["web"].(map[string]any)
+		search, _ := web["search"].(map[string]any)
+		if got, _ := search["provider"].(string); got != "ollama" {
+			t.Errorf("search provider = %q, want %q", got, "ollama")
 		}
-		if source, _ := record["source"].(string); source != "npm" {
-			t.Errorf("install source = %q, want %q", source, "npm")
-		}
-		if spec, _ := record["spec"].(string); spec != webSearchNpmPackage {
-			t.Errorf("install spec = %q, want %q", spec, webSearchNpmPackage)
-		}
-		expectedPath := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
-		if installPath, _ := record["installPath"].(string); installPath != expectedPath {
-			t.Errorf("installPath = %q, want %q", installPath, expectedPath)
+		if enabled, _ := search["enabled"].(bool); !enabled {
+			t.Error("expected tools.web.search.enabled = true")
 		}
 	})

@@ -2403,8 +2670,8 @@ func TestRegisterWebSearchPlugin(t *testing.T) {
 			t.Fatal(err)
 		}

-		registerWebSearchPlugin()
-		registerWebSearchPlugin()
+		configureOllamaWebSearch()
+		configureOllamaWebSearch()

 		data, err := os.ReadFile(configPath)
 		if err != nil {
@@ -2416,30 +2683,39 @@ func TestRegisterWebSearchPlugin(t *testing.T) {
 		}

 		plugins, _ := config["plugins"].(map[string]any)
-		allow, _ := plugins["allow"].([]any)
-		count := 0
-		for _, v := range allow {
-			if s, ok := v.(string); ok && s == "openclaw-web-search" {
-				count++
-			}
+		entries, _ := plugins["entries"].(map[string]any)
+		if len(entries) != 1 {
+			t.Fatalf("expected only bundled ollama entry, got %v", entries)
 		}
-		if count != 1 {
-			t.Errorf("expected exactly 1 openclaw-web-search in allow, got %d", count)
+		if _, ok := entries["ollama"]; !ok {
+			t.Fatalf("expected entries.ollama to exist, got %v", entries)
 		}
 	})

-	t.Run("preserves existing config", func(t *testing.T) {
+	t.Run("migrates stale plugin config and preserves unrelated settings", func(t *testing.T) {
 		initial := map[string]any{
 			"plugins": map[string]any{
-				"allow": []any{"some-other-plugin"},
+				"allow": []any{"some-other-plugin", "openclaw-web-search"},
 				"entries": map[string]any{
-					"some-other-plugin": map[string]any{"enabled": true},
+					"some-other-plugin":   map[string]any{"enabled": true},
+					"openclaw-web-search": map[string]any{"enabled": true},
 				},
 				"installs": map[string]any{
 					"some-other-plugin": map[string]any{
 						"source":      "npm",
 						"installPath": "/some/path",
 					},
+					"openclaw-web-search": map[string]any{
+						"source":      "npm",
+						"installPath": "/old/path",
+					},
+				},
+			},
+			"tools": map[string]any{
+				"alsoAllow": []any{"ollama_web_search", "ollama_web_fetch", "browser"},
+				"web": map[string]any{
+					"search": map[string]any{"enabled": false},
+					"fetch":  map[string]any{"enabled": false},
 				},
 			},
 			"customField": "preserved",
@@ -2449,7 +2725,7 @@ func TestRegisterWebSearchPlugin(t *testing.T) {
 			t.Fatal(err)
 		}

-		registerWebSearchPlugin()
+		configureOllamaWebSearch()

 		out, err := os.ReadFile(configPath)
 		if err != nil {
@@ -2469,28 +2745,61 @@ func TestRegisterWebSearchPlugin(t *testing.T) {
 		if entries["some-other-plugin"] == nil {
 			t.Error("existing plugin entry was lost")
 		}
+		if entries["openclaw-web-search"] != nil {
+			t.Error("stale openclaw-web-search entry should be removed")
+		}
+		if ollamaEntry, _ := entries["ollama"].(map[string]any); ollamaEntry == nil {
+			t.Fatal("expected bundled ollama entry to be enabled")
+		}

 		installs, _ := plugins["installs"].(map[string]any)
 		if installs["some-other-plugin"] == nil {
 			t.Error("existing install record was lost")
 		}
+		if installs["openclaw-web-search"] != nil {
+			t.Error("stale openclaw-web-search install record should be removed")
+		}

 		allow, _ := plugins["allow"].([]any)
-		hasOther, hasWebSearch := false, false
+		hasOther, hasStalePlugin, hasOllama := false, false, false
 		for _, v := range allow {
 			s, _ := v.(string)
 			if s == "some-other-plugin" {
 				hasOther = true
 			}
 			if s == "openclaw-web-search" {
-				hasWebSearch = true
+				hasStalePlugin = true
+			}
+			if s == "ollama" {
+				hasOllama = true
 			}
 		}
 		if !hasOther {
 			t.Error("existing allow entry was lost")
 		}
-		if !hasWebSearch {
-			t.Error("openclaw-web-search not added to allow")
+		if hasStalePlugin {
+			t.Error("stale openclaw-web-search allow entry should be removed")
+		}
+		if !hasOllama {
+			t.Error("expected plugins.allow to contain bundled ollama plugin")
+		}
+
+		tools, _ := config["tools"].(map[string]any)
+		alsoAllow, _ := tools["alsoAllow"].([]any)
+		if len(alsoAllow) != 1 || alsoAllow[0] != "browser" {
+			t.Errorf("expected stale custom web tools to be removed, got %v", alsoAllow)
+		}
+		web, _ := tools["web"].(map[string]any)
+		search, _ := web["search"].(map[string]any)
+		fetch, _ := web["fetch"].(map[string]any)
+		if got, _ := search["provider"].(string); got != "ollama" {
+			t.Errorf("search provider = %q, want %q", got, "ollama")
+		}
+		if enabled, _ := search["enabled"].(bool); !enabled {
+			t.Error("expected migrated tools.web.search.enabled = true")
+		}
+		if enabled, _ := fetch["enabled"].(bool); !enabled {
+			t.Error("expected migrated tools.web.fetch.enabled = true")
 		}
 	})
 }
--- a/docs/capabilities/structured-outputs.mdx
+++ b/docs/capabilities/structured-outputs.mdx
@@ -2,6 +2,10 @@
 title: Structured Outputs
 ---

+<Note>
+  Ollama's Cloud currently does not support structured outputs.
+</Note>
+
 Structured outputs let you enforce a JSON schema on model responses so you can reliably extract structured data, describe images, or keep every reply consistent.

 ## Generating structured JSON
--- a/docs/integrations/openclaw.mdx
+++ b/docs/integrations/openclaw.mdx
@@ -15,7 +15,7 @@ Ollama handles everything automatically:
 1. **Install** — If OpenClaw isn't installed, Ollama prompts to install it via npm
 2. **Security** — On the first launch, a security notice explains the risks of tool access
 3. **Model** — Pick a model from the selector (local or cloud)
-4. **Onboarding** — Ollama configures the provider, installs the gateway daemon, sets your model as the primary, and installs the web search and fetch plugin
+4. **Onboarding** — Ollama configures the provider, installs the gateway daemon, sets your model as the primary, and enables OpenClaw's bundled Ollama web search
 5. **Gateway** — Starts in the background and opens the OpenClaw TUI

 <Note>OpenClaw requires a larger context window. It is recommended to use a context window of at least 64k tokens if using local models. See [Context length](/context-length) for more information.</Note>
@@ -24,19 +24,19 @@ Ollama handles everything automatically:

 ## Web search and fetch

-OpenClaw ships with a web search and fetch plugin that gives local or cloud models the ability to search the web and extract readable page content.
+OpenClaw ships with a bundled Ollama `web_search` provider that lets local or cloud-backed Ollama setups search the web through the configured Ollama host.

 ```bash
 ollama launch openclaw
 ```

-Web search and fetch is enabled automatically when launching OpenClaw through Ollama. To install the plugin directly:
+Ollama web search is enabled automatically when launching OpenClaw through Ollama. To configure it manually:

 ```bash
-openclaw plugins install @ollama/openclaw-web-search
+openclaw configure --section web
 ```

-<Note>Web search for local models requires `ollama signin`.</Note>
+<Note>Ollama web search for local models requires `ollama signin`.</Note>

 ## Configure without launching

@@ -93,4 +93,3 @@ Link WhatsApp, Telegram, Slack, Discord, or iMessage to chat with your local mod
 ```bash
 openclaw gateway stop
 ```
-
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -406,10 +406,6 @@ func TestAPIShowModel(t *testing.T) {
 }

 func TestAPIGenerateLogprobs(t *testing.T) {
-	if testModel != "" {
-		// Logprobs requires runner support (e.g. llama.cpp has it, MLX does not).
-		t.Skip("logprobs not supported by all runners")
-	}
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()

@@ -523,10 +519,6 @@ func TestAPIGenerateLogprobs(t *testing.T) {
 }

 func TestAPIChatLogprobs(t *testing.T) {
-	if testModel != "" {
-		// Logprobs requires runner support (e.g. llama.cpp has it, MLX does not).
-		t.Skip("logprobs not supported by all runners")
-	}
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()

--- a/openai/openai.go
+++ b/openai/openai.go
@@ -632,8 +632,8 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}

 	if effort != "" {
-		if !slices.Contains([]string{"high", "medium", "low", "none"}, effort) {
-			return nil, fmt.Errorf("invalid reasoning value: '%s' (must be \"high\", \"medium\", \"low\", or \"none\")", effort)
+		if !slices.Contains([]string{"high", "medium", "low", "max", "none"}, effort) {
+			return nil, fmt.Errorf("invalid reasoning value: '%s' (must be \"high\", \"medium\", \"low\", \"max\", or \"none\")", effort)
 		}

 		if effort == "none" {
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -55,6 +55,57 @@ func TestFromChatRequest_Basic(t *testing.T) {
 	}
 }

+func TestFromChatRequest_ReasoningEffort(t *testing.T) {
+	effort := func(s string) *string { return &s }
+
+	cases := []struct {
+		name    string
+		effort  *string
+		want    any // expected ThinkValue.Value; nil means req.Think should be nil
+		wantErr bool
+	}{
+		{name: "unset", effort: nil, want: nil},
+		{name: "high", effort: effort("high"), want: "high"},
+		{name: "medium", effort: effort("medium"), want: "medium"},
+		{name: "low", effort: effort("low"), want: "low"},
+		{name: "max", effort: effort("max"), want: "max"},
+		{name: "none disables", effort: effort("none"), want: false},
+		{name: "invalid", effort: effort("extreme"), wantErr: true},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			req := ChatCompletionRequest{
+				Model:           "test-model",
+				Messages:        []Message{{Role: "user", Content: "hi"}},
+				ReasoningEffort: tc.effort,
+			}
+			result, err := FromChatRequest(req)
+			if tc.wantErr {
+				if err == nil {
+					t.Fatalf("expected error for effort=%v, got none", *tc.effort)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if tc.want == nil {
+				if result.Think != nil {
+					t.Fatalf("expected nil Think, got %+v", result.Think)
+				}
+				return
+			}
+			if result.Think == nil {
+				t.Fatalf("expected Think=%v, got nil", tc.want)
+			}
+			if result.Think.Value != tc.want {
+				t.Fatalf("got Think.Value=%v, want %v", result.Think.Value, tc.want)
+			}
+		})
+	}
+}
+
 func TestFromChatRequest_WithImage(t *testing.T) {
 	imgData, _ := base64.StdEncoding.DecodeString(image)

--- a/openai/responses.go
+++ b/openai/responses.go
@@ -525,6 +525,18 @@ func FromResponsesRequest(r ResponsesRequest) (*api.ChatRequest, error) {
 		options["num_predict"] = *r.MaxOutputTokens
 	}

+	var think *api.ThinkValue
+	if effort := r.Reasoning.Effort; effort != "" {
+		switch effort {
+		case "none":
+			think = &api.ThinkValue{Value: false}
+		case "low", "medium", "high", "max":
+			think = &api.ThinkValue{Value: effort}
+		default:
+			return nil, fmt.Errorf("invalid reasoning value: %q (must be \"high\", \"medium\", \"low\", \"max\", or \"none\")", effort)
+		}
+	}
+
 	// Convert tools from Responses API format to api.Tool format
 	var tools []api.Tool
 	for _, t := range r.Tools {
@@ -552,6 +564,7 @@ func FromResponsesRequest(r ResponsesRequest) (*api.ChatRequest, error) {
 		Options:  options,
 		Tools:    tools,
 		Format:   format,
+		Think:    think,
 	}, nil
 }

--- a/openai/responses_test.go
+++ b/openai/responses_test.go
@@ -415,6 +415,86 @@ func TestFromResponsesRequest_Tools(t *testing.T) {
 	}
 }

+func TestFromResponsesRequest_ReasoningEffort(t *testing.T) {
+	tests := []struct {
+		name      string
+		effort    string
+		wantThink any
+		wantErr   bool
+	}{
+		{
+			name: "unset",
+		},
+		{
+			name:      "low",
+			effort:    "low",
+			wantThink: "low",
+		},
+		{
+			name:      "medium",
+			effort:    "medium",
+			wantThink: "medium",
+		},
+		{
+			name:      "high",
+			effort:    "high",
+			wantThink: "high",
+		},
+		{
+			name:      "max",
+			effort:    "max",
+			wantThink: "max",
+		},
+		{
+			name:      "none",
+			effort:    "none",
+			wantThink: false,
+		},
+		{
+			name:    "invalid",
+			effort:  "extreme",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			req := ResponsesRequest{
+				Model: "deepseek-v4-flash",
+				Input: ResponsesInput{Text: "hi"},
+			}
+			if tt.effort != "" {
+				req.Reasoning.Effort = tt.effort
+			}
+
+			chatReq, err := FromResponsesRequest(req)
+			if tt.wantErr {
+				if err == nil {
+					t.Fatal("expected error, got nil")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if tt.wantThink == nil {
+				if chatReq.Think != nil {
+					t.Fatalf("Think = %#v, want nil", chatReq.Think)
+				}
+				return
+			}
+
+			if chatReq.Think == nil {
+				t.Fatalf("Think = nil, want %v", tt.wantThink)
+			}
+			if chatReq.Think.Value != tt.wantThink {
+				t.Errorf("Think.Value = %v, want %v", chatReq.Think.Value, tt.wantThink)
+			}
+		})
+	}
+}
+
 func TestFromResponsesRequest_FunctionCallOutput(t *testing.T) {
 	// Test a complete tool call round-trip:
 	// 1. User message asking about weather
--- a/server/routes.go
+++ b/server/routes.go
@@ -375,8 +375,16 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	}

 	var builtinParser parsers.Parser
-	if shouldUseHarmony(m) && m.Config.Parser == "" {
-		m.Config.Parser = "harmony"
+	if shouldUseHarmony(m) {
+		// harmony's Reasoning field only understands low/medium/high; map "max" to "high"
+		if req.Think != nil {
+			if s, ok := req.Think.Value.(string); ok && s == "max" {
+				req.Think.Value = "high"
+			}
+		}
+		if m.Config.Parser == "" {
+			m.Config.Parser = "harmony"
+		}
 	}

 	if !req.Raw && m.Config.Parser != "" {
@@ -2320,8 +2328,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	if shouldUseHarmony(m) && m.Config.Parser == "" {
-		m.Config.Parser = "harmony"
+	if shouldUseHarmony(m) {
+		// harmony's Reasoning field only understands low/medium/high; map "max" to "high"
+		if req.Think != nil {
+			if s, ok := req.Think.Value.(string); ok && s == "max" {
+				req.Think.Value = "high"
+			}
+		}
+		if m.Config.Parser == "" {
+			m.Config.Parser = "harmony"
+		}
 	}

 	var builtinParser parsers.Parser
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -151,22 +151,11 @@ func (c *Client) WaitUntilRunning(ctx context.Context) error {
 	}
 }

-// completionRequest is a properly-tagged version of llm.CompletionRequest for JSON serialization.
-type completionRequest struct {
-	Prompt  string          `json:"prompt"`
-	Options *completionOpts `json:"options,omitempty"`
-}
-
-type completionOpts struct {
-	Temperature      float32 `json:"temperature,omitempty"`
-	TopP             float32 `json:"top_p,omitempty"`
-	MinP             float32 `json:"min_p,omitempty"`
-	TopK             int     `json:"top_k,omitempty"`
-	RepeatLastN      int     `json:"repeat_last_n,omitempty"`
-	RepeatPenalty    float32 `json:"repeat_penalty,omitempty"`
-	PresencePenalty  float32 `json:"presence_penalty,omitempty"`
-	FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
-	NumPredict       int     `json:"num_predict,omitempty"`
+type CompletionRequest struct {
+	Prompt      string
+	Options     api.Options
+	Logprobs    bool
+	TopLogprobs int
 }

 type CompletionResponse struct {
@@ -179,6 +168,8 @@ type CompletionResponse struct {
 	EvalCount          int
 	EvalDuration       time.Duration

+	Logprobs []llm.Logprob
+
 	Error *api.StatusError
 }

@@ -203,21 +194,13 @@ func (c *Client) Close() error {

 // Completion implements llm.LlamaServer.
 func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
-	creq := completionRequest{
-		Prompt: req.Prompt,
+	creq := CompletionRequest{
+		Prompt:      req.Prompt,
+		Logprobs:    req.Logprobs,
+		TopLogprobs: req.TopLogprobs,
 	}
 	if req.Options != nil {
-		creq.Options = &completionOpts{
-			Temperature:      req.Options.Temperature,
-			TopP:             req.Options.TopP,
-			MinP:             req.Options.MinP,
-			TopK:             req.Options.TopK,
-			RepeatLastN:      req.Options.RepeatLastN,
-			RepeatPenalty:    req.Options.RepeatPenalty,
-			PresencePenalty:  req.Options.PresencePenalty,
-			FrequencyPenalty: req.Options.FrequencyPenalty,
-			NumPredict:       req.Options.NumPredict,
-		}
+		creq.Options = *req.Options
 	}

 	body, err := json.Marshal(creq)
@@ -243,7 +226,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f

 	if resp.StatusCode != http.StatusOK {
 		respBody, _ := io.ReadAll(resp.Body)
-		return fmt.Errorf("%s", strings.TrimSpace(string(respBody)))
+		return api.StatusError{StatusCode: resp.StatusCode, ErrorMessage: strings.TrimSpace(string(respBody))}
 	}

 	scanner := bufio.NewScanner(resp.Body)
@@ -266,6 +249,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 			PromptEvalDuration: raw.PromptEvalDuration,
 			EvalCount:          raw.EvalCount,
 			EvalDuration:       raw.EvalDuration,
+			Logprobs:           raw.Logprobs,
 		}

 		fn(cresp)
--- a/x/mlxrunner/mlx/array.go
+++ b/x/mlxrunner/mlx/array.go
@@ -10,6 +10,8 @@ import (
 	"reflect"
 	"sort"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"unsafe"

 	"github.com/ollama/ollama/logutil"
@@ -18,20 +20,28 @@ import (
 type Array struct {
 	ctx    C.mlx_array
 	name   string
-	pinned int
+	pinned atomic.Int32
 }

-var arrays []*Array
+var (
+	arrays   []*Array
+	arraysMu sync.Mutex
+)

 // constructor utilities

 func New(name string) *Array {
 	t := &Array{name: name}
+
 	if tracing {
 		traceScratch = append(traceScratch, t)
 	} else {
+		arraysMu.Lock()
+		defer arraysMu.Unlock()
+
 		arrays = append(arrays, t)
 	}
+
 	return t
 }

@@ -131,7 +141,7 @@ func (t *Array) Clone() *Array {
 func Pin(s ...*Array) {
 	for _, t := range s {
 		if t != nil {
-			t.pinned++
+			t.pinned.Add(1)
 		}
 	}
 }
@@ -140,8 +150,7 @@ func Pin(s ...*Array) {
 func Unpin(s ...*Array) {
 	for _, t := range s {
 		if t != nil {
-			t.pinned--
-			if t.pinned < 0 {
+			if t.pinned.Add(-1) < 0 {
 				panic(fmt.Sprintf("mlx.Unpin: negative pin count on array %q", t.name))
 			}
 		}
@@ -151,9 +160,11 @@ func Unpin(s ...*Array) {
 // Sweep releases all unpinned arrays, primarily intermediate tensors. MLX will truly
 // free them when there are no other references, including dependencies in the graph.
 func Sweep() {
+	arraysMu.Lock()
+	defer arraysMu.Unlock()
 	n := 0
 	for _, t := range arrays {
-		if t.pinned > 0 && t.Valid() {
+		if t.pinned.Load() > 0 && t.Valid() {
 			arrays[n] = t
 			n++
 		} else if t.Valid() {
@@ -180,7 +191,7 @@ func (t *Array) String() string {
 func (t *Array) LogValue() slog.Value {
 	attrs := []slog.Attr{
 		slog.String("name", t.name),
-		slog.Int("pinned", t.pinned),
+		slog.Int("pinned", int(t.pinned.Load())),
 	}
 	if t.Valid() {
 		attrs = append(attrs,
@@ -194,19 +205,19 @@ func (t *Array) LogValue() slog.Value {

 // shape utilities

-func (t Array) Size() int {
+func (t *Array) Size() int {
 	return int(C.mlx_array_size(t.ctx))
 }

-func (t Array) NumBytes() int {
+func (t *Array) NumBytes() int {
 	return int(C.mlx_array_nbytes(t.ctx))
 }

-func (t Array) NumDims() int {
+func (t *Array) NumDims() int {
 	return int(C.mlx_array_ndim(t.ctx))
 }

-func (t Array) Dims() []int {
+func (t *Array) Dims() []int {
 	dims := make([]int, t.NumDims())
 	for i := range dims {
 		dims[i] = t.Dim(i)
@@ -215,29 +226,32 @@ func (t Array) Dims() []int {
 	return dims
 }

-func (t Array) Dim(dim int) int {
+func (t *Array) Dim(dim int) int {
 	return int(C.mlx_array_dim(t.ctx, C.int(dim)))
 }

-func (t Array) DType() DType {
+func (t *Array) DType() DType {
 	return DType(C.mlx_array_dtype(t.ctx))
 }

 // data utilities

-func (t Array) Int() int {
+func (t *Array) Int() int {
 	var item C.int64_t
 	C.mlx_array_item_int64(&item, t.ctx)
 	return int(item)
 }

-func (t Array) Float() float64 {
+func (t *Array) Float() float64 {
 	var item C.double
 	C.mlx_array_item_float64(&item, t.ctx)
 	return float64(item)
 }

-func (t Array) Ints() []int {
+func (t *Array) Ints() []int {
+	if dt := t.DType(); dt != DTypeInt32 {
+		panic(fmt.Sprintf("mlx: Ints requires DTypeInt32, got %v", dt))
+	}
 	ints := make([]int, t.Size())
 	for i, f := range unsafe.Slice(C.mlx_array_data_int32(t.ctx), len(ints)) {
 		ints[i] = int(f)
@@ -245,7 +259,10 @@ func (t Array) Ints() []int {
 	return ints
 }

-func (t Array) Floats() []float32 {
+func (t *Array) Floats() []float32 {
+	if dt := t.DType(); dt != DTypeFloat32 {
+		panic(fmt.Sprintf("mlx: Floats requires DTypeFloat32, got %v", dt))
+	}
 	floats := make([]float32, t.Size())
 	for i, f := range unsafe.Slice(C.mlx_array_data_float32(t.ctx), len(floats)) {
 		floats[i] = float32(f)
@@ -253,7 +270,7 @@ func (t Array) Floats() []float32 {
 	return floats
 }

-func (t Array) Save(name string) error {
+func (t *Array) Save(name string) error {
 	cName := C.CString(name)
 	defer C.free(unsafe.Pointer(cName))
 	C.mlx_save(cName, t.ctx)
@@ -262,6 +279,8 @@ func (t Array) Save(name string) error {

 // LogArrays logs all live arrays, sorted by size
 func LogArrays() {
+	arraysMu.Lock()
+	defer arraysMu.Unlock()
 	sort.Slice(arrays, func(i, j int) bool {
 		return arrays[i].NumBytes() > arrays[j].NumBytes()
 	})
@@ -270,7 +289,7 @@ func LogArrays() {
 	for _, t := range arrays {
 		nb := t.NumBytes()
 		total += nb
-		logutil.Trace(fmt.Sprintf("tensor %-60s %5s %5s pinned=%d %v", t.name, t.DType(), PrettyBytes(nb), t.pinned, t.Dims()))
+		logutil.Trace(fmt.Sprintf("tensor %-60s %5s %5s pinned=%d %v", t.name, t.DType(), PrettyBytes(nb), t.pinned.Load(), t.Dims()))
 	}
 	logutil.Trace(fmt.Sprintf("tensors total: %d, size: %s, active: %s", len(arrays), PrettyBytes(total), PrettyBytes(ActiveMemory())))
 }
--- a/x/mlxrunner/mlx/compile.go
+++ b/x/mlxrunner/mlx/compile.go
@@ -150,7 +150,7 @@ func closureCallback(res *C.mlx_vector_array, input C.mlx_vector_array, payload
 	traceScratch = nil
 	defer func() {
 		for _, a := range traceScratch {
-			if a.pinned > 0 {
+			if a.pinned.Load() > 0 {
 				panic("mlx: traced array was pinned during compilation")
 			}
 			if a.Valid() {
--- a/x/mlxrunner/mlx/fast.go
+++ b/x/mlxrunner/mlx/fast.go
@@ -24,8 +24,8 @@ func ScaledDotProductAttention(query, key, value, mask *Array, scale float32) *A
 }

 type LayerNorm struct {
-	Weight Array `weight:"weight"`
-	Bias   Array `weight:"bias"`
+	Weight *Array `weight:"weight"`
+	Bias   *Array `weight:"bias"`
 }

 func (r *LayerNorm) Forward(x *Array, eps float32) *Array {
@@ -35,10 +35,10 @@ func (r *LayerNorm) Forward(x *Array, eps float32) *Array {
 }

 type RMSNorm struct {
-	Weight Array `weight:"weight"`
+	Weight *Array `weight:"weight"`
 }

-func (r RMSNorm) Forward(x *Array, eps float32) *Array {
+func (r *RMSNorm) Forward(x *Array, eps float32) *Array {
 	out := New("FAST_RMSNORM")
 	C.mlx_fast_rms_norm(&out.ctx, x.ctx, r.Weight.ctx, C.float(eps), DefaultStream().ctx)
 	return out
--- a/x/mlxrunner/mlx/nn.go
+++ b/x/mlxrunner/mlx/nn.go
@@ -1,12 +1,12 @@
 package mlx

 type Linear struct {
-	Weight Array `weight:"weight"`
-	Bias   Array `weight:"bias"`
+	Weight *Array `weight:"weight"`
+	Bias   *Array `weight:"bias"`
 }

 // Forward computes the linear transformation: x @ Weight.T + Bias
-func (m Linear) Forward(x *Array) *Array {
+func (m *Linear) Forward(x *Array) *Array {
 	w := m.Weight.Transpose(1, 0)
 	if m.Bias.Valid() {
 		return m.Bias.Addmm(x, w, 1.0, 1.0)
@@ -15,14 +15,14 @@ func (m Linear) Forward(x *Array) *Array {
 	return x.Matmul(w)
 }

-func (m Linear) Gather(x, lhs, rhs *Array, sorted bool) *Array {
+func (m *Linear) Gather(x, lhs, rhs *Array, sorted bool) *Array {
 	w := m.Weight.Transpose(0, 2, 1)
 	// TODO: bias
 	return x.GatherMM(w, lhs, rhs, sorted)
 }

 type Embedding struct {
-	Weight Array `weight:"weight"`
+	Weight *Array `weight:"weight"`
 }

 func (e *Embedding) Forward(indices *Array) *Array {
--- a/x/mlxrunner/mlx/ops.go
+++ b/x/mlxrunner/mlx/ops.go
@@ -139,6 +139,12 @@ func (t *Array) Less(other *Array) *Array {
 	return out
 }

+func (t *Array) MaxAxis(axis int, keepDims bool) *Array {
+	out := New("MAX_AXIS")
+	C.mlx_max_axis(&out.ctx, t.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
+	return out
+}
+
 func (t *Array) Matmul(other *Array) *Array {
 	out := New("MATMUL")
 	C.mlx_matmul(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -6,36 +6,59 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
-	"net/http"
+	"sort"
 	"time"

-	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
+	sampler "github.com/ollama/ollama/x/mlxrunner/sample"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func prefillChunkSize() int {
 	return 2 << 10
 }

-func (r *Runner) TextGenerationPipeline(request Request) error {
+// Prepare tokenizes the prompt and validates it against the model's
+// context length. It is safe to call from any goroutine. On success it
+// populates request.Tokens and adjusts request.Options.NumPredict.
+func (r *Runner) Prepare(request *Request) error {
 	if r.Model == nil {
 		return errors.New("model not loaded")
 	}

+	tokens := r.Tokenizer.Encode(request.Prompt, r.Tokenizer.AddBOS())
+	if len(tokens) == 0 {
+		return errors.New("empty prompt")
+	}
+
+	if len(tokens) >= r.contextLength {
+		return fmt.Errorf("input length (%d tokens) exceeds the model's maximum context length (%d tokens)", len(tokens), r.contextLength)
+	}
+
+	// Cap generation to stay within the model's context length
+	maxGenerate := r.contextLength - len(tokens)
+	if request.Options.NumPredict <= 0 {
+		request.Options.NumPredict = maxGenerate
+	} else {
+		request.Options.NumPredict = min(request.Options.NumPredict, maxGenerate)
+	}
+
+	request.Tokens = tokens
+	return nil
+}
+
+func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) error {
 	mlx.ResetPeakMemory()
-	ctx := request.Ctx
-	var (
-		sample     *mlx.Array
-		nextSample *mlx.Array
-	)
+	var sample, nextSample sampler.Result

 	defer func() {
 		if request.Sampler != nil {
 			request.Sampler.Free()
 		}
-		mlx.Unpin(sample)
-		mlx.Unpin(nextSample)
+		mlx.Unpin(sample.Arrays()...)
+		mlx.Unpin(nextSample.Arrays()...)
 		mlx.Sweep()
 		mlx.ClearCache()

@@ -46,26 +69,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		slog.Info("peak memory", "size", mlx.PrettyBytes(mlx.PeakMemory()))
 	}()

-	inputs := r.Tokenizer.Encode(request.Prompt, r.Tokenizer.AddBOS())
-	if len(inputs) == 0 {
-		return errors.New("empty prompt")
-	}
-
-	if len(inputs) >= r.contextLength {
-		return api.StatusError{
-			StatusCode:   http.StatusBadRequest,
-			ErrorMessage: fmt.Sprintf("input length (%d tokens) exceeds the model's maximum context length (%d tokens)", len(inputs), r.contextLength),
-		}
-	}
-
-	// Cap generation to stay within the model's context length
-	maxGenerate := r.contextLength - len(inputs)
-	if request.Options.MaxTokens <= 0 {
-		request.Options.MaxTokens = maxGenerate
-	} else {
-		request.Options.MaxTokens = min(request.Options.MaxTokens, maxGenerate)
-	}
-
+	inputs := request.Tokens
 	request.Sampler.ResetHistory(inputs)

 	session := r.cache.begin(r.Model, inputs)
@@ -135,40 +139,38 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		mlx.ClearCache()
 	}

-	step := func(token *mlx.Array) *mlx.Array {
+	step := func(token *mlx.Array) sampler.Result {
 		fwd := r.Model.Forward(token.ExpandDims(0), caches)
 		logits := r.Model.Unembed(fwd)
 		logits = logits.Slice(mlx.Slice(), mlx.Slice(logits.Dim(1)-1), mlx.Slice()).Squeeze(1)

 		sample := request.Sampler.Sample(logits)
-
-		mlx.Pin(sample)
+		mlx.Pin(sample.Arrays()...)
 		mlx.Sweep()
-		mlx.AsyncEval(sample)
-
+		mlx.AsyncEval(sample.Arrays()...)
 		return sample
 	}

 	sample = step(mlx.FromValues(tokens[processed:], total-processed))

-	var b bytes.Buffer
+	dec := decoder{tokenizer: r.Tokenizer}

-	final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.MaxTokens, DoneReason: 1}
-	for i := range request.Options.MaxTokens {
+	final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.NumPredict, DoneReason: 1}
+	for i := range request.Options.NumPredict {
 		if err := ctx.Err(); err != nil {
 			return err
 		}

-		request.Sampler.AppendToken(sample)
-		nextSample = step(sample)
+		request.Sampler.AppendToken(sample.Token)
+		nextSample = step(sample.Token)

 		if i == 0 {
-			mlx.Eval(sample)
+			mlx.Eval(sample.Arrays()...)
 			final.PromptEvalDuration = time.Since(now)
 			now = time.Now()
 		}

-		output := int32(sample.Int())
+		output := int32(sample.Token.Int())
 		session.outputs = append(session.outputs, output)

 		if r.Tokenizer.IsEOS(output) {
@@ -177,17 +179,16 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			break
 		}

-		select {
-		case <-ctx.Done():
-			return ctx.Err()
-		case request.Responses <- CompletionResponse{
-			Content: r.Decode(output, &b),
-		}:
+		if resp, ok := dec.decode(sample); ok {
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case request.Responses <- resp:
+			}
 		}

-		mlx.Unpin(sample)
-		sample = nextSample
-		nextSample = nil
+		mlx.Unpin(sample.Arrays()...)
+		sample, nextSample = nextSample, sampler.Result{}

 		if i%256 == 0 {
 			mlx.ClearCache()
@@ -203,13 +204,57 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}
 }

-func (r Runner) Decode(sample int32, b *bytes.Buffer) string {
-	token := r.Tokenizer.Decode([]int32{sample})
+// decoder serializes sampled tokens into response chunks, holding bytes
+// whose UTF-8 sequence hasn't completed yet and the logprobs that belong
+// with those bytes so Content and Logprobs stay aligned when a chunk does
+// flush.
+type decoder struct {
+	tokenizer *tokenizer.Tokenizer
+	buf       bytes.Buffer
+	logprobs  []llm.Logprob
+}

-	if _, err := b.WriteString(token); err != nil {
-		slog.Error("Failed to write token to buffer", "error", err)
-		return ""
+func (d *decoder) decode(res sampler.Result) (CompletionResponse, bool) {
+	output := int32(res.Token.Int())
+	d.buf.WriteString(d.tokenizer.Decode([]int32{output}))
+	d.logprobs = append(d.logprobs, buildLogprob(res, d.tokenizer.Decode)...)
+
+	content := flushValidUTF8Prefix(&d.buf)
+	if content == "" {
+		return CompletionResponse{}, false
+	}
+	resp := CompletionResponse{Content: content, Logprobs: d.logprobs}
+	d.logprobs = nil
+	return resp, true
+}
+
+func buildLogprob(sample sampler.Result, decode func([]int32) string) []llm.Logprob {
+	if sample.Logprob == nil {
+		return nil
+	}
+	tok := func(id int32) string { return decode([]int32{id}) }
+
+	out := llm.Logprob{
+		TokenLogprob: llm.TokenLogprob{
+			Token:   tok(int32(sample.Token.Int())),
+			Logprob: float64(sample.Logprob.Floats()[0]),
+		},
 	}

-	return flushValidUTF8Prefix(b)
+	if sample.TopTokens != nil {
+		ids := sample.TopTokens.Ints()
+		vals := sample.TopLogprobs.Floats()
+		pairs := make([]llm.TokenLogprob, len(ids))
+		for i, id := range ids {
+			pairs[i] = llm.TokenLogprob{
+				Token:   tok(int32(id)),
+				Logprob: float64(vals[i]),
+			}
+		}
+		sort.Slice(pairs, func(i, j int) bool {
+			return pairs[i].Logprob > pairs[j].Logprob
+		})
+		out.TopLogprobs = pairs
+	}
+	return []llm.Logprob{out}
 }
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -18,34 +18,20 @@ import (
 	"github.com/ollama/ollama/x/tokenizer"
 )

+// Request is a short-lived struct that carries a completion request through
+// a channel from the HTTP handler to the runner goroutine. The ctx field
+// must travel with the request so that cancellation propagates across the
+// channel boundary.
 type Request struct {
-	TextCompletionsRequest
+	CompletionRequest
 	Responses chan CompletionResponse
-	Pipeline  func(Request) error
-
-	Ctx context.Context
+	Pipeline  func(context.Context, Request) error

+	Ctx     context.Context //nolint:containedctx
+	Tokens  []int32
 	Sampler *sample.Sampler
 }

-type TextCompletionsRequest struct {
-	Prompt  string `json:"prompt"`
-	Options struct {
-		Temperature      float32 `json:"temperature"`
-		TopP             float32 `json:"top_p"`
-		MinP             float32 `json:"min_p"`
-		TopK             int     `json:"top_k"`
-		RepeatLastN      int     `json:"repeat_last_n"`
-		RepeatPenalty    float32 `json:"repeat_penalty"`
-		PresencePenalty  float32 `json:"presence_penalty"`
-		FrequencyPenalty float32 `json:"frequency_penalty"`
-		MaxTokens        int     `json:"max_tokens"`
-
-		// Deprecated: use MaxTokens instead
-		NumPredict int `json:"num_predict"`
-	} `json:"options"`
-}
-
 type Runner struct {
 	Model         base.Model
 	Tokenizer     *tokenizer.Tokenizer
@@ -149,7 +135,7 @@ func (r *Runner) Run(host, port string, mux http.Handler) error {
 			case <-ctx.Done():
 				return nil
 			case request := <-r.Requests:
-				if err := request.Pipeline(request); err != nil {
+				if err := request.Pipeline(request.Ctx, request); err != nil {
 					slog.Info("Request terminated", "error", err)
 					var statusErr api.StatusError
 					if !errors.As(err, &statusErr) {
--- a/x/mlxrunner/sample/logprob_test.go
+++ b/x/mlxrunner/sample/logprob_test.go
@@ -0,0 +1,249 @@
+//go:build mlx
+
+package sample
+
+import (
+	"math"
+	"sort"
+	"testing"
+
+	"github.com/ollama/ollama/x/mlxrunner/mlx"
+)
+
+// logprobEntry is the (token id, logprob) pair returned by the sampler's
+// top-K extraction, used after the test-side descending sort.
+type logprobEntry struct {
+	id      int
+	logprob float64
+}
+
+// runSampleLogprobs drives Sample on a fresh Sampler configured for logprobs
+// and returns the greedily-sampled token id, its logprob, and the top-K
+// entries sorted descending by logprob. Logits must be a [vocab]-shaped
+// slice; the helper reshapes it to [1, vocab] before calling the sampler.
+func runSampleLogprobs(t *testing.T, logits []float32, topK int) (int, float64, []logprobEntry) {
+	t.Helper()
+
+	s := New(Options{Logprobs: true, TopLogprobs: topK})
+	defer func() {
+		s.Free()
+		mlx.Sweep()
+	}()
+
+	tensor := mlx.FromValues(logits, 1, len(logits))
+	res := s.Sample(tensor)
+
+	mlx.Pin(res.Arrays()...)
+	defer mlx.Unpin(res.Arrays()...)
+	mlx.Sweep()
+	mlx.Eval(res.Arrays()...)
+
+	selected := res.Token.Int()
+	selLP := float64(res.Logprob.Floats()[0])
+
+	var top []logprobEntry
+	if topK > 0 && res.TopTokens != nil {
+		ids := res.TopTokens.Ints()
+		vals := res.TopLogprobs.Floats()
+		top = make([]logprobEntry, len(ids))
+		for i, id := range ids {
+			top[i] = logprobEntry{id: id, logprob: float64(vals[i])}
+		}
+		sort.Slice(top, func(i, j int) bool { return top[i].logprob > top[j].logprob })
+	}
+	return selected, selLP, top
+}
+
+func TestSampleLogprobsBasic(t *testing.T) {
+	tests := []struct {
+		name           string
+		logits         []float32
+		topK           int
+		wantSelectedID int
+		wantTopLen     int
+	}{
+		{
+			name:           "single token without top logprobs",
+			logits:         []float32{1.0, 0.5, 0.3, 0.1},
+			topK:           0,
+			wantSelectedID: 0,
+			wantTopLen:     0,
+		},
+		{
+			name:           "single token with top logprobs",
+			logits:         []float32{1.0, 0.5, 0.3, 0.1},
+			topK:           3,
+			wantSelectedID: 0,
+			wantTopLen:     3,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			selected, _, top := runSampleLogprobs(t, tt.logits, tt.topK)
+			if selected != tt.wantSelectedID {
+				t.Errorf("selected = %d, want %d", selected, tt.wantSelectedID)
+			}
+			if len(top) != tt.wantTopLen {
+				t.Errorf("top-K length = %d, want %d", len(top), tt.wantTopLen)
+			}
+		})
+	}
+}
+
+func TestSampleLogprobsNumericalStability(t *testing.T) {
+	logits := []float32{1000.0, 999.0, 998.0}
+	_, selLP, top := runSampleLogprobs(t, logits, 3)
+
+	if math.IsInf(selLP, 0) || math.IsNaN(selLP) {
+		t.Errorf("selected logprob is not finite: %f", selLP)
+	}
+	for i, e := range top {
+		if math.IsInf(e.logprob, 0) || math.IsNaN(e.logprob) {
+			t.Errorf("top[%d] logprob is not finite: %f", i, e.logprob)
+		}
+	}
+	for i := 1; i < len(top); i++ {
+		if top[i].logprob > top[i-1].logprob {
+			t.Errorf("top logprobs not descending: %f > %f", top[i].logprob, top[i-1].logprob)
+		}
+	}
+}
+
+func TestSampleLogprobsProbabilityCorrectness(t *testing.T) {
+	tests := []struct {
+		name   string
+		logits []float32
+	}{
+		{"uniform", []float32{1.0, 1.0, 1.0, 1.0}},
+		{"different", []float32{2.0, 1.0, 0.5, 0.1}},
+		{"negative", []float32{-1.0, -2.0, -3.0, -4.0}},
+		{"mixed", []float32{5.0, -5.0, 0.0, 2.5}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			selected, selLP, top := runSampleLogprobs(t, tt.logits, len(tt.logits))
+
+			if selLP > 0 {
+				t.Errorf("selected logprob should be <= 0, got %f", selLP)
+			}
+			for i, e := range top {
+				if e.logprob > 0 {
+					t.Errorf("top[%d] logprob should be <= 0, got %f", i, e.logprob)
+				}
+			}
+
+			if tt.name == "uniform" {
+				want := 1.0 / float64(len(tt.logits))
+				got := math.Exp(selLP)
+				if math.Abs(got-want) > 1e-6 {
+					t.Errorf("uniform logits: selected prob = %f, want %f", got, want)
+				}
+			}
+
+			for i := 1; i < len(top); i++ {
+				if top[i].logprob > top[i-1].logprob {
+					t.Errorf("top logprobs not descending at %d: %f > %f",
+						i, top[i].logprob, top[i-1].logprob)
+				}
+			}
+
+			found := false
+			for _, e := range top {
+				if e.id == selected {
+					found = true
+					if math.Abs(e.logprob-selLP) > 1e-6 {
+						t.Errorf("selected logprob mismatch: selLP=%f top=%f", selLP, e.logprob)
+					}
+					break
+				}
+			}
+			if !found {
+				t.Errorf("selected token %d not present in top-K", selected)
+			}
+		})
+	}
+}
+
+func TestSampleLogprobsSoftmaxCorrectness(t *testing.T) {
+	tests := []struct {
+		name   string
+		logits []float32
+	}{
+		{"small vocabulary", []float32{1.0, 2.0, 3.0}},
+		{"large differences", []float32{10.0, 0.0, -10.0}},
+		{"all equal", []float32{5.0, 5.0, 5.0, 5.0, 5.0}},
+		{"very large values", []float32{500.0, 499.0, 498.0}},
+		{"very small values", []float32{-500.0, -499.0, -498.0}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, _, top := runSampleLogprobs(t, tt.logits, len(tt.logits))
+			if len(top) != len(tt.logits) {
+				t.Fatalf("top-K length = %d, want %d", len(top), len(tt.logits))
+			}
+
+			var sum float64
+			for _, e := range top {
+				p := math.Exp(e.logprob)
+				if p < 0 || p > 1 {
+					t.Errorf("token %d: probability %f out of [0,1]", e.id, p)
+				}
+				sum += p
+			}
+
+			if math.Abs(sum-1.0) > 1e-5 {
+				t.Errorf("probabilities sum = %f, want 1.0", sum)
+			}
+		})
+	}
+}
+
+func TestSampleLogprobsSelectedTokenCorrectness(t *testing.T) {
+	logits := []float32{3.0, 1.0, 2.0, 0.5}
+
+	maxIdx := 0
+	for i, v := range logits[1:] {
+		if v > logits[maxIdx] {
+			maxIdx = i + 1
+		}
+	}
+
+	selected, selLP, top := runSampleLogprobs(t, logits, len(logits))
+
+	if selected != maxIdx {
+		t.Errorf("selected = %d, want argmax %d", selected, maxIdx)
+	}
+
+	if top[0].id != maxIdx {
+		t.Errorf("top[0].id = %d, want argmax %d", top[0].id, maxIdx)
+	}
+	if math.Abs(top[0].logprob-selLP) > 1e-6 {
+		t.Errorf("top[0].logprob = %f, want selected %f", top[0].logprob, selLP)
+	}
+}
+
+func TestSampleLogprobsTopKOrdering(t *testing.T) {
+	// Logits chosen so argmax order differs from index order.
+	logits := []float32{2.0, 5.0, 1.0, 4.0, 3.0}
+	wantOrder := []int{1, 3, 4, 0, 2}
+
+	_, _, top := runSampleLogprobs(t, logits, len(logits))
+
+	if len(top) != len(wantOrder) {
+		t.Fatalf("top-K length = %d, want %d", len(top), len(wantOrder))
+	}
+	for i, e := range top {
+		if e.id != wantOrder[i] {
+			t.Errorf("top[%d].id = %d, want %d", i, e.id, wantOrder[i])
+		}
+	}
+	for i := 1; i < len(top); i++ {
+		if top[i].logprob > top[i-1].logprob {
+			t.Errorf("top[%d].logprob (%f) > top[%d].logprob (%f)",
+				i, top[i].logprob, i-1, top[i-1].logprob)
+		}
+	}
+}
--- a/x/mlxrunner/sample/sample.go
+++ b/x/mlxrunner/sample/sample.go
@@ -8,7 +8,7 @@ import (

 type Transform func(*Sampler, *mlx.Array) *mlx.Array

-type Sampler struct {
+type Options struct {
 	Temperature      float32
 	TopP             float32
 	MinP             float32
@@ -18,45 +18,66 @@ type Sampler struct {
 	PresencePenalty  float32
 	FrequencyPenalty float32

+	// Logprobs causes Sample to populate Result.Logprob with the selected
+	// token's log-probability. TopLogprobs (when > 0) adds top-K pairs.
+	Logprobs    bool
+	TopLogprobs int
+}
+
+type Sampler struct {
+	Options
+
 	history    *mlx.Array
 	historyLen int
 	transforms []Transform
 }

-func New(temp, top_p, min_p float32, top_k, repeatLastN int, repeatPenalty, presencePenalty, frequencyPenalty float32) *Sampler {
-	if repeatPenalty <= 0 {
-		repeatPenalty = 1
+// Result bundles the outputs of one decode step. The logprob tensors are
+// populated only when the sampler is configured to report them.
+type Result struct {
+	Token       *mlx.Array // sampled token id, shape [B]
+	Logprob     *mlx.Array // sampled-token logprob, shape [B,1]; nil unless Logprobs
+	TopTokens   *mlx.Array // top-K token ids, shape [B,K]; nil unless TopLogprobs > 0
+	TopLogprobs *mlx.Array // top-K logprobs, shape [B,K]; nil unless TopLogprobs > 0
+}
+
+// Arrays returns the tensor fields as a slice so callers can drive the mlx
+// lifecycle verbs (Pin, Unpin, Eval, AsyncEval) over the whole group. Unset
+// fields stay nil; the mlx helpers skip them.
+func (r Result) Arrays() []*mlx.Array {
+	return []*mlx.Array{r.Token, r.Logprob, r.TopTokens, r.TopLogprobs}
+}
+
+func New(opts Options) *Sampler {
+	if opts.RepeatPenalty <= 0 {
+		opts.RepeatPenalty = 1
 	}

-	s := &Sampler{
-		Temperature:      temp,
-		TopP:             top_p,
-		MinP:             min_p,
-		TopK:             top_k,
-		RepeatLastN:      repeatLastN,
-		RepeatPenalty:    repeatPenalty,
-		PresencePenalty:  presencePenalty,
-		FrequencyPenalty: frequencyPenalty,
-	}
+	s := &Sampler{Options: opts}

 	var transforms []Transform
 	if s.usesHistory() {
 		transforms = append(transforms, penalty)
 	}

-	if top_p > 0 && top_p < 1 {
-		transforms = append(transforms, topP)
-	}
-
-	if min_p != 0 {
-		transforms = append(transforms, minP)
-	}
-
-	if top_k > 0 {
+	hasTopP := opts.TopP > 0 && opts.TopP < 1
+	hasTopK := opts.TopK > 0
+	switch {
+	case hasTopP:
+		// topKTopP always does a full descending sort for the top-P
+		// cumulative mask and opportunistically masks top-K during the
+		// same pass when it is also configured.
+		transforms = append(transforms, topKTopP)
+	case hasTopK:
+		// Argpartition (partial sort) is cheaper than a full sort.
 		transforms = append(transforms, topK)
 	}

-	if temp == 0 {
+	if opts.MinP != 0 {
+		transforms = append(transforms, minP)
+	}
+
+	if opts.Temperature == 0 {
 		transforms = append(transforms, greedy)
 	} else {
 		transforms = append(transforms, temperature)
@@ -123,76 +144,121 @@ func (s *Sampler) Free() {
 	s.setHistory(nil, 0)
 }

-func (s *Sampler) Sample(logits *mlx.Array) *mlx.Array {
+// Sample runs the configured transform chain on the raw per-token logits
+// and returns the sampled token id plus, when configured, the reported
+// log-probability tensors for the selected token and the top-K tokens.
+func (s *Sampler) Sample(logits *mlx.Array) Result {
+	scores := logits
 	for _, transform := range s.transforms {
-		logits = transform(s, logits)
+		scores = transform(s, scores)
 	}
-	return logits
-}
+	res := Result{Token: scores}

-func greedy(_ *Sampler, logits *mlx.Array) *mlx.Array {
-	return logits.Argmax(-1, false)
-}
-
-func temperature(s *Sampler, logits *mlx.Array) *mlx.Array {
-	return mlx.DivScalar(logits, s.Temperature).Categorical(-1)
-}
-
-func topP(s *Sampler, logits *mlx.Array) *mlx.Array {
-	if s.TopP <= 0 || s.TopP >= 1 {
-		return logits
+	if s.Logprobs {
+		// Compute log_softmax in fp32 and subtract the max before
+		// logsumexp so the final subtraction stays on small values.
+		// Otherwise it cancels two large numbers and loses precision.
+		lp := logits.AsType(mlx.DTypeFloat32)
+		lp = lp.Subtract(lp.MaxAxis(-1, true))
+		lp = lp.Subtract(lp.Logsumexp(true))
+		res.Logprob = lp.TakeAlongAxis(res.Token.ExpandDims(-1), -1)
+		if k := s.TopLogprobs; k > 0 {
+			if vocab := lp.Dim(lp.NumDims() - 1); k > vocab {
+				k = vocab
+			}
+			// Argpartition on the negated values places the K largest
+			// (unsorted) in positions [0:K].
+			idx := lp.Negative().ArgpartitionAxis(k-1, -1).Slice(mlx.Slice(), mlx.Slice(0, k))
+			res.TopTokens = idx.AsType(mlx.DTypeInt32)
+			res.TopLogprobs = lp.TakeAlongAxis(idx, -1)
+		}
 	}
+	return res
+}

-	order := logits.Negative().ArgsortAxis(-1)
-	sortedLogits := logits.TakeAlongAxis(order, -1)
-	sortedProbs := mlx.SoftmaxAxis(sortedLogits, -1, true)
-	prevCumProbs := sortedProbs.Cumsum(-1, false, true).Subtract(sortedProbs)
+func greedy(_ *Sampler, scores *mlx.Array) *mlx.Array {
+	return scores.Argmax(-1, false)
+}
+
+func temperature(s *Sampler, scores *mlx.Array) *mlx.Array {
+	return mlx.DivScalar(scores, s.Temperature).Categorical(-1)
+}
+
+// topKTopP applies top-P in a descending sort pass and, when top-K is also
+// configured, masks any surviving value below the K-th largest in the same
+// pass. Callers dispatch here whenever top-P is enabled — the top-K-only
+// case uses a cheaper partial sort via the topK transform.
+func topKTopP(s *Sampler, scores *mlx.Array) *mlx.Array {
+	vocab := scores.Dim(scores.NumDims() - 1)
+	applyTopK := s.TopK > 0 && s.TopK < vocab
+
+	order := scores.Negative().ArgsortAxis(-1)
+	sorted := scores.TakeAlongAxis(order, -1)
+	negInf := mlx.FromValue(float32(math.Inf(-1)))
+
+	// Top-P: in descending order, keep tokens whose exclusive cumulative
+	// probability is still below s.TopP.
+	probs := mlx.SoftmaxAxis(sorted, -1, true)
+	prevCumProbs := probs.Cumsum(-1, false, true).Subtract(probs)
 	keep := prevCumProbs.Less(mlx.FromValue(s.TopP))
-	filtered := mlx.Where(keep, sortedLogits, mlx.FromValue(float32(math.Inf(-1))))
-	return logits.PutAlongAxis(order, filtered, -1)
-}
+	sorted = mlx.Where(keep, sorted, negInf)

-func minP(s *Sampler, logits *mlx.Array) *mlx.Array {
-	if s.MinP <= 0 || s.MinP > 1 {
-		return logits
+	out := scores.PutAlongAxis(order, sorted, -1)
+
+	// Top-K: sorted is already in descending order, so positions [K, V)
+	// are the ones to drop. Scatter -inf through their original-layout
+	// indices (order[K:]). Positional (not value-based) so exactly K
+	// tokens survive — ties at the K-th logit get broken by the sort
+	// order rather than promoted through the filter.
+	if applyTopK {
+		dropOrder := order.Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
+		out = out.PutAlongAxis(dropOrder, negInf, -1)
 	}

-	maxLogits := logits.TakeAlongAxis(logits.Argmax(-1, true), -1)
-	minLogits := mlx.AddScalar(maxLogits, float32(math.Log(float64(s.MinP))))
+	return out
+}
+
+func minP(s *Sampler, scores *mlx.Array) *mlx.Array {
+	if s.MinP <= 0 || s.MinP > 1 {
+		return scores
+	}
+
+	maxScore := scores.MaxAxis(-1, true)
+	threshold := mlx.AddScalar(maxScore, float32(math.Log(float64(s.MinP))))

 	return mlx.Where(
-		logits.Less(minLogits),
+		scores.Less(threshold),
 		mlx.FromValue(float32(math.Inf(-1))),
-		logits,
+		scores,
 	)
 }

-func topK(s *Sampler, logits *mlx.Array) *mlx.Array {
+func topK(s *Sampler, scores *mlx.Array) *mlx.Array {
 	if s.TopK <= 0 {
-		return logits
+		return scores
 	}

-	vocab := logits.Dim(logits.NumDims() - 1)
+	vocab := scores.Dim(scores.NumDims() - 1)
 	if s.TopK >= vocab {
-		return logits
+		return scores
 	}

-	mask := logits.Negative().ArgpartitionAxis(s.TopK-1, -1).Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
-	return logits.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
+	mask := scores.Negative().ArgpartitionAxis(s.TopK-1, -1).Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
+	return scores.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
 }

-func penalty(s *Sampler, logits *mlx.Array) *mlx.Array {
+func penalty(s *Sampler, scores *mlx.Array) *mlx.Array {
 	if s.historyLen == 0 {
-		return logits
+		return scores
 	}

 	tokenIndices := s.history
-	if logits.NumDims() > 1 {
+	if scores.NumDims() > 1 {
 		tokenIndices = tokenIndices.ExpandDims(0)
 	}

 	if s.RepeatPenalty != 1 || s.PresencePenalty != 0 {
-		adjusted := logits.TakeAlongAxis(tokenIndices, -1)
+		adjusted := scores.TakeAlongAxis(tokenIndices, -1)
 		if s.RepeatPenalty != 1 {
 			factor := mlx.Where(
 				adjusted.Less(mlx.FromValue(float32(0))),
@@ -204,12 +270,12 @@ func penalty(s *Sampler, logits *mlx.Array) *mlx.Array {
 		if s.PresencePenalty != 0 {
 			adjusted = mlx.AddScalar(adjusted, -s.PresencePenalty)
 		}
-		logits = logits.PutAlongAxis(tokenIndices, adjusted, -1)
+		scores = scores.PutAlongAxis(tokenIndices, adjusted, -1)
 	}

 	if s.FrequencyPenalty != 0 {
-		logits = logits.ScatterAddAxis(tokenIndices, mlx.FromValue(-s.FrequencyPenalty), -1)
+		scores = scores.ScatterAddAxis(tokenIndices, mlx.FromValue(-s.FrequencyPenalty), -1)
 	}

-	return logits
+	return scores
 }
--- a/x/mlxrunner/sample/sample_test.go
+++ b/x/mlxrunner/sample/sample_test.go
@@ -10,8 +10,7 @@ import (
 )

 func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
-	// RepeatLastN = 1, PresencePenalty = 6
-	s := New(0, 0, 0, 0, 1, 1, 6, 0)
+	s := New(Options{RepeatLastN: 1, PresencePenalty: 6})
 	defer func() {
 		s.Free()
 		mlx.Sweep()
@@ -21,7 +20,7 @@ func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
 	s.AppendToken(mlx.NewArrayInt32([]int32{1}, []int32{1}))

 	logits := mlx.FromValues([]float32{0, 5, 4}, 3)
-	got := s.Sample(logits)
+	got := s.Sample(logits).Token
 	mlx.Eval(got)

 	// logits will be [0, -1, 4] after the penalty
@@ -33,7 +32,7 @@ func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
 }

 func TestRepeatPenaltyUsesHistoryWithoutPresencePenalty(t *testing.T) {
-	s := New(0, 0, 0, 0, 1, 2, 0, 0)
+	s := New(Options{RepeatLastN: 1, RepeatPenalty: 2})
 	defer func() {
 		s.Free()
 		mlx.Sweep()
@@ -42,7 +41,7 @@ func TestRepeatPenaltyUsesHistoryWithoutPresencePenalty(t *testing.T) {
 	s.ResetHistory([]int32{1})

 	logits := mlx.FromValues([]float32{0, 5, 4}, 3)
-	got := s.Sample(logits)
+	got := s.Sample(logits).Token
 	mlx.Eval(got)

 	// token 1 is repeated and positive, so 5 / 2 falls below token 2.
@@ -53,7 +52,7 @@ func TestRepeatPenaltyUsesHistoryWithoutPresencePenalty(t *testing.T) {
 }

 func TestFrequencyPenaltyUsesTokenCounts(t *testing.T) {
-	s := New(0, 0, 0, 0, 4, 1, 0, 2)
+	s := New(Options{RepeatLastN: 4, FrequencyPenalty: 2})
 	defer func() {
 		s.Free()
 		mlx.Sweep()
@@ -62,7 +61,7 @@ func TestFrequencyPenaltyUsesTokenCounts(t *testing.T) {
 	s.ResetHistory([]int32{1, 1})

 	logits := mlx.FromValues([]float32{0, 5, 4}, 3)
-	got := s.Sample(logits)
+	got := s.Sample(logits).Token
 	mlx.Eval(got)

 	// token 1 appears twice, so 5 - (2 * 2) falls below token 2.
@@ -73,7 +72,7 @@ func TestFrequencyPenaltyUsesTokenCounts(t *testing.T) {
 }

 func TestMinPMasksTokensBelowThreshold(t *testing.T) {
-	s := New(0, 0, 0.5, 0, 0, 1, 0, 0)
+	s := New(Options{MinP: 0.5})
 	defer func() {
 		s.Free()
 		mlx.Sweep()
--- a/x/mlxrunner/server.go
+++ b/x/mlxrunner/server.go
@@ -2,7 +2,6 @@ package mlxrunner

 import (
 	"bytes"
-	"cmp"
 	"context"
 	"encoding/json"
 	"flag"
@@ -87,25 +86,30 @@ func Execute(args []string) error {
 	mux.HandleFunc("POST /v1/completions", func(w http.ResponseWriter, r *http.Request) {
 		request := Request{Responses: make(chan CompletionResponse)}

-		if err := json.NewDecoder(r.Body).Decode(&request.TextCompletionsRequest); err != nil {
+		if err := json.NewDecoder(r.Body).Decode(&request.CompletionRequest); err != nil {
 			slog.Error("Failed to decode request", "error", err)
 			http.Error(w, "Bad Request", http.StatusBadRequest)
 			return
 		}

-		request.Options.MaxTokens = cmp.Or(request.Options.MaxTokens, request.Options.NumPredict)
-
 		request.Pipeline = runner.TextGenerationPipeline
-		request.Sampler = sample.New(
-			request.Options.Temperature,
-			request.Options.TopP,
-			request.Options.MinP,
-			request.Options.TopK,
-			request.Options.RepeatLastN,
-			request.Options.RepeatPenalty,
-			request.Options.PresencePenalty,
-			request.Options.FrequencyPenalty,
-		)
+		request.Sampler = sample.New(sample.Options{
+			Temperature:      request.Options.Temperature,
+			TopP:             request.Options.TopP,
+			MinP:             request.Options.MinP,
+			TopK:             request.Options.TopK,
+			RepeatLastN:      request.Options.RepeatLastN,
+			RepeatPenalty:    request.Options.RepeatPenalty,
+			PresencePenalty:  request.Options.PresencePenalty,
+			FrequencyPenalty: request.Options.FrequencyPenalty,
+			Logprobs:         request.Logprobs,
+			TopLogprobs:      request.TopLogprobs,
+		})
+
+		if err := runner.Prepare(&request); err != nil {
+			http.Error(w, err.Error(), http.StatusBadRequest)
+			return
+		}

 		var cancel context.CancelFunc
 		request.Ctx, cancel = context.WithCancel(r.Context())
--- a/x/models/gemma4/gemma4_moe_test.go
+++ b/x/models/gemma4/gemma4_moe_test.go
@@ -144,6 +144,8 @@ func TestRouterForwardMatchesLegacy(t *testing.T) {

 	gotScores, gotInds := r.Forward(x, cfg)
 	wantScores, wantInds := legacyRouterForward(r, x, cfg)
+	gotInds = gotInds.AsType(mlx.DTypeInt32)
+	wantInds = wantInds.AsType(mlx.DTypeInt32)
 	mlx.Eval(gotScores, gotInds, wantScores, wantInds)

 	if got, want := gotInds.Ints(), wantInds.Ints(); !intSlicesEqual(got, want) {
--- a/x/models/nn/nn_test.go
+++ b/x/models/nn/nn_test.go
@@ -169,8 +169,8 @@ func TestQuantizedLinearMXFP4MatchesDequantizedWeight(t *testing.T) {
 	dequantizedWeight := mlx.Dequantize(ql.Weight, ql.Scales, ql.QBiases, 32, 4, "mxfp4")
 	mlx.Eval(dequantizedWeight)

-	qOut := ql.Forward(input)
-	dOut := NewLinear(dequantizedWeight, nil).Forward(input)
+	qOut := ql.Forward(input).AsType(mlx.DTypeFloat32)
+	dOut := NewLinear(dequantizedWeight, nil).Forward(input).AsType(mlx.DTypeFloat32)
 	mlx.Eval(qOut, dOut)

 	got := qOut.Floats()
Author	SHA1	Message	Date
Parth Sareen	ea01af6f76	openai: map responses reasoning effort to think (#15789 )	2026-04-24 02:49:36 -07:00
Parth Sareen	c2ebb4d57c	api: accept "max" as a think value (#15787 )	2026-04-24 01:49:39 -07:00
Parth Sareen	590109c835	launch: harden OpenClaw onboarding flow (#15777 )	2026-04-23 16:47:20 -07:00
Eva H	b4442c6d17	launch: resave managed integration config when live config drifts (#15776 )	2026-04-23 19:32:36 -04:00
Eva H	85ff8e4a21	launch: keep launch recommended models in a fixed canonical order (#15750 )	2026-04-23 16:33:00 -04:00
Parth Sareen	160660e572	launch: use bundled OpenClaw ollama web search (#15757 )	2026-04-22 16:34:19 -07:00
madflow	3b43b9bc4b	docs: update structured outputs doc for cloud (#15733 ) --------- Co-authored-by: Parth Sareen <parth.sareen@ollama.com>	2026-04-22 00:42:39 -07:00
Parth Sareen	21883571b7	launch: replace kimi-k2.5 with k2.6 as top recommended model (#15737 )	2026-04-21 15:13:20 -07:00
Jesse Gross	ce99f24731	mlxrunner: tokenize prompts in request handler goroutines Move tokenization out of the single GPU processing goroutine and into each request's HTTP handler goroutine. This allows the next request's prompt to be tokenized on the CPU while the current request is executing on the GPU.	2026-04-21 14:38:49 -07:00
Jesse Gross	04f5f0cdb4	mlx: improve thread safety of array management Use atomic.Int32 for Array.pinned and a sync.Mutex for the global arrays slice so MLX arrays can be created and pinned from multiple goroutines without racing on those structures. Convert Array value receivers to pointer receivers and struct fields from Array to *Array to avoid copying the atomic. This does not fully achieve thread safety even when building completely independent graphs. The tracing flag and traceScratch slice in compile.go are unprotected, so concurrent Compile calls will race. MLX itself is not fully thread-safe either although it is working to improve.	2026-04-21 14:38:49 -07:00
Matteo Celani	fb36a01ffe	app/ui: fix model picker showing stale model after switching chats (#15280 ) * app/ui: fix model picker showing stale model after switching chats Optimistic messages created during streaming were storing the full Model object instead of the model name string. When switching back to a chat with cached streaming data, the restore effect read an object where it expected a string, causing the model picker to fail matching and remain stuck on the previous chat's model. * app/ui: fix two more instances of Model object passed as model name Fix the same bug at lines 523 and 536 in the assistant_with_tools event handler, where selectedModel (object) was used instead of selectedModel.model (string).	2026-04-21 15:08:06 -04:00
Michael Verrilli	0c65ed33bc	cmd: populate model capabilities in launchInteractiveModel (#15712 ) launchInteractiveModel was introduced in PR #14609 without the client.Show() capability-detection block that RunHandler uses. This left opts.MultiModal always false in the TUI path, causing image/audio file paths to always be treated as unknown commands instead of being loaded as multimodal attachments. Mirror the Show() call, pull-on-404 fallback, cloud auth handling, and MultiModal/Think population from RunHandler into launchInteractiveModel. Fixes #15711	2026-04-21 14:37:36 -04:00
Jesse Gross	22d6c817f8	mlxrunner: fuse top-P and top-K into a single sort pass When both filters are active, avoid paying for a full sort in top-P and a partial sort in top-K. Single-filter paths are unchanged. Improves generation throughput on gemma4:e4b by 1.5%.	2026-04-20 17:43:00 -07:00
Jesse Gross	ca01373b28	mlxrunner: use MaxAxis in the min-P sampler One reduction op instead of Argmax + TakeAlongAxis.	2026-04-20 17:43:00 -07:00
Jesse Gross	24e038d56a	mlxrunner: add logprobs support Match the ollamarunner and OpenAI semantics: raw, full-vocab log-softmax with the top-K ranked by probability. Skipped on the GPU when the request doesn't ask for logprobs so decode doesn't pay for it otherwise.	2026-04-20 17:43:00 -07:00