From b00bd1dfd4e9c8cf012eb8a1d2e406565f00d13e Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Fri, 27 Mar 2026 15:01:33 -0700 Subject: [PATCH] launch: skip context length warning for MLX models and show model name (#15102) --- cmd/launch/launch_test.go | 33 ++++++++++++++++++++++++++++++--- cmd/launch/models.go | 28 ++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/cmd/launch/launch_test.go b/cmd/launch/launch_test.go index dfb8a9497..7e1146454 100644 --- a/cmd/launch/launch_test.go +++ b/cmd/launch/launch_test.go @@ -1506,6 +1506,7 @@ func TestConfirmLowContextLength(t *testing.T) { statusBody string statusCode int showParams string // Parameters field returned by /api/show + showBody string // full JSON body for /api/show (overrides showParams when set) wantWarning bool wantModelfile bool // true if warning should mention Modelfile }{ @@ -1596,6 +1597,28 @@ func TestConfirmLowContextLength(t *testing.T) { statusCode: http.StatusOK, showParams: "SHOW_ERROR", // sentinel to make show return 500 }, + { + name: "no warning for safetensors model with high context length", + models: []string{"qwen3.5"}, + statusBody: `{"cloud":{},"context_length":32768}`, + statusCode: http.StatusOK, + showBody: `{"details":{"format":"safetensors"},"model_info":{"qwen3_5_moe.context_length":262144}}`, + }, + { + name: "warns for safetensors model with low context length", + models: []string{"small-model"}, + statusBody: `{"cloud":{},"context_length":32768}`, + statusCode: http.StatusOK, + showBody: `{"details":{"format":"safetensors"},"model_info":{"small.context_length":4096}}`, + wantWarning: true, + }, + { + name: "no warning for safetensors model even when server context is low", + models: []string{"qwen3.5"}, + statusBody: `{"cloud":{},"context_length":4096}`, + statusCode: http.StatusOK, + showBody: `{"details":{"format":"safetensors"},"model_info":{"qwen3_5_moe.context_length":262144}}`, + }, } for _, tt := range tests { @@ -1614,7 +1637,11 @@ func TestConfirmLowContextLength(t *testing.T) { return } w.WriteHeader(http.StatusOK) - fmt.Fprintf(w, `{"parameters":%q}`, tt.showParams) + if tt.showBody != "" { + fmt.Fprint(w, tt.showBody) + } else { + fmt.Fprintf(w, `{"parameters":%q}`, tt.showParams) + } return } http.NotFound(w, r) @@ -1648,12 +1675,12 @@ func TestConfirmLowContextLength(t *testing.T) { t.Fatalf("expected warning=%v, got output: %q", tt.wantWarning, output) } if tt.wantWarning && tt.wantModelfile { - if !strings.Contains(output, "Use the model:") { + if !strings.Contains(output, "Use the base model") { t.Fatalf("expected parent model hint in output: %q", output) } } if tt.wantWarning && !tt.wantModelfile { - if strings.Contains(output, "Use the model:") { + if strings.Contains(output, "Use the base model") { t.Fatalf("expected server hint, not parent model hint in output: %q", output) } } diff --git a/cmd/launch/models.go b/cmd/launch/models.go index ab2b21e3e..7452fc2cd 100644 --- a/cmd/launch/models.go +++ b/cmd/launch/models.go @@ -479,16 +479,24 @@ func lowContextLength(ctx context.Context, client *api.Client, models []string) modelfileOverride := false var info *api.ShowResponse if info, err = client.Show(ctx, &api.ShowRequest{Model: m}); err == nil { + // Safetensors (MLX) models always load at their full max context + // length, so the server default num_ctx doesn't apply. + if info.Details.Format == "safetensors" { + // Context length check in case models with low context length are added + if modelCtx := modelInfoContextLength(info.ModelInfo); modelCtx >= recommendedContextLength { + continue + } + } if numCtx := parseNumCtx(info.Parameters); numCtx > 0 { effectiveCtx = numCtx modelfileOverride = true } } if effectiveCtx < recommendedContextLength { - fmt.Fprintf(os.Stderr, "\n%sWarning: context window is %d tokens (recommended: %d+)%s\n", ansiYellow, effectiveCtx, recommendedContextLength, ansiReset) + fmt.Fprintf(os.Stderr, "\n%sWarning: %s has a context length of %d tokens, which is below the recommended %d.%s\n", ansiYellow, m, effectiveCtx, recommendedContextLength, ansiReset) if modelfileOverride { parentModel := info.Details.ParentModel - fmt.Fprintf(os.Stderr, "%sUse the model: %s and increase the context length to at least %d in Ollama App Settings.%s\n\n", ansiYellow, parentModel, recommendedContextLength, ansiReset) + fmt.Fprintf(os.Stderr, "%sUse the base model %s and increase the context length in Ollama App Settings.%s\n\n", ansiYellow, parentModel, ansiReset) } else { if runtime.GOOS == "windows" { fmt.Fprintf(os.Stderr, "%sIncrease it in Ollama App Settings or with $env:OLLAMA_CONTEXT_LENGTH=%d; ollama serve%s\n\n", ansiYellow, recommendedContextLength, ansiReset) @@ -515,6 +523,22 @@ func parseNumCtx(parameters string) int { return 0 } +// modelInfoContextLength extracts the model's architectural context length +// from the ModelInfo map (e.g. "qwen3_5_moe.context_length" → 262144). +func modelInfoContextLength(modelInfo map[string]any) int { + for k, v := range modelInfo { + if strings.HasSuffix(k, ".context_length") { + switch n := v.(type) { + case float64: + return int(n) + case int: + return n + } + } + } + return 0 +} + // TODO(parthsareen): this duplicates the pull progress UI in cmd.PullHandler. // Move the shared pull rendering to a small utility once the package boundary settles. func pullModel(ctx context.Context, client *api.Client, model string, insecure bool) error {