bugfix: fix crash bug in token cache logic

This change fixes a problem in the token cache logic to avoid panics caused by empty token arrays by ensuring at least one token remains on full cache hits in the relevant function. The happens if there is an exact match in the cache on subsequent generations.
model: fix qwen3 tool calling in thinking (#14477 )
2026-04-18 13:54:11 +02:00 · 2026-02-26 18:35:44 -08:00 · 2026-02-26 16:13:18 -08:00
5 changed files with 119 additions and 16 deletions
--- a/model/parsers/qwen3.go
+++ b/model/parsers/qwen3.go
@@ -204,6 +204,24 @@ func (p *Qwen3Parser) eat() ([]qwen3Event, bool) {
 			p.maybeThinkingOpenAtBOL = false
 		}
 		thinkingCloseIdx := strings.Index(acc, qwen3ThinkingCloseTag)
 		toolOpenIdx := strings.Index(acc, qwen3ToolOpenTag)
 		// If a tool call starts before </think>, treat that as the end of thinking
 		// for parsing purposes and continue in tool-call mode.
 		if toolOpenIdx != -1 && (thinkingCloseIdx == -1 || toolOpenIdx < thinkingCloseIdx) {
 			before, after := p.splitAtTag(qwen3ToolOpenTag, true)
 			if len(before) > 0 {
 				events = append(events, qwen3EventThinkingContent{content: before})
 			}
 			if after == "" {
 				p.state = qwen3ParserStateToolStartedEatingWhitespace
 			} else {
 				p.state = qwen3ParserStateCollectingToolContent
 			}
 			return events, true
 		}
 		if strings.Contains(acc, qwen3ThinkingCloseTag) {
 			thinking, remaining := p.splitAtTag(qwen3ThinkingCloseTag, true)
 			if len(thinking) > 0 {
@@ -215,7 +233,7 @@ func (p *Qwen3Parser) eat() ([]qwen3Event, bool) {
 				p.state = qwen3ParserStateCollectingContent
 			}
 			return events, true
-		} else if overlapLen := overlap(acc, qwen3ThinkingCloseTag); overlapLen > 0 {
+		} else if overlapLen := max(overlap(acc, qwen3ThinkingCloseTag), overlap(acc, qwen3ToolOpenTag)); overlapLen > 0 {
 			beforePartialTag := acc[:len(acc)-overlapLen]
 			trailingWsLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWsLen
--- a/model/parsers/qwen3_test.go
+++ b/model/parsers/qwen3_test.go
@@ -146,6 +146,68 @@ func TestQwen3ParserToolCall(t *testing.T) {
 	}
 }
 func TestQwen3ParserThinkingWithToolCallBeforeThinkingClose(t *testing.T) {
 	parser := &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
 	parser.Init(nil, nil, &api.ThinkValue{Value: true})
 	input := "Let me think<tool_call>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"San Francisco\",\"unit\":\"celsius\"}}</tool_call>"
 	content, thinking, calls, err := parser.Add(input, true)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
 	}
 	if content != "" {
 		t.Fatalf("expected empty content, got %q", content)
 	}
 	if thinking != "Let me think" {
 		t.Fatalf("expected thinking %q, got %q", "Let me think", thinking)
 	}
 	if len(calls) != 1 {
 		t.Fatalf("expected 1 tool call, got %d", len(calls))
 	}
 	if calls[0].Function.Name != "get_weather" {
 		t.Fatalf("expected tool name %q, got %q", "get_weather", calls[0].Function.Name)
 	}
 }
 func TestQwen3ParserThinkingWithSplitToolOpenTag(t *testing.T) {
 	parser := &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
 	parser.Init(nil, nil, &api.ThinkValue{Value: true})
 	content, thinking, calls, err := parser.Add("Let me think<tool_ca", false)
 	if err != nil {
 		t.Fatalf("parse failed on first chunk: %v", err)
 	}
 	if content != "" || thinking != "Let me think" || len(calls) != 0 {
 		t.Fatalf(
 			"expected content=%q thinking=%q calls=%d, got content=%q thinking=%q calls=%d",
 			"",
 			"Let me think",
 			0,
 			content,
 			thinking,
 			len(calls),
 		)
 	}
 	content, thinking, calls, err = parser.Add("ll>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"SF\"}}</tool_call>", true)
 	if err != nil {
 		t.Fatalf("parse failed on second chunk: %v", err)
 	}
 	if content != "" {
 		t.Fatalf("expected empty content, got %q", content)
 	}
 	if thinking != "" {
 		t.Fatalf("expected no additional thinking on second chunk, got %q", thinking)
 	}
 	if len(calls) != 1 {
 		t.Fatalf("expected 1 tool call, got %d", len(calls))
 	}
 	if calls[0].Function.Name != "get_weather" {
 		t.Fatalf("expected tool name %q, got %q", "get_weather", calls[0].Function.Name)
 	}
 }
 func TestQwen35ParserRespectsNoThink(t *testing.T) {
 	parser := ParserForName("qwen3.5")
 	if parser == nil {
--- a/model/parsers/qwen3vl.go
+++ b/model/parsers/qwen3vl.go
@@ -180,7 +180,22 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			return events, false
 		}
 	case CollectingThinkingContent:
-		if strings.Contains(p.buffer.String(), thinkingCloseTag) {
+		acc := p.buffer.String()
 		thinkingCloseIdx := strings.Index(acc, thinkingCloseTag)
 		toolOpenIdx := strings.Index(acc, toolOpenTag)
 		// If a tool call starts before </think>, treat that as the end of thinking
 		// for parsing purposes and continue in tool-call mode.
 		if toolOpenIdx != -1 && (thinkingCloseIdx == -1 || toolOpenIdx < thinkingCloseIdx) {
 			before, _ := splitAtTag(&p.buffer, toolOpenTag, false)
 			if len(before) > 0 {
 				events = append(events, qwenEventThinkingContent{content: before})
 			}
 			p.state = CollectingToolContent
 			return events, true
 		}
 		if strings.Contains(acc, thinkingCloseTag) {
 			thinking, remaining := splitAtTag(&p.buffer, thinkingCloseTag, true)
 			if len(thinking) > 0 {
 				events = append(events, qwenEventThinkingContent{content: thinking})
@@ -191,13 +206,13 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 				p.state = CollectingContent
 			}
 			return events, true
-		} else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 {
+		} else if overlapLen := max(overlap(acc, thinkingCloseTag), overlap(acc, toolOpenTag)); overlapLen > 0 {
-			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
+			beforePartialTag := acc[:len(acc)-overlapLen]
 			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
-			unambiguous := p.buffer.String()[:ambiguousStart]
+			unambiguous := acc[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
+			ambiguous := acc[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
@@ -205,11 +220,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			}
 			return events, false
 		} else {
-			whitespaceLen := trailingWhitespaceLen(p.buffer.String())
+			whitespaceLen := trailingWhitespaceLen(acc)
-			ambiguousStart := len(p.buffer.String()) - whitespaceLen
+			ambiguousStart := len(acc) - whitespaceLen
-			unambiguous := p.buffer.String()[:ambiguousStart]
+			unambiguous := acc[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
+			ambiguous := acc[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
--- a/model/parsers/qwen3vl_thinking_test.go
+++ b/model/parsers/qwen3vl_thinking_test.go
@@ -98,8 +98,12 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 			desc: "nested thinking and tool call (outside thinking, inside tool call)",
 			steps: []step{
 				{
-					input:      "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
+					input: "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
-					wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm nested tool call</tool_call>"}},
+					wantEvents: []qwenEvent{
 						qwenEventThinkingContent{content: "I'm thinking"},
 						qwenEventRawToolCall{raw: "I'm nested tool call"},
 						qwenEventContent{content: "</think>"},
 					},
 				},
 			},
 		},
@@ -109,8 +113,7 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 				{
 					input: "<tool_call>I'm nested tool call<think>I'm thinking</think></tool_call>",
 					wantEvents: []qwenEvent{
-						qwenEventThinkingContent{content: "<tool_call>I'm nested tool call<think>I'm thinking"},
+						qwenEventRawToolCall{raw: "I'm nested tool call<think>I'm thinking</think>"},
 						qwenEventContent{content: "</tool_call>"},
 					},
 				},
 			},
@@ -121,8 +124,8 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 				{
 					input: "I'm thinking<tool_call>I'm NOT a nested tool call</think></tool_call><tool_call>I'm nested tool call 2<think></tool_call></think>",
 					wantEvents: []qwenEvent{
-						qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm NOT a nested tool call"},
+						qwenEventThinkingContent{content: "I'm thinking"},
-						qwenEventContent{content: "</tool_call>"},
+						qwenEventRawToolCall{raw: "I'm NOT a nested tool call</think>"},
 						qwenEventRawToolCall{raw: "I'm nested tool call 2<think>"},
 						qwenEventContent{content: "</think>"},
 					},
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -78,6 +78,11 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
 		prefix++
 	}
 	if prefix == len(tokens) && prefix > 0 {
 		// Leave one token to run through the model so we can sample a response.
 		prefix--
 	}
 	if prefix < len(c.tokens) {
 		trim := len(c.tokens) - prefix
 		for _, kv := range c.caches {