diff --git a/model/parsers/gemma4.go b/model/parsers/gemma4.go index 1392fe5fc..166d4b63f 100644 --- a/model/parsers/gemma4.go +++ b/model/parsers/gemma4.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "log/slog" + "regexp" "strings" "unicode" @@ -25,6 +26,11 @@ const ( gemma4ToolCallCloseTag = "" ) +var ( + gemma4QuotedStringRe = regexp.MustCompile(`(?s)<\|"\|>(.*?)<\|"\|>`) + gemma4BareKeyRe = regexp.MustCompile(`([,{])(\w+):`) +) + type Gemma4Parser struct { state Gemma4ParserState buffer strings.Builder @@ -345,126 +351,19 @@ func parseGemma4ToolCall(content string) (api.ToolCall, error) { // gemma4ArgsToJSON converts Gemma 4's custom argument format to valid JSON. func gemma4ArgsToJSON(s string) string { - const quoteToken = `<|"|>` + var quotedStrings []string + text := gemma4QuotedStringRe.ReplaceAllStringFunc(s, func(match string) string { + submatches := gemma4QuotedStringRe.FindStringSubmatch(match) + quotedStrings = append(quotedStrings, submatches[1]) + return "\x00" + string(rune(len(quotedStrings)-1)) + "\x00" + }) - var buf strings.Builder - buf.Grow(len(s) + 32) - const ( - stringModeNone = iota - stringModeGemmaToken - stringModeRawQuote - ) + text = gemma4BareKeyRe.ReplaceAllString(text, `$1"$2":`) - stringMode := stringModeNone - hex := "0123456789abcdef" - i := 0 - for i < len(s) { - if strings.HasPrefix(s[i:], quoteToken) { - if stringMode == stringModeGemmaToken { - stringMode = stringModeNone - } else if stringMode == stringModeNone { - stringMode = stringModeGemmaToken - } else { - // In a raw-quote string, treat the Gemma quote token literally. - buf.WriteString(quoteToken) - i += len(quoteToken) - continue - } - buf.WriteByte('"') - i += len(quoteToken) - continue - } - - ch := s[i] - - if stringMode == stringModeNone && ch == '"' { - stringMode = stringModeRawQuote - buf.WriteByte('"') - i++ - continue - } - - if stringMode != stringModeNone { - switch ch { - case '\\': - if i+1 < len(s) { - next := s[i+1] - if stringMode == stringModeGemmaToken { - switch next { - case '"': - // In Gemma-token strings, preserve \" as two literal characters. - buf.WriteString(`\\\"`) - i += 2 - continue - case '\\', '/': - // Keep existing behavior for \\ and \/ in Gemma-token strings. - buf.WriteByte('\\') - buf.WriteByte(next) - i += 2 - continue - } - } else { - switch next { - case '"', '\\', '/': - // Preserve valid JSON escapes that are already in raw-quoted strings. - buf.WriteByte('\\') - buf.WriteByte(next) - i += 2 - continue - } - } - } - // Unknown escape sequence: treat backslash as a literal character. - buf.WriteString(`\\`) - case '"': - if stringMode == stringModeRawQuote { - stringMode = stringModeNone - buf.WriteByte('"') - } else { - // In Gemma-token strings, raw double quotes are string content. - buf.WriteString(`\"`) - } - case '\n': - buf.WriteString(`\n`) - case '\r': - buf.WriteString(`\r`) - case '\t': - buf.WriteString(`\t`) - case '\b': - buf.WriteString(`\b`) - case '\f': - buf.WriteString(`\f`) - default: - if ch < 0x20 { - buf.WriteString(`\u00`) - buf.WriteByte(hex[ch>>4]) - buf.WriteByte(hex[ch&0x0f]) - } else { - buf.WriteByte(ch) - } - } - i++ - continue - } - - if isIdentStart(ch) { - j := i + 1 - for j < len(s) && isIdentPart(s[j]) { - j++ - } - word := s[i:j] - if j < len(s) && s[j] == ':' { - buf.WriteByte('"') - buf.WriteString(word) - buf.WriteByte('"') - } else { - buf.WriteString(word) - } - i = j - } else { - buf.WriteByte(ch) - i++ - } + for i, value := range quotedStrings { + escaped, _ := json.Marshal(value) + text = strings.ReplaceAll(text, "\x00"+string(rune(i))+"\x00", string(escaped)) } - return buf.String() + + return text } diff --git a/model/parsers/gemma4_test.go b/model/parsers/gemma4_test.go index af836813a..d2a612793 100644 --- a/model/parsers/gemma4_test.go +++ b/model/parsers/gemma4_test.go @@ -600,7 +600,7 @@ func TestGemma4ArgsToJSON(t *testing.T) { { name: "string_value_with_windows_path_backslashes", input: `{path:<|"|>C:\\Temp\\file.txt<|"|>}`, - expected: `{"path":"C:\\Temp\\file.txt"}`, + expected: `{"path":"C:\\\\Temp\\\\file.txt"}`, }, { name: "string_value_with_windows_path_single_backslashes", @@ -610,7 +610,7 @@ func TestGemma4ArgsToJSON(t *testing.T) { { name: "string_value_with_escaped_forward_slashes", input: `{url:<|"|>https:\/\/example.com\/a<|"|>}`, - expected: `{"url":"https:\/\/example.com\/a"}`, + expected: `{"url":"https:\\/\\/example.com\\/a"}`, }, { name: "string_value_with_unicode_escape_sequence", @@ -667,3 +667,83 @@ func TestGemma4Parser_HasThinkingSupport(t *testing.T) { t.Error("Gemma4Parser without thinking support should not report it") } } + +func TestParseGemma4ToolCall_InvalidRawQuotedEscape(t *testing.T) { + _, err := parseGemma4ToolCall(`call:open_file{path:"C:\users\bob\file.txt"}`) + if err == nil { + t.Fatal("expected parseGemma4ToolCall to reject malformed raw-quoted JSON escapes") + } +} + +func TestParseGemma4ToolCall_QuotedScalarsStayStrings(t *testing.T) { + toolCall, err := parseGemma4ToolCall(`call:foo{n:<|"|>1<|"|>,b:<|"|>true<|"|>,z:<|"|>null<|"|>}`) + if err != nil { + t.Fatalf("parseGemma4ToolCall returned error: %v", err) + } + + want := api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "foo", + Arguments: testArgs(map[string]any{ + "n": "1", + "b": "true", + "z": "null", + }), + }, + } + + if diff := cmp.Diff(want, toolCall, argsComparer); diff != "" { + t.Fatalf("quoted scalar handling differed from the reference implementation (-want +got):\n%s", diff) + } +} + +func TestParseGemma4ToolCall_UnquotedScalarsKeepStructuredTypes(t *testing.T) { + toolCall, err := parseGemma4ToolCall(`call:foo{n:1,b:true,z:null}`) + if err != nil { + t.Fatalf("parseGemma4ToolCall returned error: %v", err) + } + + want := api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "foo", + Arguments: testArgs(map[string]any{ + "n": 1.0, + "b": true, + "z": nil, + }), + }, + } + + if diff := cmp.Diff(want, toolCall, argsComparer); diff != "" { + t.Fatalf("unquoted scalar handling differed from the reference implementation (-want +got):\n%s", diff) + } +} + +func TestParseGemma4ToolCall_ReferenceImplementationExample(t *testing.T) { + toolCall, err := parseGemma4ToolCall(`call:get_current_temperature{detail_level:0,location:<|"|>Paris, France<|"|>,unit:<|"|>celsius<|"|>}`) + if err != nil { + t.Fatalf("parseGemma4ToolCall returned error: %v", err) + } + + want := api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "get_current_temperature", + Arguments: testArgs(map[string]any{ + "detail_level": 0.0, + "location": "Paris, France", + "unit": "celsius", + }), + }, + } + + if diff := cmp.Diff(want, toolCall, argsComparer); diff != "" { + t.Fatalf("tool call handling differed from the reference implementation (-want +got):\n%s", diff) + } +} + +func TestParseGemma4ToolCall_InvalidRawQuotedStructuralString(t *testing.T) { + _, err := parseGemma4ToolCall(`call:foo{q:"a,b:c"}`) + if err == nil { + t.Fatal("expected parseGemma4ToolCall to reject raw-quoted strings with structural text that the reference implementation does not support") + } +}