server: decompress zstd request bodies in cloud passthrough middleware (#14827)

When a zstd-compressed request (e.g. from Codex CLI) hits /v1/responses with a cloud model the request failed. Fix by decompressing zstd bodies before model extraction, so cloud models are detected and proxied directly without the writer being wrapped.
2026-04-21 00:05:40 +02:00 · 2026-03-13 15:06:47 -07:00
parent 870599f5da
commit 3980c0217d
3 changed files with 117 additions and 1 deletions
--- a/server/cloud_proxy.go
+++ b/server/cloud_proxy.go
@@ -16,6 +16,7 @@ import (
 	"time"

 	"github.com/gin-gonic/gin"
+	"github.com/klauspost/compress/zstd"

 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
@@ -29,6 +30,9 @@ const (
 	cloudProxyBaseURLEnv          = "OLLAMA_CLOUD_BASE_URL"
 	legacyCloudAnthropicKey       = "legacy_cloud_anthropic_web_search"
 	cloudProxyClientVersionHeader = "X-Ollama-Client-Version"
+
+	// maxDecompressedBodySize limits the size of a decompressed request body
+	maxDecompressedBodySize = 20 << 20
 )

 var (
@@ -73,6 +77,19 @@ func cloudPassthroughMiddleware(disabledOperation string) gin.HandlerFunc {
 			return
 		}

+		// Decompress zstd-encoded request bodies so we can inspect the model
+		if c.GetHeader("Content-Encoding") == "zstd" {
+			reader, err := zstd.NewReader(c.Request.Body, zstd.WithDecoderMaxMemory(8<<20))
+			if err != nil {
+				c.JSON(http.StatusBadRequest, gin.H{"error": "failed to decompress request body"})
+				c.Abort()
+				return
+			}
+			defer reader.Close()
+			c.Request.Body = http.MaxBytesReader(c.Writer, io.NopCloser(reader), maxDecompressedBodySize)
+			c.Request.Header.Del("Content-Encoding")
+		}
+
 		// TODO(drifkin): Avoid full-body buffering here for model detection.
 		// A future optimization can parse just enough JSON to read "model" (and
 		// optionally short-circuit cloud-disabled explicit-cloud requests) while