server: decompress zstd request bodies in cloud passthrough middleware (#14827)

When a zstd-compressed request (e.g. from Codex CLI) hits /v1/responses
with a cloud model the request failed.

Fix by decompressing zstd bodies before
model extraction, so cloud models are detected and proxied directly
without the writer being wrapped.
This commit is contained in:
Bruce MacDonald
2026-03-13 15:06:47 -07:00
committed by GitHub
parent 870599f5da
commit 3980c0217d
3 changed files with 117 additions and 1 deletions

View File

@@ -16,6 +16,7 @@ import (
"time"
"github.com/gin-gonic/gin"
"github.com/klauspost/compress/zstd"
"github.com/ollama/ollama/auth"
"github.com/ollama/ollama/envconfig"
@@ -29,6 +30,9 @@ const (
cloudProxyBaseURLEnv = "OLLAMA_CLOUD_BASE_URL"
legacyCloudAnthropicKey = "legacy_cloud_anthropic_web_search"
cloudProxyClientVersionHeader = "X-Ollama-Client-Version"
// maxDecompressedBodySize limits the size of a decompressed request body
maxDecompressedBodySize = 20 << 20
)
var (
@@ -73,6 +77,19 @@ func cloudPassthroughMiddleware(disabledOperation string) gin.HandlerFunc {
return
}
// Decompress zstd-encoded request bodies so we can inspect the model
if c.GetHeader("Content-Encoding") == "zstd" {
reader, err := zstd.NewReader(c.Request.Body, zstd.WithDecoderMaxMemory(8<<20))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "failed to decompress request body"})
c.Abort()
return
}
defer reader.Close()
c.Request.Body = http.MaxBytesReader(c.Writer, io.NopCloser(reader), maxDecompressedBodySize)
c.Request.Header.Del("Content-Encoding")
}
// TODO(drifkin): Avoid full-body buffering here for model detection.
// A future optimization can parse just enough JSON to read "model" (and
// optionally short-circuit cloud-disabled explicit-cloud requests) while