new runner

2026-04-21 08:15:42 +02:00 · 2024-12-17 19:59:41 -08:00
parent 3c95c21ddf
commit 4b4a5a28bf
27 changed files with 2122 additions and 317 deletions
--- a/runner/oldrunner/cache.go
+++ b/runner/oldrunner/cache.go
@@ -0,0 +1,246 @@
+package oldrunner
+
+import (
+	"errors"
+	"fmt"
+	"log/slog"
+	"reflect"
+	"time"
+
+	"github.com/ollama/ollama/llama"
+)
+
+type InputCache struct {
+	// context window size (per slot)
+	numCtx int
+
+	// individual KV caches
+	slots []InputCacheSlot
+
+	// optimize cache eviction for multiple users
+	multiUserCache bool
+
+	lc *llama.Context
+}
+
+func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/numSlots < 1 {
+		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	}
+
+	slots := make([]InputCacheSlot, numSlots)
+
+	for i := range slots {
+		slots[i] = InputCacheSlot{
+			Id:     i,
+			Inputs: make([]input, 0),
+		}
+	}
+
+	return &InputCache{
+		numCtx:         kvSize / numSlots,
+		slots:          slots,
+		multiUserCache: multiUserCache,
+		lc:             lc,
+	}, nil
+}
+
+// Locking: Operations on InputCacheSlot (including finding one
+// through LoadCacheSlot) require a lock to be be held that serializes
+// these operations with each other and llama.Decode
+
+type InputCacheSlot struct {
+	// Index in the KV cache
+	Id int
+
+	// Inputs that are stored in the KV cache
+	Inputs []input
+
+	// is this cache actively being processed as part of a sequence?
+	InUse bool
+
+	// last time this cache was used (as of start of processing)
+	lastUsed time.Time
+}
+
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
+	var slot *InputCacheSlot
+	var numPast int
+	var err error
+
+	// In single-user scenarios, the longest cache slot works fine for getting good input
+	// cache hit rates and it reuses the same VRAM over and over again, which is good for
+	// GPU performance in situations where we miss the input cache.
+	// For multiple users, the "best" cache slot produces better input cache hit rates
+	// at the cost of worse performance when we miss the input cache (because it causes
+	// GPU L2 cache misses due to spreading out accesses across VRAM).
+	if !c.multiUserCache {
+		slot, numPast, err = c.findLongestCacheSlot(prompt)
+	} else {
+		slot, numPast, err = c.findBestCacheSlot(prompt)
+	}
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if !cachePrompt {
+		numPast = 0
+	}
+
+	slot.InUse = true
+	slot.lastUsed = time.Now()
+
+	if numPast == len(prompt) {
+		// Leave one input to sample so we can get a response
+		numPast--
+	}
+
+	if !c.lc.KvCacheSeqRm(slot.Id, numPast, -1) {
+		// Some models don't support partial erasure
+		c.lc.KvCacheSeqRm(slot.Id, 0, -1)
+		numPast = 0
+	}
+
+	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
+		"used", numPast, "remaining", len(prompt)-numPast)
+
+	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]
+
+	return slot, prompt, nil
+}
+
+func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
+	longest := -1
+	var longestSlot *InputCacheSlot
+
+	for i, s := range c.slots {
+		if s.InUse {
+			continue
+		}
+
+		count := countCommonPrefix(s.Inputs, prompt)
+		if count > longest {
+			longest = count
+			longestSlot = &c.slots[i]
+		}
+	}
+
+	if longestSlot == nil {
+		return nil, 0, errors.New("no available cache slots")
+	}
+
+	return longestSlot, longest, nil
+}
+
+func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
+	oldest := time.Now()
+	var oldestSlot *InputCacheSlot
+
+	longest := -1
+	var longestSlot *InputCacheSlot
+
+	for i, s := range c.slots {
+		count := countCommonPrefix(s.Inputs, prompt)
+		if count > longest {
+			longest = count
+			longestSlot = &c.slots[i]
+		}
+
+		if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
+			oldest = s.lastUsed
+			oldestSlot = &c.slots[i]
+		}
+	}
+
+	if longest == len(longestSlot.Inputs) && !longestSlot.InUse {
+		return longestSlot, longest, nil
+	}
+
+	if oldestSlot.InUse {
+		return nil, 0, errors.New("no available cache slots")
+	}
+
+	if len(oldestSlot.Inputs) != 0 {
+		slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
+			"used", oldestSlot.lastUsed)
+	}
+
+	if longest > 0 && longestSlot != oldestSlot {
+		slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
+			len(longestSlot.Inputs))
+		oldestSlot.Inputs = make([]input, longest)
+		copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
+		// This is only nil for unit tests
+		if c.lc != nil {
+			c.lc.KvCacheSeqRm(oldestSlot.Id, 0, -1)
+			c.lc.KvCacheSeqCp(longestSlot.Id, oldestSlot.Id, 0, longest)
+		}
+	}
+
+	return oldestSlot, longest, nil
+}
+
+func countCommonPrefix(a []input, b []input) int {
+	var count int
+
+	for i := range a {
+		if i >= len(b) {
+			break
+		}
+
+		if !reflect.DeepEqual(a[i], b[i]) {
+			break
+		}
+
+		count++
+	}
+
+	return count
+}
+
+func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int {
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)
+
+	currentFree := c.numCtx - inputLen
+	discard := targetFree - currentFree
+
+	if discard < 0 {
+		discard = 0
+	}
+
+	return discard
+}
+
+// Frees up space in the KV cache by deleting the oldest half of history and shifting
+// the newest half into that space (saving numKeep inputs at the beginning).
+//
+// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
+	if numKeep >= c.numCtx {
+		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
+	}
+
+	discard := c.ShiftDiscard(len(slot.Inputs), numKeep)
+
+	if discard <= 0 {
+		return nil
+	}
+
+	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
+		"keep", numKeep, "discard", discard)
+
+	// TODO (jessegross): KV cache removal can fail for certain types of models
+	if !c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard) {
+		return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v)", slot.Id, numKeep, discard)
+	}
+	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
+
+	for i := numKeep + discard; i < len(slot.Inputs); i++ {
+		slot.Inputs[i-discard] = slot.Inputs[i]
+	}
+	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
+
+	return nil
+}