Merge pull request #9703 from ollama/mxyng/gemma3-memory

count gemma3 vision tensors
count non-repeating vision layers
2026-04-27 11:15:40 +02:00 · 2025-03-13 16:56:34 -07:00 · 2025-03-13 16:53:29 -07:00 · 2025-03-13 16:35:20 -07:00 · 2025-03-13 16:35:00 -07:00 · 2025-03-13 16:35:00 -07:00
3 changed files with 46 additions and 26 deletions
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -187,6 +187,13 @@ cloudflared tunnel --url http://localhost:11434 --http-host-header="localhost:11
 Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
 For browser extensions, you'll need to explicitly allow the extension's origin pattern. Set `OLLAMA_ORIGINS` to include `chrome-extension://*`, `moz-extension://*`, and `safari-web-extension://*` if you wish to allow all browser extensions access, or specific extensions as needed:
 ```
 # Allow all Chrome, Firefox, and Safari extensions
 OLLAMA_ORIGINS=chrome-extension://*,moz-extension://*,safari-web-extension://* ollama serve
 ```
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
 ## Where are models stored?
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -583,39 +583,52 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 }
 func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
-	switch llm.KV().Architecture() {
+	if llm.KV().Uint("vision.block_count") == 0 {
-	case "mllama":
+		return
 		for _, layer := range llm.Tensors().GroupLayers()["v"] {
 			weights += layer.Size()
 	}
-		kv := func(n string) uint64 {
+	for name, layer := range llm.Tensors().GroupLayers() {
-			if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok {
+		if name == "v" || strings.HasPrefix(name, "v.") {
-				return uint64(v)
+			for _, tensor := range layer {
 				weights += tensor.Size()
 			}
 		}
 	}
-			return 0
+	imageSize := uint64(llm.KV().Uint("vision.image_size"))
 	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
 	if patchSize == 0 {
 		slog.Warn("unknown patch size for vision model")
 		return
 	}
-		imageSize := kv("image_size")
+	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
-		maxNumTiles := kv("max_num_tiles")
+	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
 		embeddingLength := kv("embedding_length")
 		headCount := kv("attention.head_count")
 		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
 	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
 		numPatches++
 	}
 	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
 	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
 	switch llm.KV().Architecture() {
 	case "mllama":
 		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
 		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
 		graphSize = 4 * (8 +
-			imageSize*imageSize*kv("num_channels")*maxNumTiles +
+			imageSize*imageSize*numChannels*maxNumTiles +
 			embeddingLength*numPatches*maxNumTiles +
 			9*embeddingLength*numPaddedPatches*maxNumTiles +
 			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
 	case "gemma3":
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
 	}
 	return weights, graphSize
 }
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -218,8 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
 			layerSize += kv / f.KV().BlockCount()
 			memoryWeights += blk.Size()
 		}
 		memoryWeights += layerSize
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
@@ -376,7 +376,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
 				// memory of the weights
 				"total", format.HumanBytes2(m.memoryWeights),
 				// memory of repeating layers
-				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
+				"repeating", format.HumanBytes2(m.memoryWeights),
 				// memory of non-repeating layers
 				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
 			),
Author	SHA1	Message	Date
Michael Yang	4ea4d2b189	Merge pull request #9703 from ollama/mxyng/gemma3-memory count gemma3 vision tensors	2025-03-13 16:56:34 -07:00
Michael Yang	8d76fa23ef	count non-repeating vision layers	2025-03-13 16:53:29 -07:00
Bradley Erickson	74b44fdf8f	docs: Add OLLAMA_ORIGINS for browser extension support (#9643 )	2025-03-13 16:35:20 -07:00
Michael Yang	65b88c544f	fix divide by zero	2025-03-13 16:35:00 -07:00
Michael Yang	a422ba39c9	roughly count gemma3 graph the largest operation is by far (q @ k) so just count that for simplicity	2025-03-13 16:35:00 -07:00
Michael Yang	d2ec22371e	count all vision tensors	2025-03-13 16:35:00 -07:00
Michael Yang	033cec232a	count gemma3 vision tensors	2025-03-13 16:34:42 -07:00