Compare commits

...

7 Commits

Author SHA1 Message Date
Michael Yang
4ea4d2b189 Merge pull request #9703 from ollama/mxyng/gemma3-memory
count gemma3 vision tensors
2025-03-13 16:56:34 -07:00
Michael Yang
8d76fa23ef count non-repeating vision layers 2025-03-13 16:53:29 -07:00
Bradley Erickson
74b44fdf8f docs: Add OLLAMA_ORIGINS for browser extension support (#9643) 2025-03-13 16:35:20 -07:00
Michael Yang
65b88c544f fix divide by zero 2025-03-13 16:35:00 -07:00
Michael Yang
a422ba39c9 roughly count gemma3 graph
the largest operation is by far (q @ k) so just count that for
simplicity
2025-03-13 16:35:00 -07:00
Michael Yang
d2ec22371e count all vision tensors 2025-03-13 16:35:00 -07:00
Michael Yang
033cec232a count gemma3 vision tensors 2025-03-13 16:34:42 -07:00
3 changed files with 46 additions and 26 deletions

View File

@@ -187,6 +187,13 @@ cloudflared tunnel --url http://localhost:11434 --http-host-header="localhost:11
Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`. Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
For browser extensions, you'll need to explicitly allow the extension's origin pattern. Set `OLLAMA_ORIGINS` to include `chrome-extension://*`, `moz-extension://*`, and `safari-web-extension://*` if you wish to allow all browser extensions access, or specific extensions as needed:
```
# Allow all Chrome, Firefox, and Safari extensions
OLLAMA_ORIGINS=chrome-extension://*,moz-extension://*,safari-web-extension://* ollama serve
```
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform. Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## Where are models stored? ## Where are models stored?

View File

@@ -583,39 +583,52 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
} }
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
switch llm.KV().Architecture() { if llm.KV().Uint("vision.block_count") == 0 {
case "mllama": return
for _, layer := range llm.Tensors().GroupLayers()["v"] {
weights += layer.Size()
} }
kv := func(n string) uint64 { for name, layer := range llm.Tensors().GroupLayers() {
if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok { if name == "v" || strings.HasPrefix(name, "v.") {
return uint64(v) for _, tensor := range layer {
weights += tensor.Size()
}
}
} }
return 0 imageSize := uint64(llm.KV().Uint("vision.image_size"))
patchSize := uint64(llm.KV().Uint("vision.patch_size"))
if patchSize == 0 {
slog.Warn("unknown patch size for vision model")
return
} }
imageSize := kv("image_size") numChannels := uint64(llm.KV().Uint("vision.num_channels"))
maxNumTiles := kv("max_num_tiles") numPatches := (imageSize / patchSize) * (imageSize / patchSize)
embeddingLength := kv("embedding_length")
headCount := kv("attention.head_count")
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok { if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
numPatches++ numPatches++
} }
headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
switch llm.KV().Architecture() {
case "mllama":
numPaddedPatches := numPatches + 8 - (numPatches%8)%8 numPaddedPatches := numPatches + 8 - (numPatches%8)%8
maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
graphSize = 4 * (8 + graphSize = 4 * (8 +
imageSize*imageSize*kv("num_channels")*maxNumTiles + imageSize*imageSize*numChannels*maxNumTiles +
embeddingLength*numPatches*maxNumTiles + embeddingLength*numPatches*maxNumTiles +
9*embeddingLength*numPaddedPatches*maxNumTiles + 9*embeddingLength*numPaddedPatches*maxNumTiles +
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
case "gemma3":
graphSize = 4 * (imageSize*imageSize*numChannels +
embeddingLength*patchSize +
numPatches*numPatches*headCount)
} }
return weights, graphSize return weights, graphSize
} }

View File

@@ -218,8 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
layerSize = blk.Size() layerSize = blk.Size()
layerSize += kv / f.KV().BlockCount() layerSize += kv / f.KV().BlockCount()
memoryWeights += blk.Size()
} }
memoryWeights += layerSize
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU // Stop allocating on GPU(s) once we hit the users target NumGPU
@@ -376,7 +376,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
// memory of the weights // memory of the weights
"total", format.HumanBytes2(m.memoryWeights), "total", format.HumanBytes2(m.memoryWeights),
// memory of repeating layers // memory of repeating layers
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput), "repeating", format.HumanBytes2(m.memoryWeights),
// memory of non-repeating layers // memory of non-repeating layers
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput), "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
), ),