mirror of
https://github.com/ollama/ollama.git
synced 2026-04-25 18:25:42 +02:00
Add skipIfNoVisionOverride() to skip vision tests when OLLAMA_TEST_MODEL is set to a non-vision model. Add Think:false to context exhaustion test to prevent thinking models from using all context before the test can measure it. Add third test image (ollama homepage) and replace OCR test with ImageDescription test using it. Relax match strings for broader model compatibility. Add TestThinkingEnabled and TestThinkingSuppressed to verify thinking output and channel tag handling.
386 lines
12 KiB
Go
386 lines
12 KiB
Go
//go:build integration
|
|
|
|
package integration
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"slices"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/types/model"
|
|
)
|
|
|
|
// Default set of vision models to test. When OLLAMA_TEST_MODEL is set,
|
|
// only that model is tested (with a capability check for vision).
|
|
var defaultVisionModels = []string{
|
|
"gemma3",
|
|
"llama3.2-vision",
|
|
"qwen2.5vl",
|
|
"qwen3-vl:8b",
|
|
}
|
|
|
|
// decodeTestImages returns the test images.
|
|
func decodeTestImages(t *testing.T) (abbeyRoad, docs, ollamaHome api.ImageData) {
|
|
t.Helper()
|
|
var err error
|
|
abbeyRoad, err = base64.StdEncoding.DecodeString(imageEncoding)
|
|
if err != nil {
|
|
t.Fatalf("decode abbey road image: %v", err)
|
|
}
|
|
docs, err = base64.StdEncoding.DecodeString(imageEncodingDocs)
|
|
if err != nil {
|
|
t.Fatalf("decode docs image: %v", err)
|
|
}
|
|
ollamaHome, err = base64.StdEncoding.DecodeString(imageEncodingOllamaHome)
|
|
if err != nil {
|
|
t.Fatalf("decode ollama home image: %v", err)
|
|
}
|
|
return
|
|
}
|
|
|
|
// skipIfNoVisionOverride skips the entire test (at parent level) when
|
|
// OLLAMA_TEST_MODEL is set to a non-vision model. This prevents the parent
|
|
// test from reporting PASS when all subtests are skipped.
|
|
func skipIfNoVisionOverride(t *testing.T) {
|
|
t.Helper()
|
|
if testModel == "" {
|
|
return
|
|
}
|
|
// Check actual model capabilities via the API rather than a hardcoded list.
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
resp, err := client.Show(ctx, &api.ShowRequest{Name: testModel})
|
|
if err != nil {
|
|
return // let the test proceed and fail naturally
|
|
}
|
|
if len(resp.Capabilities) > 0 && !slices.Contains(resp.Capabilities, model.CapabilityVision) {
|
|
t.Skipf("model override %q does not have vision capability (has %v)", testModel, resp.Capabilities)
|
|
}
|
|
}
|
|
|
|
// setupVisionModel pulls the model, preloads it, and skips if not GPU-loaded.
|
|
func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, model string) {
|
|
t.Helper()
|
|
if testModel != "" {
|
|
requireCapability(ctx, t, client, model, "vision")
|
|
}
|
|
pullOrSkip(ctx, t, client, model)
|
|
err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
|
|
if err != nil {
|
|
t.Fatalf("failed to load model %s: %s", model, err)
|
|
}
|
|
skipIfNotGPULoaded(ctx, t, client, model, 80)
|
|
}
|
|
|
|
// TestVisionMultiTurn sends an image, gets a response, then asks follow-up
|
|
// questions about the same image. This verifies that the KV cache correctly
|
|
// handles cached image tokens across turns.
|
|
func TestVisionMultiTurn(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
// Models that fail on multi-turn detail questions (e.g. misidentifying objects).
|
|
skipModels := map[string]string{
|
|
"gemma3": "misidentifies briefcase as smartphone on turn 3",
|
|
"llama3.2-vision": "miscounts animals (says 3 instead of 4) on turn 2",
|
|
}
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
if reason, ok := skipModels[model]; ok && testModel == "" {
|
|
t.Skipf("skipping: %s", reason)
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
abbeyRoad, _, _ := decodeTestImages(t)
|
|
|
|
// Turn 1: describe the image
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "Describe this image briefly.",
|
|
Images: []api.ImageData{abbeyRoad},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
resp1 := DoChat(ctx, t, client, req, []string{
|
|
"llama", "cross", "walk", "road", "animal", "cartoon",
|
|
}, 120*time.Second, 30*time.Second)
|
|
if resp1 == nil {
|
|
t.Fatal("no response from turn 1")
|
|
}
|
|
|
|
// Turn 2: follow-up about count
|
|
req.Messages = append(req.Messages,
|
|
*resp1,
|
|
api.Message{Role: "user", Content: "How many animals are in the image?"},
|
|
)
|
|
resp2 := DoChat(ctx, t, client, req, []string{
|
|
"four", "4", "three", "3",
|
|
}, 60*time.Second, 30*time.Second)
|
|
if resp2 == nil {
|
|
t.Fatal("no response from turn 2")
|
|
}
|
|
|
|
// Turn 3: follow-up about specific detail
|
|
req.Messages = append(req.Messages,
|
|
*resp2,
|
|
api.Message{Role: "user", Content: "Is any animal carrying something? What is it?"},
|
|
)
|
|
DoChat(ctx, t, client, req, []string{
|
|
"briefcase", "suitcase", "bag", "case", "luggage",
|
|
}, 60*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVisionObjectCounting asks the model to count objects in an image.
|
|
func TestVisionObjectCounting(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
skipModels := map[string]string{
|
|
"llama3.2-vision": "consistently miscounts (says 3 instead of 4)",
|
|
}
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
if reason, ok := skipModels[model]; ok && testModel == "" {
|
|
t.Skipf("skipping: %s", reason)
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
_, docs, _ := decodeTestImages(t)
|
|
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "How many animals are shown in this image? Answer with just the number.",
|
|
Images: []api.ImageData{docs},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
DoChat(ctx, t, client, req, []string{"4", "four"}, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVisionSceneUnderstanding tests whether the model can identify
|
|
// cultural references and scene context from an image.
|
|
func TestVisionSceneUnderstanding(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
// Models known to be too small or not capable enough for cultural reference detection.
|
|
skipModels := map[string]string{
|
|
"llama3.2-vision": "3B model lacks cultural reference knowledge",
|
|
"minicpm-v": "too small for cultural reference detection",
|
|
}
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
if reason, ok := skipModels[model]; ok && testModel == "" {
|
|
t.Skipf("skipping: %s", reason)
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
abbeyRoad, _, _ := decodeTestImages(t)
|
|
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "What famous image or album cover is this a parody of?",
|
|
Images: []api.ImageData{abbeyRoad},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
DoChat(ctx, t, client, req, []string{
|
|
"abbey road", "beatles", "abbey", "llama",
|
|
}, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVisionSpatialReasoning tests the model's ability to identify
|
|
// objects based on their spatial position in the image.
|
|
func TestVisionSpatialReasoning(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
_, docs, _ := decodeTestImages(t)
|
|
|
|
// The docs image has: leftmost llama on laptop with glasses,
|
|
// rightmost llama sleeping.
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "What is the animal on the far left doing in this image?",
|
|
Images: []api.ImageData{docs},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
DoChat(ctx, t, client, req, []string{
|
|
"laptop", "computer", "typing", "working",
|
|
}, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVisionDetailRecognition tests whether the model can identify
|
|
// small details like accessories in an image.
|
|
func TestVisionDetailRecognition(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
_, docs, _ := decodeTestImages(t)
|
|
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "Are any of the animals wearing glasses? Describe what you see.",
|
|
Images: []api.ImageData{docs},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
DoChat(ctx, t, client, req, []string{
|
|
"glasses", "spectacles", "eyeglasses",
|
|
}, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVisionMultiImage sends two images in a single message and asks
|
|
// the model to compare and contrast them. This exercises multi-image
|
|
// encoding and cross-image reasoning.
|
|
func TestVisionMultiImage(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
// Multi-image support varies across models.
|
|
skipModels := map[string]string{
|
|
"llama3.2-vision": "does not support multi-image input",
|
|
}
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
if reason, ok := skipModels[model]; ok && testModel == "" {
|
|
t.Skipf("skipping: %s", reason)
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
abbeyRoad, docs, _ := decodeTestImages(t)
|
|
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "I'm showing you two images. What do they have in common, and how are they different?",
|
|
Images: []api.ImageData{abbeyRoad, docs},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
// Both images feature cartoon llamas/alpacas — the model should
|
|
// note the common subject and the different settings.
|
|
DoChat(ctx, t, client, req, []string{
|
|
"llama", "alpaca", "animal", "cartoon",
|
|
}, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVisionImageDescription verifies that the model can describe the contents
|
|
// of the ollama homepage image (a cartoon llama with "Start building with
|
|
// open models" text). Basic sanity check that the vision pipeline works.
|
|
func TestVisionImageDescription(t *testing.T) {
|
|
skipUnderMinVRAM(t, 6)
|
|
skipIfNoVisionOverride(t)
|
|
|
|
for _, model := range testModels(defaultVisionModels) {
|
|
t.Run(model, func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
setupVisionModel(ctx, t, client, model)
|
|
_, _, ollamaHome := decodeTestImages(t)
|
|
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "Describe what you see in this image briefly.",
|
|
Images: []api.ImageData{ollamaHome},
|
|
},
|
|
},
|
|
Stream: &stream,
|
|
Options: map[string]any{"temperature": 0.0, "seed": 42},
|
|
}
|
|
DoChat(ctx, t, client, req, []string{
|
|
"llama", "animal", "build", "model", "open", "cartoon", "character",
|
|
}, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|