//go:build integration package integration import ( "context" "encoding/base64" "testing" "time" "github.com/ollama/ollama/api" ) // Default set of vision models to test. When OLLAMA_TEST_MODEL is set, // only that model is tested (with a capability check for vision). var defaultVisionModels = []string{ "gemma3", "llama3.2-vision", "qwen2.5vl", "qwen3-vl:8b", } // decodeTestImages returns the two test images (Abbey Road llamas, docs llamas). func decodeTestImages(t *testing.T) (abbeyRoad, docs api.ImageData) { t.Helper() var err error abbeyRoad, err = base64.StdEncoding.DecodeString(imageEncoding) if err != nil { t.Fatalf("decode abbey road image: %v", err) } docs, err = base64.StdEncoding.DecodeString(imageEncodingDocs) if err != nil { t.Fatalf("decode docs image: %v", err) } return } // setupVisionModel pulls the model, preloads it, and skips if not GPU-loaded. func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, model string) { t.Helper() if testModel != "" { requireCapability(ctx, t, client, model, "vision") } pullOrSkip(ctx, t, client, model) err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil }) if err != nil { t.Fatalf("failed to load model %s: %s", model, err) } skipIfNotGPULoaded(ctx, t, client, model, 80) } // TestVisionMultiTurn sends an image, gets a response, then asks follow-up // questions about the same image. This verifies that the KV cache correctly // handles cached image tokens across turns. func TestVisionMultiTurn(t *testing.T) { skipUnderMinVRAM(t, 6) // Models that fail on multi-turn detail questions (e.g. misidentifying objects). skipModels := map[string]string{ "gemma3": "misidentifies briefcase as smartphone on turn 3", "llama3.2-vision": "miscounts animals (says 3 instead of 4) on turn 2", } for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { if reason, ok := skipModels[model]; ok && testModel == "" { t.Skipf("skipping: %s", reason) } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) abbeyRoad, _ := decodeTestImages(t) // Turn 1: describe the image req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "Describe this image briefly.", Images: []api.ImageData{abbeyRoad}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } resp1 := DoChat(ctx, t, client, req, []string{ "llama", "cross", "walk", "road", "animal", "cartoon", }, 120*time.Second, 30*time.Second) if resp1 == nil { t.Fatal("no response from turn 1") } // Turn 2: follow-up about count req.Messages = append(req.Messages, *resp1, api.Message{Role: "user", Content: "How many animals are in the image?"}, ) resp2 := DoChat(ctx, t, client, req, []string{ "four", "4", }, 60*time.Second, 30*time.Second) if resp2 == nil { t.Fatal("no response from turn 2") } // Turn 3: follow-up about specific detail req.Messages = append(req.Messages, *resp2, api.Message{Role: "user", Content: "Is any animal carrying something? What is it?"}, ) DoChat(ctx, t, client, req, []string{ "briefcase", "suitcase", "bag", "case", "luggage", }, 60*time.Second, 30*time.Second) }) } } // TestVisionObjectCounting asks the model to count objects in an image. func TestVisionObjectCounting(t *testing.T) { skipUnderMinVRAM(t, 6) skipModels := map[string]string{ "llama3.2-vision": "consistently miscounts (says 3 instead of 4)", } for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { if reason, ok := skipModels[model]; ok && testModel == "" { t.Skipf("skipping: %s", reason) } ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) _, docs := decodeTestImages(t) req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "How many animals are shown in this image? Answer with just the number.", Images: []api.ImageData{docs}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } DoChat(ctx, t, client, req, []string{"4", "four"}, 120*time.Second, 30*time.Second) }) } } // TestVisionSceneUnderstanding tests whether the model can identify // cultural references and scene context from an image. func TestVisionSceneUnderstanding(t *testing.T) { skipUnderMinVRAM(t, 6) // Models known to be too small or not capable enough for cultural reference detection. skipModels := map[string]string{ "llama3.2-vision": "3B model lacks cultural reference knowledge", "minicpm-v": "too small for cultural reference detection", } for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { if reason, ok := skipModels[model]; ok && testModel == "" { t.Skipf("skipping: %s", reason) } ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) abbeyRoad, _ := decodeTestImages(t) req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "What famous image or album cover is this a parody of?", Images: []api.ImageData{abbeyRoad}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } DoChat(ctx, t, client, req, []string{ "abbey road", "beatles", "abbey", }, 120*time.Second, 30*time.Second) }) } } // TestVisionSpatialReasoning tests the model's ability to identify // objects based on their spatial position in the image. func TestVisionSpatialReasoning(t *testing.T) { skipUnderMinVRAM(t, 6) for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) _, docs := decodeTestImages(t) // The docs image has: leftmost llama on laptop with glasses, // rightmost llama sleeping. req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "What is the animal on the far left doing in this image?", Images: []api.ImageData{docs}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } DoChat(ctx, t, client, req, []string{ "laptop", "computer", "typing", "working", }, 120*time.Second, 30*time.Second) }) } } // TestVisionDetailRecognition tests whether the model can identify // small details like accessories in an image. func TestVisionDetailRecognition(t *testing.T) { skipUnderMinVRAM(t, 6) for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) _, docs := decodeTestImages(t) req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "Are any of the animals wearing glasses? Describe what you see.", Images: []api.ImageData{docs}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } DoChat(ctx, t, client, req, []string{ "glasses", "spectacles", "eyeglasses", }, 120*time.Second, 30*time.Second) }) } } // TestVisionMultiImage sends two images in a single message and asks // the model to compare and contrast them. This exercises multi-image // encoding and cross-image reasoning. func TestVisionMultiImage(t *testing.T) { skipUnderMinVRAM(t, 6) // Multi-image support varies across models. skipModels := map[string]string{ "llama3.2-vision": "does not support multi-image input", } for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { if reason, ok := skipModels[model]; ok && testModel == "" { t.Skipf("skipping: %s", reason) } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) abbeyRoad, docs := decodeTestImages(t) req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "I'm showing you two images. What do they have in common, and how are they different?", Images: []api.ImageData{abbeyRoad, docs}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } // Both images feature cartoon llamas/alpacas — the model should // note the common subject and the different settings. DoChat(ctx, t, client, req, []string{ "llama", "alpaca", "animal", "cartoon", }, 120*time.Second, 30*time.Second) }) } } // TestVisionOCR tests text extraction from an image. The docs image // contains the text "Ollama's documentation" in a header. func TestVisionOCR(t *testing.T) { skipUnderMinVRAM(t, 6) for _, model := range testModels(defaultVisionModels) { t.Run(model, func(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() setupVisionModel(ctx, t, client, model) _, docs := decodeTestImages(t) req := api.ChatRequest{ Model: model, Messages: []api.Message{ { Role: "user", Content: "What text appears in this image? Read all visible text.", Images: []api.ImageData{docs}, }, }, Stream: &stream, Options: map[string]any{"temperature": 0.0, "seed": 42}, } DoChat(ctx, t, client, req, []string{ "ollama", "documentation", }, 120*time.Second, 30*time.Second) }) } }