restructure

image processing Update model.go Update model.go Update model.go no projector no projector vision model scaffold ... ... wip ... rebase fix patch merger tidy ... Update model_vision.go server: do not attempt to parse offset file as gguf This logic was causing issues for me when importing a gguf that had some padding at the end of the file. The valid gguf would be read, but then it would try to read the offset as a different gguf file. This does not seem right. Update process_image_test.go apply norm prompt processing prompt processing fix post tokenize fix gguf padding + populate the split patch embeddings ... ... another shot at patch embeddings ... patch embedding Update model_vision.go split pixels
2026-04-23 01:05:47 +02:00 · 2025-04-02 10:41:51 -07:00
parent 198b1e6db9
commit c1f9bcb4dd
17 changed files with 1194 additions and 208 deletions
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -0,0 +1,196 @@
+package qwen25vl
+
+import (
+	"fmt"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
+type ImageProcessor struct {
+	imageSize         int
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	minPixels         int
+	maxPixels         int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+
+	return ImageProcessor{
+		imageSize:         int(c.Uint("vision.image_size", 560)),
+		numChannels:       3,
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		minPixels:         56 * 56,
+		maxPixels:         28 * 28 * 4 * 1280,
+		factor:            patchSize * mergeSize,
+		rescaleFactor:     1.0 / 255.0,
+		imageMean:         []float32{0.48145466, 0.4578275, 0.40821073},
+		imageStd:          []float32{0.26862954, 0.26130258, 0.27577711},
+	}
+}
+
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if float64(max(height, width))/float64(min(height, width)) > 200 {
+		aspectRatio := float64(max(height, width)) / float64(min(height, width))
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %f", aspectRatio))
+	}
+
+	round := func(x float64) int {
+		return int(math.Round(x))
+	}
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+
+	if hBar*wBar > p.maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.minPixels {
+		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, int, int, int, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+
+	// Calculate grid dimensions
+	gridH := resizedHeight / p.patchSize
+	gridW := resizedWidth / p.patchSize
+	gridT := 1 // For single images, temporal dimension is 1
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, gridH, gridW, gridT)
+	if err != nil {
+		return nil, 0, 0, 0, fmt.Errorf("failed to create patches: %v", err)
+	}
+
+	// Return patches and grid dimensions
+	return patches, gridT, gridH, gridW, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width, gridH, gridW, gridT int) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	// Calculate output dimensions
+	numPatches := gridT * gridH * gridW
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+
+	// Create output tensor
+	result := make([]float32, numPatches*patchDim)
+
+	// Instead of the complex 9D reshape+transpose, directly extract patches
+	// in the format expected by the forward pass
+	patchIndex := 0
+
+	for t := 0; t < gridT; t++ {
+		// For each patch in the grid
+		for h := 0; h < gridH; h += mergeSize {
+			for w := 0; w < gridW; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := 0; mh < mergeSize; mh++ {
+					for mw := 0; mw < mergeSize; mw++ {
+						// For each pixel in the patch
+						for py := 0; py < patchSize; py++ {
+							for px := 0; px < patchSize; px++ {
+								// Calculate source coordinates
+								y := (h+mh)*patchSize + py
+								x := (w+mw)*patchSize + px
+
+								// For each channel
+								for c := 0; c < channels; c++ {
+									// Channel-first format (CHW)
+									srcIdx := c*height*width + y*width + x
+
+									// Calculate destination index based on the expected layout
+									// This is the key part that matches what the model expects
+									dstIdx := patchIndex*patchDim +
+										(c * temporalPatchSize * patchSize * patchSize) +
+										(0 * patchSize * patchSize) + // temporal dim
+										(py * patchSize) +
+										px
+
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+
+						// Handle temporal dimension padding (if needed)
+						for tp := 1; tp < temporalPatchSize; tp++ {
+							for py := 0; py < patchSize; py++ {
+								for px := 0; px < patchSize; px++ {
+									for c := 0; c < channels; c++ {
+										srcIdx := patchIndex*patchDim +
+											(c * temporalPatchSize * patchSize * patchSize) +
+											(0 * patchSize * patchSize) + // first temporal frame
+											(py * patchSize) +
+											px
+
+										dstIdx := patchIndex*patchDim +
+											(c * temporalPatchSize * patchSize * patchSize) +
+											(tp * patchSize * patchSize) + // current temporal frame
+											(py * patchSize) +
+											px
+
+										if srcIdx < len(result) && dstIdx < len(result) {
+											result[dstIdx] = result[srcIdx] // Copy from first frame
+										}
+									}
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}