package gemma4 import ( "image" "math" "golang.org/x/image/draw" "github.com/ollama/ollama/fs" ) type ImageProcessor struct { patchSize int numChannels int nMerge int minPixels int maxPixels int } func newImageProcessor(c fs.Config) ImageProcessor { patchSize := int(c.Uint("vision.patch_size", 16)) nMerge := int(c.Uint("vision.projector.scale_factor", 3)) numChannels := int(c.Uint("vision.num_channels", 3)) // Token limits from reference: min=40, max=280 output tokens after pooling. // Convert to pixel counts: tokens * nMerge^2 * patchSize^2 minTokens := 40 maxTokens := 280 patchArea := patchSize * patchSize * nMerge * nMerge minPixels := minTokens * patchArea maxPixels := maxTokens * patchArea return ImageProcessor{ patchSize: patchSize, numChannels: numChannels, nMerge: nMerge, minPixels: minPixels, maxPixels: maxPixels, } } // ProcessImage resizes an image preserving aspect ratio, aligning dimensions // to (patchSize * nMerge) boundaries, and normalizes pixels to [-1, 1]. // Returns the float32 pixel data and the actual output dimensions. func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, int, int, error) { // Compute target size preserving aspect ratio alignSize := p.patchSize * p.nMerge targetW, targetH := p.smartResize(img.Bounds().Dx(), img.Bounds().Dy(), alignSize) // Resize directly without alpha compositing, matching MLX reference. dst := image.NewRGBA(image.Rect(0, 0, targetW, targetH)) draw.BiLinear.Scale(dst, dst.Bounds(), img, img.Bounds(), draw.Over, nil) // Normalize to [-1, 1] using mean=0.5, std=0.5: (pixel/255 - 0.5) / 0.5 = 2*pixel/255 - 1 data := p.pack(dst) return data, targetW, targetH, nil } // smartResize computes target dimensions that preserve aspect ratio and // align to alignSize boundaries. It scales the image to fill the maximum // patch budget (maxPixels), matching the MLX reference. func (p *ImageProcessor) smartResize(origW, origH, alignSize int) (int, int) { totalPx := origW * origH var targetW, targetH int if p.maxPixels > 0 && totalPx > 0 { factor := math.Sqrt(float64(p.maxPixels) / float64(totalPx)) targetH = max(alignSize, int(math.Floor(factor*float64(origH)/float64(alignSize)))*alignSize) targetW = max(alignSize, int(math.Floor(factor*float64(origW)/float64(alignSize)))*alignSize) } else { targetH = max(alignSize, (origH/alignSize)*alignSize) targetW = max(alignSize, (origW/alignSize)*alignSize) } return targetW, targetH } // pack extracts RGB values from an image and normalizes to [-1, 1]. // Returns channel-first layout: [R..., G..., B...]. func (p *ImageProcessor) pack(img image.Image) []float32 { bounds := img.Bounds() w := bounds.Dx() h := bounds.Dy() size := w * h pixelVals := make([]float32, 3*size) rOff, gOff, bOff := 0, size, 2*size for y := bounds.Min.Y; y < bounds.Max.Y; y++ { for x := bounds.Min.X; x < bounds.Max.X; x++ { c := img.At(x, y) r, g, b, _ := c.RGBA() idx := (y-bounds.Min.Y)*w + (x - bounds.Min.X) // Normalize [0, 255] -> [-1, 1]: 2 * (val/255) - 1 pixelVals[rOff+idx] = float32(r>>8)/255.0*2.0 - 1.0 pixelVals[gOff+idx] = float32(g>>8)/255.0*2.0 - 1.0 pixelVals[bOff+idx] = float32(b>>8)/255.0*2.0 - 1.0 } } return pixelVals }