From 4d14b0ff92cc2116f91f33b2f85d78f7ef263f29 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 31 Mar 2026 16:46:30 -0700 Subject: [PATCH] mlx: respect tokenizer add_bos_token setting in pipeline (#15185) Replace hardcoded Encode(prompt, true) with Encode(prompt, r.Tokenizer.AddBOS()) so the pipeline respects each model's tokenizer configuration. Models with add_bos_token=true (gemma3, llama): unchanged, tokenizer still prepends BOS. Models with bos_token=null (qwen3, qwen3.5): unchanged, the BOS guard (vocab.BOS >= 0) already prevented prepending regardless of the flag. This aligns the pipeline with the /v1/tokenize endpoint which already uses Tokenizer.AddBOS(). --- x/mlxrunner/pipeline.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x/mlxrunner/pipeline.go b/x/mlxrunner/pipeline.go index d98d25ccd..4dcfad01d 100644 --- a/x/mlxrunner/pipeline.go +++ b/x/mlxrunner/pipeline.go @@ -55,7 +55,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error { slog.Info("peak memory", "size", mlx.PrettyBytes(mlx.PeakMemory())) }() - inputs := r.Tokenizer.Encode(request.Prompt, true) + inputs := r.Tokenizer.Encode(request.Prompt, r.Tokenizer.AddBOS()) if len(inputs) == 0 { return errors.New("empty prompt") }