mlx: respect tokenizer add_bos_token setting in pipeline (#15185)

Replace hardcoded Encode(prompt, true) with
Encode(prompt, r.Tokenizer.AddBOS()) so the pipeline respects each
model's tokenizer configuration.

Models with add_bos_token=true (gemma3, llama): unchanged, tokenizer
still prepends BOS.

Models with bos_token=null (qwen3, qwen3.5): unchanged, the BOS
guard (vocab.BOS >= 0) already prevented prepending regardless of
the flag.

This aligns the pipeline with the /v1/tokenize endpoint which already
uses Tokenizer.AddBOS().
This commit is contained in:
Daniel Hiltgen
2026-03-31 16:46:30 -07:00
committed by GitHub
parent d9cb70c270
commit 4d14b0ff92

View File

@@ -55,7 +55,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
slog.Info("peak memory", "size", mlx.PrettyBytes(mlx.PeakMemory()))
}()
inputs := r.Tokenizer.Encode(request.Prompt, true)
inputs := r.Tokenizer.Encode(request.Prompt, r.Tokenizer.AddBOS())
if len(inputs) == 0 {
return errors.New("empty prompt")
}