x/grammar: add experimental GPU accelerated constrained decoding package

This commit is contained in:
jmorganca
2026-01-10 16:42:45 -08:00
parent 7cc2a653f2
commit e23ddd84b8
38 changed files with 5819 additions and 36 deletions

View File

@@ -0,0 +1,45 @@
root ::= ws "{" ws id-field "," ws kind-field "," ws items-field "," ws alt-field "," ws flags-field "," ws meta-field "," ws priority-field ws "}" ws
id-field ::= "\"id\"" ws ":" ws uuid
kind-field ::= "\"kind\"" ws ":" ws kind
items-field ::= "\"items\"" ws ":" ws items
alt-field ::= "\"alt\"" ws ":" ws alt
flags-field ::= "\"flags\"" ws ":" ws flags
meta-field ::= "\"meta\"" ws ":" ws meta
priority-field ::= "\"priority\"" ws ":" ws int
kind ::= "\"order\"" | "\"invoice\"" | "\"shipment\""
status ::= "\"new\"" | "\"backorder\"" | "\"shipped\""
flag ::= "\"fragile\"" | "\"gift\"" | "\"priority\"" | "\"insured\""
source ::= "\"api\"" | "\"batch\"" | "\"import\""
items ::= "[" ws item ( "," ws item )? ( "," ws item )? ws "]"
flags ::= "[" ws "]" | "[" ws flag ( "," ws flag )? ( "," ws flag )? ( "," ws flag )? ws "]"
item ::= "{" ws item-sku "," ws item-qty "," ws item-status "," ws item-notes ws "}"
item-sku ::= "\"sku\"" ws ":" ws string
item-qty ::= "\"qty\"" ws ":" ws int
item-status ::= "\"status\"" ws ":" ws status
item-notes ::= "\"notes\"" ws ":" ws string
meta ::= "{" ws meta-created "," ws meta-source "," ws meta-ip ws "}"
meta-created ::= "\"created\"" ws ":" ws date-time
meta-source ::= "\"source\"" ws ":" ws source
meta-ip ::= "\"ip\"" ws ":" ws ipv4
alt ::= string | int | "null"
uuid ::= "\"" hex hex hex hex hex hex hex hex "-" hex hex hex hex "-" hex hex hex hex "-" hex hex hex hex "-" hex hex hex hex hex hex hex hex hex hex hex hex "\""
date-time ::= "\"" digit digit digit digit "-" digit digit "-" digit digit "T" digit digit ":" digit digit ":" digit digit ( "Z" | ( "+" | "-" ) digit digit ":" digit digit ) "\""
ipv4 ::= "\"" digit+ "." digit+ "." digit+ "." digit+ "\""
string ::= "\"" characters "\""
characters ::= character*
character ::= [^"\\] | "\\" escape
escape ::= ["\\bfnrt]
int ::= "-"? digit+
digit ::= [0-9]
hex ::= [0-9a-fA-F]
ws ::= [ \t\n\r]*

View File

@@ -0,0 +1,46 @@
{
"type": "object",
"properties": {
"id": { "type": "string", "format": "uuid" },
"kind": { "enum": ["order", "invoice", "shipment"] },
"items": {
"type": "array",
"minItems": 1,
"maxItems": 3,
"items": {
"type": "object",
"properties": {
"sku": { "type": "string" },
"qty": { "type": "integer" },
"status": { "enum": ["new", "backorder", "shipped"] },
"notes": { "type": "string" }
},
"required": ["sku", "qty", "status", "notes"]
}
},
"alt": {
"oneOf": [
{ "type": "string" },
{ "type": "null" },
{ "type": "integer" }
]
},
"flags": {
"type": "array",
"minItems": 0,
"maxItems": 4,
"items": { "enum": ["fragile", "gift", "priority", "insured"] }
},
"meta": {
"type": "object",
"properties": {
"created": { "type": "string", "format": "date-time" },
"source": { "enum": ["api", "batch", "import"] },
"ip": { "type": "string", "format": "ipv4" }
},
"required": ["created", "source", "ip"]
},
"priority": { "type": "integer" }
},
"required": ["id", "kind", "items", "alt", "flags", "meta", "priority"]
}

View File

@@ -0,0 +1,235 @@
//go:build mlx
package main
import (
"encoding/json"
"flag"
"fmt"
"os"
"time"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/x/grammar"
"github.com/ollama/ollama/x/grammar/schema"
"github.com/ollama/ollama/x/imagegen/mlx"
)
const jsonGBNF = `
root ::= value
value ::= object | array | string | number | "true" | "false" | "null"
object ::= "{" ws "}" | "{" members "}"
members ::= member ("," member)*
member ::= ws string ws ":" element
array ::= "[" ws "]" | "[" elements "]"
elements ::= element ("," element)*
element ::= ws value ws
string ::= "\"" characters "\""
characters ::= character*
character ::= [^"\\] | "\\" escape
escape ::= ["\\bfnrt]
number ::= "-"? integer fraction? exponent?
integer ::= "0" | [1-9] [0-9]*
fraction ::= "." [0-9]+
exponent ::= [eE] [+-]? [0-9]+
ws ::= [ \t\n\r]*
`
type result struct {
vocabSize int `json:"vocab_size"`
Iterations int `json:"iterations"`
Warmup int `json:"warmup"`
ConstrainedSource string `json:"constrained_source"`
LlamaSource string `json:"llama_source"`
LlamaApply string `json:"llama_apply"`
ConstrainedGraph string `json:"constrained_graph"`
ConstrainedWithEval string `json:"constrained_with_eval,omitempty"`
EvalOnly string `json:"eval_only,omitempty"`
ConstrainedEvalNet string `json:"constrained_eval_net,omitempty"`
}
func main() {
var (
vocabSize = flag.Int("vocab-size", 128000, "Vocabulary size")
iterations = flag.Int("iterations", 500, "Benchmark iterations")
warmup = flag.Int("warmup", 50, "Warmup iterations")
withEval = flag.Bool("eval", true, "Measure ApplyMask with mlx.Eval")
gbnfPath = flag.String("gbnf", "", "GBNF grammar file for llama.cpp")
schemaPath = flag.String("schema", "", "JSON Schema file for grammar constraints")
ebnfPath = flag.String("ebnf", "", "EBNF grammar file for grammar constraints")
startRule = flag.String("start", "root", "Start rule for EBNF")
)
flag.Parse()
if *vocabSize <= 0 || *iterations <= 0 || *warmup < 0 {
fmt.Fprintln(os.Stderr, "invalid flags")
os.Exit(2)
}
vocab := createVocab(*vocabSize)
if *schemaPath != "" && *ebnfPath != "" {
fmt.Fprintln(os.Stderr, "only one of -schema or -ebnf may be set")
os.Exit(2)
}
var constrainedSource string
var compiled *grammar.Grammar
var err error
switch {
case *schemaPath != "":
data, readErr := os.ReadFile(*schemaPath)
if readErr != nil {
fmt.Fprintf(os.Stderr, "read schema: %v\n", readErr)
os.Exit(1)
}
compiled, err = schema.Grammar(string(data))
constrainedSource = "schema:" + *schemaPath
case *ebnfPath != "":
data, readErr := os.ReadFile(*ebnfPath)
if readErr != nil {
fmt.Fprintf(os.Stderr, "read ebnf: %v\n", readErr)
os.Exit(1)
}
compiled, err = grammar.ParseEBNF(string(data), *startRule)
constrainedSource = "ebnf:" + *ebnfPath
default:
compiled, err = grammar.JSONGrammar()
constrainedSource = "json"
}
if err != nil {
fmt.Fprintf(os.Stderr, "grammar: %v\n", err)
os.Exit(1)
}
engine, err := grammar.NewEngine(compiled, vocab)
if err != nil {
fmt.Fprintf(os.Stderr, "engine: %v\n", err)
os.Exit(1)
}
defer engine.Close()
logits := mlx.Ones(int32(*vocabSize))
mlx.Keep(logits)
for i := 0; i < *warmup; i++ {
masked := engine.ApplyMask(logits)
if *withEval {
mlx.Eval(masked)
}
}
graphAvg := measure(*iterations, func() {
_ = engine.ApplyMask(logits)
})
var evalAvg time.Duration
var evalOnlyAvg time.Duration
if *withEval {
evalOnlyAvg = measure(*iterations, func() {
baseline := mlx.MulScalar(logits, 1)
mlx.Eval(baseline)
baseline.Free()
})
evalAvg = measure(*iterations, func() {
masked := engine.ApplyMask(logits)
mlx.Eval(masked)
})
}
vocabIDs := make([]uint32, *vocabSize)
for i := range vocabIDs {
vocabIDs[i] = uint32(i)
}
eogTokens := []int32{0}
gbnf := jsonGBNF
llamaSource := "json"
if *gbnfPath != "" {
data, readErr := os.ReadFile(*gbnfPath)
if readErr != nil {
fmt.Fprintf(os.Stderr, "read gbnf: %v\n", readErr)
os.Exit(1)
}
gbnf = string(data)
llamaSource = *gbnfPath
}
llamaGrammar := llama.NewGrammar(gbnf, vocabIDs, vocab, eogTokens)
if llamaGrammar == nil {
fmt.Fprintln(os.Stderr, "llama grammar initialization failed")
os.Exit(1)
}
defer llamaGrammar.Free()
llamaTokens := make([]llama.TokenData, *vocabSize)
for i := 0; i < *warmup; i++ {
for j := range llamaTokens {
llamaTokens[j].Logit = 1.0
}
llamaGrammar.Apply(llamaTokens)
}
llamaAvg := measure(*iterations, func() {
for j := range llamaTokens {
llamaTokens[j].Logit = 1.0
}
llamaGrammar.Apply(llamaTokens)
})
out := result{
vocabSize: *vocabSize,
Iterations: *iterations,
Warmup: *warmup,
LlamaApply: llamaAvg.String(),
ConstrainedGraph: graphAvg.String(),
ConstrainedSource: constrainedSource,
LlamaSource: llamaSource,
}
if *withEval {
out.ConstrainedWithEval = evalAvg.String()
out.EvalOnly = evalOnlyAvg.String()
if evalAvg > evalOnlyAvg {
out.ConstrainedEvalNet = (evalAvg - evalOnlyAvg).String()
} else {
out.ConstrainedEvalNet = "0s"
}
}
enc := json.NewEncoder(os.Stdout)
if err := enc.Encode(out); err != nil {
fmt.Fprintf(os.Stderr, "encode: %v\n", err)
os.Exit(1)
}
}
func measure(iterations int, fn func()) time.Duration {
start := time.Now()
for i := 0; i < iterations; i++ {
fn()
}
return time.Since(start) / time.Duration(iterations)
}
func createVocab(size int) []string {
vocab := make([]string, size)
jsonTokens := []string{
"{", "}", "[", "]", ":", ",",
"true", "false", "null",
" ", "\n", "\t", "\r",
"\"",
}
for i, t := range jsonTokens {
if i < size {
vocab[i] = t
}
}
for i := len(jsonTokens); i < size; i++ {
vocab[i] = fmt.Sprintf("tok%d", i)
}
return vocab
}