x/grammar: add experimental GPU accelerated constrained decoding package

2026-04-19 20:54:25 +02:00 · 2026-01-10 16:42:45 -08:00
parent 7cc2a653f2
commit e23ddd84b8
38 changed files with 5819 additions and 36 deletions
--- a/x/grammar/cmd/compare/complex.gbnf
+++ b/x/grammar/cmd/compare/complex.gbnf
@@ -0,0 +1,45 @@
+root ::= ws "{" ws id-field "," ws kind-field "," ws items-field "," ws alt-field "," ws flags-field "," ws meta-field "," ws priority-field ws "}" ws
+
+id-field ::= "\"id\"" ws ":" ws uuid
+kind-field ::= "\"kind\"" ws ":" ws kind
+items-field ::= "\"items\"" ws ":" ws items
+alt-field ::= "\"alt\"" ws ":" ws alt
+flags-field ::= "\"flags\"" ws ":" ws flags
+meta-field ::= "\"meta\"" ws ":" ws meta
+priority-field ::= "\"priority\"" ws ":" ws int
+
+kind ::= "\"order\"" | "\"invoice\"" | "\"shipment\""
+status ::= "\"new\"" | "\"backorder\"" | "\"shipped\""
+flag ::= "\"fragile\"" | "\"gift\"" | "\"priority\"" | "\"insured\""
+source ::= "\"api\"" | "\"batch\"" | "\"import\""
+
+items ::= "[" ws item ( "," ws item )? ( "," ws item )? ws "]"
+flags ::= "[" ws "]" | "[" ws flag ( "," ws flag )? ( "," ws flag )? ( "," ws flag )? ws "]"
+
+item ::= "{" ws item-sku "," ws item-qty "," ws item-status "," ws item-notes ws "}"
+item-sku ::= "\"sku\"" ws ":" ws string
+item-qty ::= "\"qty\"" ws ":" ws int
+item-status ::= "\"status\"" ws ":" ws status
+item-notes ::= "\"notes\"" ws ":" ws string
+
+meta ::= "{" ws meta-created "," ws meta-source "," ws meta-ip ws "}"
+meta-created ::= "\"created\"" ws ":" ws date-time
+meta-source ::= "\"source\"" ws ":" ws source
+meta-ip ::= "\"ip\"" ws ":" ws ipv4
+
+alt ::= string | int | "null"
+
+uuid ::= "\"" hex hex hex hex hex hex hex hex "-" hex hex hex hex "-" hex hex hex hex "-" hex hex hex hex "-" hex hex hex hex hex hex hex hex hex hex hex hex "\""
+date-time ::= "\"" digit digit digit digit "-" digit digit "-" digit digit "T" digit digit ":" digit digit ":" digit digit ( "Z" | ( "+" | "-" ) digit digit ":" digit digit ) "\""
+ipv4 ::= "\"" digit+ "." digit+ "." digit+ "." digit+ "\""
+
+string ::= "\"" characters "\""
+characters ::= character*
+character ::= [^"\\] | "\\" escape
+escape ::= ["\\bfnrt]
+
+int ::= "-"? digit+
+digit ::= [0-9]
+hex ::= [0-9a-fA-F]
+
+ws ::= [ \t\n\r]*
--- a/x/grammar/cmd/compare/complex.schema.json
+++ b/x/grammar/cmd/compare/complex.schema.json
@@ -0,0 +1,46 @@
+{
+  "type": "object",
+  "properties": {
+    "id": { "type": "string", "format": "uuid" },
+    "kind": { "enum": ["order", "invoice", "shipment"] },
+    "items": {
+      "type": "array",
+      "minItems": 1,
+      "maxItems": 3,
+      "items": {
+        "type": "object",
+        "properties": {
+          "sku": { "type": "string" },
+          "qty": { "type": "integer" },
+          "status": { "enum": ["new", "backorder", "shipped"] },
+          "notes": { "type": "string" }
+        },
+        "required": ["sku", "qty", "status", "notes"]
+      }
+    },
+    "alt": {
+      "oneOf": [
+        { "type": "string" },
+        { "type": "null" },
+        { "type": "integer" }
+      ]
+    },
+    "flags": {
+      "type": "array",
+      "minItems": 0,
+      "maxItems": 4,
+      "items": { "enum": ["fragile", "gift", "priority", "insured"] }
+    },
+    "meta": {
+      "type": "object",
+      "properties": {
+        "created": { "type": "string", "format": "date-time" },
+        "source": { "enum": ["api", "batch", "import"] },
+        "ip": { "type": "string", "format": "ipv4" }
+      },
+      "required": ["created", "source", "ip"]
+    },
+    "priority": { "type": "integer" }
+  },
+  "required": ["id", "kind", "items", "alt", "flags", "meta", "priority"]
+}
--- a/x/grammar/cmd/compare/main.go
+++ b/x/grammar/cmd/compare/main.go
@@ -0,0 +1,235 @@
+//go:build mlx
+
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/x/grammar"
+	"github.com/ollama/ollama/x/grammar/schema"
+	"github.com/ollama/ollama/x/imagegen/mlx"
+)
+
+const jsonGBNF = `
+root ::= value
+value ::= object | array | string | number | "true" | "false" | "null"
+object ::= "{" ws "}" | "{" members "}"
+members ::= member ("," member)*
+member ::= ws string ws ":" element
+array ::= "[" ws "]" | "[" elements "]"
+elements ::= element ("," element)*
+element ::= ws value ws
+string ::= "\"" characters "\""
+characters ::= character*
+character ::= [^"\\] | "\\" escape
+escape ::= ["\\bfnrt]
+number ::= "-"? integer fraction? exponent?
+integer ::= "0" | [1-9] [0-9]*
+fraction ::= "." [0-9]+
+exponent ::= [eE] [+-]? [0-9]+
+ws ::= [ \t\n\r]*
+`
+
+type result struct {
+	vocabSize           int    `json:"vocab_size"`
+	Iterations          int    `json:"iterations"`
+	Warmup              int    `json:"warmup"`
+	ConstrainedSource   string `json:"constrained_source"`
+	LlamaSource         string `json:"llama_source"`
+	LlamaApply          string `json:"llama_apply"`
+	ConstrainedGraph    string `json:"constrained_graph"`
+	ConstrainedWithEval string `json:"constrained_with_eval,omitempty"`
+	EvalOnly            string `json:"eval_only,omitempty"`
+	ConstrainedEvalNet  string `json:"constrained_eval_net,omitempty"`
+}
+
+func main() {
+	var (
+		vocabSize  = flag.Int("vocab-size", 128000, "Vocabulary size")
+		iterations = flag.Int("iterations", 500, "Benchmark iterations")
+		warmup     = flag.Int("warmup", 50, "Warmup iterations")
+		withEval   = flag.Bool("eval", true, "Measure ApplyMask with mlx.Eval")
+		gbnfPath   = flag.String("gbnf", "", "GBNF grammar file for llama.cpp")
+		schemaPath = flag.String("schema", "", "JSON Schema file for grammar constraints")
+		ebnfPath   = flag.String("ebnf", "", "EBNF grammar file for grammar constraints")
+		startRule  = flag.String("start", "root", "Start rule for EBNF")
+	)
+	flag.Parse()
+
+	if *vocabSize <= 0 || *iterations <= 0 || *warmup < 0 {
+		fmt.Fprintln(os.Stderr, "invalid flags")
+		os.Exit(2)
+	}
+
+	vocab := createVocab(*vocabSize)
+
+	if *schemaPath != "" && *ebnfPath != "" {
+		fmt.Fprintln(os.Stderr, "only one of -schema or -ebnf may be set")
+		os.Exit(2)
+	}
+
+	var constrainedSource string
+	var compiled *grammar.Grammar
+	var err error
+	switch {
+	case *schemaPath != "":
+		data, readErr := os.ReadFile(*schemaPath)
+		if readErr != nil {
+			fmt.Fprintf(os.Stderr, "read schema: %v\n", readErr)
+			os.Exit(1)
+		}
+		compiled, err = schema.Grammar(string(data))
+		constrainedSource = "schema:" + *schemaPath
+	case *ebnfPath != "":
+		data, readErr := os.ReadFile(*ebnfPath)
+		if readErr != nil {
+			fmt.Fprintf(os.Stderr, "read ebnf: %v\n", readErr)
+			os.Exit(1)
+		}
+		compiled, err = grammar.ParseEBNF(string(data), *startRule)
+		constrainedSource = "ebnf:" + *ebnfPath
+	default:
+		compiled, err = grammar.JSONGrammar()
+		constrainedSource = "json"
+	}
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "grammar: %v\n", err)
+		os.Exit(1)
+	}
+	engine, err := grammar.NewEngine(compiled, vocab)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "engine: %v\n", err)
+		os.Exit(1)
+	}
+	defer engine.Close()
+
+	logits := mlx.Ones(int32(*vocabSize))
+	mlx.Keep(logits)
+
+	for i := 0; i < *warmup; i++ {
+		masked := engine.ApplyMask(logits)
+		if *withEval {
+			mlx.Eval(masked)
+		}
+	}
+
+	graphAvg := measure(*iterations, func() {
+		_ = engine.ApplyMask(logits)
+	})
+
+	var evalAvg time.Duration
+	var evalOnlyAvg time.Duration
+	if *withEval {
+		evalOnlyAvg = measure(*iterations, func() {
+			baseline := mlx.MulScalar(logits, 1)
+			mlx.Eval(baseline)
+			baseline.Free()
+		})
+
+		evalAvg = measure(*iterations, func() {
+			masked := engine.ApplyMask(logits)
+			mlx.Eval(masked)
+		})
+	}
+
+	vocabIDs := make([]uint32, *vocabSize)
+	for i := range vocabIDs {
+		vocabIDs[i] = uint32(i)
+	}
+	eogTokens := []int32{0}
+
+	gbnf := jsonGBNF
+	llamaSource := "json"
+	if *gbnfPath != "" {
+		data, readErr := os.ReadFile(*gbnfPath)
+		if readErr != nil {
+			fmt.Fprintf(os.Stderr, "read gbnf: %v\n", readErr)
+			os.Exit(1)
+		}
+		gbnf = string(data)
+		llamaSource = *gbnfPath
+	}
+
+	llamaGrammar := llama.NewGrammar(gbnf, vocabIDs, vocab, eogTokens)
+	if llamaGrammar == nil {
+		fmt.Fprintln(os.Stderr, "llama grammar initialization failed")
+		os.Exit(1)
+	}
+	defer llamaGrammar.Free()
+
+	llamaTokens := make([]llama.TokenData, *vocabSize)
+
+	for i := 0; i < *warmup; i++ {
+		for j := range llamaTokens {
+			llamaTokens[j].Logit = 1.0
+		}
+		llamaGrammar.Apply(llamaTokens)
+	}
+
+	llamaAvg := measure(*iterations, func() {
+		for j := range llamaTokens {
+			llamaTokens[j].Logit = 1.0
+		}
+		llamaGrammar.Apply(llamaTokens)
+	})
+
+	out := result{
+		vocabSize:         *vocabSize,
+		Iterations:        *iterations,
+		Warmup:            *warmup,
+		LlamaApply:        llamaAvg.String(),
+		ConstrainedGraph:  graphAvg.String(),
+		ConstrainedSource: constrainedSource,
+		LlamaSource:       llamaSource,
+	}
+	if *withEval {
+		out.ConstrainedWithEval = evalAvg.String()
+		out.EvalOnly = evalOnlyAvg.String()
+		if evalAvg > evalOnlyAvg {
+			out.ConstrainedEvalNet = (evalAvg - evalOnlyAvg).String()
+		} else {
+			out.ConstrainedEvalNet = "0s"
+		}
+	}
+
+	enc := json.NewEncoder(os.Stdout)
+	if err := enc.Encode(out); err != nil {
+		fmt.Fprintf(os.Stderr, "encode: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func measure(iterations int, fn func()) time.Duration {
+	start := time.Now()
+	for i := 0; i < iterations; i++ {
+		fn()
+	}
+	return time.Since(start) / time.Duration(iterations)
+}
+
+func createVocab(size int) []string {
+	vocab := make([]string, size)
+
+	jsonTokens := []string{
+		"{", "}", "[", "]", ":", ",",
+		"true", "false", "null",
+		" ", "\n", "\t", "\r",
+		"\"",
+	}
+	for i, t := range jsonTokens {
+		if i < size {
+			vocab[i] = t
+		}
+	}
+
+	for i := len(jsonTokens); i < size; i++ {
+		vocab[i] = fmt.Sprintf("tok%d", i)
+	}
+
+	return vocab
+}