mirror of
https://github.com/ollama/ollama.git
synced 2026-04-20 06:54:29 +02:00
consolidate the tokenizer (#14327)
This change adds a new x/tokenizer package which includes: * New BPE and SentencePiece tokenizers * Removing the dependency on the imagegen tokenizers * Fixes to multibyte decoding in the pipeline * Various correctness and benchmark tests Not included in this PR is the WordPiece tokenizer for BERT models which will be added when we add embedding models. The imagegen tokenizers will also be removed in a follow-up PR.
This commit is contained in:
47
x/mlxrunner/utf8_buffer.go
Normal file
47
x/mlxrunner/utf8_buffer.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package mlxrunner
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// flushValidUTF8Prefix returns and consumes the longest valid UTF-8 prefix
|
||||
// currently buffered, leaving any incomplete trailing bytes in place.
|
||||
func flushValidUTF8Prefix(b *bytes.Buffer) string {
|
||||
data := b.Bytes()
|
||||
if len(data) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
prefix := validUTF8PrefixLen(data)
|
||||
if prefix == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
text := string(data[:prefix])
|
||||
b.Next(prefix)
|
||||
return text
|
||||
}
|
||||
|
||||
func validUTF8PrefixLen(data []byte) int {
|
||||
i := 0
|
||||
prefix := 0
|
||||
for i < len(data) {
|
||||
r, size := utf8.DecodeRune(data[i:])
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
if !utf8.FullRune(data[i:]) {
|
||||
break
|
||||
}
|
||||
|
||||
// Invalid UTF-8 byte; consume one byte to guarantee forward progress.
|
||||
i++
|
||||
prefix = i
|
||||
continue
|
||||
}
|
||||
|
||||
i += size
|
||||
prefix = i
|
||||
}
|
||||
|
||||
return prefix
|
||||
}
|
||||
Reference in New Issue
Block a user