mirror of
https://github.com/ollama/ollama.git
synced 2026-04-28 03:39:48 +02:00
Compare commits
8 Commits
codex/fix-
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
03aee88186 | ||
|
|
ec9b4e9e47 | ||
|
|
4656a07e56 | ||
|
|
30f86cb9dd | ||
|
|
ea01af6f76 | ||
|
|
c2ebb4d57c | ||
|
|
590109c835 | ||
|
|
b4442c6d17 |
14
api/types.go
14
api/types.go
@@ -1080,7 +1080,7 @@ func DefaultOptions() Options {
|
||||
}
|
||||
}
|
||||
|
||||
// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low")
|
||||
// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low", "max")
|
||||
type ThinkValue struct {
|
||||
// Value can be a bool or string
|
||||
Value interface{}
|
||||
@@ -1096,7 +1096,7 @@ func (t *ThinkValue) IsValid() bool {
|
||||
case bool:
|
||||
return true
|
||||
case string:
|
||||
return v == "high" || v == "medium" || v == "low"
|
||||
return v == "high" || v == "medium" || v == "low" || v == "max"
|
||||
default:
|
||||
return false
|
||||
}
|
||||
@@ -1130,8 +1130,8 @@ func (t *ThinkValue) Bool() bool {
|
||||
case bool:
|
||||
return v
|
||||
case string:
|
||||
// Any string value ("high", "medium", "low") means thinking is enabled
|
||||
return v == "high" || v == "medium" || v == "low"
|
||||
// Any string value ("high", "medium", "low", "max") means thinking is enabled
|
||||
return v == "high" || v == "medium" || v == "low" || v == "max"
|
||||
default:
|
||||
return false
|
||||
}
|
||||
@@ -1169,14 +1169,14 @@ func (t *ThinkValue) UnmarshalJSON(data []byte) error {
|
||||
var s string
|
||||
if err := json.Unmarshal(data, &s); err == nil {
|
||||
// Validate string values
|
||||
if s != "high" && s != "medium" && s != "low" {
|
||||
return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", true, or false)", s)
|
||||
if s != "high" && s != "medium" && s != "low" && s != "max" {
|
||||
return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", \"max\", true, or false)", s)
|
||||
}
|
||||
t.Value = s
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\", true, or false)")
|
||||
return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\", \"max\", true, or false)")
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler
|
||||
|
||||
@@ -495,6 +495,11 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
|
||||
input: `{ "think": "low" }`,
|
||||
expectedThinking: &ThinkValue{Value: "low"},
|
||||
},
|
||||
{
|
||||
name: "string_max",
|
||||
input: `{ "think": "max" }`,
|
||||
expectedThinking: &ThinkValue{Value: "max"},
|
||||
},
|
||||
{
|
||||
name: "invalid_string",
|
||||
input: `{ "think": "invalid" }`,
|
||||
|
||||
@@ -582,10 +582,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
opts.Think = &api.ThinkValue{Value: true}
|
||||
case "false":
|
||||
opts.Think = &api.ThinkValue{Value: false}
|
||||
case "high", "medium", "low":
|
||||
case "high", "medium", "low", "max":
|
||||
opts.Think = &api.ThinkValue{Value: thinkStr}
|
||||
default:
|
||||
return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, or low)", thinkStr)
|
||||
return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, low, or max)", thinkStr)
|
||||
}
|
||||
} else {
|
||||
opts.Think = nil
|
||||
|
||||
@@ -588,7 +588,7 @@ func (c *launcherClient) launchManagedSingleIntegration(ctx context.Context, nam
|
||||
return nil
|
||||
}
|
||||
|
||||
if (current == "" || needsConfigure || req.ModelOverride != "" || target != current) && !savedMatchesModels(saved, []string{target}) {
|
||||
if needsConfigure || req.ModelOverride != "" || (current != "" && target != current) || !savedMatchesModels(saved, []string{target}) {
|
||||
if err := prepareManagedSingleIntegration(name, runner, managed, target); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -512,6 +512,65 @@ func TestLaunchIntegration_ManagedSingleIntegrationRewritesWhenSavedDiffers(t *t
|
||||
}
|
||||
}
|
||||
|
||||
func TestLaunchIntegration_ManagedSingleIntegrationRewritesWhenLiveConfigDrifts(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
setLaunchTestHome(t, tmpDir)
|
||||
withInteractiveSession(t, true)
|
||||
withLauncherHooks(t)
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/api/tags":
|
||||
fmt.Fprint(w, `{"models":[{"name":"gemma4"},{"name":"qwen3:8b"}]}`)
|
||||
case "/api/show":
|
||||
fmt.Fprint(w, `{"model_info":{"general.context_length":131072}}`)
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
t.Setenv("OLLAMA_HOST", srv.URL)
|
||||
|
||||
if err := config.SaveIntegration("stubmanaged", []string{"gemma4"}); err != nil {
|
||||
t.Fatalf("failed to save managed integration config: %v", err)
|
||||
}
|
||||
|
||||
runner := &launcherManagedRunner{
|
||||
currentModel: "qwen3:8b",
|
||||
}
|
||||
withIntegrationOverride(t, "stubmanaged", runner)
|
||||
|
||||
DefaultSingleSelector = func(title string, items []ModelItem, current string) (string, error) {
|
||||
t.Fatal("selector should not be called when live config already provides the target")
|
||||
return "", nil
|
||||
}
|
||||
DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
if err := LaunchIntegration(context.Background(), IntegrationLaunchRequest{Name: "stubmanaged"}); err != nil {
|
||||
t.Fatalf("LaunchIntegration returned error: %v", err)
|
||||
}
|
||||
|
||||
if diff := compareStrings(runner.configured, []string{"qwen3:8b"}); diff != "" {
|
||||
t.Fatalf("expected Configure to reconcile stale saved config to live target: %s", diff)
|
||||
}
|
||||
if runner.refreshCalls != 1 {
|
||||
t.Fatalf("expected runtime refresh once after drift reconciliation, got %d", runner.refreshCalls)
|
||||
}
|
||||
if runner.ranModel != "qwen3:8b" {
|
||||
t.Fatalf("expected launch to run live configured model, got %q", runner.ranModel)
|
||||
}
|
||||
|
||||
saved, err := config.LoadIntegration("stubmanaged")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to reload managed integration config: %v", err)
|
||||
}
|
||||
if diff := compareStrings(saved.Models, []string{"qwen3:8b"}); diff != "" {
|
||||
t.Fatalf("saved models mismatch after drift reconciliation: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLaunchIntegration_ManagedSingleIntegrationStopsWhenRuntimeRefreshFails(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
setLaunchTestHome(t, tmpDir)
|
||||
|
||||
@@ -28,6 +28,8 @@ var openclawModelShowTimeout = 5 * time.Second
|
||||
// openclawFreshInstall is set to true when ensureOpenclawInstalled performs an install
|
||||
var openclawFreshInstall bool
|
||||
|
||||
var openclawCanInstallDaemon = canInstallDaemon
|
||||
|
||||
type Openclaw struct{}
|
||||
|
||||
func (c *Openclaw) String() string { return "OpenClaw" }
|
||||
@@ -58,6 +60,7 @@ func (c *Openclaw) Run(model string, args []string) error {
|
||||
// the newest wizard flags (e.g. --auth-choice ollama).
|
||||
if !openclawFreshInstall {
|
||||
update := exec.Command(bin, "update")
|
||||
update.Env = openclawInstallEnv()
|
||||
update.Stdout = os.Stdout
|
||||
update.Stderr = os.Stderr
|
||||
_ = update.Run() // best-effort; continue even if update fails
|
||||
@@ -73,19 +76,18 @@ func (c *Openclaw) Run(model string, args []string) error {
|
||||
"--auth-choice", "ollama",
|
||||
"--custom-base-url", envconfig.Host().String(),
|
||||
"--custom-model-id", model,
|
||||
// Launch owns the first real gateway startup immediately after onboarding,
|
||||
// so don't let OpenClaw fail the whole first-run flow on a transient
|
||||
// daemon health probe.
|
||||
"--skip-health",
|
||||
"--skip-channels",
|
||||
"--skip-skills",
|
||||
}
|
||||
if canInstallDaemon() {
|
||||
if openclawCanInstallDaemon() {
|
||||
onboardArgs = append(onboardArgs, "--install-daemon")
|
||||
} else {
|
||||
// When we can't install a daemon (e.g. no systemd, sudo dropped
|
||||
// XDG_RUNTIME_DIR, or container environment), skip the gateway
|
||||
// health check so non-interactive onboarding completes. The
|
||||
// gateway is started as a foreground child process after onboarding.
|
||||
onboardArgs = append(onboardArgs, "--skip-health")
|
||||
}
|
||||
cmd := exec.Command(bin, onboardArgs...)
|
||||
cmd.Env = openclawInstallEnv()
|
||||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
@@ -101,6 +103,18 @@ func (c *Openclaw) Run(model string, args []string) error {
|
||||
// When extra args are passed through, run exactly what the user asked for
|
||||
// after setup and skip the built-in gateway+TUI convenience flow.
|
||||
if len(args) > 0 {
|
||||
cleanup := func() {}
|
||||
if shouldEnsureGatewayForArgs(args) {
|
||||
cleanupFn, _, _, err := c.ensureGatewayReady(bin)
|
||||
if err != nil {
|
||||
return windowsHint(err)
|
||||
}
|
||||
if cleanupFn != nil {
|
||||
cleanup = cleanupFn
|
||||
}
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
cmd := exec.Command(bin, args...)
|
||||
cmd.Env = openclawEnv()
|
||||
cmd.Stdin = os.Stdin
|
||||
@@ -121,41 +135,11 @@ func (c *Openclaw) Run(model string, args []string) error {
|
||||
|
||||
fmt.Fprintf(os.Stderr, "\n%sStarting your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)
|
||||
|
||||
token, port := c.gatewayInfo()
|
||||
addr := fmt.Sprintf("localhost:%d", port)
|
||||
|
||||
// If the gateway is already running (e.g. via the daemon), restart it
|
||||
// so it picks up any config changes (model, provider, etc.).
|
||||
if portOpen(addr) {
|
||||
restart := exec.Command(bin, "daemon", "restart")
|
||||
restart.Env = openclawEnv()
|
||||
if err := restart.Run(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s Warning: daemon restart failed: %v%s\n", ansiYellow, err, ansiReset)
|
||||
}
|
||||
if !waitForPort(addr, 10*time.Second) {
|
||||
fmt.Fprintf(os.Stderr, "%s Warning: gateway did not come back after restart%s\n", ansiYellow, ansiReset)
|
||||
}
|
||||
}
|
||||
|
||||
// If the gateway isn't running, start it as a background child process.
|
||||
if !portOpen(addr) {
|
||||
gw := exec.Command(bin, "gateway", "run", "--force")
|
||||
gw.Env = openclawEnv()
|
||||
if err := gw.Start(); err != nil {
|
||||
return windowsHint(fmt.Errorf("failed to start gateway: %w", err))
|
||||
}
|
||||
defer func() {
|
||||
if gw.Process != nil {
|
||||
_ = gw.Process.Kill()
|
||||
_ = gw.Wait()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
|
||||
if !waitForPort(addr, 30*time.Second) {
|
||||
return windowsHint(fmt.Errorf("gateway did not start on %s", addr))
|
||||
cleanup, token, port, err := c.ensureGatewayReady(bin)
|
||||
if err != nil {
|
||||
return windowsHint(err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
printOpenclawReady(bin, token, port, firstLaunch)
|
||||
|
||||
@@ -175,6 +159,66 @@ func (c *Openclaw) Run(model string, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func shouldEnsureGatewayForArgs(args []string) bool {
|
||||
return len(args) > 0 && args[0] == "tui"
|
||||
}
|
||||
|
||||
func (c *Openclaw) ensureGatewayReady(bin string) (func(), string, int, error) {
|
||||
token, port := c.gatewayInfo()
|
||||
addr := fmt.Sprintf("localhost:%d", port)
|
||||
|
||||
// If the gateway is already running (e.g. via the daemon), restart it
|
||||
// so it picks up any config changes (model, provider, etc.).
|
||||
if portOpen(addr) {
|
||||
restart := exec.Command(bin, "daemon", "restart")
|
||||
restart.Env = openclawEnv()
|
||||
if err := restart.Run(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s Warning: daemon restart failed: %v%s\n", ansiYellow, err, ansiReset)
|
||||
}
|
||||
if !waitForPort(addr, 10*time.Second) {
|
||||
fmt.Fprintf(os.Stderr, "%s Warning: gateway did not come back after restart%s\n", ansiYellow, ansiReset)
|
||||
}
|
||||
}
|
||||
|
||||
// If the daemon is installed but not currently listening, try to bring it
|
||||
// up before falling back to a foreground child process.
|
||||
if openclawCanInstallDaemon() && !portOpen(addr) {
|
||||
start := exec.Command(bin, "daemon", "start")
|
||||
start.Env = openclawEnv()
|
||||
if err := start.Run(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s Warning: daemon start failed: %v%s\n", ansiYellow, err, ansiReset)
|
||||
} else if waitForPort(addr, 10*time.Second) {
|
||||
fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
|
||||
return func() {}, token, port, nil
|
||||
}
|
||||
}
|
||||
|
||||
cleanup := func() {}
|
||||
|
||||
// If the gateway still isn't running, start it as a background child process.
|
||||
if !portOpen(addr) {
|
||||
gw := exec.Command(bin, "gateway", "run", "--force")
|
||||
gw.Env = openclawEnv()
|
||||
if err := gw.Start(); err != nil {
|
||||
return nil, "", 0, fmt.Errorf("failed to start gateway: %w", err)
|
||||
}
|
||||
cleanup = func() {
|
||||
if gw.Process != nil {
|
||||
_ = gw.Process.Kill()
|
||||
_ = gw.Wait()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
|
||||
if !waitForPort(addr, 30*time.Second) {
|
||||
cleanup()
|
||||
return nil, "", 0, fmt.Errorf("gateway did not start on %s", addr)
|
||||
}
|
||||
|
||||
return cleanup, token, port, nil
|
||||
}
|
||||
|
||||
// runChannelSetupPreflight prompts users to connect a messaging channel before
|
||||
// starting the built-in gateway+TUI flow. In interactive sessions, it loops
|
||||
// until a channel is configured, unless the user chooses "Set up later".
|
||||
@@ -335,9 +379,30 @@ func openclawEnv() []string {
|
||||
env = append(env, e)
|
||||
}
|
||||
}
|
||||
if _, ok := os.LookupEnv("OPENCLAW_PLUGIN_STAGE_DIR"); !ok {
|
||||
if dir := openclawPluginStageDir(); dir != "" {
|
||||
env = append(env, "OPENCLAW_PLUGIN_STAGE_DIR="+dir)
|
||||
}
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
func openclawInstallEnv() []string {
|
||||
env := openclawEnv()
|
||||
if _, ok := os.LookupEnv("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"); !ok {
|
||||
env = append(env, "OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS=1")
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
func openclawPluginStageDir() string {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(home, ".openclaw", "plugin-runtime-deps")
|
||||
}
|
||||
|
||||
// portOpen checks if a TCP port is currently accepting connections.
|
||||
func portOpen(addr string) bool {
|
||||
conn, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
|
||||
@@ -561,6 +626,7 @@ func ensureOpenclawInstalled() (string, error) {
|
||||
|
||||
fmt.Fprintf(os.Stderr, "\nInstalling OpenClaw...\n")
|
||||
cmd := exec.Command("npm", "install", "-g", "openclaw@latest")
|
||||
cmd.Env = openclawInstallEnv()
|
||||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
@@ -251,6 +251,359 @@ func TestOpenclawRun_SetupLaterContinuesToGatewayAndTUI(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenclawRun_FirstLaunchOnboardUsesLaunchManagedHealthFlow(t *testing.T) {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("uses a POSIX shell test binary")
|
||||
}
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
setTestHome(t, tmpDir)
|
||||
t.Setenv("PATH", tmpDir)
|
||||
|
||||
bin := filepath.Join(tmpDir, "openclaw")
|
||||
script := fmt.Sprintf(`#!/bin/sh
|
||||
printf '%%s\n' "$*" >> "$HOME/invocations.log"
|
||||
if [ "$1" = "onboard" ]; then
|
||||
/usr/bin/env | /usr/bin/sort > "$HOME/onboard-env.log"
|
||||
/bin/mkdir -p "$HOME/.openclaw"
|
||||
/bin/cat > "$HOME/.openclaw/openclaw.json" <<'EOF'
|
||||
{"wizard":{"lastRunAt":"2026-01-01T00:00:00Z"},"gateway":{"port":18789,"mode":"local"}}
|
||||
EOF
|
||||
fi
|
||||
exit 0
|
||||
`)
|
||||
if err := os.WriteFile(bin, []byte(script), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
oldConfirmPrompt := DefaultConfirmPrompt
|
||||
DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
|
||||
if prompt != "I understand the risks. Continue?" {
|
||||
t.Fatalf("unexpected prompt: %q", prompt)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
defer func() { DefaultConfirmPrompt = oldConfirmPrompt }()
|
||||
|
||||
c := &Openclaw{}
|
||||
if err := c.Run("llama3.2", []string{"status"}); err != nil {
|
||||
t.Fatalf("Run() error = %v", err)
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
|
||||
if len(lines) < 2 {
|
||||
t.Fatalf("expected onboard + passthrough invocations, got %v", lines)
|
||||
}
|
||||
onboardInvocation := ""
|
||||
for _, line := range lines {
|
||||
if strings.HasPrefix(line, "onboard ") {
|
||||
onboardInvocation = line
|
||||
break
|
||||
}
|
||||
}
|
||||
if onboardInvocation == "" {
|
||||
t.Fatalf("expected onboard invocation, got %v", lines)
|
||||
}
|
||||
if !strings.Contains(onboardInvocation, "--skip-health") {
|
||||
t.Fatalf("expected onboard invocation to include --skip-health, got %q", onboardInvocation)
|
||||
}
|
||||
|
||||
envData, err := os.ReadFile(filepath.Join(tmpDir, "onboard-env.log"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
env := envSliceToMap(strings.Split(strings.TrimSpace(string(envData)), "\n"))
|
||||
if env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"] != "1" {
|
||||
t.Fatalf("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS = %q, want %q", env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"], "1")
|
||||
}
|
||||
if env["OPENCLAW_PLUGIN_STAGE_DIR"] != filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps") {
|
||||
t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps"))
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenclawRun_FirstLaunchTUIArgsEnsureGatewayBeforePassthrough(t *testing.T) {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("uses a POSIX shell test binary")
|
||||
}
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
setTestHome(t, tmpDir)
|
||||
t.Setenv("PATH", tmpDir)
|
||||
|
||||
ln, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer ln.Close()
|
||||
port := ln.Addr().(*net.TCPAddr).Port
|
||||
|
||||
bin := filepath.Join(tmpDir, "openclaw")
|
||||
script := fmt.Sprintf(`#!/bin/sh
|
||||
printf '%%s\n' "$*" >> "$HOME/invocations.log"
|
||||
if [ "$1" = "onboard" ]; then
|
||||
/bin/mkdir -p "$HOME/.openclaw"
|
||||
/bin/cat > "$HOME/.openclaw/openclaw.json" <<'EOF'
|
||||
{"wizard":{"lastRunAt":"2026-01-01T00:00:00Z"},"gateway":{"port":%d,"mode":"local"}}
|
||||
EOF
|
||||
fi
|
||||
exit 0
|
||||
`, port)
|
||||
if err := os.WriteFile(bin, []byte(script), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
oldConfirmPrompt := DefaultConfirmPrompt
|
||||
DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
|
||||
if prompt != "I understand the risks. Continue?" {
|
||||
t.Fatalf("unexpected prompt: %q", prompt)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
defer func() { DefaultConfirmPrompt = oldConfirmPrompt }()
|
||||
|
||||
c := &Openclaw{}
|
||||
if err := c.Run("llama3.2", []string{"tui"}); err != nil {
|
||||
t.Fatalf("Run() error = %v", err)
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
|
||||
if len(lines) < 3 {
|
||||
t.Fatalf("expected at least 3 invocations (update, onboard, daemon restart, tui), got %v", lines)
|
||||
}
|
||||
onboardIdx, daemonRestartIdx, tuiIdx := -1, -1, -1
|
||||
for i, line := range lines {
|
||||
if onboardIdx == -1 && strings.HasPrefix(line, "onboard ") {
|
||||
onboardIdx = i
|
||||
}
|
||||
if daemonRestartIdx == -1 && line == "daemon restart" {
|
||||
daemonRestartIdx = i
|
||||
}
|
||||
if tuiIdx == -1 && line == "tui" {
|
||||
tuiIdx = i
|
||||
}
|
||||
}
|
||||
if onboardIdx == -1 {
|
||||
t.Fatalf("expected an onboarding invocation, got %v", lines)
|
||||
}
|
||||
if daemonRestartIdx == -1 {
|
||||
t.Fatalf("expected a daemon restart before tui, got %v", lines)
|
||||
}
|
||||
if tuiIdx == -1 {
|
||||
t.Fatalf("expected a tui invocation, got %v", lines)
|
||||
}
|
||||
if !(onboardIdx < daemonRestartIdx && daemonRestartIdx < tuiIdx) {
|
||||
t.Fatalf("expected onboarding, then daemon restart, then tui; got %v", lines)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenclawEnsureGatewayReady_UsesDaemonStartFallback(t *testing.T) {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("uses a POSIX shell test binary")
|
||||
}
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
setTestHome(t, tmpDir)
|
||||
t.Setenv("PATH", tmpDir)
|
||||
|
||||
portProbe, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
port := portProbe.Addr().(*net.TCPAddr).Port
|
||||
_ = portProbe.Close()
|
||||
|
||||
configDir := filepath.Join(tmpDir, ".openclaw")
|
||||
if err := os.MkdirAll(configDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(fmt.Sprintf(`{
|
||||
"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"},
|
||||
"gateway": {"port": %d, "mode": "local"}
|
||||
}`, port)), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
bin := filepath.Join(tmpDir, "openclaw")
|
||||
if err := os.WriteFile(bin, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> \"$HOME/invocations.log\"\n"), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
oldCanInstallDaemon := openclawCanInstallDaemon
|
||||
openclawCanInstallDaemon = func() bool { return true }
|
||||
defer func() { openclawCanInstallDaemon = oldCanInstallDaemon }()
|
||||
|
||||
triggeredBy := make(chan string, 1)
|
||||
listenerReady := make(chan net.Listener, 1)
|
||||
go func() {
|
||||
invocationsPath := filepath.Join(tmpDir, "invocations.log")
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
data, err := os.ReadFile(invocationsPath)
|
||||
if err == nil {
|
||||
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
|
||||
for _, line := range lines {
|
||||
if line != "daemon start" && line != "gateway run --force" {
|
||||
continue
|
||||
}
|
||||
ln, err := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", port))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
go func() {
|
||||
for {
|
||||
conn, err := ln.Accept()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = conn.Close()
|
||||
}
|
||||
}()
|
||||
triggeredBy <- line
|
||||
listenerReady <- ln
|
||||
return
|
||||
}
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
}()
|
||||
|
||||
c := &Openclaw{}
|
||||
cleanup, _, gotPort, err := c.ensureGatewayReady(bin)
|
||||
if err != nil {
|
||||
t.Fatalf("ensureGatewayReady() error = %v", err)
|
||||
}
|
||||
defer cleanup()
|
||||
if gotPort != port {
|
||||
t.Fatalf("ensureGatewayReady() port = %d, want %d", gotPort, port)
|
||||
}
|
||||
|
||||
var ln net.Listener
|
||||
select {
|
||||
case which := <-triggeredBy:
|
||||
if which != "daemon start" {
|
||||
t.Fatalf("expected daemon start fallback, got %q", which)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("timed out waiting for gateway startup trigger")
|
||||
}
|
||||
select {
|
||||
case ln = <-listenerReady:
|
||||
defer ln.Close()
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("timed out waiting for test listener")
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
|
||||
if len(lines) == 0 || lines[0] != "daemon start" {
|
||||
t.Fatalf("expected daemon start invocation, got %v", lines)
|
||||
}
|
||||
for _, line := range lines {
|
||||
if line == "gateway run --force" {
|
||||
t.Fatalf("did not expect gateway run fallback when daemon start succeeds, got %v", lines)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenclawEnv_StagesBundledPluginRuntimeDeps(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
setTestHome(t, tmpDir)
|
||||
t.Setenv("OPENAI_API_KEY", "should-be-cleared")
|
||||
|
||||
env := envSliceToMap(openclawEnv())
|
||||
|
||||
if env["OPENCLAW_PLUGIN_STAGE_DIR"] != filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps") {
|
||||
t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps"))
|
||||
}
|
||||
if _, ok := env["OPENAI_API_KEY"]; ok {
|
||||
t.Fatal("expected OPENAI_API_KEY to be cleared from openclaw environment")
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenclawInstallEnv_PreservesExplicitStageDirAndAddsEagerDeps(t *testing.T) {
|
||||
t.Setenv("OPENCLAW_PLUGIN_STAGE_DIR", "/tmp/custom-stage")
|
||||
|
||||
env := envSliceToMap(openclawInstallEnv())
|
||||
|
||||
if env["OPENCLAW_PLUGIN_STAGE_DIR"] != "/tmp/custom-stage" {
|
||||
t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], "/tmp/custom-stage")
|
||||
}
|
||||
if env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"] != "1" {
|
||||
t.Fatalf("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS = %q, want %q", env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"], "1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureOpenclawInstalled_UsesBundledPluginInstallEnv(t *testing.T) {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("uses a POSIX shell test binary")
|
||||
}
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
setTestHome(t, tmpDir)
|
||||
t.Setenv("PATH", tmpDir)
|
||||
|
||||
writeScript := func(path, content string) {
|
||||
t.Helper()
|
||||
if err := os.WriteFile(path, []byte(content), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
openclawPath := filepath.Join(tmpDir, "openclaw")
|
||||
npmScript := fmt.Sprintf(`#!/bin/sh
|
||||
/usr/bin/env | /usr/bin/sort > "$HOME/npm-env.log"
|
||||
/bin/cat > %q <<'EOF'
|
||||
#!/bin/sh
|
||||
exit 0
|
||||
EOF
|
||||
/bin/chmod +x %q
|
||||
exit 0
|
||||
`, openclawPath, openclawPath)
|
||||
writeScript(filepath.Join(tmpDir, "npm"), npmScript)
|
||||
writeScript(filepath.Join(tmpDir, "git"), "#!/bin/sh\nexit 0\n")
|
||||
|
||||
oldConfirmPrompt := DefaultConfirmPrompt
|
||||
DefaultConfirmPrompt = func(prompt string, options ConfirmOptions) (bool, error) {
|
||||
if prompt != "OpenClaw is not installed. Install with npm?" {
|
||||
t.Fatalf("unexpected prompt: %q", prompt)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
defer func() { DefaultConfirmPrompt = oldConfirmPrompt }()
|
||||
|
||||
openclawFreshInstall = false
|
||||
bin, err := ensureOpenclawInstalled()
|
||||
if err != nil {
|
||||
t.Fatalf("ensureOpenclawInstalled() error = %v", err)
|
||||
}
|
||||
if bin != "openclaw" {
|
||||
t.Fatalf("ensureOpenclawInstalled() bin = %q, want %q", bin, "openclaw")
|
||||
}
|
||||
|
||||
envData, err := os.ReadFile(filepath.Join(tmpDir, "npm-env.log"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
env := envSliceToMap(strings.Split(strings.TrimSpace(string(envData)), "\n"))
|
||||
if env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"] != "1" {
|
||||
t.Fatalf("OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS = %q, want %q", env["OPENCLAW_EAGER_BUNDLED_PLUGIN_DEPS"], "1")
|
||||
}
|
||||
if env["OPENCLAW_PLUGIN_STAGE_DIR"] != filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps") {
|
||||
t.Fatalf("OPENCLAW_PLUGIN_STAGE_DIR = %q, want %q", env["OPENCLAW_PLUGIN_STAGE_DIR"], filepath.Join(tmpDir, ".openclaw", "plugin-runtime-deps"))
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenclawEdit(t *testing.T) {
|
||||
c := &Openclaw{}
|
||||
tmpDir := t.TempDir()
|
||||
@@ -1227,6 +1580,18 @@ func TestOpenclawChannelsConfigured(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func envSliceToMap(entries []string) map[string]string {
|
||||
env := make(map[string]string, len(entries))
|
||||
for _, entry := range entries {
|
||||
key, value, ok := strings.Cut(entry, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
env[key] = value
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
func TestOpenclawChannelSetupPreflight(t *testing.T) {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("uses a POSIX shell test binary")
|
||||
|
||||
@@ -632,8 +632,8 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
}
|
||||
|
||||
if effort != "" {
|
||||
if !slices.Contains([]string{"high", "medium", "low", "none"}, effort) {
|
||||
return nil, fmt.Errorf("invalid reasoning value: '%s' (must be \"high\", \"medium\", \"low\", or \"none\")", effort)
|
||||
if !slices.Contains([]string{"high", "medium", "low", "max", "none"}, effort) {
|
||||
return nil, fmt.Errorf("invalid reasoning value: '%s' (must be \"high\", \"medium\", \"low\", \"max\", or \"none\")", effort)
|
||||
}
|
||||
|
||||
if effort == "none" {
|
||||
|
||||
@@ -55,6 +55,57 @@ func TestFromChatRequest_Basic(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromChatRequest_ReasoningEffort(t *testing.T) {
|
||||
effort := func(s string) *string { return &s }
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
effort *string
|
||||
want any // expected ThinkValue.Value; nil means req.Think should be nil
|
||||
wantErr bool
|
||||
}{
|
||||
{name: "unset", effort: nil, want: nil},
|
||||
{name: "high", effort: effort("high"), want: "high"},
|
||||
{name: "medium", effort: effort("medium"), want: "medium"},
|
||||
{name: "low", effort: effort("low"), want: "low"},
|
||||
{name: "max", effort: effort("max"), want: "max"},
|
||||
{name: "none disables", effort: effort("none"), want: false},
|
||||
{name: "invalid", effort: effort("extreme"), wantErr: true},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
req := ChatCompletionRequest{
|
||||
Model: "test-model",
|
||||
Messages: []Message{{Role: "user", Content: "hi"}},
|
||||
ReasoningEffort: tc.effort,
|
||||
}
|
||||
result, err := FromChatRequest(req)
|
||||
if tc.wantErr {
|
||||
if err == nil {
|
||||
t.Fatalf("expected error for effort=%v, got none", *tc.effort)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if tc.want == nil {
|
||||
if result.Think != nil {
|
||||
t.Fatalf("expected nil Think, got %+v", result.Think)
|
||||
}
|
||||
return
|
||||
}
|
||||
if result.Think == nil {
|
||||
t.Fatalf("expected Think=%v, got nil", tc.want)
|
||||
}
|
||||
if result.Think.Value != tc.want {
|
||||
t.Fatalf("got Think.Value=%v, want %v", result.Think.Value, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromChatRequest_WithImage(t *testing.T) {
|
||||
imgData, _ := base64.StdEncoding.DecodeString(image)
|
||||
|
||||
|
||||
@@ -525,6 +525,18 @@ func FromResponsesRequest(r ResponsesRequest) (*api.ChatRequest, error) {
|
||||
options["num_predict"] = *r.MaxOutputTokens
|
||||
}
|
||||
|
||||
var think *api.ThinkValue
|
||||
if effort := r.Reasoning.Effort; effort != "" {
|
||||
switch effort {
|
||||
case "none":
|
||||
think = &api.ThinkValue{Value: false}
|
||||
case "low", "medium", "high", "max":
|
||||
think = &api.ThinkValue{Value: effort}
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid reasoning value: %q (must be \"high\", \"medium\", \"low\", \"max\", or \"none\")", effort)
|
||||
}
|
||||
}
|
||||
|
||||
// Convert tools from Responses API format to api.Tool format
|
||||
var tools []api.Tool
|
||||
for _, t := range r.Tools {
|
||||
@@ -552,6 +564,7 @@ func FromResponsesRequest(r ResponsesRequest) (*api.ChatRequest, error) {
|
||||
Options: options,
|
||||
Tools: tools,
|
||||
Format: format,
|
||||
Think: think,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -415,6 +415,86 @@ func TestFromResponsesRequest_Tools(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromResponsesRequest_ReasoningEffort(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
effort string
|
||||
wantThink any
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "unset",
|
||||
},
|
||||
{
|
||||
name: "low",
|
||||
effort: "low",
|
||||
wantThink: "low",
|
||||
},
|
||||
{
|
||||
name: "medium",
|
||||
effort: "medium",
|
||||
wantThink: "medium",
|
||||
},
|
||||
{
|
||||
name: "high",
|
||||
effort: "high",
|
||||
wantThink: "high",
|
||||
},
|
||||
{
|
||||
name: "max",
|
||||
effort: "max",
|
||||
wantThink: "max",
|
||||
},
|
||||
{
|
||||
name: "none",
|
||||
effort: "none",
|
||||
wantThink: false,
|
||||
},
|
||||
{
|
||||
name: "invalid",
|
||||
effort: "extreme",
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
req := ResponsesRequest{
|
||||
Model: "deepseek-v4-flash",
|
||||
Input: ResponsesInput{Text: "hi"},
|
||||
}
|
||||
if tt.effort != "" {
|
||||
req.Reasoning.Effort = tt.effort
|
||||
}
|
||||
|
||||
chatReq, err := FromResponsesRequest(req)
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if tt.wantThink == nil {
|
||||
if chatReq.Think != nil {
|
||||
t.Fatalf("Think = %#v, want nil", chatReq.Think)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if chatReq.Think == nil {
|
||||
t.Fatalf("Think = nil, want %v", tt.wantThink)
|
||||
}
|
||||
if chatReq.Think.Value != tt.wantThink {
|
||||
t.Errorf("Think.Value = %v, want %v", chatReq.Think.Value, tt.wantThink)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromResponsesRequest_FunctionCallOutput(t *testing.T) {
|
||||
// Test a complete tool call round-trip:
|
||||
// 1. User message asking about weather
|
||||
|
||||
@@ -375,8 +375,16 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
var builtinParser parsers.Parser
|
||||
if shouldUseHarmony(m) && m.Config.Parser == "" {
|
||||
m.Config.Parser = "harmony"
|
||||
if shouldUseHarmony(m) {
|
||||
// harmony's Reasoning field only understands low/medium/high; map "max" to "high"
|
||||
if req.Think != nil {
|
||||
if s, ok := req.Think.Value.(string); ok && s == "max" {
|
||||
req.Think.Value = "high"
|
||||
}
|
||||
}
|
||||
if m.Config.Parser == "" {
|
||||
m.Config.Parser = "harmony"
|
||||
}
|
||||
}
|
||||
|
||||
if !req.Raw && m.Config.Parser != "" {
|
||||
@@ -2320,8 +2328,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
msgs = filterThinkTags(msgs, m)
|
||||
|
||||
if shouldUseHarmony(m) && m.Config.Parser == "" {
|
||||
m.Config.Parser = "harmony"
|
||||
if shouldUseHarmony(m) {
|
||||
// harmony's Reasoning field only understands low/medium/high; map "max" to "high"
|
||||
if req.Think != nil {
|
||||
if s, ok := req.Think.Value.(string); ok && s == "max" {
|
||||
req.Think.Value = "high"
|
||||
}
|
||||
}
|
||||
if m.Config.Parser == "" {
|
||||
m.Config.Parser = "harmony"
|
||||
}
|
||||
}
|
||||
|
||||
var builtinParser parsers.Parser
|
||||
|
||||
@@ -84,7 +84,7 @@ func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
|
||||
var offset int
|
||||
for m, _ := re.FindRunesMatch(r); m != nil; m, _ = re.FindNextMatch(m) {
|
||||
if offset-m.Index != 0 {
|
||||
if !yield(string(r[:m.Index])) {
|
||||
if !yield(string(r[offset:m.Index])) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -545,6 +545,61 @@ func BenchmarkBytePairEncoding(b *testing.B) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytePairEncodingSplitMultipleRegexpsPreservesOffsets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bpe := NewBytePairEncoding(
|
||||
nil,
|
||||
`(?:\r?\n)+(?!\r?\n)`,
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
)
|
||||
|
||||
input := "One line\nTwo lines\n\nThree"
|
||||
got := slices.Collect(bpe.split(input))
|
||||
want := []string{"One", " line", "\n", "Two", " lines", "\n\n", "Three"}
|
||||
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("split mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytePairEncodingSplitRefactPreservesOffsets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bpe := NewBytePairEncoding(
|
||||
nil,
|
||||
`\p{N}`,
|
||||
`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`,
|
||||
)
|
||||
|
||||
input := "One line\nTwo lines\n\nThree"
|
||||
got := slices.Collect(bpe.split(input))
|
||||
want := []string{"One", " line", "\n", "Two", " lines", "\n", "\n", "Three"}
|
||||
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("split mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytePairEncodingSplitDeepSeekV3PreservesOffsets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bpe := NewBytePairEncoding(
|
||||
nil,
|
||||
"\\p{N}{1,3}",
|
||||
`[一-龥-ゟ゠-ヿ]+`,
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
)
|
||||
|
||||
input := "One line\nTwo lines\n\nThree"
|
||||
got := slices.Collect(bpe.split(input))
|
||||
want := []string{"One", " line", "\n", "Two", " lines", "\n\n", "Three"}
|
||||
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("split mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplit(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
|
||||
@@ -70,9 +70,13 @@ func loadAndQuantizeArray(r io.Reader, name, quantize string, arrays map[string]
|
||||
if info, ok := header[inputKey]; ok && info.Dtype == "F8_E4M3" {
|
||||
scaleKey := inputKey + ".scale_inv"
|
||||
scaleInv := st.Get(scaleKey)
|
||||
if scaleInv == nil {
|
||||
scaleKey = inputKey + ".scale"
|
||||
scaleInv = st.Get(scaleKey)
|
||||
}
|
||||
if scaleInv == nil {
|
||||
st.Free()
|
||||
return tmpPath, nil, nil, fmt.Errorf("missing companion tensor %q for fp8 source tensor %q", scaleKey, inputKey)
|
||||
return tmpPath, nil, nil, fmt.Errorf("missing companion tensor %q or %q for fp8 source tensor %q", inputKey+".scale_inv", inputKey+".scale", inputKey)
|
||||
}
|
||||
arr, err = decodeSourceFP8Tensor(arr, scaleInv)
|
||||
if err != nil {
|
||||
@@ -560,13 +564,13 @@ func safetensorsKey(preferred string, header map[string]safetensorsHeaderEntry)
|
||||
return keys[0], nil
|
||||
}
|
||||
|
||||
func decodeSourceFP8Tensor(weight, scaleInv *mlx.Array) (*mlx.Array, error) {
|
||||
if weight == nil || scaleInv == nil {
|
||||
func decodeSourceFP8Tensor(weight, scale *mlx.Array) (*mlx.Array, error) {
|
||||
if weight == nil || scale == nil {
|
||||
return nil, fmt.Errorf("fp8 weight and scale tensors are required")
|
||||
}
|
||||
|
||||
weightShape := weight.Dims()
|
||||
scaleShape := scaleInv.Dims()
|
||||
scaleShape := scale.Dims()
|
||||
if len(weightShape) != 2 || len(scaleShape) != 2 {
|
||||
return nil, fmt.Errorf("expected 2D fp8 weight and scale tensors, got %v and %v", weightShape, scaleShape)
|
||||
}
|
||||
@@ -596,7 +600,7 @@ func decodeSourceFP8Tensor(weight, scaleInv *mlx.Array) (*mlx.Array, error) {
|
||||
}
|
||||
|
||||
decoded = mlx.Reshape(decoded, int32(scaleShape[0]), int32(blockRows), int32(scaleShape[1]), int32(blockCols))
|
||||
decoded = mlx.Mul(decoded, mlx.ExpandDims(mlx.ExpandDims(scaleInv, 1), 3))
|
||||
decoded = mlx.Mul(decoded, mlx.ExpandDims(mlx.ExpandDims(scale, 1), 3))
|
||||
decoded = mlx.Reshape(decoded, int32(rows+padBottom), int32(cols+padSide))
|
||||
if padBottom > 0 || padSide > 0 {
|
||||
decoded = mlx.SliceStartStop(decoded, []int32{0, 0}, []int32{int32(rows), int32(cols)})
|
||||
|
||||
24
x/create/client/quantize_test.go
Normal file
24
x/create/client/quantize_test.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
)
|
||||
|
||||
func TestDecodeSourceFP8TensorAcceptsWeightScale(t *testing.T) {
|
||||
if err := mlx.CheckInit(); err != nil {
|
||||
t.Skipf("MLX unavailable: %v", err)
|
||||
}
|
||||
|
||||
weight := mlx.FromValues([]uint8{0, 1, 2, 3}, 2, 2)
|
||||
scale := mlx.FromValues([]float32{1}, 1, 1).AsType(mlx.DTypeBFloat16)
|
||||
got, err := decodeSourceFP8Tensor(weight, scale)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
mlx.Eval(got)
|
||||
if dims := got.Dims(); len(dims) != 2 || dims[0] != 2 || dims[1] != 2 {
|
||||
t.Fatalf("decoded dims = %v, want [2 2]", dims)
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,9 @@ import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
@@ -59,6 +61,43 @@ func TestIsTensorModelDir(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateScalarFloat32TensorData(t *testing.T) {
|
||||
td := st.NewTensorDataFromBytes("linear.weight_scale_2", "F32", []int32{}, encodeFloat32s(2))
|
||||
|
||||
got, err := validateScalarFloat32TensorData(td, "linear.weight.global_scale")
|
||||
if err != nil {
|
||||
t.Fatalf("validateScalarFloat32TensorData returned error: %v", err)
|
||||
}
|
||||
|
||||
if got.Name != "linear.weight.global_scale" {
|
||||
t.Fatalf("name = %q, want %q", got.Name, "linear.weight.global_scale")
|
||||
}
|
||||
if got.Dtype != "F32" {
|
||||
t.Fatalf("dtype = %q, want F32", got.Dtype)
|
||||
}
|
||||
if len(got.Shape) != 0 {
|
||||
t.Fatalf("shape = %v, want scalar", got.Shape)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateScalarFloat32TensorDataRejectsNonScalar(t *testing.T) {
|
||||
td := st.NewTensorDataFromBytes("linear.weight_scale_2", "F32", []int32{2}, encodeFloat32s(2, 4))
|
||||
|
||||
_, err := validateScalarFloat32TensorData(td, "linear.weight.global_scale")
|
||||
if err == nil || !strings.Contains(err.Error(), "expected scalar F32 tensor") {
|
||||
t.Fatalf("validateScalarFloat32TensorData error = %v, want scalar-shape failure", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvertScalarFloat32TensorDataRejectsNonF32(t *testing.T) {
|
||||
td := st.NewTensorDataFromBytes("linear.weight_global_scale", "BF16", []int32{}, []byte{0, 0})
|
||||
|
||||
_, err := invertScalarFloat32TensorData(td, "linear.weight.global_scale")
|
||||
if err == nil || !strings.Contains(err.Error(), "expected F32 tensor") {
|
||||
t.Fatalf("invertScalarFloat32TensorData error = %v, want dtype failure", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSafetensorsModelDir(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -246,6 +285,41 @@ func readSingleTensorRaw(t *testing.T, data []byte) []byte {
|
||||
return nil
|
||||
}
|
||||
|
||||
func encodeFloat32s(vals ...float32) []byte {
|
||||
raw := make([]byte, 4*len(vals))
|
||||
for i, v := range vals {
|
||||
binary.LittleEndian.PutUint32(raw[i*4:(i+1)*4], math.Float32bits(v))
|
||||
}
|
||||
return raw
|
||||
}
|
||||
|
||||
func readPackedTensorRaw(t *testing.T, data []byte, tensorName string) []byte {
|
||||
t.Helper()
|
||||
|
||||
var headerSize uint64
|
||||
if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil {
|
||||
t.Fatalf("failed to read header size: %v", err)
|
||||
}
|
||||
|
||||
var header map[string]struct {
|
||||
Dtype string `json:"dtype"`
|
||||
Shape []int32 `json:"shape"`
|
||||
DataOffsets [2]int `json:"data_offsets"`
|
||||
}
|
||||
if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil {
|
||||
t.Fatalf("failed to parse header: %v", err)
|
||||
}
|
||||
|
||||
info, ok := header[tensorName]
|
||||
if !ok {
|
||||
t.Fatalf("tensor %q not found in header", tensorName)
|
||||
}
|
||||
|
||||
start := 8 + int(headerSize) + info.DataOffsets[0]
|
||||
end := 8 + int(headerSize) + info.DataOffsets[1]
|
||||
return data[start:end]
|
||||
}
|
||||
|
||||
func readSafetensorsHeaderNames(t *testing.T, data []byte) []string {
|
||||
t.Helper()
|
||||
|
||||
@@ -612,10 +686,22 @@ func TestCreateSafetensorsModel_HFFP8AutoConvertsToMXFP8(t *testing.T) {
|
||||
|
||||
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
||||
|
||||
if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, func(string) {}); err != nil {
|
||||
var statusMessages []string
|
||||
progressFn := func(status string) {
|
||||
statusMessages = append(statusMessages, status)
|
||||
}
|
||||
|
||||
if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
||||
}
|
||||
|
||||
if len(statusMessages) == 0 {
|
||||
t.Fatal("no status messages received")
|
||||
}
|
||||
if got, want := statusMessages[0], "importing model.safetensors (4 tensors, converting source E4M3 block-FP8 to MLX mxfp8)"; got != want {
|
||||
t.Fatalf("status = %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if got := quantizeByName["linear.weight"]; got != "mxfp8" {
|
||||
t.Fatalf("linear.weight quantization = %q, want %q", got, "mxfp8")
|
||||
}
|
||||
@@ -643,6 +729,166 @@ func TestCreateSafetensorsModel_HFFP8AutoConvertsToMXFP8(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_CompressedTensorsFP8WeightScale(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
configJSON := `{
|
||||
"model_type": "test",
|
||||
"architectures": ["TestModel"],
|
||||
"compression_config": {
|
||||
"quant_method": "compressed-tensors",
|
||||
"format": "float-quantized",
|
||||
"config_groups": {
|
||||
"group_0": {
|
||||
"format": "float-quantized",
|
||||
"weights": {
|
||||
"type": "float",
|
||||
"num_bits": 8,
|
||||
"block_structure": [128, 128]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
||||
t.Fatalf("failed to write config.json: %v", err)
|
||||
}
|
||||
|
||||
createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("linear.weight", "F8_E4M3", []int32{2, 2}, []byte{1, 2, 3, 4}),
|
||||
st.NewTensorDataFromBytes("linear.weight_scale", "BF16", []int32{1, 1}, make([]byte, 2)),
|
||||
st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{2}, make([]byte, 4)),
|
||||
})
|
||||
|
||||
quantizeByName := make(map[string]string)
|
||||
headerNamesByName := make(map[string][]string)
|
||||
|
||||
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
||||
if _, err := io.ReadAll(r); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
||||
}
|
||||
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
quantizeByName[name] = quantize
|
||||
headerNamesByName[name] = readSafetensorsHeaderNames(t, data)
|
||||
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
||||
}
|
||||
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
||||
|
||||
var statusMessages []string
|
||||
progressFn := func(status string) {
|
||||
statusMessages = append(statusMessages, status)
|
||||
}
|
||||
|
||||
if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
||||
}
|
||||
if len(statusMessages) == 0 {
|
||||
t.Fatal("no status messages received")
|
||||
}
|
||||
if got, want := statusMessages[0], "importing model.safetensors (3 tensors, converting source E4M3 block-FP8 to MLX mxfp8)"; got != want {
|
||||
t.Fatalf("status = %q, want %q", got, want)
|
||||
}
|
||||
if got := quantizeByName["linear.weight"]; got != "mxfp8" {
|
||||
t.Fatalf("linear.weight quantization = %q, want mxfp8", got)
|
||||
}
|
||||
if _, ok := quantizeByName["linear.weight_scale"]; ok {
|
||||
t.Fatal("linear.weight_scale should not be imported as a standalone tensor")
|
||||
}
|
||||
if got := headerNamesByName["linear.weight"]; !slices.Equal(got, []string{"linear.weight", "linear.weight.scale"}) {
|
||||
t.Fatalf("linear.weight blob tensors = %v, want %v", got, []string{"linear.weight", "linear.weight.scale"})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_HFFP8SourceCanConvertToNVFP4(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
configJSON := `{
|
||||
"model_type": "test",
|
||||
"architectures": ["TestModel"],
|
||||
"quantization_config": {"quant_method": "fp8", "weight_block_size": [128, 128]}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
||||
t.Fatalf("failed to write config.json: %v", err)
|
||||
}
|
||||
|
||||
createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("linear.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)),
|
||||
st.NewTensorDataFromBytes("linear.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)),
|
||||
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)),
|
||||
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)),
|
||||
st.NewTensorDataFromBytes("model.layers.0.self_attn.q_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
||||
st.NewTensorDataFromBytes("model.embed_tokens.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
||||
st.NewTensorDataFromBytes("lm_head.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
||||
st.NewTensorDataFromBytes("model.layers.0.mlp.gate.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
||||
st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{128}, make([]byte, 256)),
|
||||
})
|
||||
|
||||
quantizeByName := make(map[string]string)
|
||||
headerNamesByName := make(map[string][]string)
|
||||
|
||||
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
||||
if _, err := io.ReadAll(r); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
||||
}
|
||||
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
quantizeByName[name] = quantize
|
||||
headerNamesByName[name] = readSafetensorsHeaderNames(t, data)
|
||||
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
||||
}
|
||||
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
||||
|
||||
var statusMessages []string
|
||||
progressFn := func(status string) {
|
||||
statusMessages = append(statusMessages, status)
|
||||
}
|
||||
|
||||
if err := CreateSafetensorsModel("test-model", dir, "nvfp4", createLayer, createTensorLayer, writeManifest, progressFn); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
||||
}
|
||||
if len(statusMessages) == 0 {
|
||||
t.Fatal("no status messages received")
|
||||
}
|
||||
if got, want := statusMessages[0], "importing model.safetensors (9 tensors, converting source E4M3 block-FP8 to MLX nvfp4)"; got != want {
|
||||
t.Fatalf("status = %q, want %q", got, want)
|
||||
}
|
||||
if got := quantizeByName["linear.weight"]; got != "nvfp4" {
|
||||
t.Fatalf("linear.weight quantization = %q, want nvfp4", got)
|
||||
}
|
||||
if got := quantizeByName["model.layers.0.mlp.experts.0.down_proj.weight"]; got != "mxfp8" {
|
||||
t.Fatalf("source fp8 down_proj quantization = %q, want mxfp8", got)
|
||||
}
|
||||
for _, name := range []string{
|
||||
"model.layers.0.self_attn.q_proj.weight",
|
||||
"model.embed_tokens.weight",
|
||||
"lm_head.weight",
|
||||
} {
|
||||
if got := quantizeByName[name]; got != "mxfp8" {
|
||||
t.Fatalf("%s quantization = %q, want mxfp8", name, got)
|
||||
}
|
||||
}
|
||||
if got := quantizeByName["model.layers.0.mlp.gate.weight"]; got != "" {
|
||||
t.Fatalf("router gate quantization = %q, want empty", got)
|
||||
}
|
||||
if got := quantizeByName["norm.weight"]; got != "" {
|
||||
t.Fatalf("norm.weight quantization = %q, want empty", got)
|
||||
}
|
||||
if got := headerNamesByName["linear.weight"]; !slices.Equal(got, []string{"linear.weight", "linear.weight.scale_inv"}) {
|
||||
t.Fatalf("linear.weight blob tensors = %v, want %v", got, []string{"linear.weight", "linear.weight.scale_inv"})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_RejectsRequantizingQuantizedSources(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -670,7 +916,20 @@ func TestCreateSafetensorsModel_RejectsRequantizingQuantizedSources(t *testing.T
|
||||
st.NewTensorDataFromBytes("linear.weight", "F8_E4M3", []int32{2, 2}, []byte{1, 2, 3, 4}),
|
||||
st.NewTensorDataFromBytes("linear.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)),
|
||||
},
|
||||
wantErr: `cannot requantize already-quantized fp8 source model with --quantize "int4"`,
|
||||
wantErr: `cannot convert already-quantized fp8 source model with --quantize "int4"`,
|
||||
},
|
||||
{
|
||||
name: "packed nvfp4 source",
|
||||
configJSON: `{
|
||||
"model_type": "test",
|
||||
"architectures": ["TestModel"],
|
||||
"compression_config": {"format": "nvfp4-pack-quantized"}
|
||||
}`,
|
||||
tensors: []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("linear.weight_packed", "U8", []int32{16, 8}, make([]byte, 128)),
|
||||
st.NewTensorDataFromBytes("linear.weight_scale", "F8_E4M3", []int32{16, 1}, make([]byte, 16)),
|
||||
},
|
||||
wantErr: `cannot requantize already-quantized source model with --quantize "int4"`,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -701,6 +960,317 @@ func TestCreateSafetensorsModel_RejectsRequantizingQuantizedSources(t *testing.T
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_PackedNVFP4PreservesSourceLayout(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
configJSON := `{
|
||||
"model_type": "test",
|
||||
"architectures": ["TestModel"],
|
||||
"compression_config": {"format": "nvfp4-pack-quantized"}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
||||
t.Fatalf("failed to write config.json: %v", err)
|
||||
}
|
||||
|
||||
createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("linear.weight_packed", "U8", []int32{16, 8}, make([]byte, 128)),
|
||||
st.NewTensorDataFromBytes("linear.weight_scale", "F8_E4M3", []int32{16, 1}, make([]byte, 16)),
|
||||
st.NewTensorDataFromBytes("linear.weight_global_scale", "F32", []int32{}, encodeFloat32s(4)),
|
||||
st.NewTensorDataFromBytes("linear.input_global_scale", "F32", []int32{}, encodeFloat32s(8)),
|
||||
st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{16}, make([]byte, 32)),
|
||||
})
|
||||
|
||||
var statusMessages []string
|
||||
layerHeaders := make(map[string]map[string]json.RawMessage)
|
||||
layerData := make(map[string][]byte)
|
||||
var tensorLayerNames []string
|
||||
|
||||
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
if mediaType == "application/vnd.ollama.image.tensor" {
|
||||
if len(data) < 8 {
|
||||
return LayerInfo{}, io.ErrUnexpectedEOF
|
||||
}
|
||||
var headerSize uint64
|
||||
if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
var header map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
layerHeaders[name] = header
|
||||
layerData[name] = data
|
||||
}
|
||||
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
||||
}
|
||||
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
||||
if _, err := io.ReadAll(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tensorLayerNames = append(tensorLayerNames, name)
|
||||
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
||||
}
|
||||
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
||||
progressFn := func(status string) { statusMessages = append(statusMessages, status) }
|
||||
|
||||
if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
||||
}
|
||||
|
||||
if len(statusMessages) == 0 {
|
||||
t.Fatal("no status messages received")
|
||||
}
|
||||
if got, want := statusMessages[0], "importing model.safetensors (5 tensors, preserving source quantization)"; got != want {
|
||||
t.Fatalf("status = %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if slices.Contains(tensorLayerNames, "linear.weight_scale") || slices.Contains(tensorLayerNames, "linear.weight_global_scale") || slices.Contains(tensorLayerNames, "linear.input_global_scale") {
|
||||
t.Fatalf("packed nvfp4 companions unexpectedly emitted as standalone tensor layers: %v", tensorLayerNames)
|
||||
}
|
||||
|
||||
packedHeader := layerHeaders["linear.weight"]
|
||||
if packedHeader == nil {
|
||||
t.Fatalf("missing packed layer header for linear.weight")
|
||||
}
|
||||
for _, key := range []string{
|
||||
"linear.weight",
|
||||
"linear.weight.scale",
|
||||
"linear.weight.global_scale",
|
||||
} {
|
||||
if _, ok := packedHeader[key]; !ok {
|
||||
t.Fatalf("packed header missing %s: %v", key, packedHeader)
|
||||
}
|
||||
}
|
||||
if _, ok := packedHeader["linear.weight.input_global_scale"]; ok {
|
||||
t.Fatalf("packed header unexpectedly includes input_global_scale: %v", packedHeader)
|
||||
}
|
||||
globalRaw := readPackedTensorRaw(t, layerData["linear.weight"], "linear.weight.global_scale")
|
||||
if got := math.Float32frombits(binary.LittleEndian.Uint32(globalRaw)); got != 0.25 {
|
||||
t.Fatalf("linear.weight.global_scale = %v, want 0.25", got)
|
||||
}
|
||||
|
||||
var metadata map[string]string
|
||||
if metaRaw, ok := packedHeader["__metadata__"]; ok {
|
||||
if err := json.Unmarshal(metaRaw, &metadata); err != nil {
|
||||
t.Fatalf("failed to parse metadata: %v", err)
|
||||
}
|
||||
}
|
||||
if metadata["quant_type"] != "nvfp4" {
|
||||
t.Fatalf("quant_type = %q, want %q", metadata["quant_type"], "nvfp4")
|
||||
}
|
||||
if metadata["group_size"] != "16" {
|
||||
t.Fatalf("group_size = %q, want %q", metadata["group_size"], "16")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_PackedNVFP4CrossShardCompanions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
configJSON := `{
|
||||
"model_type": "test",
|
||||
"architectures": ["TestModel"],
|
||||
"compression_config": {"format": "nvfp4-pack-quantized"}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
||||
t.Fatalf("failed to write config.json: %v", err)
|
||||
}
|
||||
|
||||
createTestSafetensors(t, filepath.Join(dir, "model-00001-of-00002.safetensors"), []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("linear.weight_packed", "U8", []int32{16, 8}, make([]byte, 128)),
|
||||
st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{16}, make([]byte, 32)),
|
||||
})
|
||||
createTestSafetensors(t, filepath.Join(dir, "model-00002-of-00002.safetensors"), []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("linear.weight_scale", "F8_E4M3", []int32{16, 1}, make([]byte, 16)),
|
||||
st.NewTensorDataFromBytes("linear.weight_global_scale", "F32", []int32{}, encodeFloat32s(2)),
|
||||
st.NewTensorDataFromBytes("linear.input_global_scale", "F32", []int32{}, encodeFloat32s(8)),
|
||||
})
|
||||
indexJSON := `{
|
||||
"metadata": {"total_size": 152},
|
||||
"weight_map": {
|
||||
"linear.weight_packed": "model-00001-of-00002.safetensors",
|
||||
"norm.weight": "model-00001-of-00002.safetensors",
|
||||
"linear.weight_scale": "model-00002-of-00002.safetensors",
|
||||
"linear.weight_global_scale": "model-00002-of-00002.safetensors",
|
||||
"linear.input_global_scale": "model-00002-of-00002.safetensors"
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "model.safetensors.index.json"), []byte(indexJSON), 0o644); err != nil {
|
||||
t.Fatalf("failed to write index: %v", err)
|
||||
}
|
||||
|
||||
layerHeaders := make(map[string]map[string]json.RawMessage)
|
||||
var tensorLayerNames []string
|
||||
|
||||
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
if mediaType == "application/vnd.ollama.image.tensor" {
|
||||
var headerSize uint64
|
||||
if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
var header map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
layerHeaders[name] = header
|
||||
}
|
||||
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
||||
}
|
||||
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
||||
if _, err := io.ReadAll(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tensorLayerNames = append(tensorLayerNames, name)
|
||||
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
||||
}
|
||||
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
||||
|
||||
packedCreator := func(groupName string, tensors []PackedTensorInput) (LayerInfo, error) {
|
||||
return LayerInfo{}, fmt.Errorf("unexpected packedCreator call for %s", groupName)
|
||||
}
|
||||
if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, func(string) {}, packedCreator); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
||||
}
|
||||
|
||||
if slices.Contains(tensorLayerNames, "linear.weight_packed") || slices.Contains(tensorLayerNames, "linear.weight_scale") || slices.Contains(tensorLayerNames, "linear.weight_global_scale") || slices.Contains(tensorLayerNames, "linear.input_global_scale") {
|
||||
t.Fatalf("packed nvfp4 tensors unexpectedly emitted as standalone tensor layers: %v", tensorLayerNames)
|
||||
}
|
||||
|
||||
packedHeader := layerHeaders["linear.weight"]
|
||||
if packedHeader == nil {
|
||||
t.Fatalf("missing packed layer header for linear.weight")
|
||||
}
|
||||
for _, key := range []string{
|
||||
"linear.weight",
|
||||
"linear.weight.scale",
|
||||
"linear.weight.global_scale",
|
||||
} {
|
||||
if _, ok := packedHeader[key]; !ok {
|
||||
t.Fatalf("packed header missing %s: %v", key, packedHeader)
|
||||
}
|
||||
}
|
||||
if _, ok := packedHeader["linear.weight.input_global_scale"]; ok {
|
||||
t.Fatalf("packed header unexpectedly includes input_global_scale: %v", packedHeader)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_PackedNVFP4StacksExperts(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
configJSON := `{
|
||||
"model_type": "test",
|
||||
"architectures": ["TestModel"],
|
||||
"compression_config": {"format": "nvfp4-pack-quantized"}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
||||
t.Fatalf("failed to write config.json: %v", err)
|
||||
}
|
||||
|
||||
createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.weight_packed", "U8", []int32{2, 8}, make([]byte, 16)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.weight_scale", "F8_E4M3", []int32{2, 1}, make([]byte, 2)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.weight_global_scale", "F32", []int32{1}, encodeFloat32s(2)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.input_global_scale", "F32", []int32{1}, encodeFloat32s(32)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.weight_packed", "U8", []int32{2, 8}, make([]byte, 16)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.weight_scale", "F8_E4M3", []int32{2, 1}, make([]byte, 2)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.weight_global_scale", "F32", []int32{1}, encodeFloat32s(4)),
|
||||
st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.input_global_scale", "F32", []int32{1}, encodeFloat32s(64)),
|
||||
st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{2}, make([]byte, 4)),
|
||||
})
|
||||
|
||||
layerHeaders := make(map[string]map[string]json.RawMessage)
|
||||
layerData := make(map[string][]byte)
|
||||
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
if mediaType == "application/vnd.ollama.image.tensor" {
|
||||
var headerSize uint64
|
||||
if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
var header map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil {
|
||||
return LayerInfo{}, err
|
||||
}
|
||||
layerHeaders[name] = header
|
||||
layerData[name] = data
|
||||
}
|
||||
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
||||
}
|
||||
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
||||
if _, err := io.ReadAll(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
||||
}
|
||||
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
||||
packedCreator := func(groupName string, tensors []PackedTensorInput) (LayerInfo, error) {
|
||||
return LayerInfo{}, fmt.Errorf("unexpected packedCreator call for %s", groupName)
|
||||
}
|
||||
|
||||
if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, func(string) {}, packedCreator); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
||||
}
|
||||
|
||||
header := layerHeaders["model.layers.1.mlp.experts"]
|
||||
if header == nil {
|
||||
t.Fatalf("missing packed expert layer header")
|
||||
}
|
||||
for _, key := range []string{
|
||||
"model.layers.1.mlp.switch_mlp.gate_proj.weight",
|
||||
"model.layers.1.mlp.switch_mlp.gate_proj.weight.scale",
|
||||
"model.layers.1.mlp.switch_mlp.gate_proj.weight.global_scale",
|
||||
} {
|
||||
if _, ok := header[key]; !ok {
|
||||
t.Fatalf("stacked header missing %s: %v", key, header)
|
||||
}
|
||||
}
|
||||
if _, ok := header["model.layers.1.mlp.switch_mlp.gate_proj.weight.input_global_scale"]; ok {
|
||||
t.Fatalf("stacked header unexpectedly includes input_global_scale: %v", header)
|
||||
}
|
||||
if _, ok := header["model.layers.1.mlp.experts.0.gate_proj.weight"]; ok {
|
||||
t.Fatalf("unexpected per-expert tensor left in packed header: %v", header)
|
||||
}
|
||||
|
||||
var weightInfo struct {
|
||||
Dtype string `json:"dtype"`
|
||||
Shape []int32 `json:"shape"`
|
||||
}
|
||||
if err := json.Unmarshal(header["model.layers.1.mlp.switch_mlp.gate_proj.weight"], &weightInfo); err != nil {
|
||||
t.Fatalf("failed to unmarshal stacked weight info: %v", err)
|
||||
}
|
||||
if weightInfo.Dtype != "U32" || !slices.Equal(weightInfo.Shape, []int32{2, 2, 2}) {
|
||||
t.Fatalf("stacked weight = dtype %s shape %v, want U32 [2 2 2]", weightInfo.Dtype, weightInfo.Shape)
|
||||
}
|
||||
|
||||
var globalInfo struct {
|
||||
Dtype string `json:"dtype"`
|
||||
Shape []int32 `json:"shape"`
|
||||
}
|
||||
if err := json.Unmarshal(header["model.layers.1.mlp.switch_mlp.gate_proj.weight.global_scale"], &globalInfo); err != nil {
|
||||
t.Fatalf("failed to unmarshal stacked global scale info: %v", err)
|
||||
}
|
||||
if globalInfo.Dtype != "F32" || !slices.Equal(globalInfo.Shape, []int32{2, 1, 1}) {
|
||||
t.Fatalf("stacked global scale = dtype %s shape %v, want F32 [2 1 1]", globalInfo.Dtype, globalInfo.Shape)
|
||||
}
|
||||
globalRaw := readPackedTensorRaw(t, layerData["model.layers.1.mlp.experts"], "model.layers.1.mlp.switch_mlp.gate_proj.weight.global_scale")
|
||||
if got0 := math.Float32frombits(binary.LittleEndian.Uint32(globalRaw[0:4])); got0 != 0.5 {
|
||||
t.Fatalf("stacked global scale[0] = %v, want 0.5", got0)
|
||||
}
|
||||
if got1 := math.Float32frombits(binary.LittleEndian.Uint32(globalRaw[4:8])); got1 != 0.25 {
|
||||
t.Fatalf("stacked global scale[1] = %v, want 0.25", got1)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_HFFP8PacksExperts(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
@@ -777,6 +1347,26 @@ func TestCreateSafetensorsModel_HFFP8PacksExperts(t *testing.T) {
|
||||
t.Fatalf("expected mxfp8 quantize for %s, got %q", tensor.Name, tensor.Quantize)
|
||||
}
|
||||
}
|
||||
|
||||
packedLayerNames = nil
|
||||
packedLayerTensors = nil
|
||||
if err := CreateSafetensorsModel("test-model", dir, "nvfp4", createLayer, createTensorLayer, writeManifest, func(string) {}, createPackedLayer); err != nil {
|
||||
t.Fatalf("CreateSafetensorsModel nvfp4 failed: %v", err)
|
||||
}
|
||||
|
||||
if len(packedLayerNames) != 1 {
|
||||
t.Fatalf("expected 1 packed layer for nvfp4, got %d: %v", len(packedLayerNames), packedLayerNames)
|
||||
}
|
||||
|
||||
for _, tensor := range packedLayerTensors[0] {
|
||||
want := "nvfp4"
|
||||
if strings.Contains(tensor.Name, "down_proj") {
|
||||
want = "mxfp8"
|
||||
}
|
||||
if tensor.Quantize != want {
|
||||
t.Fatalf("nvfp4 packed tensor %s quantize = %q, want %q", tensor.Name, tensor.Quantize, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSafetensorsModel_Qwen35Transforms(t *testing.T) {
|
||||
|
||||
@@ -19,6 +19,10 @@ func DTypeSize(dtype string) (int, error) {
|
||||
return 4, nil
|
||||
case "F64":
|
||||
return 8, nil
|
||||
case "U8", "I8":
|
||||
return 1, nil
|
||||
case "F8_E4M3", "F8_E5M2", "F8_E4M3FN", "F8_E5M2FNUZ":
|
||||
return 1, nil
|
||||
default:
|
||||
return 0, fmt.Errorf("unsupported dtype %q", dtype)
|
||||
}
|
||||
|
||||
@@ -64,6 +64,8 @@ func dtypeFromString(s string) mlx.Dtype {
|
||||
return mlx.DtypeInt64
|
||||
case "U8", "UINT8":
|
||||
return mlx.DtypeUint8
|
||||
case "F8_E4M3", "F8_E5M2", "F8_E4M3FN", "F8_E5M2FNUZ":
|
||||
return mlx.DtypeUint8 // FP8 types stored as raw uint8 bytes
|
||||
default:
|
||||
return mlx.DtypeFloat32
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"iter"
|
||||
"runtime"
|
||||
"sort"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@@ -121,10 +122,17 @@ func SaveSafetensorsWithMetadata(path string, arrays map[string]*Array, metadata
|
||||
cArrays := C.mlx_map_string_to_array_new()
|
||||
defer C.mlx_map_string_to_array_free(cArrays)
|
||||
|
||||
arrayNames := make([]string, 0, len(arrays))
|
||||
for name, arr := range arrays {
|
||||
if arr == nil {
|
||||
continue
|
||||
}
|
||||
arrayNames = append(arrayNames, name)
|
||||
}
|
||||
sort.Strings(arrayNames)
|
||||
|
||||
for _, name := range arrayNames {
|
||||
arr := arrays[name]
|
||||
cName := C.CString(name)
|
||||
C.mlx_map_string_to_array_insert(cArrays, cName, arr.ctx)
|
||||
C.free(unsafe.Pointer(cName))
|
||||
@@ -133,7 +141,14 @@ func SaveSafetensorsWithMetadata(path string, arrays map[string]*Array, metadata
|
||||
cMetadata := C.mlx_map_string_to_string_new()
|
||||
defer C.mlx_map_string_to_string_free(cMetadata)
|
||||
|
||||
for key, value := range metadata {
|
||||
metadataKeys := make([]string, 0, len(metadata))
|
||||
for key := range metadata {
|
||||
metadataKeys = append(metadataKeys, key)
|
||||
}
|
||||
sort.Strings(metadataKeys)
|
||||
|
||||
for _, key := range metadataKeys {
|
||||
value := metadata[key]
|
||||
cKey := C.CString(key)
|
||||
cValue := C.CString(value)
|
||||
C.mlx_map_string_to_string_insert(cMetadata, cKey, cValue)
|
||||
|
||||
@@ -72,6 +72,10 @@ func (t *Array) AsStrided(shape []int, strides []int, offset int) *Array {
|
||||
}
|
||||
|
||||
func (t *Array) Concatenate(axis int, others ...*Array) *Array {
|
||||
if len(others) == 0 {
|
||||
return t.Clone()
|
||||
}
|
||||
|
||||
vector := C.mlx_vector_array_new()
|
||||
defer C.mlx_vector_array_free(vector)
|
||||
|
||||
@@ -127,9 +131,9 @@ func (t *Array) GatherMM(other, lhs, rhs *Array, sorted bool) *Array {
|
||||
return out
|
||||
}
|
||||
|
||||
func (t *Array) Logsumexp(keepDims bool) *Array {
|
||||
out := New("LOGSUMEXP")
|
||||
C.mlx_logsumexp(&out.ctx, t.ctx, C.bool(keepDims), DefaultStream().ctx)
|
||||
func (t *Array) LogsumexpAxis(axis int, keepDims bool) *Array {
|
||||
out := New("LOGSUMEXP_AXIS")
|
||||
C.mlx_logsumexp_axis(&out.ctx, t.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
|
||||
return out
|
||||
}
|
||||
|
||||
|
||||
@@ -376,6 +376,9 @@ func Concatenate(arrays []*Array, axis int) *Array {
|
||||
if len(arrays) == 0 {
|
||||
return nil
|
||||
}
|
||||
if len(arrays) == 1 {
|
||||
return arrays[0].Clone()
|
||||
}
|
||||
return arrays[0].Concatenate(axis, arrays[1:]...)
|
||||
}
|
||||
|
||||
|
||||
@@ -74,14 +74,23 @@ func MakeLinearLayer(
|
||||
scales,
|
||||
)
|
||||
|
||||
// Check for per-tensor global scale (NVIDIA double-scale nvfp4).
|
||||
// NVIDIA ModelOpt stores this as "weight_scale_2"; our import
|
||||
// pipeline maps it to "weight.global_scale".
|
||||
globalScale := tensors[path+".weight.global_scale"]
|
||||
if globalScale == nil {
|
||||
globalScale = tensors[path+".weight_scale_2"]
|
||||
}
|
||||
|
||||
return &nn.QuantizedLinear{
|
||||
Weight: w,
|
||||
Scales: scales,
|
||||
QBiases: qbiases,
|
||||
Bias: bias,
|
||||
GroupSize: groupSize,
|
||||
Bits: bits,
|
||||
Mode: mode,
|
||||
Weight: w,
|
||||
Scales: scales,
|
||||
QBiases: qbiases,
|
||||
Bias: bias,
|
||||
GlobalScale: globalScale,
|
||||
GroupSize: groupSize,
|
||||
Bits: bits,
|
||||
Mode: mode,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,14 +49,15 @@ func (r *Runner) Prepare(request *Request) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// The runner serializes requests today so we just use a fixed slot ID.
|
||||
const pipelineSlot = 0
|
||||
|
||||
func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) error {
|
||||
mlx.ResetPeakMemory()
|
||||
var sample, nextSample sampler.Result
|
||||
|
||||
defer func() {
|
||||
if request.Sampler != nil {
|
||||
request.Sampler.Free()
|
||||
}
|
||||
r.Sampler.Remove(pipelineSlot)
|
||||
mlx.Unpin(sample.Arrays()...)
|
||||
mlx.Unpin(nextSample.Arrays()...)
|
||||
mlx.Sweep()
|
||||
@@ -70,7 +71,6 @@ func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) er
|
||||
}()
|
||||
|
||||
inputs := request.Tokens
|
||||
request.Sampler.ResetHistory(inputs)
|
||||
|
||||
session := r.cache.begin(r.Model, inputs)
|
||||
defer session.close()
|
||||
@@ -122,7 +122,7 @@ func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) er
|
||||
}
|
||||
}
|
||||
|
||||
r.Model.Forward(mlx.FromValues(tokens[processed:processed+n], n).ExpandDims(0), caches)
|
||||
r.Model.Forward(mlx.FromValues(tokens[processed:processed+n], 1, n), caches)
|
||||
mlx.Sweep()
|
||||
materializeCaches()
|
||||
processed += n
|
||||
@@ -139,21 +139,28 @@ func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) er
|
||||
mlx.ClearCache()
|
||||
}
|
||||
|
||||
// Register the sampler after prefill completes.
|
||||
r.Sampler.Add(pipelineSlot, request.SamplerOpts, inputs)
|
||||
|
||||
step := func(token *mlx.Array) sampler.Result {
|
||||
fwd := r.Model.Forward(token.ExpandDims(0), caches)
|
||||
fwd := r.Model.Forward(token, caches)
|
||||
logits := r.Model.Unembed(fwd)
|
||||
logits = logits.Slice(mlx.Slice(), mlx.Slice(logits.Dim(1)-1), mlx.Slice()).Squeeze(1)
|
||||
|
||||
sample := request.Sampler.Sample(logits)
|
||||
sample := r.Sampler.Sample([]int{pipelineSlot}, logits)
|
||||
mlx.Pin(sample.Arrays()...)
|
||||
mlx.Sweep()
|
||||
mlx.AsyncEval(sample.Arrays()...)
|
||||
return sample
|
||||
}
|
||||
|
||||
sample = step(mlx.FromValues(tokens[processed:], total-processed))
|
||||
sample = step(mlx.FromValues(tokens[processed:], 1, total-processed))
|
||||
|
||||
dec := decoder{tokenizer: r.Tokenizer}
|
||||
dec := decoder{
|
||||
tokenizer: r.Tokenizer,
|
||||
wantLogprobs: request.SamplerOpts.Logprobs,
|
||||
wantTopLogprobs: request.SamplerOpts.TopLogprobs,
|
||||
}
|
||||
|
||||
final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.NumPredict, DoneReason: 1}
|
||||
for i := range request.Options.NumPredict {
|
||||
@@ -161,8 +168,7 @@ func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) er
|
||||
return err
|
||||
}
|
||||
|
||||
request.Sampler.AppendToken(sample.Token)
|
||||
nextSample = step(sample.Token)
|
||||
nextSample = step(sample.Token.ExpandDims(-1))
|
||||
|
||||
if i == 0 {
|
||||
mlx.Eval(sample.Arrays()...)
|
||||
@@ -209,15 +215,17 @@ func (r *Runner) TextGenerationPipeline(ctx context.Context, request Request) er
|
||||
// with those bytes so Content and Logprobs stay aligned when a chunk does
|
||||
// flush.
|
||||
type decoder struct {
|
||||
tokenizer *tokenizer.Tokenizer
|
||||
buf bytes.Buffer
|
||||
logprobs []llm.Logprob
|
||||
tokenizer *tokenizer.Tokenizer
|
||||
buf bytes.Buffer
|
||||
logprobs []llm.Logprob
|
||||
wantLogprobs bool
|
||||
wantTopLogprobs int
|
||||
}
|
||||
|
||||
func (d *decoder) decode(res sampler.Result) (CompletionResponse, bool) {
|
||||
output := int32(res.Token.Int())
|
||||
d.buf.WriteString(d.tokenizer.Decode([]int32{output}))
|
||||
d.logprobs = append(d.logprobs, buildLogprob(res, d.tokenizer.Decode)...)
|
||||
d.logprobs = append(d.logprobs, buildLogprob(res, d.wantLogprobs, d.wantTopLogprobs, d.tokenizer.Decode)...)
|
||||
|
||||
content := flushValidUTF8Prefix(&d.buf)
|
||||
if content == "" {
|
||||
@@ -228,8 +236,13 @@ func (d *decoder) decode(res sampler.Result) (CompletionResponse, bool) {
|
||||
return resp, true
|
||||
}
|
||||
|
||||
func buildLogprob(sample sampler.Result, decode func([]int32) string) []llm.Logprob {
|
||||
if sample.Logprob == nil {
|
||||
// buildLogprob converts the sampler's logprob tensors into the wire-format
|
||||
// llm.Logprob entries the caller wants. The sampler populates its logprob
|
||||
// tensors whenever any registered slot requested them, so the caller must
|
||||
// gate emission on its own request config (wantLogprobs / wantTopLogprobs)
|
||||
// rather than on whether the tensors happen to be non-nil.
|
||||
func buildLogprob(sample sampler.Result, wantLogprobs bool, wantTopLogprobs int, decode func([]int32) string) []llm.Logprob {
|
||||
if !wantLogprobs || sample.Logprob == nil {
|
||||
return nil
|
||||
}
|
||||
tok := func(id int32) string { return decode([]int32{id}) }
|
||||
@@ -241,7 +254,7 @@ func buildLogprob(sample sampler.Result, decode func([]int32) string) []llm.Logp
|
||||
},
|
||||
}
|
||||
|
||||
if sample.TopTokens != nil {
|
||||
if wantTopLogprobs > 0 && sample.TopTokens != nil {
|
||||
ids := sample.TopTokens.Ints()
|
||||
vals := sample.TopLogprobs.Floats()
|
||||
pairs := make([]llm.TokenLogprob, len(ids))
|
||||
@@ -251,9 +264,14 @@ func buildLogprob(sample sampler.Result, decode func([]int32) string) []llm.Logp
|
||||
Logprob: float64(vals[i]),
|
||||
}
|
||||
}
|
||||
// The sampler emits the top maxK across registered slots via
|
||||
// Argpartition, which leaves entries unsorted.
|
||||
sort.Slice(pairs, func(i, j int) bool {
|
||||
return pairs[i].Logprob > pairs[j].Logprob
|
||||
})
|
||||
if wantTopLogprobs < len(pairs) {
|
||||
pairs = pairs[:wantTopLogprobs]
|
||||
}
|
||||
out.TopLogprobs = pairs
|
||||
}
|
||||
return []llm.Logprob{out}
|
||||
|
||||
@@ -27,15 +27,16 @@ type Request struct {
|
||||
Responses chan CompletionResponse
|
||||
Pipeline func(context.Context, Request) error
|
||||
|
||||
Ctx context.Context //nolint:containedctx
|
||||
Tokens []int32
|
||||
Sampler *sample.Sampler
|
||||
Ctx context.Context //nolint:containedctx
|
||||
Tokens []int32
|
||||
SamplerOpts sample.Options
|
||||
}
|
||||
|
||||
type Runner struct {
|
||||
Model base.Model
|
||||
Tokenizer *tokenizer.Tokenizer
|
||||
Requests chan Request
|
||||
Sampler *sample.Sampler
|
||||
cache kvCache
|
||||
contextLength int
|
||||
}
|
||||
@@ -67,6 +68,7 @@ func (r *Runner) Load(modelName string) error {
|
||||
r.Model = m
|
||||
r.Tokenizer = m.Tokenizer()
|
||||
r.contextLength = m.MaxContextLength()
|
||||
r.Sampler = sample.New(r.contextLength)
|
||||
|
||||
mlx.EnableCompile()
|
||||
return nil
|
||||
|
||||
@@ -24,14 +24,15 @@ type logprobEntry struct {
|
||||
func runSampleLogprobs(t *testing.T, logits []float32, topK int) (int, float64, []logprobEntry) {
|
||||
t.Helper()
|
||||
|
||||
s := New(Options{Logprobs: true, TopLogprobs: topK})
|
||||
s := New(128)
|
||||
defer func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
}()
|
||||
s.Add(0, Options{Logprobs: true, TopLogprobs: topK}, nil)
|
||||
|
||||
tensor := mlx.FromValues(logits, 1, len(logits))
|
||||
res := s.Sample(tensor)
|
||||
res := s.Sample([]int{0}, tensor)
|
||||
|
||||
mlx.Pin(res.Arrays()...)
|
||||
defer mlx.Unpin(res.Arrays()...)
|
||||
@@ -55,6 +56,8 @@ func runSampleLogprobs(t *testing.T, logits []float32, topK int) (int, float64,
|
||||
}
|
||||
|
||||
func TestSampleLogprobsBasic(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logits []float32
|
||||
@@ -92,6 +95,8 @@ func TestSampleLogprobsBasic(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSampleLogprobsNumericalStability(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
logits := []float32{1000.0, 999.0, 998.0}
|
||||
_, selLP, top := runSampleLogprobs(t, logits, 3)
|
||||
|
||||
@@ -111,6 +116,8 @@ func TestSampleLogprobsNumericalStability(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSampleLogprobsProbabilityCorrectness(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logits []float32
|
||||
@@ -167,6 +174,8 @@ func TestSampleLogprobsProbabilityCorrectness(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSampleLogprobsSoftmaxCorrectness(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logits []float32
|
||||
@@ -202,6 +211,8 @@ func TestSampleLogprobsSoftmaxCorrectness(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSampleLogprobsSelectedTokenCorrectness(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
logits := []float32{3.0, 1.0, 2.0, 0.5}
|
||||
|
||||
maxIdx := 0
|
||||
@@ -225,7 +236,47 @@ func TestSampleLogprobsSelectedTokenCorrectness(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestBatchedLogprobsPerRow verifies that per-row logprobs in a batched
|
||||
// sample call match the per-slot reference. The numerically-stable softmax
|
||||
// must reduce along the last axis only, not over the whole batch.
|
||||
func TestBatchedLogprobsPerRow(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
rowA := []float32{2, 1, 0}
|
||||
rowB := []float32{0, 5, 0}
|
||||
|
||||
_, wantA, _ := runSampleLogprobs(t, rowA, 0)
|
||||
_, wantB, _ := runSampleLogprobs(t, rowB, 0)
|
||||
|
||||
s := New(128)
|
||||
t.Cleanup(func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
})
|
||||
s.Add(1, Options{Logprobs: true}, nil)
|
||||
s.Add(2, Options{Logprobs: true}, nil)
|
||||
|
||||
logits := mlx.FromValues(append(append([]float32{}, rowA...), rowB...), 2, 3)
|
||||
res := s.Sample([]int{1, 2}, logits)
|
||||
mlx.Pin(res.Arrays()...)
|
||||
t.Cleanup(func() { mlx.Unpin(res.Arrays()...) })
|
||||
mlx.Eval(res.Arrays()...)
|
||||
|
||||
got := res.Logprob.Floats()
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("Logprob length = %d, want 2", len(got))
|
||||
}
|
||||
if math.Abs(float64(got[0])-wantA) > 1e-5 {
|
||||
t.Errorf("row 0 logprob = %f, want %f (per-slot reference)", got[0], wantA)
|
||||
}
|
||||
if math.Abs(float64(got[1])-wantB) > 1e-5 {
|
||||
t.Errorf("row 1 logprob = %f, want %f (per-slot reference)", got[1], wantB)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSampleLogprobsTopKOrdering(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
// Logits chosen so argmax order differs from index order.
|
||||
logits := []float32{2.0, 5.0, 1.0, 4.0, 3.0}
|
||||
wantOrder := []int{1, 3, 4, 0, 2}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
package sample
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
)
|
||||
|
||||
type Transform func(*Sampler, *mlx.Array) *mlx.Array
|
||||
|
||||
type Options struct {
|
||||
Temperature float32
|
||||
TopP float32
|
||||
@@ -24,21 +24,15 @@ type Options struct {
|
||||
TopLogprobs int
|
||||
}
|
||||
|
||||
type Sampler struct {
|
||||
Options
|
||||
|
||||
history *mlx.Array
|
||||
historyLen int
|
||||
transforms []Transform
|
||||
}
|
||||
|
||||
// Result bundles the outputs of one decode step. The logprob tensors are
|
||||
// populated only when the sampler is configured to report them.
|
||||
// Result bundles the outputs of one decode step. Logprob/TopTokens/
|
||||
// TopLogprobs are populated whenever any registered slot has Logprobs
|
||||
// (respectively TopLogprobs>0). Consumers need to filter by their
|
||||
// per-slot Options.
|
||||
type Result struct {
|
||||
Token *mlx.Array // sampled token id, shape [B]
|
||||
Logprob *mlx.Array // sampled-token logprob, shape [B,1]; nil unless Logprobs
|
||||
TopTokens *mlx.Array // top-K token ids, shape [B,K]; nil unless TopLogprobs > 0
|
||||
TopLogprobs *mlx.Array // top-K logprobs, shape [B,K]; nil unless TopLogprobs > 0
|
||||
Token *mlx.Array // sampled token ids, shape [B]
|
||||
Logprob *mlx.Array // sampled-token logprobs, shape [B,1]; nil unless any registered slot has Logprobs
|
||||
TopTokens *mlx.Array // top-K token ids, shape [B,maxK]; nil unless any registered slot has TopLogprobs>0
|
||||
TopLogprobs *mlx.Array // top-K logprobs, shape [B,maxK]; same
|
||||
}
|
||||
|
||||
// Arrays returns the tensor fields as a slice so callers can drive the mlx
|
||||
@@ -48,121 +42,300 @@ func (r Result) Arrays() []*mlx.Array {
|
||||
return []*mlx.Array{r.Token, r.Logprob, r.TopTokens, r.TopLogprobs}
|
||||
}
|
||||
|
||||
func New(opts Options) *Sampler {
|
||||
if opts.RepeatPenalty <= 0 {
|
||||
opts.RepeatPenalty = 1
|
||||
// Sampler is a batched, slot-based sampler. Sequences are registered with
|
||||
// Add and released with Remove. Each Sample call takes a subset of
|
||||
// registered slots (in any order) with their [B,V] logits, samples one
|
||||
// token per row, and appends it to that slot's ring-buffer history. Slots
|
||||
// not named in a given call are untouched.
|
||||
type Sampler struct {
|
||||
slots []*slotState
|
||||
byID map[int]*slotState
|
||||
|
||||
// history is the pooled ring-buffer storage, [B, W] int32. Row i
|
||||
// belongs to slots[i]; W is max(RepeatLastN) across penalty slots.
|
||||
// Allocated on the first penalty slot, rebuilt only in Add/Remove.
|
||||
history *mlx.Array
|
||||
|
||||
// allSameOpts: every registered slot shares Options. When true the
|
||||
// canonical shared value is s.slots[0].opts.
|
||||
allSameOpts bool
|
||||
|
||||
// anyLogprobs / maxTopLogprobs: compute-for-all output config.
|
||||
// Sample populates Logprob (and Top* when maxTopLogprobs>0) whenever
|
||||
// any registered slot requests them, even if that slot isn't in the
|
||||
// current call.
|
||||
anyLogprobs bool
|
||||
maxTopLogprobs int
|
||||
|
||||
// numCtx is the runner's context window; normalize uses it to
|
||||
// resolve the repeat_last_n == -1 sentinel.
|
||||
numCtx int
|
||||
}
|
||||
|
||||
type slotState struct {
|
||||
opts Options
|
||||
transforms []transform
|
||||
historyLen int
|
||||
}
|
||||
|
||||
type slotCtx struct {
|
||||
opts Options
|
||||
history *mlx.Array // 2D [B, W] when penalties are configured; nil otherwise
|
||||
}
|
||||
|
||||
type transform func(*slotCtx, *mlx.Array) *mlx.Array
|
||||
|
||||
// New constructs an empty sampler with no registered slots. numCtx is
|
||||
// the runner's context window and must be positive.
|
||||
func New(numCtx int) *Sampler {
|
||||
return &Sampler{
|
||||
byID: make(map[int]*slotState),
|
||||
allSameOpts: true,
|
||||
numCtx: numCtx,
|
||||
}
|
||||
}
|
||||
|
||||
// historyWidth returns the column count of the pooled history tensor,
|
||||
// or 0 when no penalty slot has forced it to be allocated.
|
||||
func (s *Sampler) historyWidth() int {
|
||||
if s.history == nil {
|
||||
return 0
|
||||
}
|
||||
return s.history.Dim(1)
|
||||
}
|
||||
|
||||
func (o Options) usesHistory() bool {
|
||||
// RepeatLastN == 0 disables the penalty ring per the repeat_last_n API
|
||||
// contract (0 = disabled), overriding any penalty coefficients.
|
||||
if o.RepeatLastN == 0 {
|
||||
return false
|
||||
}
|
||||
return o.RepeatPenalty != 1 || o.PresencePenalty != 0 || o.FrequencyPenalty != 0
|
||||
}
|
||||
|
||||
func (o Options) normalize(numCtx int) Options {
|
||||
if o.RepeatPenalty <= 0 {
|
||||
o.RepeatPenalty = 1
|
||||
}
|
||||
// Resolve the repeat_last_n == -1 sentinel ("-1 = num_ctx") against
|
||||
// the caller's context window.
|
||||
if o.RepeatLastN < 0 {
|
||||
o.RepeatLastN = numCtx
|
||||
}
|
||||
if !o.usesHistory() {
|
||||
// Zero the ring capacity so slots that differ only in a spurious
|
||||
// RepeatLastN still batch together and don't inflate pool width.
|
||||
o.RepeatLastN = 0
|
||||
}
|
||||
return o
|
||||
}
|
||||
|
||||
func (o Options) buildTransforms() []transform {
|
||||
var ts []transform
|
||||
if o.usesHistory() {
|
||||
ts = append(ts, penalty)
|
||||
}
|
||||
|
||||
s := &Sampler{Options: opts}
|
||||
|
||||
var transforms []Transform
|
||||
if s.usesHistory() {
|
||||
transforms = append(transforms, penalty)
|
||||
}
|
||||
|
||||
hasTopP := opts.TopP > 0 && opts.TopP < 1
|
||||
hasTopK := opts.TopK > 0
|
||||
hasTopP := o.TopP > 0 && o.TopP < 1
|
||||
hasTopK := o.TopK > 0
|
||||
switch {
|
||||
case hasTopP:
|
||||
// topKTopP always does a full descending sort for the top-P
|
||||
// cumulative mask and opportunistically masks top-K during the
|
||||
// same pass when it is also configured.
|
||||
transforms = append(transforms, topKTopP)
|
||||
ts = append(ts, topKTopP)
|
||||
case hasTopK:
|
||||
// Argpartition (partial sort) is cheaper than a full sort.
|
||||
transforms = append(transforms, topK)
|
||||
ts = append(ts, topK)
|
||||
}
|
||||
|
||||
if opts.MinP != 0 {
|
||||
transforms = append(transforms, minP)
|
||||
if o.MinP != 0 {
|
||||
ts = append(ts, minP)
|
||||
}
|
||||
|
||||
if opts.Temperature == 0 {
|
||||
transforms = append(transforms, greedy)
|
||||
if o.Temperature == 0 {
|
||||
ts = append(ts, greedy)
|
||||
} else {
|
||||
transforms = append(transforms, temperature)
|
||||
ts = append(ts, temperature)
|
||||
}
|
||||
|
||||
s.transforms = transforms
|
||||
return s
|
||||
return ts
|
||||
}
|
||||
|
||||
func (s *Sampler) usesHistory() bool {
|
||||
return s.RepeatPenalty != 1 || s.PresencePenalty != 0 || s.FrequencyPenalty != 0
|
||||
}
|
||||
|
||||
func (s *Sampler) setHistory(history *mlx.Array, historyLen int) {
|
||||
if history != nil {
|
||||
mlx.Pin(history)
|
||||
// Add registers a sequence under seqID. The last RepeatLastN entries of
|
||||
// priorTokens seed the ring buffer.
|
||||
func (s *Sampler) Add(seqID int, opts Options, priorTokens []int32) {
|
||||
if _, dup := s.byID[seqID]; dup {
|
||||
panic(fmt.Sprintf("sample.Sampler.Add: seqID %d already registered", seqID))
|
||||
}
|
||||
if s.history != nil {
|
||||
|
||||
opts = opts.normalize(s.numCtx)
|
||||
slot := &slotState{
|
||||
opts: opts,
|
||||
transforms: opts.buildTransforms(),
|
||||
}
|
||||
|
||||
// Grow the pool to hold this slot's row. The pool is lazy — the first
|
||||
// penalty slot allocates it — and thereafter every registered slot
|
||||
// gets a row (rows for non-penalty slots are zero and never read).
|
||||
// Invariant: s.history is pinned whenever non-nil.
|
||||
if s.history != nil || opts.usesHistory() {
|
||||
targetWidth := max(opts.RepeatLastN, s.historyWidth())
|
||||
newRow := makeHistoryRow(priorTokens, opts.RepeatLastN, targetWidth)
|
||||
|
||||
var pool *mlx.Array
|
||||
switch {
|
||||
case s.history == nil && len(s.slots) == 0:
|
||||
pool = newRow
|
||||
case s.history == nil:
|
||||
// First penalty slot with non-penalty slots already registered;
|
||||
// seed zero rows so s.slots and pool row indices stay aligned.
|
||||
zeros := mlx.Zeros(mlx.DTypeInt32, len(s.slots), targetWidth)
|
||||
pool = zeros.Concatenate(0, newRow)
|
||||
case targetWidth > s.historyWidth():
|
||||
pad := mlx.Zeros(mlx.DTypeInt32, s.history.Dim(0), targetWidth-s.historyWidth())
|
||||
pool = s.history.Concatenate(1, pad).Concatenate(0, newRow)
|
||||
default:
|
||||
pool = s.history.Concatenate(0, newRow)
|
||||
}
|
||||
|
||||
mlx.Pin(pool)
|
||||
mlx.Unpin(s.history)
|
||||
s.history = pool
|
||||
|
||||
if opts.usesHistory() {
|
||||
// Cap on seed so the next write's ring position
|
||||
// (historyLen % RepeatLastN) lands at 0, overwriting the
|
||||
// oldest entry when the ring was filled from priors.
|
||||
slot.historyLen = min(len(priorTokens), opts.RepeatLastN)
|
||||
}
|
||||
}
|
||||
s.history = history
|
||||
s.historyLen = historyLen
|
||||
|
||||
s.slots = append(s.slots, slot)
|
||||
s.byID[seqID] = slot
|
||||
s.recomputeInvariants()
|
||||
}
|
||||
|
||||
func (s *Sampler) ResetHistory(history []int32) {
|
||||
if !s.usesHistory() {
|
||||
// makeHistoryRow builds a [1, width] int32 row with the last repeatLastN
|
||||
// entries of priorTokens packed into [0, min(len, repeatLastN)), zeros
|
||||
// elsewhere.
|
||||
func makeHistoryRow(priorTokens []int32, repeatLastN, width int) *mlx.Array {
|
||||
take := min(len(priorTokens), repeatLastN)
|
||||
if take <= 0 {
|
||||
return mlx.Zeros(mlx.DTypeInt32, 1, width)
|
||||
}
|
||||
row := make([]int32, width)
|
||||
copy(row, priorTokens[len(priorTokens)-take:])
|
||||
return mlx.NewArrayInt32(row, []int32{1, int32(width)})
|
||||
}
|
||||
|
||||
// recomputeInvariants refreshes allSameOpts and anyLogprobs/maxTopLogprobs
|
||||
// from s.slots. Called at the end of Add and Remove.
|
||||
func (s *Sampler) recomputeInvariants() {
|
||||
if len(s.slots) == 0 {
|
||||
s.allSameOpts = true
|
||||
s.anyLogprobs = false
|
||||
s.maxTopLogprobs = 0
|
||||
return
|
||||
}
|
||||
if s.RepeatLastN > 0 && len(history) > s.RepeatLastN {
|
||||
history = history[len(history)-s.RepeatLastN:]
|
||||
first := s.slots[0].opts
|
||||
s.allSameOpts = true
|
||||
s.anyLogprobs = false
|
||||
s.maxTopLogprobs = 0
|
||||
for _, slot := range s.slots {
|
||||
if slot.opts != first {
|
||||
s.allSameOpts = false
|
||||
}
|
||||
if slot.opts.Logprobs {
|
||||
s.anyLogprobs = true
|
||||
if slot.opts.TopLogprobs > s.maxTopLogprobs {
|
||||
s.maxTopLogprobs = slot.opts.TopLogprobs
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(history) == 0 {
|
||||
s.setHistory(nil, 0)
|
||||
}
|
||||
|
||||
// Remove releases the slot. The pool tensor is rebuilt to drop the row.
|
||||
func (s *Sampler) Remove(seqID int) {
|
||||
slot, ok := s.byID[seqID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
delete(s.byID, seqID)
|
||||
|
||||
row := slices.Index(s.slots, slot)
|
||||
s.slots = slices.Delete(s.slots, row, row+1)
|
||||
s.recomputeInvariants()
|
||||
|
||||
if s.history == nil {
|
||||
return
|
||||
}
|
||||
|
||||
tokens := append([]int32(nil), history...)
|
||||
s.setHistory(mlx.NewArrayInt32(tokens, []int32{int32(len(tokens))}), len(tokens))
|
||||
}
|
||||
|
||||
func (s *Sampler) AppendToken(token *mlx.Array) {
|
||||
if !s.usesHistory() || token == nil {
|
||||
return
|
||||
}
|
||||
|
||||
next := token.AsType(mlx.DTypeInt32)
|
||||
nextLen := next.Size()
|
||||
|
||||
if s.history != nil && s.historyLen > 0 {
|
||||
next = s.history.Concatenate(0, next)
|
||||
nextLen += s.historyLen
|
||||
}
|
||||
|
||||
if s.RepeatLastN > 0 && nextLen > s.RepeatLastN {
|
||||
trim := nextLen - s.RepeatLastN
|
||||
next = next.Slice(mlx.Slice(trim, nextLen))
|
||||
nextLen = s.RepeatLastN
|
||||
}
|
||||
|
||||
s.setHistory(next, nextLen)
|
||||
n := s.history.Dim(0)
|
||||
var newHistory *mlx.Array
|
||||
switch {
|
||||
case n == 1:
|
||||
newHistory = nil
|
||||
case row == 0:
|
||||
newHistory = s.history.Slice(mlx.Slice(1, n), mlx.Slice())
|
||||
case row == n-1:
|
||||
newHistory = s.history.Slice(mlx.Slice(0, row), mlx.Slice())
|
||||
default:
|
||||
before := s.history.Slice(mlx.Slice(0, row), mlx.Slice())
|
||||
after := s.history.Slice(mlx.Slice(row+1, n), mlx.Slice())
|
||||
newHistory = before.Concatenate(0, after)
|
||||
}
|
||||
|
||||
mlx.Pin(newHistory)
|
||||
mlx.Unpin(s.history)
|
||||
s.history = newHistory
|
||||
}
|
||||
|
||||
// Free releases the pooled history tensor and resets the sampler to the
|
||||
// New-equivalent state so it may be reused.
|
||||
func (s *Sampler) Free() {
|
||||
s.setHistory(nil, 0)
|
||||
mlx.Unpin(s.history)
|
||||
*s = Sampler{
|
||||
byID: make(map[int]*slotState),
|
||||
allSameOpts: true,
|
||||
numCtx: s.numCtx,
|
||||
}
|
||||
}
|
||||
|
||||
// Sample runs the configured transform chain on the raw per-token logits
|
||||
// and returns the sampled token id plus, when configured, the reported
|
||||
// log-probability tensors for the selected token and the top-K tokens.
|
||||
func (s *Sampler) Sample(logits *mlx.Array) Result {
|
||||
scores := logits
|
||||
for _, transform := range s.transforms {
|
||||
scores = transform(s, scores)
|
||||
// Sample draws one token per row of logits ([B,V]); seqIDs[i] names the
|
||||
// slot whose logits live at row i. Each sampled token is appended to its
|
||||
// slot's ring. Slots not named in seqIDs are untouched.
|
||||
func (s *Sampler) Sample(seqIDs []int, logits *mlx.Array) Result {
|
||||
if len(seqIDs) == 0 {
|
||||
return Result{}
|
||||
}
|
||||
res := Result{Token: scores}
|
||||
|
||||
if s.Logprobs {
|
||||
// Compute log_softmax in fp32 and subtract the max before
|
||||
// logsumexp so the final subtraction stays on small values.
|
||||
// Otherwise it cancels two large numbers and loses precision.
|
||||
slots := make([]*slotState, len(seqIDs))
|
||||
for i, id := range seqIDs {
|
||||
slot, ok := s.byID[id]
|
||||
if !ok {
|
||||
panic(fmt.Sprintf("sample.Sampler.Sample: seqID %d not registered", id))
|
||||
}
|
||||
slots[i] = slot
|
||||
}
|
||||
|
||||
var token *mlx.Array
|
||||
if opts0, ok := s.canBatch(slots); ok {
|
||||
token = s.sampleTokensUniform(slots, opts0, logits)
|
||||
} else {
|
||||
token = s.sampleTokensSerial(slots, logits)
|
||||
}
|
||||
|
||||
res := Result{Token: token}
|
||||
if s.anyLogprobs {
|
||||
// Log-softmax over original logits so every row holds a truthful
|
||||
// value (compute-for-all; consumers filter per-slot). Subtract
|
||||
// max first for numerical stability in the logsumexp.
|
||||
lp := logits.AsType(mlx.DTypeFloat32)
|
||||
lp = lp.Subtract(lp.MaxAxis(-1, true))
|
||||
lp = lp.Subtract(lp.Logsumexp(true))
|
||||
res.Logprob = lp.TakeAlongAxis(res.Token.ExpandDims(-1), -1)
|
||||
if k := s.TopLogprobs; k > 0 {
|
||||
lp = lp.Subtract(lp.LogsumexpAxis(-1, true))
|
||||
res.Logprob = lp.TakeAlongAxis(token.ExpandDims(-1), -1)
|
||||
if s.maxTopLogprobs > 0 {
|
||||
k := s.maxTopLogprobs
|
||||
if vocab := lp.Dim(lp.NumDims() - 1); k > vocab {
|
||||
k = vocab
|
||||
}
|
||||
@@ -176,55 +349,180 @@ func (s *Sampler) Sample(logits *mlx.Array) Result {
|
||||
return res
|
||||
}
|
||||
|
||||
func greedy(_ *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
return scores.Argmax(-1, false)
|
||||
// canBatch reports whether the call can take the uniform batched path.
|
||||
// All slots must share Options; when penalties are active the call must
|
||||
// additionally cover every registered slot in registration order with a
|
||||
// full ring, because the uniform path indexes the pool positionally.
|
||||
func (s *Sampler) canBatch(slots []*slotState) (Options, bool) {
|
||||
if !s.allSameOpts {
|
||||
return Options{}, false
|
||||
}
|
||||
// slots is non-empty (Sample guards) and every slot is registered,
|
||||
// so s.slots[0].opts is the canonical shared value.
|
||||
shared := s.slots[0].opts
|
||||
if !shared.usesHistory() {
|
||||
return shared, true
|
||||
}
|
||||
if len(slots) != len(s.slots) {
|
||||
return Options{}, false
|
||||
}
|
||||
for i, slot := range slots {
|
||||
if s.slots[i] != slot || slot.historyLen < shared.RepeatLastN {
|
||||
return Options{}, false
|
||||
}
|
||||
}
|
||||
return shared, true
|
||||
}
|
||||
|
||||
func temperature(s *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
return mlx.DivScalar(scores, s.Temperature).Categorical(-1)
|
||||
// sampleTokensUniform runs one fused transform pass over the whole batch.
|
||||
// Reached only when canBatch is true, which lets the pool be used in place
|
||||
// with a single PutAlongAxis write-back and no gather.
|
||||
func (s *Sampler) sampleTokensUniform(slots []*slotState, opts Options, logits *mlx.Array) *mlx.Array {
|
||||
B := len(slots)
|
||||
|
||||
var hist *mlx.Array
|
||||
if opts.usesHistory() {
|
||||
hist = s.history
|
||||
if s.historyWidth() > opts.RepeatLastN {
|
||||
hist = hist.Slice(mlx.Slice(), mlx.Slice(0, opts.RepeatLastN))
|
||||
}
|
||||
}
|
||||
|
||||
ctx := &slotCtx{opts: opts, history: hist}
|
||||
scores := logits
|
||||
for _, t := range slots[0].transforms {
|
||||
scores = t(ctx, scores)
|
||||
}
|
||||
token := scores
|
||||
|
||||
if !opts.usesHistory() {
|
||||
return token
|
||||
}
|
||||
|
||||
writeIdxData := make([]int32, B)
|
||||
for i, slot := range slots {
|
||||
writeIdxData[i] = int32(slot.historyLen % opts.RepeatLastN)
|
||||
slot.historyLen++
|
||||
}
|
||||
writeIdx := mlx.NewArrayInt32(writeIdxData, []int32{int32(B), 1})
|
||||
|
||||
s.history.Set(s.history.PutAlongAxis(writeIdx, token.ExpandDims(-1), 1))
|
||||
return token
|
||||
}
|
||||
|
||||
// sampleTokensSerial runs each slot's transforms against its own row of
|
||||
// logits.
|
||||
func (s *Sampler) sampleTokensSerial(slots []*slotState, logits *mlx.Array) *mlx.Array {
|
||||
perSlotTokens := make([]*mlx.Array, len(slots))
|
||||
|
||||
rowOf := make(map[*slotState]int, len(s.slots))
|
||||
for i, slot := range s.slots {
|
||||
rowOf[slot] = i
|
||||
}
|
||||
|
||||
for i, slot := range slots {
|
||||
row := logits.Slice(mlx.Slice(i, i+1), mlx.Slice())
|
||||
|
||||
var hist *mlx.Array
|
||||
if slot.opts.usesHistory() && slot.historyLen > 0 && s.history != nil {
|
||||
poolRow := rowOf[slot]
|
||||
fill := min(slot.historyLen, slot.opts.RepeatLastN)
|
||||
hist = s.history.Slice(
|
||||
mlx.Slice(poolRow, poolRow+1),
|
||||
mlx.Slice(0, fill),
|
||||
)
|
||||
}
|
||||
|
||||
ctx := &slotCtx{opts: slot.opts, history: hist}
|
||||
scores := row
|
||||
for _, t := range slot.transforms {
|
||||
scores = t(ctx, scores)
|
||||
}
|
||||
perSlotTokens[i] = scores
|
||||
}
|
||||
|
||||
token := mlx.Concatenate(perSlotTokens, 0)
|
||||
|
||||
if s.history != nil {
|
||||
// For each writing slot collect its flat (row-major) pool offset
|
||||
// and the call-order position of its token. One PutAlongAxis on a
|
||||
// flat view of the pool scatters all writes in a single op.
|
||||
flatOffsets := make([]int32, 0, len(slots))
|
||||
tokenPos := make([]int32, 0, len(slots))
|
||||
for i, slot := range slots {
|
||||
if !slot.opts.usesHistory() {
|
||||
continue
|
||||
}
|
||||
ringPos := slot.historyLen % slot.opts.RepeatLastN
|
||||
flatOffsets = append(flatOffsets, int32(rowOf[slot]*s.historyWidth()+ringPos))
|
||||
tokenPos = append(tokenPos, int32(i))
|
||||
slot.historyLen++
|
||||
}
|
||||
|
||||
if len(flatOffsets) > 0 {
|
||||
m := len(flatOffsets)
|
||||
flatIdx := mlx.NewArrayInt32(flatOffsets, []int32{int32(m), 1})
|
||||
writingTokens := token
|
||||
if m != len(slots) {
|
||||
tokenPosIdx := mlx.NewArrayInt32(tokenPos, []int32{int32(m)})
|
||||
writingTokens = token.TakeAxis(tokenPosIdx, 0)
|
||||
}
|
||||
flatHist := s.history.Reshape(s.history.Dim(0)*s.historyWidth(), 1)
|
||||
s.history.Set(flatHist.PutAlongAxis(flatIdx, writingTokens.ExpandDims(-1), 0).Reshape(s.history.Dim(0), s.historyWidth()))
|
||||
}
|
||||
}
|
||||
return token
|
||||
}
|
||||
|
||||
func greedy(_ *slotCtx, scores *mlx.Array) *mlx.Array {
|
||||
return scores.Argmax(-1, false).AsType(mlx.DTypeInt32)
|
||||
}
|
||||
|
||||
func temperature(ctx *slotCtx, scores *mlx.Array) *mlx.Array {
|
||||
return mlx.DivScalar(scores, ctx.opts.Temperature).Categorical(-1).AsType(mlx.DTypeInt32)
|
||||
}
|
||||
|
||||
// topKTopP applies top-P in a descending sort pass and, when top-K is also
|
||||
// configured, masks any surviving value below the K-th largest in the same
|
||||
// pass. Callers dispatch here whenever top-P is enabled — the top-K-only
|
||||
// case uses a cheaper partial sort via the topK transform.
|
||||
func topKTopP(s *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
// pass. Callers dispatch here whenever top-P is enabled — the top-K-only case
|
||||
// uses a cheaper partial sort via the topK transform.
|
||||
func topKTopP(ctx *slotCtx, scores *mlx.Array) *mlx.Array {
|
||||
vocab := scores.Dim(scores.NumDims() - 1)
|
||||
applyTopK := s.TopK > 0 && s.TopK < vocab
|
||||
applyTopK := ctx.opts.TopK > 0 && ctx.opts.TopK < vocab
|
||||
|
||||
order := scores.Negative().ArgsortAxis(-1)
|
||||
sorted := scores.TakeAlongAxis(order, -1)
|
||||
negInf := mlx.FromValue(float32(math.Inf(-1)))
|
||||
|
||||
// Top-P: in descending order, keep tokens whose exclusive cumulative
|
||||
// probability is still below s.TopP.
|
||||
// probability is still below TopP.
|
||||
probs := mlx.SoftmaxAxis(sorted, -1, true)
|
||||
prevCumProbs := probs.Cumsum(-1, false, true).Subtract(probs)
|
||||
keep := prevCumProbs.Less(mlx.FromValue(s.TopP))
|
||||
keep := prevCumProbs.Less(mlx.FromValue(ctx.opts.TopP))
|
||||
sorted = mlx.Where(keep, sorted, negInf)
|
||||
|
||||
out := scores.PutAlongAxis(order, sorted, -1)
|
||||
|
||||
// Top-K: sorted is already in descending order, so positions [K, V)
|
||||
// are the ones to drop. Scatter -inf through their original-layout
|
||||
// indices (order[K:]). Positional (not value-based) so exactly K
|
||||
// tokens survive — ties at the K-th logit get broken by the sort
|
||||
// order rather than promoted through the filter.
|
||||
// Top-K: sorted is already in descending order, so positions [K, V) are
|
||||
// the ones to drop. Scatter -inf through their original-layout indices
|
||||
// (order[K:]). Positional (not value-based) so exactly K tokens survive —
|
||||
// ties at the K-th logit get broken by the sort order rather than
|
||||
// promoted through the filter.
|
||||
if applyTopK {
|
||||
dropOrder := order.Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
|
||||
dropOrder := order.Slice(mlx.Slice(), mlx.Slice(ctx.opts.TopK, mlx.End))
|
||||
out = out.PutAlongAxis(dropOrder, negInf, -1)
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func minP(s *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
if s.MinP <= 0 || s.MinP > 1 {
|
||||
func minP(ctx *slotCtx, scores *mlx.Array) *mlx.Array {
|
||||
if ctx.opts.MinP <= 0 || ctx.opts.MinP > 1 {
|
||||
return scores
|
||||
}
|
||||
|
||||
maxScore := scores.MaxAxis(-1, true)
|
||||
threshold := mlx.AddScalar(maxScore, float32(math.Log(float64(s.MinP))))
|
||||
threshold := mlx.AddScalar(maxScore, float32(math.Log(float64(ctx.opts.MinP))))
|
||||
|
||||
return mlx.Where(
|
||||
scores.Less(threshold),
|
||||
@@ -233,48 +531,43 @@ func minP(s *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
)
|
||||
}
|
||||
|
||||
func topK(s *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
if s.TopK <= 0 {
|
||||
func topK(ctx *slotCtx, scores *mlx.Array) *mlx.Array {
|
||||
if ctx.opts.TopK <= 0 {
|
||||
return scores
|
||||
}
|
||||
|
||||
vocab := scores.Dim(scores.NumDims() - 1)
|
||||
if s.TopK >= vocab {
|
||||
if ctx.opts.TopK >= vocab {
|
||||
return scores
|
||||
}
|
||||
|
||||
mask := scores.Negative().ArgpartitionAxis(s.TopK-1, -1).Slice(mlx.Slice(), mlx.Slice(s.TopK, mlx.End))
|
||||
mask := scores.Negative().ArgpartitionAxis(ctx.opts.TopK-1, -1).Slice(mlx.Slice(), mlx.Slice(ctx.opts.TopK, mlx.End))
|
||||
return scores.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
|
||||
}
|
||||
|
||||
func penalty(s *Sampler, scores *mlx.Array) *mlx.Array {
|
||||
if s.historyLen == 0 {
|
||||
func penalty(ctx *slotCtx, scores *mlx.Array) *mlx.Array {
|
||||
tokenIndices := ctx.history
|
||||
if tokenIndices == nil {
|
||||
return scores
|
||||
}
|
||||
|
||||
tokenIndices := s.history
|
||||
if scores.NumDims() > 1 {
|
||||
tokenIndices = tokenIndices.ExpandDims(0)
|
||||
}
|
||||
|
||||
if s.RepeatPenalty != 1 || s.PresencePenalty != 0 {
|
||||
if ctx.opts.RepeatPenalty != 1 || ctx.opts.PresencePenalty != 0 {
|
||||
adjusted := scores.TakeAlongAxis(tokenIndices, -1)
|
||||
if s.RepeatPenalty != 1 {
|
||||
if ctx.opts.RepeatPenalty != 1 {
|
||||
factor := mlx.Where(
|
||||
adjusted.Less(mlx.FromValue(float32(0))),
|
||||
mlx.FromValue(s.RepeatPenalty),
|
||||
mlx.FromValue(1/s.RepeatPenalty),
|
||||
mlx.FromValue(ctx.opts.RepeatPenalty),
|
||||
mlx.FromValue(1/ctx.opts.RepeatPenalty),
|
||||
)
|
||||
adjusted = adjusted.Multiply(factor)
|
||||
}
|
||||
if s.PresencePenalty != 0 {
|
||||
adjusted = mlx.AddScalar(adjusted, -s.PresencePenalty)
|
||||
if ctx.opts.PresencePenalty != 0 {
|
||||
adjusted = mlx.AddScalar(adjusted, -ctx.opts.PresencePenalty)
|
||||
}
|
||||
scores = scores.PutAlongAxis(tokenIndices, adjusted, -1)
|
||||
}
|
||||
|
||||
if s.FrequencyPenalty != 0 {
|
||||
scores = scores.ScatterAddAxis(tokenIndices, mlx.FromValue(-s.FrequencyPenalty), -1)
|
||||
if ctx.opts.FrequencyPenalty != 0 {
|
||||
scores = scores.ScatterAddAxis(tokenIndices, mlx.FromValue(-ctx.opts.FrequencyPenalty), -1)
|
||||
}
|
||||
|
||||
return scores
|
||||
|
||||
@@ -9,93 +9,298 @@ import (
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
)
|
||||
|
||||
func TestPresencePenaltyUsesAppendedTokenImmediately(t *testing.T) {
|
||||
s := New(Options{RepeatLastN: 1, PresencePenalty: 6})
|
||||
defer func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
}()
|
||||
|
||||
s.ResetHistory([]int32{0})
|
||||
s.AppendToken(mlx.NewArrayInt32([]int32{1}, []int32{1}))
|
||||
|
||||
logits := mlx.FromValues([]float32{0, 5, 4}, 3)
|
||||
got := s.Sample(logits).Token
|
||||
mlx.Eval(got)
|
||||
|
||||
// logits will be [0, -1, 4] after the penalty
|
||||
// and then (index) 2 after the greedy sampler
|
||||
gotInt := got.Int()
|
||||
if gotInt != 2 {
|
||||
t.Fatalf("got %d, want 2", gotInt)
|
||||
func skipIfNoMLX(t *testing.T) {
|
||||
t.Helper()
|
||||
if err := mlx.CheckInit(); err != nil {
|
||||
t.Skipf("MLX not available: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepeatPenaltyUsesHistoryWithoutPresencePenalty(t *testing.T) {
|
||||
s := New(Options{RepeatLastN: 1, RepeatPenalty: 2})
|
||||
defer func() {
|
||||
// slotLogits builds a [1, V] logits tensor for a single-slot Sample call.
|
||||
func slotLogits(values []float32) *mlx.Array {
|
||||
return mlx.FromValues(values, 1, len(values))
|
||||
}
|
||||
|
||||
// batchLogits stacks per-row float32 slices of equal length into a [B, V]
|
||||
// logits tensor.
|
||||
func batchLogits(rows ...[]float32) *mlx.Array {
|
||||
v := len(rows[0])
|
||||
flat := make([]float32, 0, len(rows)*v)
|
||||
for _, r := range rows {
|
||||
if len(r) != v {
|
||||
panic("batchLogits: rows must share vocab size")
|
||||
}
|
||||
flat = append(flat, r...)
|
||||
}
|
||||
return mlx.FromValues(flat, len(rows), v)
|
||||
}
|
||||
|
||||
// sampleOne runs Sample on a freshly-added single slot and returns the
|
||||
// sampled token id. Used both for the single-slot options table and as the
|
||||
// reference oracle for the batched-equivalence test.
|
||||
func sampleOne(t *testing.T, opts Options, priorTokens []int32, values []float32) int {
|
||||
t.Helper()
|
||||
s := New(128)
|
||||
t.Cleanup(func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
}()
|
||||
})
|
||||
s.Add(0, opts, priorTokens)
|
||||
|
||||
s.ResetHistory([]int32{1})
|
||||
|
||||
logits := mlx.FromValues([]float32{0, 5, 4}, 3)
|
||||
got := s.Sample(logits).Token
|
||||
got := s.Sample([]int{0}, slotLogits(values)).Token
|
||||
mlx.Eval(got)
|
||||
return got.Int()
|
||||
}
|
||||
|
||||
// token 1 is repeated and positive, so 5 / 2 falls below token 2.
|
||||
gotInt := got.Int()
|
||||
if gotInt != 2 {
|
||||
t.Fatalf("got %d, want 2", gotInt)
|
||||
// logOf returns log(p) as a float32 so tests can build logits that softmax to
|
||||
// a chosen probability distribution.
|
||||
func logOf(p float64) float32 { return float32(math.Log(p)) }
|
||||
|
||||
// TestSampleSingleSlotOptions pins the per-slot behavior of each Options
|
||||
// knob against a concrete expected token. Expected values are worked out by
|
||||
// hand from the math of each transform, not from a second call into the
|
||||
// sampler — so a regression in any single transform shows up here.
|
||||
func TestSampleSingleSlotOptions(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
opts Options
|
||||
priors []int32
|
||||
logits []float32
|
||||
want int
|
||||
}{
|
||||
{
|
||||
name: "presence penalty",
|
||||
opts: Options{RepeatLastN: 1, PresencePenalty: 6},
|
||||
priors: []int32{1},
|
||||
logits: []float32{0, 5, 4},
|
||||
want: 2, // token 1: 5 - 6 = -1, argmax shifts to 2
|
||||
},
|
||||
{
|
||||
name: "repeat penalty on positive logits",
|
||||
opts: Options{RepeatLastN: 1, RepeatPenalty: 2},
|
||||
priors: []int32{1},
|
||||
logits: []float32{0, 5, 4},
|
||||
want: 2, // token 1 positive → divided: 5/2 = 2.5, argmax shifts to 2
|
||||
},
|
||||
{
|
||||
name: "repeat penalty on negative logits",
|
||||
opts: Options{RepeatLastN: 1, RepeatPenalty: 4},
|
||||
priors: []int32{1},
|
||||
logits: []float32{-5, -1, -3},
|
||||
want: 2, // token 1 negative → multiplied: -1*4 = -4, argmax shifts to 2
|
||||
},
|
||||
{
|
||||
name: "frequency penalty",
|
||||
opts: Options{RepeatLastN: 4, FrequencyPenalty: 2},
|
||||
priors: []int32{1, 1},
|
||||
logits: []float32{0, 5, 4},
|
||||
want: 2, // 5 - 2*count(1)=2*2=4 → 1, argmax shifts to 2
|
||||
},
|
||||
{
|
||||
name: "top-k",
|
||||
opts: Options{Temperature: 1, TopK: 1},
|
||||
logits: []float32{1, 5, 4},
|
||||
want: 1, // only argmax survives → deterministic even with temperature
|
||||
},
|
||||
{
|
||||
name: "top-p",
|
||||
opts: Options{Temperature: 1, TopP: 0.4},
|
||||
logits: []float32{logOf(0.5), logOf(0.3), logOf(0.2)},
|
||||
want: 0, // exclusive cumsum below 0.4 keeps only token 0
|
||||
},
|
||||
{
|
||||
name: "min-p",
|
||||
opts: Options{Temperature: 1, MinP: 0.7},
|
||||
logits: []float32{logOf(0.5), logOf(0.3), logOf(0.2)},
|
||||
want: 0, // threshold 0.5*0.7=0.35 drops all but the top token
|
||||
},
|
||||
{
|
||||
name: "RepeatLastN=0 disables penalties",
|
||||
opts: Options{RepeatLastN: 0, RepeatPenalty: 2, PresencePenalty: 10},
|
||||
priors: []int32{1},
|
||||
logits: []float32{0, 5, 4},
|
||||
want: 1, // 0 = disabled per API contract, argmax unchanged
|
||||
},
|
||||
{
|
||||
name: "RepeatLastN=-1 resolves to num_ctx",
|
||||
opts: Options{RepeatLastN: -1, PresencePenalty: 6},
|
||||
priors: []int32{1},
|
||||
logits: []float32{0, 5, 4},
|
||||
want: 2, // -1 → num_ctx (128); penalty applies, argmax shifts
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := sampleOne(t, tc.opts, tc.priors, tc.logits); got != tc.want {
|
||||
t.Errorf("got %d, want %d", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFrequencyPenaltyUsesTokenCounts(t *testing.T) {
|
||||
s := New(Options{RepeatLastN: 4, FrequencyPenalty: 2})
|
||||
defer func() {
|
||||
// TestSampleHistoryWindow verifies that penalty history respects the
|
||||
// RepeatLastN window: priors longer than RepeatLastN are trimmed on Add,
|
||||
// and once the ring wraps, tokens that rotate out no longer contribute
|
||||
// to penalties.
|
||||
func TestSampleHistoryWindow(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
s := New(128)
|
||||
t.Cleanup(func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
}()
|
||||
})
|
||||
|
||||
s.ResetHistory([]int32{1, 1})
|
||||
// RepeatLastN=2 with priors {1, 2, 3}: makeHistoryRow keeps only
|
||||
// {2, 3}. Token 1 was trimmed — its penalty is NOT active.
|
||||
s.Add(0, Options{RepeatLastN: 2, PresencePenalty: 10}, []int32{1, 2, 3})
|
||||
|
||||
logits := mlx.FromValues([]float32{0, 5, 4}, 3)
|
||||
got := s.Sample(logits).Token
|
||||
mlx.Eval(got)
|
||||
// Step 1: logits favor token 1 (trimmed). If the trim were broken it
|
||||
// would be penalized and the argmax would move.
|
||||
step1 := s.Sample([]int{0}, slotLogits([]float32{0, 5, 0, 0, 0})).Token
|
||||
mlx.Eval(step1)
|
||||
if got := step1.Int(); got != 1 {
|
||||
t.Fatalf("step 1 = %d, want 1 (token 1 trimmed from priors)", got)
|
||||
}
|
||||
// After step 1 the ring holds {1, 3}; token 2 has rotated out.
|
||||
|
||||
// token 1 appears twice, so 5 - (2 * 2) falls below token 2.
|
||||
gotInt := got.Int()
|
||||
if gotInt != 2 {
|
||||
t.Fatalf("got %d, want 2", gotInt)
|
||||
// Step 2: logits favor token 2 (rotated out). If the ring wrap were
|
||||
// wrong, token 2 would still be penalized.
|
||||
step2 := s.Sample([]int{0}, slotLogits([]float32{0, 0, 5, 0, 0})).Token
|
||||
mlx.Eval(step2)
|
||||
if got := step2.Int(); got != 2 {
|
||||
t.Fatalf("step 2 = %d, want 2 (token 2 rotated out of ring)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinPMasksTokensBelowThreshold(t *testing.T) {
|
||||
s := New(Options{MinP: 0.5})
|
||||
defer func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
}()
|
||||
// TestBatchSamplingPreservesPerSlotBehavior is the core equivalence test:
|
||||
// for every representative dispatch branch (uniform, serial on mixed opts,
|
||||
// serial on partial ring, subset/out-of-order), a batched Sample call must
|
||||
// produce the same token per row as running the same slot alone.
|
||||
func TestBatchSamplingPreservesPerSlotBehavior(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
logits := mlx.FromValues([]float32{
|
||||
float32(math.Log(0.5)),
|
||||
float32(math.Log(0.3)),
|
||||
float32(math.Log(0.2)),
|
||||
}, 3)
|
||||
got := minP(s, logits)
|
||||
mlx.Eval(got)
|
||||
|
||||
gotFloats := got.Floats()
|
||||
if len(gotFloats) != 3 {
|
||||
t.Fatalf("got %d scores, want 3", len(gotFloats))
|
||||
type slot struct {
|
||||
id int
|
||||
opts Options
|
||||
priors []int32
|
||||
}
|
||||
|
||||
if math.IsInf(float64(gotFloats[0]), -1) || math.IsInf(float64(gotFloats[1]), -1) {
|
||||
t.Fatalf("kept tokens were masked: %v", gotFloats)
|
||||
cases := []struct {
|
||||
name string
|
||||
slots []slot
|
||||
sample []int
|
||||
rows [][]float32
|
||||
}{
|
||||
{
|
||||
name: "uniform",
|
||||
slots: []slot{
|
||||
{10, Options{RepeatLastN: 2, PresencePenalty: 5}, []int32{1, 2}},
|
||||
{20, Options{RepeatLastN: 2, PresencePenalty: 5}, []int32{0, 2}},
|
||||
},
|
||||
sample: []int{10, 20},
|
||||
rows: [][]float32{{0, 5, 4}, {3, 0, 0}},
|
||||
},
|
||||
{
|
||||
name: "serial — mixed opts",
|
||||
slots: []slot{
|
||||
{1, Options{RepeatLastN: 1, RepeatPenalty: 2}, []int32{1}},
|
||||
{2, Options{Temperature: 1, TopK: 1}, nil},
|
||||
},
|
||||
sample: []int{1, 2},
|
||||
rows: [][]float32{{0, 5, 4, 1}, {2, 1, 5, 3}},
|
||||
},
|
||||
{
|
||||
name: "serial — partial ring",
|
||||
slots: []slot{
|
||||
{1, Options{RepeatLastN: 4, PresencePenalty: 5}, []int32{1, 1, 1, 1}},
|
||||
{2, Options{RepeatLastN: 4, PresencePenalty: 5}, []int32{2}},
|
||||
},
|
||||
sample: []int{1, 2},
|
||||
rows: [][]float32{{0, 5, 4}, {0, 4, 5}},
|
||||
},
|
||||
{
|
||||
name: "subset out-of-order",
|
||||
slots: []slot{
|
||||
{10, Options{RepeatLastN: 2, PresencePenalty: 10}, []int32{1, 1}},
|
||||
{20, Options{RepeatLastN: 2, PresencePenalty: 10}, []int32{2, 2}},
|
||||
{30, Options{RepeatLastN: 2, PresencePenalty: 10}, []int32{3, 3}},
|
||||
},
|
||||
sample: []int{30, 10},
|
||||
rows: [][]float32{{5, 5, 5, 0, 5, 5}, {5, 0, 5, 5, 0, 5}},
|
||||
},
|
||||
}
|
||||
|
||||
if !math.IsInf(float64(gotFloats[2]), -1) {
|
||||
t.Fatalf("lowest-probability token should be masked, got %v", gotFloats)
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Per-slot reference for each sampled seq.
|
||||
want := make([]int, len(tc.sample))
|
||||
for i, id := range tc.sample {
|
||||
var spec slot
|
||||
for _, s := range tc.slots {
|
||||
if s.id == id {
|
||||
spec = s
|
||||
break
|
||||
}
|
||||
}
|
||||
want[i] = sampleOne(t, spec.opts, spec.priors, tc.rows[i])
|
||||
}
|
||||
|
||||
// Batched call.
|
||||
s := New(128)
|
||||
t.Cleanup(func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
})
|
||||
for _, spec := range tc.slots {
|
||||
s.Add(spec.id, spec.opts, spec.priors)
|
||||
}
|
||||
res := s.Sample(tc.sample, batchLogits(tc.rows...))
|
||||
mlx.Eval(res.Token)
|
||||
got := res.Token.Ints()
|
||||
|
||||
for i, id := range tc.sample {
|
||||
if got[i] != want[i] {
|
||||
t.Errorf("seq %d: batched = %d, per-slot = %d", id, got[i], want[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestRemoveDoesNotLeakHistory: after Remove, a newly-added slot at the
|
||||
// recycled row must start from its own priors only — no carryover from
|
||||
// the removed slot's history.
|
||||
func TestRemoveDoesNotLeakHistory(t *testing.T) {
|
||||
skipIfNoMLX(t)
|
||||
|
||||
opts := Options{RepeatLastN: 1, PresencePenalty: 10}
|
||||
s := New(128)
|
||||
t.Cleanup(func() {
|
||||
s.Free()
|
||||
mlx.Sweep()
|
||||
})
|
||||
s.Add(1, opts, []int32{1})
|
||||
s.Add(2, opts, []int32{2})
|
||||
s.Remove(1)
|
||||
s.Add(3, opts, []int32{0})
|
||||
|
||||
// Slot 2 retains history {2}; slot 3 retains history {0}. With
|
||||
// equal logits and PresencePenalty=10 the argmax drops to the first
|
||||
// unpenalized token.
|
||||
res := s.Sample([]int{2, 3}, batchLogits(
|
||||
[]float32{3, 3, 0},
|
||||
[]float32{3, 3, 0},
|
||||
))
|
||||
mlx.Eval(res.Token)
|
||||
tokens := res.Token.Ints()
|
||||
if tokens[0] != 0 {
|
||||
t.Errorf("slot 2 = %d, want 0 (token 2 penalized)", tokens[0])
|
||||
}
|
||||
if tokens[1] != 1 {
|
||||
t.Errorf("slot 3 = %d, want 1 (token 0 penalized, no slot-1 carryover)", tokens[1])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ func Execute(args []string) error {
|
||||
}
|
||||
|
||||
request.Pipeline = runner.TextGenerationPipeline
|
||||
request.Sampler = sample.New(sample.Options{
|
||||
request.SamplerOpts = sample.Options{
|
||||
Temperature: request.Options.Temperature,
|
||||
TopP: request.Options.TopP,
|
||||
MinP: request.Options.MinP,
|
||||
@@ -104,7 +104,7 @@ func Execute(args []string) error {
|
||||
FrequencyPenalty: request.Options.FrequencyPenalty,
|
||||
Logprobs: request.Logprobs,
|
||||
TopLogprobs: request.TopLogprobs,
|
||||
})
|
||||
}
|
||||
|
||||
if err := runner.Prepare(&request); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
|
||||
@@ -78,13 +78,14 @@ func (l *Linear) OutputDim() int32 {
|
||||
|
||||
// QuantizedLinear applies an affine transformation using quantized weights.
|
||||
type QuantizedLinear struct {
|
||||
Weight *mlx.Array // Quantized weight data
|
||||
Scales *mlx.Array // Scale factors for dequantization
|
||||
QBiases *mlx.Array // Quantization biases (nil for nvfp4)
|
||||
Bias *mlx.Array // Layer bias [output_dims] or nil
|
||||
GroupSize int
|
||||
Bits int
|
||||
Mode string
|
||||
Weight *mlx.Array // Quantized weight data
|
||||
Scales *mlx.Array // Scale factors for dequantization
|
||||
QBiases *mlx.Array // Quantization biases (nil for nvfp4)
|
||||
Bias *mlx.Array // Layer bias [output_dims] or nil
|
||||
GlobalScale *mlx.Array // Per-tensor global scale for double-scale nvfp4 (nil for standard)
|
||||
GroupSize int
|
||||
Bits int
|
||||
Mode string
|
||||
}
|
||||
|
||||
func NewQuantizedLinear(weight *mlx.Array, bias *mlx.Array, groupSize, bits int, mode string) *QuantizedLinear {
|
||||
@@ -106,7 +107,18 @@ func NewQuantizedLinear(weight *mlx.Array, bias *mlx.Array, groupSize, bits int,
|
||||
}
|
||||
|
||||
func (ql *QuantizedLinear) Forward(x *mlx.Array) *mlx.Array {
|
||||
out := mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode)
|
||||
var out *mlx.Array
|
||||
if ql.GlobalScale != nil {
|
||||
// Double-scale nvfp4 (e.g., NVIDIA ModelOpt): standard quantized_matmul
|
||||
// followed by global_scale multiply. The global_scale is a per-tensor
|
||||
// F32 scalar (weight_scale_2 in NVIDIA's format).
|
||||
// TODO: switch to a fused double-scale matmul once MLX has kernel
|
||||
// coverage for this path.
|
||||
out = mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode)
|
||||
out = mlx.Mul(out, ql.GlobalScale)
|
||||
} else {
|
||||
out = mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode)
|
||||
}
|
||||
if ql.Bias != nil && ql.Bias.Valid() {
|
||||
out = out.Add(ql.Bias)
|
||||
}
|
||||
|
||||
@@ -110,6 +110,19 @@ func NewTensorDataFromBytes(name, dtype string, shape []int32, rawData []byte) *
|
||||
}
|
||||
}
|
||||
|
||||
// NewTensorDataFromReaderAt creates a TensorData backed by an arbitrary
|
||||
// io.ReaderAt. This is useful for constructing large synthetic tensors from
|
||||
// temporary files without loading the full payload into memory.
|
||||
func NewTensorDataFromReaderAt(name, dtype string, shape []int32, readerAt io.ReaderAt, size int64) *TensorData {
|
||||
return &TensorData{
|
||||
Name: name,
|
||||
Dtype: dtype,
|
||||
Shape: shape,
|
||||
Size: size,
|
||||
reader: io.NewSectionReader(readerAt, 0, size),
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractRawFromSafetensors reads a safetensors-wrapped reader and extracts
|
||||
// the raw tensor data bytes (stripping the header).
|
||||
func ExtractRawFromSafetensors(r io.Reader) ([]byte, error) {
|
||||
|
||||
102
x/server/show.go
102
x/server/show.go
@@ -306,15 +306,16 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
|
||||
}
|
||||
|
||||
// GetSafetensorsDtype returns the quantization type for a safetensors model.
|
||||
// Reads quant_type from the first tensor blob's __metadata__.
|
||||
// Falls back to torch_dtype from config.json if no quant metadata.
|
||||
// Reads tensor headers until quantized weights are found.
|
||||
// Falls back to torch_dtype from config.json if no quant metadata exists.
|
||||
func GetSafetensorsDtype(name model.Name) (string, error) {
|
||||
mf, err := manifest.ParseNamedManifest(name)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to load manifest: %w", err)
|
||||
}
|
||||
|
||||
// Check first tensor blob for quant_type metadata
|
||||
// Mixed models can start with unquantized embeddings or heads, so scan until
|
||||
// any tensor blob reports quantized weight metadata.
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType != manifest.MediaTypeImageTensor {
|
||||
continue
|
||||
@@ -323,15 +324,20 @@ func GetSafetensorsDtype(name model.Name) (string, error) {
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
info, err := readSafetensorsHeader(blobPath)
|
||||
f, err := os.Open(blobPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if quantType := canonicalQuantType(info.QuantType); quantType != "" {
|
||||
return quantType, nil
|
||||
infos, err := parseSafetensorsAllHeaders(f)
|
||||
_ = f.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, info := range infos {
|
||||
if quantType := canonicalQuantType(info.QuantType); quantType != "" {
|
||||
return quantType, nil
|
||||
}
|
||||
}
|
||||
// Only check the first tensor blob
|
||||
break
|
||||
}
|
||||
|
||||
// Not quantized - return torch_dtype from config.json
|
||||
@@ -354,86 +360,6 @@ type safetensorsTensorInfo struct {
|
||||
GroupSize string // from __metadata__.group_size (e.g., "32", "64")
|
||||
}
|
||||
|
||||
// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
|
||||
// Safetensors format: 8-byte header size (little endian) + JSON header + tensor data
|
||||
func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return parseSafetensorsHeader(f)
|
||||
}
|
||||
|
||||
// parseSafetensorsHeader parses a safetensors header from a reader.
|
||||
// This is separated for testability.
|
||||
// Parses __metadata__ for quant_type and group_size if present.
|
||||
func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
|
||||
// Read header size (8 bytes, little endian)
|
||||
var headerSize uint64
|
||||
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
|
||||
return nil, fmt.Errorf("failed to read header size: %w", err)
|
||||
}
|
||||
|
||||
// Sanity check - header shouldn't be too large
|
||||
if headerSize > 1024*1024 {
|
||||
return nil, fmt.Errorf("header size too large: %d", headerSize)
|
||||
}
|
||||
|
||||
// Read header JSON
|
||||
headerBytes := make([]byte, headerSize)
|
||||
if _, err := io.ReadFull(r, headerBytes); err != nil {
|
||||
return nil, fmt.Errorf("failed to read header: %w", err)
|
||||
}
|
||||
|
||||
// Parse as map of tensor name -> info
|
||||
var header map[string]json.RawMessage
|
||||
if err := json.Unmarshal(headerBytes, &header); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse header: %w", err)
|
||||
}
|
||||
|
||||
// Parse metadata if present
|
||||
var quantType, groupSize string
|
||||
if metaRaw, ok := header["__metadata__"]; ok {
|
||||
var meta map[string]string
|
||||
if json.Unmarshal(metaRaw, &meta) == nil {
|
||||
quantType = meta["quant_type"]
|
||||
groupSize = meta["group_size"]
|
||||
}
|
||||
}
|
||||
|
||||
// Find the main tensor entry (not __metadata__, .scale, or .bias)
|
||||
for name, raw := range header {
|
||||
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
||||
continue
|
||||
}
|
||||
var info safetensorsTensorInfo
|
||||
if err := json.Unmarshal(raw, &info); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
|
||||
}
|
||||
info.QuantType = quantType
|
||||
info.GroupSize = groupSize
|
||||
return &info, nil
|
||||
}
|
||||
|
||||
// Fall back to first non-metadata tensor entry
|
||||
for name, raw := range header {
|
||||
if name == "__metadata__" {
|
||||
continue
|
||||
}
|
||||
var info safetensorsTensorInfo
|
||||
if err := json.Unmarshal(raw, &info); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
|
||||
}
|
||||
info.QuantType = quantType
|
||||
info.GroupSize = groupSize
|
||||
return &info, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("no tensor found in header")
|
||||
}
|
||||
|
||||
// parseSafetensorsAllHeaders parses all tensor entries from a safetensors header.
|
||||
// Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias).
|
||||
// For packed blobs this returns multiple entries; for single-tensor blobs, one entry.
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/manifest"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
func TestBuildModelInfo(t *testing.T) {
|
||||
@@ -286,168 +287,7 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSafetensorsHeader(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
header map[string]any
|
||||
wantDtype string
|
||||
wantShape []int64
|
||||
wantQuantType string
|
||||
wantGroupSize string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "simple tensor",
|
||||
header: map[string]any{
|
||||
"weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 262144},
|
||||
"data_offsets": []int64{0, 1342177280},
|
||||
},
|
||||
},
|
||||
wantDtype: "BF16",
|
||||
wantShape: []int64{2560, 262144},
|
||||
},
|
||||
{
|
||||
name: "tensor keyed by name",
|
||||
header: map[string]any{
|
||||
"model.layers.0.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 2560},
|
||||
"data_offsets": []int64{0, 13107200},
|
||||
},
|
||||
},
|
||||
wantDtype: "BF16",
|
||||
wantShape: []int64{2560, 2560},
|
||||
},
|
||||
{
|
||||
name: "with int4 quant metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"quant_type": "int4",
|
||||
"group_size": "32",
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 320},
|
||||
"data_offsets": []int64{0, 3276800},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 80},
|
||||
"data_offsets": []int64{3276800, 3686400},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 80},
|
||||
"data_offsets": []int64{3686400, 4096000},
|
||||
},
|
||||
},
|
||||
wantDtype: "U32",
|
||||
wantShape: []int64{2560, 320},
|
||||
wantQuantType: "int4",
|
||||
wantGroupSize: "32",
|
||||
},
|
||||
{
|
||||
name: "int8 quant metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"quant_type": "int8",
|
||||
"group_size": "64",
|
||||
},
|
||||
"model.layers.0.mlp.down_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 640},
|
||||
"data_offsets": []int64{0, 6553600},
|
||||
},
|
||||
"model.layers.0.mlp.down_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 40},
|
||||
"data_offsets": []int64{6553600, 6963200},
|
||||
},
|
||||
},
|
||||
wantDtype: "U32",
|
||||
wantShape: []int64{2560, 640},
|
||||
wantQuantType: "int8",
|
||||
wantGroupSize: "64",
|
||||
},
|
||||
{
|
||||
name: "with old-style format metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"format": "pt",
|
||||
},
|
||||
"bias": map[string]any{
|
||||
"dtype": "F32",
|
||||
"shape": []int64{1024},
|
||||
"data_offsets": []int64{0, 4096},
|
||||
},
|
||||
},
|
||||
wantDtype: "F32",
|
||||
wantShape: []int64{1024},
|
||||
},
|
||||
{
|
||||
name: "float16 tensor",
|
||||
header: map[string]any{
|
||||
"layer.weight": map[string]any{
|
||||
"dtype": "F16",
|
||||
"shape": []int64{512, 512, 3, 3},
|
||||
"data_offsets": []int64{0, 4718592},
|
||||
},
|
||||
},
|
||||
wantDtype: "F16",
|
||||
wantShape: []int64{512, 512, 3, 3},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Create safetensors format: 8-byte size + JSON header
|
||||
headerJSON, err := json.Marshal(tt.header)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to marshal header: %v", err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil {
|
||||
t.Fatalf("failed to write header size: %v", err)
|
||||
}
|
||||
buf.Write(headerJSON)
|
||||
|
||||
info, err := parseSafetensorsHeader(&buf)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("parseSafetensorsHeader() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if tt.wantErr {
|
||||
return
|
||||
}
|
||||
|
||||
if info.Dtype != tt.wantDtype {
|
||||
t.Errorf("Dtype = %v, want %v", info.Dtype, tt.wantDtype)
|
||||
}
|
||||
|
||||
if len(info.Shape) != len(tt.wantShape) {
|
||||
t.Errorf("Shape length = %v, want %v", len(info.Shape), len(tt.wantShape))
|
||||
} else {
|
||||
for i, s := range info.Shape {
|
||||
if s != tt.wantShape[i] {
|
||||
t.Errorf("Shape[%d] = %v, want %v", i, s, tt.wantShape[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if info.QuantType != tt.wantQuantType {
|
||||
t.Errorf("QuantType = %v, want %v", info.QuantType, tt.wantQuantType)
|
||||
}
|
||||
if info.GroupSize != tt.wantGroupSize {
|
||||
t.Errorf("GroupSize = %v, want %v", info.GroupSize, tt.wantGroupSize)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSafetensorsHeader_Errors(t *testing.T) {
|
||||
func TestParseSafetensorsAllHeaders_Errors(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
@@ -467,7 +307,7 @@ func TestParseSafetensorsHeader_Errors(t *testing.T) {
|
||||
name: "header size too large",
|
||||
data: func() []byte {
|
||||
var buf bytes.Buffer
|
||||
binary.Write(&buf, binary.LittleEndian, uint64(2*1024*1024)) // 2MB
|
||||
binary.Write(&buf, binary.LittleEndian, uint64(200*1024*1024)) // 200 MiB
|
||||
return buf.Bytes()
|
||||
}(),
|
||||
wantErr: "header size too large",
|
||||
@@ -510,7 +350,7 @@ func TestParseSafetensorsHeader_Errors(t *testing.T) {
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
_, err := parseSafetensorsHeader(bytes.NewReader(tt.data))
|
||||
_, err := parseSafetensorsAllHeaders(bytes.NewReader(tt.data))
|
||||
if err == nil {
|
||||
t.Error("expected error, got nil")
|
||||
return
|
||||
@@ -1209,44 +1049,77 @@ func TestGetTensorInfoFromManifest_Packed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadSafetensorsHeader(t *testing.T) {
|
||||
// Create a temp file with a valid safetensors header
|
||||
tempDir := t.TempDir()
|
||||
func TestGetSafetensorsDtypeScansPastUnquantizedFirstBlob(t *testing.T) {
|
||||
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||
|
||||
header := map[string]any{
|
||||
"test_tensor": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{1024, 768},
|
||||
"data_offsets": []int64{0, 1572864},
|
||||
},
|
||||
}
|
||||
headerJSON, _ := json.Marshal(header)
|
||||
writeSafetensorsLayer := func(t *testing.T, header map[string]any, name string) manifest.Layer {
|
||||
t.Helper()
|
||||
|
||||
var buf bytes.Buffer
|
||||
binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
|
||||
buf.Write(headerJSON)
|
||||
headerJSON, err := json.Marshal(header)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to marshal header: %v", err)
|
||||
}
|
||||
|
||||
filePath := filepath.Join(tempDir, "test.safetensors")
|
||||
if err := os.WriteFile(filePath, buf.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write test file: %v", err)
|
||||
var buf bytes.Buffer
|
||||
if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil {
|
||||
t.Fatalf("failed to write header size: %v", err)
|
||||
}
|
||||
buf.Write(headerJSON)
|
||||
|
||||
layer, err := manifest.NewLayer(&buf, manifest.MediaTypeImageTensor)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create tensor layer: %v", err)
|
||||
}
|
||||
layer.Name = name
|
||||
return layer
|
||||
}
|
||||
|
||||
info, err := readSafetensorsHeader(filePath)
|
||||
configData, err := json.Marshal(map[string]any{
|
||||
"model_format": "safetensors",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("readSafetensorsHeader() error = %v", err)
|
||||
t.Fatalf("failed to marshal config: %v", err)
|
||||
}
|
||||
configLayer, err := manifest.NewLayer(bytes.NewReader(configData), "application/vnd.docker.container.image.v1+json")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create config layer: %v", err)
|
||||
}
|
||||
|
||||
if info.Dtype != "BF16" {
|
||||
t.Errorf("Dtype = %v, want BF16", info.Dtype)
|
||||
}
|
||||
if len(info.Shape) != 2 || info.Shape[0] != 1024 || info.Shape[1] != 768 {
|
||||
t.Errorf("Shape = %v, want [1024, 768]", info.Shape)
|
||||
}
|
||||
}
|
||||
unquantized := writeSafetensorsLayer(t, map[string]any{
|
||||
"model.embed_tokens.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{16, 8},
|
||||
"data_offsets": []int64{0, 256},
|
||||
},
|
||||
}, "model.embed_tokens.weight")
|
||||
|
||||
func TestReadSafetensorsHeader_FileNotFound(t *testing.T) {
|
||||
_, err := readSafetensorsHeader("/nonexistent/path/file.safetensors")
|
||||
if err == nil {
|
||||
t.Error("expected error for nonexistent file")
|
||||
quantized := writeSafetensorsLayer(t, map[string]any{
|
||||
"__metadata__": map[string]string{
|
||||
"quant_type": "mxfp8",
|
||||
"group_size": "32",
|
||||
},
|
||||
"model.layers.0.mlp.down_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{16, 4},
|
||||
"data_offsets": []int64{0, 256},
|
||||
},
|
||||
"model.layers.0.mlp.down_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{16, 1},
|
||||
"data_offsets": []int64{256, 288},
|
||||
},
|
||||
}, "model.layers.0.mlp.down_proj.weight")
|
||||
|
||||
name := model.ParseName("mixed-fp8-safetensors")
|
||||
if err := manifest.WriteManifest(name, configLayer, []manifest.Layer{unquantized, quantized}); err != nil {
|
||||
t.Fatalf("failed to write manifest: %v", err)
|
||||
}
|
||||
|
||||
got, err := GetSafetensorsDtype(name)
|
||||
if err != nil {
|
||||
t.Fatalf("GetSafetensorsDtype() error = %v", err)
|
||||
}
|
||||
if got != "mxfp8" {
|
||||
t.Fatalf("GetSafetensorsDtype() = %q, want mxfp8", got)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user