docker · dgageot · Apr 20, 2026 · Apr 17, 2026
@@ -64,29 +64,111 @@ models:
     model: ai/qwen3
     max_tokens: 8192
     provider_opts:
-      runtime_flags: ["--ngl=33", "--top-p=0.9"]
+      runtime_flags: ["--threads", "8"]
 ```
 
 Runtime flags also accept a single string:
 
 ```yaml
 provider_opts:
-  runtime_flags: "--ngl=33 --top-p=0.9"
+  runtime_flags: "--threads 8"
 ```
 
-## Parameter Mapping
+Use only flags your Model Runner backend allows (see `docker model configure --help` and backend docs). **Do not** put sampling parameters (`temperature`, `top_p`, penalties) in `runtime_flags` — set them on the model (`temperature`, `top_p`, etc.); they are sent **per request** via the OpenAI-compatible chat API.
 
-docker-agent model config fields map to llama.cpp flags automatically:
+## Context size
 
-| Config              | llama.cpp Flag        |
-| ------------------- | --------------------- |
-| `temperature`       | `--temp`              |
-| `top_p`             | `--top-p`             |
-| `frequency_penalty` | `--frequency-penalty` |
-| `presence_penalty`  | `--presence-penalty`  |
-| `max_tokens`        | `--context-size`      |
+`max_tokens` controls the **maximum output tokens** per chat completion request. To set the engine's **total context window**, use `provider_opts.context_size`:
 
-`runtime_flags` always take priority over derived flags on conflict.
+```yaml
+models:
+  local:
+    provider: dmr
+    model: ai/qwen3
+    max_tokens: 4096            # max output tokens (per-request)
+    provider_opts:
+      context_size: 32768       # total context window (sent via _configure)
+```
+
+If `context_size` is omitted, Model Runner uses its default. `max_tokens` is **not** used as the context window.
+
+## Thinking / reasoning budget
+
+When using the **llama.cpp** backend, `thinking_budget` is sent as structured `llamacpp.reasoning-budget` on `_configure` (maps to `--reasoning-budget`). String efforts use the same token mapping as other providers; `adaptive` maps to unlimited (`-1`).
+
+When using the **vLLM** backend, `thinking_budget` is sent as `thinking_token_budget` in each chat completion request. Effort levels map to token counts using the same scale as other providers; `adaptive` maps to unlimited (`-1`).
+
+```yaml
+models:
+  local:
+    provider: dmr
+    model: ai/qwen3
+    thinking_budget: medium   # llama.cpp: reasoning-budget=8192; vLLM: thinking_token_budget=8192
+```
+
+On **MLX** and **SGLang** backends, `thinking_budget` is silently ignored — those engines do not currently expose a per-request reasoning token budget knob.
+
+## vLLM-specific configuration
+
+When running a model on the **vLLM** backend, additional engine-level settings can be passed via `provider_opts` and are forwarded to model-runner's `_configure` endpoint:
+
+- `gpu_memory_utilization` — fraction of GPU memory (0.0–1.0) vLLM may use. Values outside this range are rejected.
+- `hf_overrides` — map of Hugging Face config overrides applied when vLLM loads the model.
+
+```yaml
+models:
+  vllm-local:
+    provider: dmr
+    model: ai/some-model-safetensors
+    provider_opts:
+      gpu_memory_utilization: 0.9
+      hf_overrides:
+        max_model_len: 8192
+        dtype: bfloat16
+```
+
+`hf_overrides` keys (including nested ones) must match `^[a-zA-Z_][a-zA-Z0-9_]*$` — the same rule model-runner enforces server-side to block injection via flags. Invalid keys are rejected at client creation time so you fail fast instead of after a round-trip.
+
+These options are ignored on non-vLLM backends.
+
+## Keeping models resident in memory (`keep_alive`)
+
+By default model-runner unloads idle models after a few minutes. Override the idle timeout via `provider_opts.keep_alive`:
+
+```yaml
+models:
+  sticky:
+    provider: dmr
+    model: ai/qwen3
+    provider_opts:
+      keep_alive: "30m"   # duration string
+      # keep_alive: "0"   # unload immediately after each request
+      # keep_alive: "-1"  # keep loaded forever
+```
+
+Accepted values: any Go duration string (`"30s"`, `"5m"`, `"1h"`, `"2h30m"`), `"0"` (immediate unload), or `"-1"` (never unload). Invalid values are rejected before the configure request is sent.
+
+## Operating mode (`mode`)
+
+Model-runner normally infers the backend mode from the request path. You can pin it explicitly via `provider_opts.mode`:
+
+```yaml
+provider_opts:
+  mode: embedding   # one of: completion, embedding, reranking, image-generation
+```
+
+Most agents don't need this — leave it unset unless you know you need it.
+
+## Raw runtime flags (`raw_runtime_flags`)
+
+`runtime_flags` (a list) is the preferred way to pass flags. If you have a pre-built command-line string you'd rather ship verbatim, use `raw_runtime_flags` instead:
+
+```yaml
+provider_opts:
+  raw_runtime_flags: "--threads 8 --batch-size 512"
+```
+
+Model-runner parses the string with shell-style word splitting. `runtime_flags` and `raw_runtime_flags` are mutually exclusive — setting both is an error.
 
 ## Speculative Decoding
 

@@ -7,6 +7,7 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"maps"
 	"net/http"
 	"os"
 	"time"
@@ -54,6 +55,7 @@ type Client struct {
 	client     openai.Client
 	baseURL    string
 	httpClient *http.Client
+	engine     string
 }
 
 // NewClient creates a new DMR client from the provided configuration
@@ -103,18 +105,28 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
 
 	clientOptions = append(clientOptions, option.WithBaseURL(baseURL), option.WithAPIKey("")) // DMR doesn't need auth
 
-	// Build runtime flags from ModelConfig and engine
-	contextSize, providerRuntimeFlags, specOpts := parseDMRProviderOpts(cfg)
-	configFlags := buildRuntimeFlagsFromModelConfig(engine, cfg)
-	finalFlags, warnings := mergeRuntimeFlagsPreferUser(configFlags, providerRuntimeFlags)
-	for _, w := range warnings {
-		slog.Warn(w)
+	parsed, err := parseDMRProviderOpts(engine, cfg)
+	if err != nil {
+		slog.Error("DMR provider_opts invalid", "error", err, "model", cfg.Model)
+		return nil, err
 	}
-	slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", contextSize, "runtime_flags", finalFlags, "speculative_opts", specOpts, "engine", engine)
+	backendCfg := buildConfigureBackendConfig(parsed.contextSize, parsed.runtimeFlags, parsed.specOpts, parsed.llamaCpp, parsed.vllm, parsed.keepAlive)
+	slog.Debug("DMR provider_opts parsed",
+		"model", cfg.Model,
+		"engine", engine,
+		"context_size", derefInt64(parsed.contextSize),
+		"runtime_flags", parsed.runtimeFlags,
+		"raw_runtime_flags", parsed.rawRuntimeFlags,
+		"mode", derefString(parsed.mode),
+		"keep_alive", derefString(parsed.keepAlive),
+		"speculative_opts", parsed.specOpts,
+		"llamacpp", parsed.llamaCpp,
+		"vllm", parsed.vllm,
+	)
 	// Skip model configuration when generating titles to avoid reconfiguring the model
 	// with different settings (e.g., smaller max_tokens) that would affect the main agent.
 	if !globalOptions.GeneratingTitle() {
-		if err := configureModel(ctx, httpClient, baseURL, cfg.Model, contextSize, finalFlags, specOpts); err != nil {
+		if err := configureModel(ctx, httpClient, baseURL, cfg.Model, backendCfg, parsed.mode, parsed.rawRuntimeFlags); err != nil {
 			slog.Debug("model configure via API skipped or failed", "error", err)
 		}
 	}
@@ -129,6 +141,7 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
 		client:     openai.NewClient(clientOptions...),
 		baseURL:    baseURL,
 		httpClient: httpClient,
+		engine:     engine,
 	}, nil
 }
 
@@ -214,6 +227,43 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
 		}
 	}
 
+	// Collect per-request extra JSON fields. SetExtraFields replaces the map
+	// wholesale, so merge all contributors before a single Set call.
+	extraFields := map[string]any{}
+
+	// NoThinking: disable reasoning at the chat-template level. llama.cpp and
+	// vLLM both honor chat_template_kwargs.enable_thinking=false for Qwen3 /
+	// Hermes / DeepSeek-R1 style templates; other engines ignore unknown keys.
+	//
+	// When the caller has also set a small MaxTokens (e.g. session title
+	// generation sets max_tokens=20), raise it to noThinkingMinOutputTokens
+	// so any residual reasoning tokens the engine/template still emits can't
+	// starve the visible output. The nil-guard is intentional: if MaxTokens
+	// is unset the caller has imposed no cap, so there is nothing to floor
+	// and we leave max_tokens off the request (letting the engine use its
+	// own output budget). Mirrors the OpenAI provider (see
+	// pkg/model/provider/openai/client.go).
+	if c.ModelOptions.NoThinking() {
+		extraFields["chat_template_kwargs"] = map[string]any{"enable_thinking": false}
+		if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
+			params.MaxTokens = openai.Int(noThinkingMinOutputTokens)
+			slog.Debug("DMR NoThinking: bumped max_tokens floor",
+				"from", *c.ModelConfig.MaxTokens, "to", noThinkingMinOutputTokens)
+		}
+	}
+
+	// vLLM-specific per-request fields (e.g. thinking_token_budget).
+	if c.engine == engineVLLM {
+		if fields := buildVLLMRequestFields(&c.ModelConfig); fields != nil {
+			maps.Copy(extraFields, fields)
+		}
+	}
+
+	if len(extraFields) > 0 {
+		params.SetExtraFields(extraFields)
+		slog.Debug("DMR extra request fields applied", "fields", extraFields)
+	}
+
 	// Log the request in JSON format for debugging
 	if requestJSON, err := json.Marshal(params); err == nil {
 		slog.Debug("DMR chat completion request", "request", string(requestJSON))
@@ -222,7 +272,7 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
 	}
 
 	if structuredOutput := c.ModelOptions.StructuredOutput(); structuredOutput != nil {
-		slog.Debug("Adding structured output to DMR request", "structured_output", structuredOutput)
+		slog.Debug("Adding structured output to DMR request", "name", structuredOutput.Name, "strict", structuredOutput.Strict)
 
 		params.ResponseFormat.OfJSONSchema = &openai.ResponseFormatJSONSchemaParam{
 			JSONSchema: openai.ResponseFormatJSONSchemaJSONSchemaParam{