Skip to content

Commit 2e7bbfe

Browse files
committed
improve DMR support
- adds 'context_size' provider_opt for DMR usage instead of giving 'max_tokens' double responsibility to avoid confusion - improves how flags are sent to the DMR model/runtime configuration endpoint Signed-off-by: Christopher Petito <chrisjpetito@gmail.com>
1 parent 42ca212 commit 2e7bbfe

File tree

4 files changed

+597
-207
lines changed

4 files changed

+597
-207
lines changed

docs/providers/dmr/index.md

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,29 +64,49 @@ models:
6464
model: ai/qwen3
6565
max_tokens: 8192
6666
provider_opts:
67-
runtime_flags: ["--ngl=33", "--top-p=0.9"]
67+
runtime_flags: ["--threads", "8"]
6868
```
6969

7070
Runtime flags also accept a single string:
7171

7272
```yaml
7373
provider_opts:
74-
runtime_flags: "--ngl=33 --top-p=0.9"
74+
runtime_flags: "--threads 8"
7575
```
7676

77-
## Parameter Mapping
77+
Use only flags your Model Runner backend allows (see `docker model configure --help` and backend docs). **Do not** put sampling parameters (`temperature`, `top_p`, penalties) in `runtime_flags` — set them on the model (`temperature`, `top_p`, etc.); they are sent **per request** via the OpenAI-compatible chat API.
7878

79-
docker-agent model config fields map to llama.cpp flags automatically:
79+
## Context size
8080

81-
| Config | llama.cpp Flag |
82-
| ------------------- | --------------------- |
83-
| `temperature` | `--temp` |
84-
| `top_p` | `--top-p` |
85-
| `frequency_penalty` | `--frequency-penalty` |
86-
| `presence_penalty` | `--presence-penalty` |
87-
| `max_tokens` | `--context-size` |
81+
`max_tokens` controls the **maximum output tokens** per chat completion request. To set the engine's **total context window**, use `provider_opts.context_size`:
8882

89-
`runtime_flags` always take priority over derived flags on conflict.
83+
```yaml
84+
models:
85+
local:
86+
provider: dmr
87+
model: ai/qwen3
88+
max_tokens: 4096 # max output tokens (per-request)
89+
provider_opts:
90+
context_size: 32768 # total context window (sent via _configure)
91+
```
92+
93+
If `context_size` is omitted, Model Runner uses its default. `max_tokens` is **not** used as the context window.
94+
95+
## Thinking / reasoning budget
96+
97+
When using the **llama.cpp** backend, `thinking_budget` is sent as structured `llamacpp.reasoning-budget` on `_configure` (maps to `--reasoning-budget`). String efforts use the same token mapping as other providers; `adaptive` maps to unlimited (`-1`).
98+
99+
When using the **vLLM** backend, `thinking_budget` is sent as `thinking_token_budget` in each chat completion request. Effort levels map to token counts using the same scale as other providers; `adaptive` maps to unlimited (`-1`).
100+
101+
```yaml
102+
models:
103+
local:
104+
provider: dmr
105+
model: ai/qwen3
106+
thinking_budget: medium # llama.cpp: reasoning-budget=8192; vLLM: thinking_token_budget=8192
107+
```
108+
109+
On **MLX** and **SGLang** backends, `thinking_budget` is silently ignored — those engines do not currently expose a per-request reasoning token budget knob.
90110

91111
## Speculative Decoding
92112

pkg/model/provider/dmr/client.go

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ type Client struct {
5454
client openai.Client
5555
baseURL string
5656
httpClient *http.Client
57+
engine string
5758
}
5859

5960
// NewClient creates a new DMR client from the provided configuration
@@ -103,18 +104,13 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
103104

104105
clientOptions = append(clientOptions, option.WithBaseURL(baseURL), option.WithAPIKey("")) // DMR doesn't need auth
105106

106-
// Build runtime flags from ModelConfig and engine
107-
contextSize, providerRuntimeFlags, specOpts := parseDMRProviderOpts(cfg)
108-
configFlags := buildRuntimeFlagsFromModelConfig(engine, cfg)
109-
finalFlags, warnings := mergeRuntimeFlagsPreferUser(configFlags, providerRuntimeFlags)
110-
for _, w := range warnings {
111-
slog.Warn(w)
112-
}
113-
slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", contextSize, "runtime_flags", finalFlags, "speculative_opts", specOpts, "engine", engine)
107+
contextSize, runtimeFlags, specOpts, llamaCpp := parseDMRProviderOpts(engine, cfg)
108+
backendCfg := buildConfigureBackendConfig(contextSize, runtimeFlags, specOpts, llamaCpp)
109+
slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", derefInt64(contextSize), "runtime_flags", runtimeFlags, "speculative_opts", specOpts, "llamacpp", llamaCpp, "engine", engine)
114110
// Skip model configuration when generating titles to avoid reconfiguring the model
115111
// with different settings (e.g., smaller max_tokens) that would affect the main agent.
116112
if !globalOptions.GeneratingTitle() {
117-
if err := configureModel(ctx, httpClient, baseURL, cfg.Model, contextSize, finalFlags, specOpts); err != nil {
113+
if err := configureModel(ctx, httpClient, baseURL, cfg.Model, backendCfg); err != nil {
118114
slog.Debug("model configure via API skipped or failed", "error", err)
119115
}
120116
}
@@ -129,6 +125,7 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
129125
client: openai.NewClient(clientOptions...),
130126
baseURL: baseURL,
131127
httpClient: httpClient,
128+
engine: engine,
132129
}, nil
133130
}
134131

@@ -214,6 +211,14 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
214211
}
215212
}
216213

214+
// For vLLM, apply engine-specific per-request fields (e.g. thinking_token_budget).
215+
if c.engine == engineVLLM {
216+
if fields := buildVLLMRequestFields(&c.ModelConfig); fields != nil {
217+
params.SetExtraFields(fields)
218+
slog.Debug("DMR vLLM extra request fields applied", "fields", fields)
219+
}
220+
}
221+
217222
// Log the request in JSON format for debugging
218223
if requestJSON, err := json.Marshal(params); err == nil {
219224
slog.Debug("DMR chat completion request", "request", string(requestJSON))
@@ -222,7 +227,7 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
222227
}
223228

224229
if structuredOutput := c.ModelOptions.StructuredOutput(); structuredOutput != nil {
225-
slog.Debug("Adding structured output to DMR request", "structured_output", structuredOutput)
230+
slog.Debug("Adding structured output to DMR request", "name", structuredOutput.Name, "strict", structuredOutput.Strict)
226231

227232
params.ResponseFormat.OfJSONSchema = &openai.ResponseFormatJSONSchemaParam{
228233
JSONSchema: openai.ResponseFormatJSONSchemaJSONSchemaParam{

0 commit comments

Comments
 (0)