Skip to content

Commit ac7f76c

Browse files
authored
fix(drilldown): remove unknown_service from non-service metrics (#232)
* fix(drilldown): suppress high-cardinality timestamp terminal fields * fix(drilldown): suppress unstable fields and unknown service injection * docs(changelog): add unreleased drilldown and metrics fixes
1 parent 4c20431 commit ac7f76c

9 files changed

Lines changed: 194 additions & 7 deletions

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Bug Fixes
11+
12+
- metrics/query-range: stop injecting synthetic `service_name="unknown_service"` into metric `query`/`query_range` responses when the result labelset has no real service signal (for example `sum by(cluster)(rate(...))`).
13+
- drilldown/fields: suppress high-cardinality terminal timestamp fields (`timestamp_end`, `observed_timestamp_end`) from `detected_fields` responses to keep Drilldown field discovery stable under repeated refreshes.
14+
15+
### Tests
16+
17+
- drilldown/compat: add unit and e2e regressions for non-service metric aggregations (no synthetic `unknown_service`) and detected-field suppression of terminal timestamp keys.
18+
1019
## [1.12.0] - 2026-04-21
1120

1221
### Bug Fixes

docs/compatibility-drilldown.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ Promotion criteria for a new family:
8989
- `index/volume_range` must expose non-empty `detected_level` series names
9090
- `detected_fields` must show parsed fields like `method`, `path`, `status`, `duration_ms`
9191
- `detected_fields` must not leak indexed labels like `app`, `cluster`, or `namespace`
92+
- `detected_fields` must suppress high-cardinality terminal timestamp fields (`timestamp_end`, `observed_timestamp_end`) so Drilldown field discovery does not trigger expensive backend stats paths that can flap into intermittent no-data responses
9293
- In hybrid field mode, `detected_fields` may expose both native dotted fields and translated aliases such as `service.name` and `service_name`
9394
- `labels` and `label/{name}/values` should stay stream-shaped; they should prefer VictoriaLogs stream metadata endpoints and only fall back to generic field endpoints for older backend versions
9495
- `detected_fields`, `detected_labels`, and `detected_field/{name}/values` should prefer native VictoriaLogs metadata lookups where they map cleanly, then fall back to bounded raw-log sampling for parsed and derived fields
@@ -103,6 +104,7 @@ Promotion criteria for a new family:
103104
- Mixed parser query path: `| json ... | logfmt | drop __error__, __error_details__`
104105
- Labels object parsing in returned log frames
105106
- App-level field suppression for `detected_level`, `level`, and `level_extracted`
107+
- High-cardinality terminal timestamp keys (`timestamp_end`, `observed_timestamp_end`) are excluded from Drilldown detected-field responses while regular parsed fields stay visible
106108
- `1.x` service-selection buckets, detected-fields filtering, and labels field parsing stay explicit in the source-contract checks
107109
- `2.x` detected-level default columns, field-values breakdown scenes, and additional label-tab wiring stay explicit in the source-contract checks
108110
- Grafana runtime `11.x` explicitly asserts `1.x`-style service buckets, filtered detected fields, and extra label values at runtime

docs/compatibility-loki.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ The required matrix is intentionally not limited to happy-path selectors. It now
120120
- `pattern` parser extraction semantics
121121
- set-style binary operators such as `or` and `unless`
122122
- `bool` comparison semantics on metric expressions
123+
- metric aggregations that do not group by service labels must not receive synthetic `service_name="unknown_service"` in query/query_range responses
123124
- invalid log/metric shape rejections that must fail with the same class of error as Loki
124125

125126
When a new LogQL family is implemented or fixed in the proxy, the expectation is to add:

internal/proxy/drilldown.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ import (
1818
const unknownServiceName = "unknown_service"
1919
const detectedFieldsSampleLimit = 500
2020

21+
var suppressedDetectedFieldNames = map[string]struct{}{
22+
"timestamp_end": {},
23+
"observed_timestamp_end": {},
24+
}
25+
2126
var serviceNameSourceFields = []string{
2227
"service_name",
2328
"service.name",
@@ -52,6 +57,27 @@ type detectedLabelSummary struct {
5257
values map[string]struct{}
5358
}
5459

60+
func shouldSuppressDetectedField(label string) bool {
61+
normalized := strings.ToLower(strings.TrimSpace(label))
62+
if normalized == "" {
63+
return false
64+
}
65+
_, suppressed := suppressedDetectedFieldNames[normalized]
66+
return suppressed
67+
}
68+
69+
func hasServiceSignal(labels map[string]string) bool {
70+
if len(labels) == 0 {
71+
return false
72+
}
73+
for _, key := range serviceNameSourceFields {
74+
if strings.TrimSpace(labels[key]) != "" {
75+
return true
76+
}
77+
}
78+
return false
79+
}
80+
5581
func deriveServiceName(labels map[string]string) string {
5682
for _, key := range serviceNameSourceFields {
5783
if value := strings.TrimSpace(labels[key]); value != "" {
@@ -177,6 +203,9 @@ func addDetectedField(fields map[string]*detectedFieldSummary, label, parser, ty
177203
if label == "" || strings.TrimSpace(value) == "" {
178204
return
179205
}
206+
if shouldSuppressDetectedField(label) {
207+
return
208+
}
180209
summary := fields[label]
181210
if summary == nil {
182211
summary = &detectedFieldSummary{
@@ -1411,6 +1440,9 @@ func (p *Proxy) detectNativeFields(ctx context.Context, query, start, end string
14111440
out := make(map[string]*detectedFieldSummary, len(fieldNames))
14121441
for _, field := range fieldNames {
14131442
for _, exposure := range p.metadataFieldExposures(field) {
1443+
if shouldSuppressDetectedField(exposure.name) {
1444+
continue
1445+
}
14141446
out[exposure.name] = &detectedFieldSummary{
14151447
label: exposure.name,
14161448
typ: "string",
@@ -1443,6 +1475,9 @@ func mergeNativeDetectedFields(scanned []map[string]interface{}, scannedValues m
14431475
merged[label] = copied
14441476
}
14451477
for label, summary := range native {
1478+
if shouldSuppressDetectedField(label) {
1479+
continue
1480+
}
14461481
if existing, ok := merged[label]; ok {
14471482
if card, ok := existing["cardinality"].(int); ok && card > 0 {
14481483
continue

internal/proxy/drilldown_compat_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,47 @@ func TestDrilldown_DetectedFields_ParseStructuredLogsInsteadOfIndexedLabels(t *t
925925
}
926926
}
927927

928+
func TestDrilldown_DetectedFields_SuppressHighCardinalityTimestampTerminals(t *testing.T) {
929+
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
930+
switch r.URL.Path {
931+
case "/select/logsql/field_names":
932+
w.Header().Set("Content-Type", "application/json")
933+
w.Write([]byte(`{"values":[{"value":"app","hits":1},{"value":"cluster","hits":1},{"value":"timestamp_end","hits":1},{"value":"observed_timestamp_end","hits":1}]}`))
934+
case "/select/logsql/query":
935+
w.Header().Set("Content-Type", "application/x-ndjson")
936+
w.Write([]byte(`{"_time":"2026-04-04T17:18:49.971082Z","_msg":"{\"method\":\"GET\",\"path\":\"/api/v1/users\",\"timestamp_end\":\"2026-04-04T17:18:49.971082Z\",\"observed_timestamp_end\":\"2026-04-04T17:18:49.971082Z\"}","_stream":"{app=\"api-gateway\",cluster=\"us-east-1\"}","app":"api-gateway","cluster":"us-east-1","timestamp_end":"2026-04-04T17:18:49.971082Z","observed_timestamp_end":"2026-04-04T17:18:49.971082Z","level":"info"}` + "\n"))
937+
default:
938+
t.Fatalf("unexpected backend path %s", r.URL.Path)
939+
}
940+
}))
941+
defer vlBackend.Close()
942+
943+
resp := doGet(t, vlBackend.URL, "/loki/api/v1/detected_fields?query=%7Bapp%3D%22api-gateway%22%7D")
944+
fields, ok := resp["fields"].([]interface{})
945+
if !ok {
946+
t.Fatalf("expected fields array, got %v", resp)
947+
}
948+
949+
got := make(map[string]struct{}, len(fields))
950+
for _, field := range fields {
951+
obj := field.(map[string]interface{})
952+
got[obj["label"].(string)] = struct{}{}
953+
}
954+
955+
if _, exists := got["timestamp_end"]; exists {
956+
t.Fatalf("timestamp_end must be suppressed from detected_fields: %v", got)
957+
}
958+
if _, exists := got["observed_timestamp_end"]; exists {
959+
t.Fatalf("observed_timestamp_end must be suppressed from detected_fields: %v", got)
960+
}
961+
if _, exists := got["method"]; !exists {
962+
t.Fatalf("expected parsed field method to remain discoverable, got %v", got)
963+
}
964+
if _, exists := got["detected_level"]; !exists {
965+
t.Fatalf("expected detected_level summary to remain discoverable, got %v", got)
966+
}
967+
}
968+
928969
func TestDrilldown_DetectedFields_ExposeStructuredMetadataWithDottedNames(t *testing.T) {
929970
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
930971
switch r.URL.Path {

internal/proxy/drilldown_coverage_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ func TestDrilldownHelpers_AdditionalCoverage(t *testing.T) {
5454
addDetectedField(fields, "", "", "string", nil, "ignored")
5555
addDetectedField(fields, "duration", "json", "int", []string{"duration"}, "10")
5656
addDetectedField(fields, "duration", "logfmt", "string", nil, "ten")
57+
addDetectedField(fields, "timestamp_end", "json", "string", nil, "2026-04-21T10:00:00Z")
58+
addDetectedField(fields, "observed_timestamp_end", "json", "string", nil, "2026-04-21T10:00:00Z")
5759

5860
summary := fields["duration"]
5961
if summary == nil {
@@ -68,6 +70,15 @@ func TestDrilldownHelpers_AdditionalCoverage(t *testing.T) {
6870
if len(summary.jsonPath) != 1 || summary.jsonPath[0] != "duration" {
6971
t.Fatalf("expected original json path to be preserved, got %#v", summary.jsonPath)
7072
}
73+
if _, exists := fields["timestamp_end"]; exists {
74+
t.Fatal("expected suppressed high-cardinality timestamp_end field to be ignored")
75+
}
76+
if _, exists := fields["observed_timestamp_end"]; exists {
77+
t.Fatal("expected suppressed high-cardinality observed_timestamp_end field to be ignored")
78+
}
79+
if !shouldSuppressDetectedField("timestamp_end") {
80+
t.Fatal("expected timestamp_end to be part of suppression list")
81+
}
7182
if got := asString(map[string]any{"line": "hello\nworld"}); !strings.Contains(got, "hello") || strings.Contains(got, "\n") {
7283
t.Fatalf("expected asString to flatten JSON, got %q", got)
7384
}

internal/proxy/proxy.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10249,13 +10249,7 @@ func (p *Proxy) translateVolumeMetric(fields map[string]string) map[string]strin
1024910249
if translated == nil {
1025010250
return nil
1025110251
}
10252-
serviceSignal := false
10253-
for _, key := range serviceNameSourceFields {
10254-
if strings.TrimSpace(translated[key]) != "" {
10255-
serviceSignal = true
10256-
break
10257-
}
10258-
}
10252+
serviceSignal := hasServiceSignal(translated)
1025910253
ensureSyntheticServiceName(translated)
1026010254
if !serviceSignal && strings.TrimSpace(translated["service_name"]) == unknownServiceName {
1026110255
delete(translated, "service_name")
@@ -13095,9 +13089,13 @@ func (p *Proxy) translateStatsResponseLabelsWithContext(ctx context.Context, bod
1309513089
syntheticLabels[key] = s
1309613090
}
1309713091
}
13092+
serviceSignal := hasServiceSignal(syntheticLabels)
1309813093
beforeSyntheticCount := len(syntheticLabels)
1309913094
ensureDetectedLevel(syntheticLabels)
1310013095
ensureSyntheticServiceName(syntheticLabels)
13096+
if !serviceSignal && strings.TrimSpace(syntheticLabels["service_name"]) == unknownServiceName {
13097+
delete(syntheticLabels, "service_name")
13098+
}
1310113099
if len(syntheticLabels) != beforeSyntheticCount {
1310213100
changed = true
1310313101
}

internal/proxy/request_helpers_coverage_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,23 @@ func TestTranslateStatsResponseLabelsWithContext_Coverage(t *testing.T) {
105105
if metric["service_name"] != "api" || metric["detected_level"] != "warn" {
106106
t.Fatalf("unexpected translated metric labels %#v", metric)
107107
}
108+
109+
noServiceBody := []byte(`{"result":[{"metric":{"k8s_cluster_name":"ops-sand"}}]}`)
110+
noServiceGot := p.translateStatsResponseLabelsWithContext(context.Background(), noServiceBody, `sum by(k8s_cluster_name) (rate({deployment_environment="dev"}[1m]))`)
111+
112+
var noServiceResp struct {
113+
Result []struct {
114+
Metric map[string]interface{} `json:"metric"`
115+
} `json:"result"`
116+
}
117+
if err := json.Unmarshal(noServiceGot, &noServiceResp); err != nil {
118+
t.Fatalf("decode no-service stats response: %v", err)
119+
}
120+
noServiceMetric := noServiceResp.Result[0].Metric
121+
if _, exists := noServiceMetric["service_name"]; exists {
122+
t.Fatalf("unexpected synthetic service_name for metric without service signal: %#v", noServiceMetric)
123+
}
124+
if noServiceMetric["k8s_cluster_name"] != "ops-sand" {
125+
t.Fatalf("expected original metric labels to remain intact, got %#v", noServiceMetric)
126+
}
108127
}

test/e2e-compat/drilldown_compat_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,77 @@ func TestDrilldown_DetectedFieldsMatchStructuredLogs(t *testing.T) {
203203
}
204204
}
205205

206+
func TestDrilldown_DetectedFieldsSuppressTimestampTerminalFields(t *testing.T) {
207+
ensureDataIngested(t)
208+
209+
now := time.Now()
210+
pushStream(t, now, streamDef{
211+
Labels: map[string]string{
212+
"app": "timestamp-heavy-service", "namespace": "prod", "env": "production",
213+
"cluster": "us-east-1", "pod": "timestamp-heavy-service-abc123",
214+
"container": "timestamp-heavy-service", "level": "info",
215+
},
216+
Lines: []string{
217+
`{"method":"GET","path":"/api/v1/slow","status":200,"timestamp_end":"2026-04-21T08:34:59.661518472Z","observed_timestamp_end":"2026-04-21T08:34:59.661518472Z"}`,
218+
`{"method":"POST","path":"/api/v1/slow","status":201,"timestamp_end":"2026-04-21T08:35:00.139166515Z","observed_timestamp_end":"2026-04-21T08:35:00.139166515Z"}`,
219+
},
220+
})
221+
time.Sleep(3 * time.Second)
222+
223+
params := url.Values{}
224+
params.Set("query", `{service_name="timestamp-heavy-service"}`)
225+
params.Set("start", fmt.Sprintf("%d", now.Add(-15*time.Minute).UnixNano()))
226+
params.Set("end", fmt.Sprintf("%d", now.Add(15*time.Minute).UnixNano()))
227+
228+
proxyResp := getJSON(t, proxyURL+"/loki/api/v1/detected_fields?"+params.Encode())
229+
proxyFields, _ := proxyResp["fields"].([]interface{})
230+
if len(proxyFields) == 0 {
231+
t.Fatalf("proxy detected_fields empty for timestamp-heavy service query: %v", proxyResp)
232+
}
233+
234+
proxyLabels := make(map[string]bool, len(proxyFields))
235+
for _, field := range proxyFields {
236+
proxyLabels[field.(map[string]interface{})["label"].(string)] = true
237+
}
238+
239+
if !proxyLabels["method"] {
240+
t.Fatalf("proxy detected_fields missing parsed method field: %v", proxyResp)
241+
}
242+
for _, suppressed := range []string{"timestamp_end", "observed_timestamp_end"} {
243+
if proxyLabels[suppressed] {
244+
t.Fatalf("proxy detected_fields must suppress high-cardinality field %q: %v", suppressed, proxyResp)
245+
}
246+
}
247+
}
248+
249+
func TestDrilldown_StatsQueryDoesNotInjectUnknownServiceName(t *testing.T) {
250+
ensureDataIngested(t)
251+
now := time.Now()
252+
253+
params := url.Values{}
254+
params.Set("query", `sum by(cluster) (rate({env="production",cluster="us-east-1"}[1m]))`)
255+
params.Set("start", fmt.Sprintf("%d", now.Add(-15*time.Minute).UnixNano()))
256+
params.Set("end", fmt.Sprintf("%d", now.UnixNano()))
257+
params.Set("step", "60")
258+
259+
proxyResp := getJSON(t, proxyURL+"/loki/api/v1/query_range?"+params.Encode())
260+
data := extractMap(proxyResp, "data")
261+
if data == nil {
262+
t.Fatalf("expected query_range data envelope, got %v", proxyResp)
263+
}
264+
result := extractArray(data, "result")
265+
if len(result) == 0 {
266+
t.Fatalf("expected non-empty stats result for cluster aggregation query, got %v", proxyResp)
267+
}
268+
269+
for _, item := range result {
270+
metric := item.(map[string]interface{})["metric"].(map[string]interface{})
271+
if _, exists := metric["service_name"]; exists {
272+
t.Fatalf("stats metric without service grouping must not inject synthetic service_name: %v", metric)
273+
}
274+
}
275+
}
276+
206277
func TestDrilldown_GrafanaResourceContracts(t *testing.T) {
207278
ensureDataIngested(t)
208279
ingestPatternData(t)

0 commit comments

Comments
 (0)