Skip to content

Commit 94d9740

Browse files
committed
fix(drilldown): suppress unstable fields and unknown service injection
1 parent e4d2cdc commit 94d9740

7 files changed

Lines changed: 151 additions & 7 deletions

File tree

docs/compatibility-drilldown.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ Promotion criteria for a new family:
8989
- `index/volume_range` must expose non-empty `detected_level` series names
9090
- `detected_fields` must show parsed fields like `method`, `path`, `status`, `duration_ms`
9191
- `detected_fields` must not leak indexed labels like `app`, `cluster`, or `namespace`
92+
- `detected_fields` must suppress high-cardinality terminal timestamp fields (`timestamp_end`, `observed_timestamp_end`) so Drilldown field discovery does not trigger expensive backend stats paths that can flap into intermittent no-data responses
9293
- In hybrid field mode, `detected_fields` may expose both native dotted fields and translated aliases such as `service.name` and `service_name`
9394
- `labels` and `label/{name}/values` should stay stream-shaped; they should prefer VictoriaLogs stream metadata endpoints and only fall back to generic field endpoints for older backend versions
9495
- `detected_fields`, `detected_labels`, and `detected_field/{name}/values` should prefer native VictoriaLogs metadata lookups where they map cleanly, then fall back to bounded raw-log sampling for parsed and derived fields
@@ -103,6 +104,7 @@ Promotion criteria for a new family:
103104
- Mixed parser query path: `| json ... | logfmt | drop __error__, __error_details__`
104105
- Labels object parsing in returned log frames
105106
- App-level field suppression for `detected_level`, `level`, and `level_extracted`
107+
- High-cardinality terminal timestamp keys (`timestamp_end`, `observed_timestamp_end`) are excluded from Drilldown detected-field responses while regular parsed fields stay visible
106108
- `1.x` service-selection buckets, detected-fields filtering, and labels field parsing stay explicit in the source-contract checks
107109
- `2.x` detected-level default columns, field-values breakdown scenes, and additional label-tab wiring stay explicit in the source-contract checks
108110
- Grafana runtime `11.x` explicitly asserts `1.x`-style service buckets, filtered detected fields, and extra label values at runtime

docs/compatibility-loki.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ The required matrix is intentionally not limited to happy-path selectors. It now
120120
- `pattern` parser extraction semantics
121121
- set-style binary operators such as `or` and `unless`
122122
- `bool` comparison semantics on metric expressions
123+
- metric aggregations that do not group by service labels must not receive synthetic `service_name="unknown_service"` in query/query_range responses
123124
- invalid log/metric shape rejections that must fail with the same class of error as Loki
124125

125126
When a new LogQL family is implemented or fixed in the proxy, the expectation is to add:

internal/proxy/drilldown.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,18 @@ func shouldSuppressDetectedField(label string) bool {
6666
return suppressed
6767
}
6868

69+
func hasServiceSignal(labels map[string]string) bool {
70+
if len(labels) == 0 {
71+
return false
72+
}
73+
for _, key := range serviceNameSourceFields {
74+
if strings.TrimSpace(labels[key]) != "" {
75+
return true
76+
}
77+
}
78+
return false
79+
}
80+
6981
func deriveServiceName(labels map[string]string) string {
7082
for _, key := range serviceNameSourceFields {
7183
if value := strings.TrimSpace(labels[key]); value != "" {

internal/proxy/drilldown_compat_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,47 @@ func TestDrilldown_DetectedFields_ParseStructuredLogsInsteadOfIndexedLabels(t *t
925925
}
926926
}
927927

928+
func TestDrilldown_DetectedFields_SuppressHighCardinalityTimestampTerminals(t *testing.T) {
929+
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
930+
switch r.URL.Path {
931+
case "/select/logsql/field_names":
932+
w.Header().Set("Content-Type", "application/json")
933+
w.Write([]byte(`{"values":[{"value":"app","hits":1},{"value":"cluster","hits":1},{"value":"timestamp_end","hits":1},{"value":"observed_timestamp_end","hits":1}]}`))
934+
case "/select/logsql/query":
935+
w.Header().Set("Content-Type", "application/x-ndjson")
936+
w.Write([]byte(`{"_time":"2026-04-04T17:18:49.971082Z","_msg":"{\"method\":\"GET\",\"path\":\"/api/v1/users\",\"timestamp_end\":\"2026-04-04T17:18:49.971082Z\",\"observed_timestamp_end\":\"2026-04-04T17:18:49.971082Z\"}","_stream":"{app=\"api-gateway\",cluster=\"us-east-1\"}","app":"api-gateway","cluster":"us-east-1","timestamp_end":"2026-04-04T17:18:49.971082Z","observed_timestamp_end":"2026-04-04T17:18:49.971082Z","level":"info"}` + "\n"))
937+
default:
938+
t.Fatalf("unexpected backend path %s", r.URL.Path)
939+
}
940+
}))
941+
defer vlBackend.Close()
942+
943+
resp := doGet(t, vlBackend.URL, "/loki/api/v1/detected_fields?query=%7Bapp%3D%22api-gateway%22%7D")
944+
fields, ok := resp["fields"].([]interface{})
945+
if !ok {
946+
t.Fatalf("expected fields array, got %v", resp)
947+
}
948+
949+
got := make(map[string]struct{}, len(fields))
950+
for _, field := range fields {
951+
obj := field.(map[string]interface{})
952+
got[obj["label"].(string)] = struct{}{}
953+
}
954+
955+
if _, exists := got["timestamp_end"]; exists {
956+
t.Fatalf("timestamp_end must be suppressed from detected_fields: %v", got)
957+
}
958+
if _, exists := got["observed_timestamp_end"]; exists {
959+
t.Fatalf("observed_timestamp_end must be suppressed from detected_fields: %v", got)
960+
}
961+
if _, exists := got["method"]; !exists {
962+
t.Fatalf("expected parsed field method to remain discoverable, got %v", got)
963+
}
964+
if _, exists := got["detected_level"]; !exists {
965+
t.Fatalf("expected detected_level summary to remain discoverable, got %v", got)
966+
}
967+
}
968+
928969
func TestDrilldown_DetectedFields_ExposeStructuredMetadataWithDottedNames(t *testing.T) {
929970
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
930971
switch r.URL.Path {

internal/proxy/proxy.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10249,13 +10249,7 @@ func (p *Proxy) translateVolumeMetric(fields map[string]string) map[string]strin
1024910249
if translated == nil {
1025010250
return nil
1025110251
}
10252-
serviceSignal := false
10253-
for _, key := range serviceNameSourceFields {
10254-
if strings.TrimSpace(translated[key]) != "" {
10255-
serviceSignal = true
10256-
break
10257-
}
10258-
}
10252+
serviceSignal := hasServiceSignal(translated)
1025910253
ensureSyntheticServiceName(translated)
1026010254
if !serviceSignal && strings.TrimSpace(translated["service_name"]) == unknownServiceName {
1026110255
delete(translated, "service_name")
@@ -13095,9 +13089,13 @@ func (p *Proxy) translateStatsResponseLabelsWithContext(ctx context.Context, bod
1309513089
syntheticLabels[key] = s
1309613090
}
1309713091
}
13092+
serviceSignal := hasServiceSignal(syntheticLabels)
1309813093
beforeSyntheticCount := len(syntheticLabels)
1309913094
ensureDetectedLevel(syntheticLabels)
1310013095
ensureSyntheticServiceName(syntheticLabels)
13096+
if !serviceSignal && strings.TrimSpace(syntheticLabels["service_name"]) == unknownServiceName {
13097+
delete(syntheticLabels, "service_name")
13098+
}
1310113099
if len(syntheticLabels) != beforeSyntheticCount {
1310213100
changed = true
1310313101
}

internal/proxy/request_helpers_coverage_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,23 @@ func TestTranslateStatsResponseLabelsWithContext_Coverage(t *testing.T) {
105105
if metric["service_name"] != "api" || metric["detected_level"] != "warn" {
106106
t.Fatalf("unexpected translated metric labels %#v", metric)
107107
}
108+
109+
noServiceBody := []byte(`{"result":[{"metric":{"k8s_cluster_name":"ops-sand"}}]}`)
110+
noServiceGot := p.translateStatsResponseLabelsWithContext(context.Background(), noServiceBody, `sum by(k8s_cluster_name) (rate({deployment_environment="dev"}[1m]))`)
111+
112+
var noServiceResp struct {
113+
Result []struct {
114+
Metric map[string]interface{} `json:"metric"`
115+
} `json:"result"`
116+
}
117+
if err := json.Unmarshal(noServiceGot, &noServiceResp); err != nil {
118+
t.Fatalf("decode no-service stats response: %v", err)
119+
}
120+
noServiceMetric := noServiceResp.Result[0].Metric
121+
if _, exists := noServiceMetric["service_name"]; exists {
122+
t.Fatalf("unexpected synthetic service_name for metric without service signal: %#v", noServiceMetric)
123+
}
124+
if noServiceMetric["k8s_cluster_name"] != "ops-sand" {
125+
t.Fatalf("expected original metric labels to remain intact, got %#v", noServiceMetric)
126+
}
108127
}

test/e2e-compat/drilldown_compat_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,77 @@ func TestDrilldown_DetectedFieldsMatchStructuredLogs(t *testing.T) {
203203
}
204204
}
205205

206+
func TestDrilldown_DetectedFieldsSuppressTimestampTerminalFields(t *testing.T) {
207+
ensureDataIngested(t)
208+
209+
now := time.Now()
210+
pushStream(t, now, streamDef{
211+
Labels: map[string]string{
212+
"app": "timestamp-heavy-service", "namespace": "prod", "env": "production",
213+
"cluster": "us-east-1", "pod": "timestamp-heavy-service-abc123",
214+
"container": "timestamp-heavy-service", "level": "info",
215+
},
216+
Lines: []string{
217+
`{"method":"GET","path":"/api/v1/slow","status":200,"timestamp_end":"2026-04-21T08:34:59.661518472Z","observed_timestamp_end":"2026-04-21T08:34:59.661518472Z"}`,
218+
`{"method":"POST","path":"/api/v1/slow","status":201,"timestamp_end":"2026-04-21T08:35:00.139166515Z","observed_timestamp_end":"2026-04-21T08:35:00.139166515Z"}`,
219+
},
220+
})
221+
time.Sleep(3 * time.Second)
222+
223+
params := url.Values{}
224+
params.Set("query", `{service_name="timestamp-heavy-service"}`)
225+
params.Set("start", fmt.Sprintf("%d", now.Add(-15*time.Minute).UnixNano()))
226+
params.Set("end", fmt.Sprintf("%d", now.Add(15*time.Minute).UnixNano()))
227+
228+
proxyResp := getJSON(t, proxyURL+"/loki/api/v1/detected_fields?"+params.Encode())
229+
proxyFields, _ := proxyResp["fields"].([]interface{})
230+
if len(proxyFields) == 0 {
231+
t.Fatalf("proxy detected_fields empty for timestamp-heavy service query: %v", proxyResp)
232+
}
233+
234+
proxyLabels := make(map[string]bool, len(proxyFields))
235+
for _, field := range proxyFields {
236+
proxyLabels[field.(map[string]interface{})["label"].(string)] = true
237+
}
238+
239+
if !proxyLabels["method"] {
240+
t.Fatalf("proxy detected_fields missing parsed method field: %v", proxyResp)
241+
}
242+
for _, suppressed := range []string{"timestamp_end", "observed_timestamp_end"} {
243+
if proxyLabels[suppressed] {
244+
t.Fatalf("proxy detected_fields must suppress high-cardinality field %q: %v", suppressed, proxyResp)
245+
}
246+
}
247+
}
248+
249+
func TestDrilldown_StatsQueryDoesNotInjectUnknownServiceName(t *testing.T) {
250+
ensureDataIngested(t)
251+
now := time.Now()
252+
253+
params := url.Values{}
254+
params.Set("query", `sum by(cluster) (rate({env="production",cluster="us-east-1"}[1m]))`)
255+
params.Set("start", fmt.Sprintf("%d", now.Add(-15*time.Minute).UnixNano()))
256+
params.Set("end", fmt.Sprintf("%d", now.UnixNano()))
257+
params.Set("step", "60")
258+
259+
proxyResp := getJSON(t, proxyURL+"/loki/api/v1/query_range?"+params.Encode())
260+
data := extractMap(proxyResp, "data")
261+
if data == nil {
262+
t.Fatalf("expected query_range data envelope, got %v", proxyResp)
263+
}
264+
result := extractArray(data, "result")
265+
if len(result) == 0 {
266+
t.Fatalf("expected non-empty stats result for cluster aggregation query, got %v", proxyResp)
267+
}
268+
269+
for _, item := range result {
270+
metric := item.(map[string]interface{})["metric"].(map[string]interface{})
271+
if _, exists := metric["service_name"]; exists {
272+
t.Fatalf("stats metric without service grouping must not inject synthetic service_name: %v", metric)
273+
}
274+
}
275+
}
276+
206277
func TestDrilldown_GrafanaResourceContracts(t *testing.T) {
207278
ensureDataIngested(t)
208279
ingestPatternData(t)

0 commit comments

Comments
 (0)